xref: /xnu-11417.140.69/osfmk/vm/vm_map.c (revision 43a90889846e00bfb5cf1d255cdc0a701a1e05a4)
1 /*
2  * Copyright (c) 2000-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * @OSF_COPYRIGHT@
30  */
31 /*
32  * Mach Operating System
33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34  * All Rights Reserved.
35  *
36  * Permission to use, copy, modify and distribute this software and its
37  * documentation is hereby granted, provided that both the copyright
38  * notice and this permission notice appear in all copies of the
39  * software, derivative works or modified versions, and any portions
40  * thereof, and that both notices appear in supporting documentation.
41  *
42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45  *
46  * Carnegie Mellon requests users of this software to return to
47  *
48  *  Software Distribution Coordinator  or  [email protected]
49  *  School of Computer Science
50  *  Carnegie Mellon University
51  *  Pittsburgh PA 15213-3890
52  *
53  * any improvements or extensions that they make and grant Carnegie Mellon
54  * the rights to redistribute these changes.
55  */
56 /*
57  */
58 /*
59  *	File:	vm/vm_map.c
60  *	Author:	Avadis Tevanian, Jr., Michael Wayne Young
61  *	Date:	1985
62  *
63  *	Virtual memory mapping module.
64  */
65 
66 #include <mach/vm_types.h>
67 #include <mach_assert.h>
68 
69 #include <vm/vm_options.h>
70 
71 #include <libkern/OSAtomic.h>
72 
73 #include <mach/kern_return.h>
74 #include <mach/port.h>
75 #include <mach/vm_attributes.h>
76 #include <mach/vm_param.h>
77 #include <mach/vm_behavior.h>
78 #include <mach/vm_statistics.h>
79 #include <mach/memory_object.h>
80 #include <mach/mach_vm_server.h>
81 #include <machine/cpu_capabilities.h>
82 #include <mach/sdt.h>
83 
84 #include <kern/assert.h>
85 #include <kern/backtrace.h>
86 #include <kern/counter.h>
87 #include <kern/exc_guard.h>
88 #include <kern/kalloc.h>
89 #include <kern/zalloc_internal.h>
90 #include <kern/telemetry.h>
91 #include <kern/trap_telemetry.h>
92 
93 #include <vm/cpm_internal.h>
94 #include <vm/memory_types.h>
95 #include <vm/vm_compressor_xnu.h>
96 #include <vm/vm_compressor_pager_internal.h>
97 #include <vm/vm_init_xnu.h>
98 #include <vm/vm_fault_internal.h>
99 #include <vm/vm_map_internal.h>
100 #include <vm/vm_object_internal.h>
101 #include <vm/vm_page_internal.h>
102 #include <vm/vm_pageout.h>
103 #include <vm/pmap.h>
104 #include <vm/vm_kern_internal.h>
105 #include <ipc/ipc_port.h>
106 #include <kern/sched_prim.h>
107 #include <kern/misc_protos.h>
108 
109 #include <mach/vm_map_server.h>
110 #include <mach/mach_host_server.h>
111 #include <vm/vm_memtag.h>
112 #include <vm/vm_protos_internal.h>
113 #include <vm/vm_purgeable_internal.h>
114 
115 #include <vm/vm_iokit.h>
116 #include <vm/vm_shared_region_internal.h>
117 #include <vm/vm_map_store_internal.h>
118 #include <vm/vm_memory_entry_xnu.h>
119 #include <vm/memory_object_internal.h>
120 #include <vm/vm_memory_entry.h>
121 #include <vm/vm_sanitize_internal.h>
122 #include <vm/vm_reclaim_xnu.h>
123 #if DEVELOPMENT || DEBUG
124 #include <vm/vm_compressor_info.h>
125 #endif /* DEVELOPMENT || DEBUG */
126 #include <san/kasan.h>
127 
128 #include <sys/resource.h>
129 #include <sys/random.h>
130 #include <sys/codesign.h>
131 #include <sys/code_signing.h>
132 #include <sys/mman.h>
133 #include <sys/reboot.h>
134 #include <sys/kdebug_triage.h>
135 #include <sys/reason.h>
136 
137 #include <os/log.h>
138 
139 #include <libkern/section_keywords.h>
140 
141 #include <os/hash.h>
142 
143 #if DEVELOPMENT || DEBUG
144 extern int proc_selfcsflags(void);
145 int vm_log_xnu_user_debug = 0;
146 int panic_on_unsigned_execute = 0;
147 int panic_on_mlock_failure = 0;
148 #endif /* DEVELOPMENT || DEBUG */
149 
150 #if DEVELOPMENT || DEBUG
151 int debug4k_filter = 0;
152 char debug4k_proc_name[1024] = "";
153 int debug4k_proc_filter = (int)-1 & ~(1 << __DEBUG4K_FAULT);
154 int debug4k_panic_on_misaligned_sharing = 0;
155 const char *debug4k_category_name[] = {
156 	"error",        /* 0 */
157 	"life",         /* 1 */
158 	"load",         /* 2 */
159 	"fault",        /* 3 */
160 	"copy",         /* 4 */
161 	"share",        /* 5 */
162 	"adjust",       /* 6 */
163 	"pmap",         /* 7 */
164 	"mementry",     /* 8 */
165 	"iokit",        /* 9 */
166 	"upl",          /* 10 */
167 	"exc",          /* 11 */
168 	"vfs"           /* 12 */
169 };
170 #endif /* DEVELOPMENT || DEBUG */
171 int debug4k_no_cow_copyin = 0;
172 
173 
174 #if __arm64__
175 extern const int fourk_binary_compatibility_unsafe;
176 #endif /* __arm64__ */
177 extern int proc_selfpid(void);
178 extern char *proc_name_address(void *p);
179 extern const char *proc_best_name(struct proc *p);
180 
181 #if VM_MAP_DEBUG_APPLE_PROTECT
182 int vm_map_debug_apple_protect = 0;
183 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
184 #if VM_MAP_DEBUG_FOURK
185 int vm_map_debug_fourk = 0;
186 #endif /* VM_MAP_DEBUG_FOURK */
187 
188 #if DEBUG || DEVELOPMENT
189 static TUNABLE(bool, vm_map_executable_immutable,
190     "vm_map_executable_immutable", true);
191 #else
192 #define vm_map_executable_immutable true
193 #endif
194 
195 /** Do not enforce the kernel allocation size limit */
196 #define VM_MAP_KERNEL_ALLOC_LIMIT_MODE_BYPASS (0)
197 /** Enforce the kernel allocation limit by refusing too large requests */
198 #define VM_MAP_KERNEL_ALLOC_LIMIT_MODE_REJECT (1)
199 /** Enforce the kernel allocation limit by panicking on any too large request */
200 #define VM_MAP_KERNEL_ALLOC_LIMIT_MODE_PANIC (2)
201 /** Do not enforce the kernel allocation limit but generate a telemetry trap */
202 #define VM_MAP_KERNEL_ALLOC_LIMIT_MODE_TRAP (3)
203 
204 #if DEVELOPMENT || DEBUG
205 static TUNABLE(int, vm_map_kernel_alloc_limit_mode,
206     "vm_map_kernel_alloc_limit_mode", VM_MAP_KERNEL_ALLOC_LIMIT_MODE_TRAP);
207 #else
208 #define vm_map_kernel_alloc_limit_mode VM_MAP_KERNEL_ALLOC_LIMIT_MODE_BYPASS
209 #endif /* DEVELOPMENT || DEBUG */
210 
211 os_refgrp_decl(static, map_refgrp, "vm_map", NULL);
212 
213 extern u_int32_t random(void);  /* from <libkern/libkern.h> */
214 /* Internal prototypes
215  */
216 
217 typedef struct vm_map_zap {
218 	vm_map_entry_t          vmz_head;
219 	vm_map_entry_t         *vmz_tail;
220 } *vm_map_zap_t;
221 
222 #define VM_MAP_ZAP_DECLARE(zap) \
223 	struct vm_map_zap zap = { .vmz_tail = &zap.vmz_head }
224 
225 extern kern_return_t vm_map_wire_external(
226 	vm_map_t                map,
227 	vm_map_offset_ut        start_u,
228 	vm_map_offset_ut        end_u,
229 	vm_prot_ut              prot_u,
230 	boolean_t               user_wire) __exported;
231 
232 #if XNU_PLATFORM_MacOSX
233 extern /* exported via Private.<arch>.MacOSX.exports on macOS */
234 #else
235 static
236 #endif
237 kern_return_t vm_map_copyin_common(
238 	vm_map_t                src_map,
239 	vm_map_address_ut       src_addr,
240 	vm_map_size_ut          len,
241 	boolean_t               src_destroy,
242 	boolean_t               src_volatile,
243 	vm_map_copy_t          *copy_result,                           /* OUT */
244 	boolean_t               use_maxprot);
245 
246 static vm_map_entry_t   vm_map_entry_insert(
247 	vm_map_t                map,
248 	vm_map_entry_t          insp_entry,
249 	vm_map_offset_t         start,
250 	vm_map_offset_t         end,
251 	vm_object_t             object,
252 	vm_object_offset_t      offset,
253 	vm_map_kernel_flags_t   vmk_flags,
254 	boolean_t               needs_copy,
255 	vm_prot_t               cur_protection,
256 	vm_prot_t               max_protection,
257 	vm_inherit_t            inheritance,
258 	boolean_t               clear_map_aligned);
259 
260 static void vm_map_simplify_range(
261 	vm_map_t        map,
262 	vm_map_offset_t start,
263 	vm_map_offset_t end);   /* forward */
264 
265 static boolean_t        vm_map_range_check(
266 	vm_map_t        map,
267 	vm_map_offset_t start,
268 	vm_map_offset_t end,
269 	vm_map_entry_t  *entry);
270 
271 static void vm_map_submap_pmap_clean(
272 	vm_map_t        map,
273 	vm_map_offset_t start,
274 	vm_map_offset_t end,
275 	vm_map_t        sub_map,
276 	vm_map_offset_t offset);
277 
278 static void             vm_map_pmap_enter(
279 	vm_map_t                map,
280 	vm_map_offset_t         addr,
281 	vm_map_offset_t         end_addr,
282 	vm_object_t             object,
283 	vm_object_offset_t      offset,
284 	vm_prot_t               protection);
285 
286 static void             _vm_map_clip_end(
287 	struct vm_map_header    *map_header,
288 	vm_map_entry_t          entry,
289 	vm_map_offset_t         end);
290 
291 static void             _vm_map_clip_start(
292 	struct vm_map_header    *map_header,
293 	vm_map_entry_t          entry,
294 	vm_map_offset_t         start);
295 
296 static kmem_return_t vm_map_delete(
297 	vm_map_t        map,
298 	vm_map_offset_t start,
299 	vm_map_offset_t end,
300 	vmr_flags_t     flags,
301 	kmem_guard_t    guard,
302 	vm_map_zap_t    zap);
303 
304 static void             vm_map_copy_insert(
305 	vm_map_t        map,
306 	vm_map_entry_t  after_where,
307 	vm_map_copy_t   copy);
308 
309 static kern_return_t    vm_map_copy_overwrite_unaligned(
310 	vm_map_t        dst_map,
311 	vm_map_entry_t  entry,
312 	vm_map_copy_t   copy,
313 	vm_map_address_t start,
314 	boolean_t       discard_on_success);
315 
316 static kern_return_t    vm_map_copy_overwrite_aligned(
317 	vm_map_t        dst_map,
318 	vm_map_entry_t  tmp_entry,
319 	vm_map_copy_t   copy,
320 	vm_map_offset_t start,
321 	pmap_t          pmap);
322 
323 static kern_return_t    vm_map_copyin_kernel_buffer(
324 	vm_map_t        src_map,
325 	vm_map_address_t src_addr,
326 	vm_map_size_t   len,
327 	boolean_t       src_destroy,
328 	vm_map_copy_t   *copy_result);  /* OUT */
329 
330 static kern_return_t    vm_map_copyout_kernel_buffer(
331 	vm_map_t        map,
332 	vm_map_address_t *addr, /* IN/OUT */
333 	vm_map_copy_t   copy,
334 	vm_map_size_t   copy_size,
335 	boolean_t       overwrite,
336 	boolean_t       consume_on_success);
337 
338 static void             vm_map_fork_share(
339 	vm_map_t        old_map,
340 	vm_map_entry_t  old_entry,
341 	vm_map_t        new_map);
342 
343 static boolean_t        vm_map_fork_copy(
344 	vm_map_t        old_map,
345 	vm_map_entry_t  *old_entry_p,
346 	vm_map_t        new_map,
347 	int             vm_map_copyin_flags);
348 
349 static kern_return_t    vm_map_wire_nested(
350 	vm_map_t                   map,
351 	vm_map_offset_t            start,
352 	vm_map_offset_t            end,
353 	vm_prot_t                  caller_prot,
354 	vm_tag_t                   tag,
355 	boolean_t                  user_wire,
356 	pmap_t                     map_pmap,
357 	vm_map_offset_t            pmap_addr,
358 	ppnum_t                   *physpage_p);
359 
360 static kern_return_t    vm_map_unwire_nested(
361 	vm_map_t                   map,
362 	vm_map_offset_t            start,
363 	vm_map_offset_t            end,
364 	boolean_t                  user_wire,
365 	pmap_t                     map_pmap,
366 	vm_map_offset_t            pmap_addr);
367 
368 static kern_return_t    vm_map_overwrite_submap_recurse(
369 	vm_map_t                   dst_map,
370 	vm_map_offset_t            dst_addr,
371 	vm_map_size_t              dst_size);
372 
373 static kern_return_t    vm_map_copy_overwrite_nested(
374 	vm_map_t                   dst_map,
375 	vm_map_offset_t            dst_addr,
376 	vm_map_copy_t              copy,
377 	boolean_t                  interruptible,
378 	pmap_t                     pmap,
379 	boolean_t                  discard_on_success);
380 
381 static kern_return_t    vm_map_remap_extract(
382 	vm_map_t                map,
383 	vm_map_offset_t         addr,
384 	vm_map_size_t           size,
385 	boolean_t               copy,
386 	vm_map_copy_t           map_copy,
387 	vm_prot_t               *cur_protection,
388 	vm_prot_t               *max_protection,
389 	vm_inherit_t            inheritance,
390 	vm_map_kernel_flags_t   vmk_flags);
391 
392 static void             vm_map_region_look_for_page(
393 	vm_map_t                   map,
394 	vm_map_offset_t            va,
395 	vm_object_t                object,
396 	vm_object_offset_t         offset,
397 	int                        max_refcnt,
398 	unsigned short             depth,
399 	vm_region_extended_info_t  extended,
400 	mach_msg_type_number_t count);
401 
402 static boolean_t        vm_map_region_has_obj_ref(
403 	vm_map_entry_t             entry,
404 	vm_object_t                object);
405 
406 
407 static kern_return_t    vm_map_willneed(
408 	vm_map_t        map,
409 	vm_map_offset_t start,
410 	vm_map_offset_t end);
411 
412 static kern_return_t    vm_map_reuse_pages(
413 	vm_map_t        map,
414 	vm_map_offset_t start,
415 	vm_map_offset_t end);
416 
417 static kern_return_t    vm_map_reusable_pages(
418 	vm_map_t        map,
419 	vm_map_offset_t start,
420 	vm_map_offset_t end);
421 
422 static kern_return_t    vm_map_can_reuse(
423 	vm_map_t        map,
424 	vm_map_offset_t start,
425 	vm_map_offset_t end);
426 
427 static kern_return_t    vm_map_zero(
428 	vm_map_t        map,
429 	vm_map_offset_t start,
430 	vm_map_offset_t end);
431 
432 static kern_return_t    vm_map_random_address_for_size(
433 	vm_map_t                map,
434 	vm_map_offset_t        *address,
435 	vm_map_size_t           size,
436 	vm_map_kernel_flags_t   vmk_flags);
437 
438 
439 #if CONFIG_MAP_RANGES
440 
441 static vm_map_range_id_t vm_map_user_range_resolve(
442 	vm_map_t                map,
443 	mach_vm_address_t       addr,
444 	mach_vm_address_t       size,
445 	mach_vm_range_t         range);
446 
447 #endif /* CONFIG_MAP_RANGES */
448 #if MACH_ASSERT
449 static kern_return_t    vm_map_pageout(
450 	vm_map_t        map,
451 	vm_map_offset_t start,
452 	vm_map_offset_t end);
453 #endif /* MACH_ASSERT */
454 
455 kern_return_t vm_map_corpse_footprint_collect(
456 	vm_map_t        old_map,
457 	vm_map_entry_t  old_entry,
458 	vm_map_t        new_map);
459 void vm_map_corpse_footprint_collect_done(
460 	vm_map_t        new_map);
461 void vm_map_corpse_footprint_destroy(
462 	vm_map_t        map);
463 kern_return_t vm_map_corpse_footprint_query_page_info(
464 	vm_map_t        map,
465 	vm_map_offset_t va,
466 	int             *disposition_p);
467 void vm_map_footprint_query_page_info(
468 	vm_map_t        map,
469 	vm_map_entry_t  map_entry,
470 	vm_map_offset_t curr_s_offset,
471 	int             *disposition_p);
472 
473 #if CONFIG_MAP_RANGES
474 static void vm_map_range_map_init(void);
475 #endif /* CONFIG_MAP_RANGES */
476 
477 pid_t find_largest_process_vm_map_entries(void);
478 
479 
480 __attribute__((always_inline))
481 int
vm_map_kernel_flags_vmflags(vm_map_kernel_flags_t vmk_flags)482 vm_map_kernel_flags_vmflags(vm_map_kernel_flags_t vmk_flags)
483 {
484 	int flags = vmk_flags.__vm_flags & VM_FLAGS_ANY_MASK;
485 
486 	/* in vmk flags the meaning of fixed/anywhere is inverted */
487 	return flags ^ (VM_FLAGS_FIXED | VM_FLAGS_ANYWHERE);
488 }
489 
490 __attribute__((always_inline, overloadable))
491 void
vm_map_kernel_flags_set_vmflags(vm_map_kernel_flags_t * vmk_flags,int vm_flags,vm_tag_t vm_tag)492 vm_map_kernel_flags_set_vmflags(
493 	vm_map_kernel_flags_t  *vmk_flags,
494 	int                     vm_flags,
495 	vm_tag_t                vm_tag)
496 {
497 	vm_flags ^= (VM_FLAGS_FIXED | VM_FLAGS_ANYWHERE);
498 	vmk_flags->__vm_flags &= ~VM_FLAGS_ANY_MASK;
499 	vmk_flags->__vm_flags |= (vm_flags & VM_FLAGS_ANY_MASK);
500 	vmk_flags->vm_tag = vm_tag;
501 }
502 
503 __attribute__((always_inline, overloadable))
504 void
vm_map_kernel_flags_set_vmflags(vm_map_kernel_flags_t * vmk_flags,int vm_flags_and_tag)505 vm_map_kernel_flags_set_vmflags(
506 	vm_map_kernel_flags_t  *vmk_flags,
507 	int                     vm_flags_and_tag)
508 {
509 	vm_flags_and_tag ^= (VM_FLAGS_FIXED | VM_FLAGS_ANYWHERE);
510 	vmk_flags->__vm_flags &= ~VM_FLAGS_ANY_MASK;
511 	vmk_flags->__vm_flags |= (vm_flags_and_tag & VM_FLAGS_ANY_MASK);
512 	VM_GET_FLAGS_ALIAS(vm_flags_and_tag, vmk_flags->vm_tag);
513 }
514 
515 __attribute__((always_inline))
516 void
vm_map_kernel_flags_and_vmflags(vm_map_kernel_flags_t * vmk_flags,int vm_flags_mask)517 vm_map_kernel_flags_and_vmflags(
518 	vm_map_kernel_flags_t  *vmk_flags,
519 	int                     vm_flags_mask)
520 {
521 	/* this function doesn't handle the inverted FIXED/ANYWHERE */
522 	assert(vm_flags_mask & VM_FLAGS_ANYWHERE);
523 	vmk_flags->__vm_flags &= vm_flags_mask;
524 }
525 
526 __attribute__((always_inline))
527 bool
vm_map_kernel_flags_check_vm_and_kflags(vm_map_kernel_flags_t vmk_flags,int vm_flags_mask)528 vm_map_kernel_flags_check_vm_and_kflags(
529 	vm_map_kernel_flags_t   vmk_flags,
530 	int                     vm_flags_mask)
531 {
532 	return (vmk_flags.__vm_flags & ~vm_flags_mask) == 0;
533 }
534 
535 bool
vm_map_kernel_flags_check_vmflags(vm_map_kernel_flags_t vmk_flags,int vm_flags_mask)536 vm_map_kernel_flags_check_vmflags(
537 	vm_map_kernel_flags_t   vmk_flags,
538 	int                     vm_flags_mask)
539 {
540 	int vmflags = vmk_flags.__vm_flags & VM_FLAGS_ANY_MASK;
541 
542 	/* Note: up to 16 still has good calling conventions */
543 	static_assert(sizeof(vm_map_kernel_flags_t) == 16);
544 
545 #if DEBUG || DEVELOPMENT
546 	/*
547 	 * All of this compiles to nothing if all checks pass.
548 	 */
549 #define check(field, value)  ({ \
550 	vm_map_kernel_flags_t fl = VM_MAP_KERNEL_FLAGS_NONE; \
551 	fl.__vm_flags = (value); \
552 	fl.field = 0; \
553 	assert(fl.__vm_flags == 0); \
554 })
555 
556 	/* bits 0-7 */
557 	check(vmf_fixed, VM_FLAGS_ANYWHERE); // kind of a lie this is inverted
558 	check(vmf_purgeable, VM_FLAGS_PURGABLE);
559 	check(vmf_4gb_chunk, VM_FLAGS_4GB_CHUNK);
560 	check(vmf_random_addr, VM_FLAGS_RANDOM_ADDR);
561 	check(vmf_no_cache, VM_FLAGS_NO_CACHE);
562 	check(vmf_resilient_codesign, VM_FLAGS_RESILIENT_CODESIGN);
563 	check(vmf_resilient_media, VM_FLAGS_RESILIENT_MEDIA);
564 	check(vmf_permanent, VM_FLAGS_PERMANENT);
565 
566 	/* bits 8-15 */
567 	check(vmf_tpro, VM_FLAGS_TPRO);
568 	check(vmf_overwrite, VM_FLAGS_OVERWRITE);
569 
570 	/* bits 16-23 */
571 	check(vmf_superpage_size, VM_FLAGS_SUPERPAGE_MASK);
572 	check(vmf_return_data_addr, VM_FLAGS_RETURN_DATA_ADDR);
573 	check(vmf_return_4k_data_addr, VM_FLAGS_RETURN_4K_DATA_ADDR);
574 
575 	{
576 		vm_map_kernel_flags_t fl = VM_MAP_KERNEL_FLAGS_NONE;
577 
578 		/* check user tags will never clip */
579 		fl.vm_tag = VM_MEMORY_COUNT - 1;
580 		assert(fl.vm_tag == VM_MEMORY_COUNT - 1);
581 
582 		/* check kernel tags will never clip */
583 		fl.vm_tag = VM_MAX_TAG_VALUE - 1;
584 		assert(fl.vm_tag == VM_MAX_TAG_VALUE - 1);
585 	}
586 
587 
588 #undef check
589 #endif /* DEBUG || DEVELOPMENT */
590 
591 	return (vmflags & ~vm_flags_mask) == 0;
592 }
593 
594 /*
595  * Macros to copy a vm_map_entry. We must be careful to correctly
596  * manage the wired page count. vm_map_entry_copy() creates a new
597  * map entry to the same memory - the wired count in the new entry
598  * must be set to zero. vm_map_entry_copy_full() creates a new
599  * entry that is identical to the old entry.  This preserves the
600  * wire count; it's used for map splitting and zone changing in
601  * vm_map_copyout.
602  */
603 
604 static inline void
vm_map_entry_copy_csm_assoc(vm_map_t map __unused,vm_map_entry_t new __unused,vm_map_entry_t old __unused)605 vm_map_entry_copy_csm_assoc(
606 	vm_map_t map __unused,
607 	vm_map_entry_t new __unused,
608 	vm_map_entry_t old __unused)
609 {
610 #if CODE_SIGNING_MONITOR
611 	/* when code signing monitor is enabled, we want to reset on copy */
612 	new->csm_associated = FALSE;
613 #else
614 	/* when code signing monitor is not enabled, assert as a sanity check */
615 	assert(new->csm_associated == FALSE);
616 #endif
617 #if DEVELOPMENT || DEBUG
618 	if (new->vme_xnu_user_debug && vm_log_xnu_user_debug) {
619 		printf("FBDP %d[%s] %s:%d map %p entry %p [ 0x%llx 0x%llx ] vme_xnu_user_debug\n",
620 		    proc_selfpid(),
621 		    (get_bsdtask_info(current_task())
622 		    ? proc_name_address(get_bsdtask_info(current_task()))
623 		    : "?"),
624 		    __FUNCTION__, __LINE__,
625 		    map, new, new->vme_start, new->vme_end);
626 	}
627 #endif /* DEVELOPMENT || DEBUG */
628 #if XNU_TARGET_OS_OSX
629 	/*
630 	 * On macOS, entries with "vme_xnu_user_debug" can be copied during fork()
631 	 * and we want the child's entry to keep its "vme_xnu_user_debug" to avoid
632 	 * trigggering CSM assertions when the child accesses its mapping.
633 	 */
634 #else /* XNU_TARGET_OS_OSX */
635 	new->vme_xnu_user_debug = FALSE;
636 #endif /* XNU_TARGET_OS_OSX */
637 }
638 
639 /*
640  * The "used_for_jit" flag was copied from OLD to NEW in vm_map_entry_copy().
641  * But for security reasons on some platforms, we don't want the
642  * new mapping to be "used for jit", so we reset the flag here.
643  */
644 static inline void
vm_map_entry_copy_code_signing(vm_map_t map,vm_map_entry_t new,vm_map_entry_t old __unused)645 vm_map_entry_copy_code_signing(
646 	vm_map_t map,
647 	vm_map_entry_t new,
648 	vm_map_entry_t old __unused)
649 {
650 	if (VM_MAP_POLICY_ALLOW_JIT_COPY(map)) {
651 		assert(new->used_for_jit == old->used_for_jit);
652 	} else {
653 		if (old->used_for_jit) {
654 			DTRACE_VM3(cs_wx,
655 			    uint64_t, new->vme_start,
656 			    uint64_t, new->vme_end,
657 			    vm_prot_t, new->protection);
658 			printf("CODE SIGNING: %d[%s] %s: curprot cannot be write+execute. %s\n",
659 			    proc_selfpid(),
660 			    (get_bsdtask_info(current_task())
661 			    ? proc_name_address(get_bsdtask_info(current_task()))
662 			    : "?"),
663 			    __FUNCTION__,
664 			    "removing execute access");
665 			new->protection &= ~VM_PROT_EXECUTE;
666 			new->max_protection &= ~VM_PROT_EXECUTE;
667 		}
668 		new->used_for_jit = FALSE;
669 	}
670 }
671 
672 static inline void
vm_map_entry_copy_full(vm_map_entry_t new,vm_map_entry_t old)673 vm_map_entry_copy_full(
674 	vm_map_entry_t new,
675 	vm_map_entry_t old)
676 {
677 #if MAP_ENTRY_CREATION_DEBUG
678 	btref_put(new->vme_creation_bt);
679 	btref_retain(old->vme_creation_bt);
680 #endif
681 #if MAP_ENTRY_INSERTION_DEBUG
682 	btref_put(new->vme_insertion_bt);
683 	btref_retain(old->vme_insertion_bt);
684 #endif
685 #if VM_BTLOG_TAGS
686 	/* Discard the btref that might be in the new entry */
687 	if (new->vme_kernel_object) {
688 		btref_put(new->vme_tag_btref);
689 	}
690 	/* Retain the btref in the old entry to account for its copy */
691 	if (old->vme_kernel_object) {
692 		btref_retain(old->vme_tag_btref);
693 	}
694 #endif /* VM_BTLOG_TAGS */
695 	*new = *old;
696 }
697 
698 static inline void
vm_map_entry_copy(vm_map_t map,vm_map_entry_t new,vm_map_entry_t old)699 vm_map_entry_copy(
700 	vm_map_t map,
701 	vm_map_entry_t new,
702 	vm_map_entry_t old)
703 {
704 	vm_map_entry_copy_full(new, old);
705 
706 	new->is_shared = FALSE;
707 	new->needs_wakeup = FALSE;
708 	new->in_transition = FALSE;
709 	new->wired_count = 0;
710 	new->user_wired_count = 0;
711 	new->vme_permanent = FALSE;
712 	vm_map_entry_copy_code_signing(map, new, old);
713 	vm_map_entry_copy_csm_assoc(map, new, old);
714 	if (new->iokit_acct) {
715 		assertf(!new->use_pmap, "old %p new %p\n", old, new);
716 		new->iokit_acct = FALSE;
717 		new->use_pmap = TRUE;
718 	}
719 	new->vme_resilient_codesign = FALSE;
720 	new->vme_resilient_media = FALSE;
721 	new->vme_atomic = FALSE;
722 	new->vme_no_copy_on_read = FALSE;
723 }
724 
725 /*
726  * Normal lock_read_to_write() returns FALSE/0 on failure.
727  * These functions evaluate to zero on success and non-zero value on failure.
728  */
729 __attribute__((always_inline))
730 int
vm_map_lock_read_to_write(vm_map_t map)731 vm_map_lock_read_to_write(vm_map_t map)
732 {
733 	if (lck_rw_lock_shared_to_exclusive(&(map)->lock)) {
734 		DTRACE_VM(vm_map_lock_upgrade);
735 		return 0;
736 	}
737 	return 1;
738 }
739 
740 __attribute__((always_inline))
741 boolean_t
vm_map_try_lock(vm_map_t map)742 vm_map_try_lock(vm_map_t map)
743 {
744 	if (lck_rw_try_lock_exclusive(&(map)->lock)) {
745 		DTRACE_VM(vm_map_lock_w);
746 		return TRUE;
747 	}
748 	return FALSE;
749 }
750 
751 __attribute__((always_inline))
752 boolean_t
vm_map_try_lock_read(vm_map_t map)753 vm_map_try_lock_read(vm_map_t map)
754 {
755 	if (lck_rw_try_lock_shared(&(map)->lock)) {
756 		DTRACE_VM(vm_map_lock_r);
757 		return TRUE;
758 	}
759 	return FALSE;
760 }
761 
762 /*!
763  * @function kdp_vm_map_is_acquired_exclusive
764  *
765  * @abstract
766  * Checks if vm map is acquired exclusive.
767  *
768  * @discussion
769  * NOT SAFE: To be used only by kernel debugger.
770  *
771  * @param map map to check
772  *
773  * @returns TRUE if the map is acquired exclusively.
774  */
775 boolean_t
kdp_vm_map_is_acquired_exclusive(vm_map_t map)776 kdp_vm_map_is_acquired_exclusive(vm_map_t map)
777 {
778 	return kdp_lck_rw_lock_is_acquired_exclusive(&map->lock);
779 }
780 
781 /*
782  * Routines to get the page size the caller should
783  * use while inspecting the target address space.
784  * Use the "_safely" variant if the caller is dealing with a user-provided
785  * array whose size depends on the page size, to avoid any overflow or
786  * underflow of a user-allocated buffer.
787  */
788 int
vm_self_region_page_shift_safely(vm_map_t target_map)789 vm_self_region_page_shift_safely(
790 	vm_map_t target_map)
791 {
792 	int effective_page_shift = 0;
793 
794 	if (PAGE_SIZE == (4096)) {
795 		/* x86_64 and 4k watches: always use 4k */
796 		return PAGE_SHIFT;
797 	}
798 	/* did caller provide an explicit page size for this thread to use? */
799 	effective_page_shift = thread_self_region_page_shift();
800 	if (effective_page_shift) {
801 		/* use the explicitly-provided page size */
802 		return effective_page_shift;
803 	}
804 	/* no explicit page size: use the caller's page size... */
805 	effective_page_shift = VM_MAP_PAGE_SHIFT(current_map());
806 	if (effective_page_shift == VM_MAP_PAGE_SHIFT(target_map)) {
807 		/* page size match: safe to use */
808 		return effective_page_shift;
809 	}
810 	/* page size mismatch */
811 	return -1;
812 }
813 int
vm_self_region_page_shift(vm_map_t target_map)814 vm_self_region_page_shift(
815 	vm_map_t target_map)
816 {
817 	int effective_page_shift;
818 
819 	effective_page_shift = vm_self_region_page_shift_safely(target_map);
820 	if (effective_page_shift == -1) {
821 		/* no safe value but OK to guess for caller */
822 		effective_page_shift = MIN(VM_MAP_PAGE_SHIFT(current_map()),
823 		    VM_MAP_PAGE_SHIFT(target_map));
824 	}
825 	return effective_page_shift;
826 }
827 
828 
829 /*
830  *	Decide if we want to allow processes to execute from their data or stack areas.
831  *	override_nx() returns true if we do.  Data/stack execution can be enabled independently
832  *	for 32 and 64 bit processes.  Set the VM_ABI_32 or VM_ABI_64 flags in allow_data_exec
833  *	or allow_stack_exec to enable data execution for that type of data area for that particular
834  *	ABI (or both by or'ing the flags together).  These are initialized in the architecture
835  *	specific pmap files since the default behavior varies according to architecture.  The
836  *	main reason it varies is because of the need to provide binary compatibility with old
837  *	applications that were written before these restrictions came into being.  In the old
838  *	days, an app could execute anything it could read, but this has slowly been tightened
839  *	up over time.  The default behavior is:
840  *
841  *	32-bit PPC apps		may execute from both stack and data areas
842  *	32-bit Intel apps	may exeucte from data areas but not stack
843  *	64-bit PPC/Intel apps	may not execute from either data or stack
844  *
845  *	An application on any architecture may override these defaults by explicitly
846  *	adding PROT_EXEC permission to the page in question with the mprotect(2)
847  *	system call.  This code here just determines what happens when an app tries to
848  *      execute from a page that lacks execute permission.
849  *
850  *	Note that allow_data_exec or allow_stack_exec may also be modified by sysctl to change the
851  *	default behavior for both 32 and 64 bit apps on a system-wide basis. Furthermore,
852  *	a Mach-O header flag bit (MH_NO_HEAP_EXECUTION) can be used to forcibly disallow
853  *	execution from data areas for a particular binary even if the arch normally permits it. As
854  *	a final wrinkle, a posix_spawn attribute flag can be used to negate this opt-in header bit
855  *	to support some complicated use cases, notably browsers with out-of-process plugins that
856  *	are not all NX-safe.
857  */
858 
859 extern int allow_data_exec, allow_stack_exec;
860 
861 int
override_nx(vm_map_t map,uint32_t user_tag)862 override_nx(vm_map_t map, uint32_t user_tag) /* map unused on arm */
863 {
864 	int current_abi;
865 
866 	if (map->pmap == kernel_pmap) {
867 		return FALSE;
868 	}
869 
870 	/*
871 	 * Determine if the app is running in 32 or 64 bit mode.
872 	 */
873 
874 	if (vm_map_is_64bit(map)) {
875 		current_abi = VM_ABI_64;
876 	} else {
877 		current_abi = VM_ABI_32;
878 	}
879 
880 	/*
881 	 * Determine if we should allow the execution based on whether it's a
882 	 * stack or data area and the current architecture.
883 	 */
884 
885 	if (user_tag == VM_MEMORY_STACK) {
886 		return allow_stack_exec & current_abi;
887 	}
888 
889 	return (allow_data_exec & current_abi) && (map->map_disallow_data_exec == FALSE);
890 }
891 
892 
893 /*
894  *	Virtual memory maps provide for the mapping, protection,
895  *	and sharing of virtual memory objects.  In addition,
896  *	this module provides for an efficient virtual copy of
897  *	memory from one map to another.
898  *
899  *	Synchronization is required prior to most operations.
900  *
901  *	Maps consist of an ordered doubly-linked list of simple
902  *	entries; a single hint is used to speed up lookups.
903  *
904  *	Sharing maps have been deleted from this version of Mach.
905  *	All shared objects are now mapped directly into the respective
906  *	maps.  This requires a change in the copy on write strategy;
907  *	the asymmetric (delayed) strategy is used for shared temporary
908  *	objects instead of the symmetric (shadow) strategy.  All maps
909  *	are now "top level" maps (either task map, kernel map or submap
910  *	of the kernel map).
911  *
912  *	Since portions of maps are specified by start/end addreses,
913  *	which may not align with existing map entries, all
914  *	routines merely "clip" entries to these start/end values.
915  *	[That is, an entry is split into two, bordering at a
916  *	start or end value.]  Note that these clippings may not
917  *	always be necessary (as the two resulting entries are then
918  *	not changed); however, the clipping is done for convenience.
919  *	No attempt is currently made to "glue back together" two
920  *	abutting entries.
921  *
922  *	The symmetric (shadow) copy strategy implements virtual copy
923  *	by copying VM object references from one map to
924  *	another, and then marking both regions as copy-on-write.
925  *	It is important to note that only one writeable reference
926  *	to a VM object region exists in any map when this strategy
927  *	is used -- this means that shadow object creation can be
928  *	delayed until a write operation occurs.  The symmetric (delayed)
929  *	strategy allows multiple maps to have writeable references to
930  *	the same region of a vm object, and hence cannot delay creating
931  *	its copy objects.  See vm_object_copy_quickly() in vm_object.c.
932  *	Copying of permanent objects is completely different; see
933  *	vm_object_copy_strategically() in vm_object.c.
934  */
935 
936 ZONE_DECLARE_ID(ZONE_ID_VM_MAP_COPY, struct vm_map_copy);
937 
938 #define VM_MAP_ZONE_NAME        "maps"
939 #define VM_MAP_ZFLAGS           (ZC_NOENCRYPT | ZC_VM)
940 
941 #define VM_MAP_ENTRY_ZONE_NAME  "VM map entries"
942 #define VM_MAP_ENTRY_ZFLAGS     (ZC_NOENCRYPT | ZC_VM)
943 
944 #define VM_MAP_HOLES_ZONE_NAME  "VM map holes"
945 #define VM_MAP_HOLES_ZFLAGS     (ZC_NOENCRYPT | ZC_VM)
946 
947 /*
948  * Asserts that a vm_map_copy object is coming from the
949  * vm_map_copy_zone to ensure that it isn't a fake constructed
950  * anywhere else.
951  */
952 void
vm_map_copy_require(struct vm_map_copy * copy)953 vm_map_copy_require(struct vm_map_copy *copy)
954 {
955 	zone_id_require(ZONE_ID_VM_MAP_COPY, sizeof(struct vm_map_copy), copy);
956 }
957 
958 /*
959  *	vm_map_require:
960  *
961  *	Ensures that the argument is memory allocated from the genuine
962  *	vm map zone. (See zone_id_require_allow_foreign).
963  */
964 void
vm_map_require(vm_map_t map)965 vm_map_require(vm_map_t map)
966 {
967 	zone_id_require(ZONE_ID_VM_MAP, sizeof(struct _vm_map), map);
968 }
969 
970 #define VM_MAP_EARLY_COUNT_MAX         16
971 static __startup_data vm_offset_t      map_data;
972 static __startup_data vm_size_t        map_data_size;
973 static __startup_data vm_offset_t      kentry_data;
974 static __startup_data vm_size_t        kentry_data_size;
975 static __startup_data vm_offset_t      map_holes_data;
976 static __startup_data vm_size_t        map_holes_data_size;
977 static __startup_data vm_map_t        *early_map_owners[VM_MAP_EARLY_COUNT_MAX];
978 static __startup_data uint32_t         early_map_count;
979 
980 #if XNU_TARGET_OS_OSX
981 #define         NO_COALESCE_LIMIT  ((1024 * 128) - 1)
982 #else /* XNU_TARGET_OS_OSX */
983 #define         NO_COALESCE_LIMIT  0
984 #endif /* XNU_TARGET_OS_OSX */
985 
986 /* Skip acquiring locks if we're in the midst of a kernel core dump */
987 unsigned int not_in_kdp = 1;
988 
989 unsigned int vm_map_set_cache_attr_count = 0;
990 
991 kern_return_t
vm_map_set_cache_attr(vm_map_t map,vm_map_offset_t va)992 vm_map_set_cache_attr(
993 	vm_map_t        map,
994 	vm_map_offset_t va)
995 {
996 	vm_map_entry_t  map_entry;
997 	vm_object_t     object;
998 	kern_return_t   kr = KERN_SUCCESS;
999 
1000 	vm_map_lock_read(map);
1001 
1002 	if (!vm_map_lookup_entry(map, va, &map_entry) ||
1003 	    map_entry->is_sub_map) {
1004 		/*
1005 		 * that memory is not properly mapped
1006 		 */
1007 		kr = KERN_INVALID_ARGUMENT;
1008 		goto done;
1009 	}
1010 	object = VME_OBJECT(map_entry);
1011 
1012 	if (object == VM_OBJECT_NULL) {
1013 		/*
1014 		 * there should be a VM object here at this point
1015 		 */
1016 		kr = KERN_INVALID_ARGUMENT;
1017 		goto done;
1018 	}
1019 	vm_object_lock(object);
1020 	object->set_cache_attr = TRUE;
1021 	vm_object_unlock(object);
1022 
1023 	vm_map_set_cache_attr_count++;
1024 done:
1025 	vm_map_unlock_read(map);
1026 
1027 	return kr;
1028 }
1029 
1030 
1031 #if CONFIG_CODE_DECRYPTION
1032 /*
1033  * vm_map_apple_protected:
1034  * This remaps the requested part of the object with an object backed by
1035  * the decrypting pager.
1036  * crypt_info contains entry points and session data for the crypt module.
1037  * The crypt_info block will be copied by vm_map_apple_protected. The data structures
1038  * referenced in crypt_info must remain valid until crypt_info->crypt_end() is called.
1039  */
1040 kern_return_t
vm_map_apple_protected(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_object_offset_t crypto_backing_offset,struct pager_crypt_info * crypt_info,uint32_t cryptid)1041 vm_map_apple_protected(
1042 	vm_map_t                map,
1043 	vm_map_offset_t         start,
1044 	vm_map_offset_t         end,
1045 	vm_object_offset_t      crypto_backing_offset,
1046 	struct pager_crypt_info *crypt_info,
1047 	uint32_t                cryptid)
1048 {
1049 	boolean_t       map_locked;
1050 	kern_return_t   kr;
1051 	vm_map_entry_t  map_entry;
1052 	struct vm_map_entry tmp_entry;
1053 	memory_object_t unprotected_mem_obj;
1054 	vm_object_t     protected_object;
1055 	vm_map_offset_t map_addr;
1056 	vm_map_offset_t start_aligned, end_aligned;
1057 	vm_object_offset_t      crypto_start, crypto_end;
1058 	boolean_t       cache_pager;
1059 
1060 	map_locked = FALSE;
1061 	unprotected_mem_obj = MEMORY_OBJECT_NULL;
1062 
1063 	if (__improbable(vm_map_range_overflows(map, start, end - start))) {
1064 		return KERN_INVALID_ADDRESS;
1065 	}
1066 	start_aligned = vm_map_trunc_page(start, PAGE_MASK_64);
1067 	end_aligned = vm_map_round_page(end, PAGE_MASK_64);
1068 	start_aligned = vm_map_trunc_page(start_aligned, VM_MAP_PAGE_MASK(map));
1069 	end_aligned = vm_map_round_page(end_aligned, VM_MAP_PAGE_MASK(map));
1070 
1071 #if __arm64__
1072 	/*
1073 	 * "start" and "end" might be 4K-aligned but not 16K-aligned,
1074 	 * so we might have to loop and establish up to 3 mappings:
1075 	 *
1076 	 * + the first 16K-page, which might overlap with the previous
1077 	 *   4K-aligned mapping,
1078 	 * + the center,
1079 	 * + the last 16K-page, which might overlap with the next
1080 	 *   4K-aligned mapping.
1081 	 * Each of these mapping might be backed by a vnode pager (if
1082 	 * properly page-aligned) or a "fourk_pager", itself backed by a
1083 	 * vnode pager (if 4K-aligned but not page-aligned).
1084 	 */
1085 #endif /* __arm64__ */
1086 
1087 	map_addr = start_aligned;
1088 	for (map_addr = start_aligned;
1089 	    map_addr < end;
1090 	    map_addr = tmp_entry.vme_end) {
1091 		vm_map_lock(map);
1092 		map_locked = TRUE;
1093 
1094 		/* lookup the protected VM object */
1095 		if (!vm_map_lookup_entry(map,
1096 		    map_addr,
1097 		    &map_entry) ||
1098 		    map_entry->is_sub_map ||
1099 		    VME_OBJECT(map_entry) == VM_OBJECT_NULL) {
1100 			/* that memory is not properly mapped */
1101 			kr = KERN_INVALID_ARGUMENT;
1102 			goto done;
1103 		}
1104 
1105 		/* ensure mapped memory is mapped as executable except
1106 		 *  except for model decryption flow */
1107 		if ((cryptid != CRYPTID_MODEL_ENCRYPTION) &&
1108 		    !(map_entry->protection & VM_PROT_EXECUTE)) {
1109 			kr = KERN_INVALID_ARGUMENT;
1110 			goto done;
1111 		}
1112 
1113 		/* get the protected object to be decrypted */
1114 		protected_object = VME_OBJECT(map_entry);
1115 		if (protected_object == VM_OBJECT_NULL) {
1116 			/* there should be a VM object here at this point */
1117 			kr = KERN_INVALID_ARGUMENT;
1118 			goto done;
1119 		}
1120 		/* ensure protected object stays alive while map is unlocked */
1121 		vm_object_reference(protected_object);
1122 
1123 		/* limit the map entry to the area we want to cover */
1124 		vm_map_clip_start(map, map_entry, start_aligned);
1125 		vm_map_clip_end(map, map_entry, end_aligned);
1126 
1127 		tmp_entry = *map_entry;
1128 		map_entry = VM_MAP_ENTRY_NULL; /* not valid after unlocking map */
1129 		vm_map_unlock(map);
1130 		map_locked = FALSE;
1131 
1132 		/*
1133 		 * This map entry might be only partially encrypted
1134 		 * (if not fully "page-aligned").
1135 		 */
1136 		crypto_start = 0;
1137 		crypto_end = tmp_entry.vme_end - tmp_entry.vme_start;
1138 		if (tmp_entry.vme_start < start) {
1139 			if (tmp_entry.vme_start != start_aligned) {
1140 				kr = KERN_INVALID_ADDRESS;
1141 				vm_object_deallocate(protected_object);
1142 				goto done;
1143 			}
1144 			crypto_start += (start - tmp_entry.vme_start);
1145 		}
1146 		if (tmp_entry.vme_end > end) {
1147 			if (tmp_entry.vme_end != end_aligned) {
1148 				kr = KERN_INVALID_ADDRESS;
1149 				vm_object_deallocate(protected_object);
1150 				goto done;
1151 			}
1152 			crypto_end -= (tmp_entry.vme_end - end);
1153 		}
1154 
1155 		/*
1156 		 * This "extra backing offset" is needed to get the decryption
1157 		 * routine to use the right key.  It adjusts for the possibly
1158 		 * relative offset of an interposed "4K" pager...
1159 		 */
1160 		if (crypto_backing_offset == (vm_object_offset_t) -1) {
1161 			crypto_backing_offset = VME_OFFSET(&tmp_entry);
1162 		}
1163 
1164 		cache_pager = TRUE;
1165 #if XNU_TARGET_OS_OSX
1166 		if (vm_map_is_alien(map)) {
1167 			cache_pager = FALSE;
1168 		}
1169 #endif /* XNU_TARGET_OS_OSX */
1170 
1171 		/*
1172 		 * Lookup (and create if necessary) the protected memory object
1173 		 * matching that VM object.
1174 		 * If successful, this also grabs a reference on the memory object,
1175 		 * to guarantee that it doesn't go away before we get a chance to map
1176 		 * it.
1177 		 */
1178 		unprotected_mem_obj = apple_protect_pager_setup(
1179 			protected_object,
1180 			VME_OFFSET(&tmp_entry),
1181 			crypto_backing_offset,
1182 			crypt_info,
1183 			crypto_start,
1184 			crypto_end,
1185 			cache_pager);
1186 
1187 		/* release extra ref on protected object */
1188 		vm_object_deallocate(protected_object);
1189 
1190 		if (unprotected_mem_obj == NULL) {
1191 			kr = KERN_FAILURE;
1192 			goto done;
1193 		}
1194 
1195 		/* can overwrite an immutable mapping */
1196 		vm_map_kernel_flags_t vmk_flags = {
1197 			.vmf_fixed = true,
1198 			.vmf_overwrite = true,
1199 			.vmkf_overwrite_immutable = true,
1200 		};
1201 		/* make the new mapping as "permanent" as the one it replaces */
1202 		vmk_flags.vmf_permanent = tmp_entry.vme_permanent;
1203 
1204 		/* map this memory object in place of the current one */
1205 		map_addr = tmp_entry.vme_start;
1206 		kr = mach_vm_map_kernel(map,
1207 		    vm_sanitize_wrap_addr_ref(&map_addr),
1208 		    (tmp_entry.vme_end -
1209 		    tmp_entry.vme_start),
1210 		    (mach_vm_offset_t) 0,
1211 		    vmk_flags,
1212 		    (ipc_port_t)(uintptr_t) unprotected_mem_obj,
1213 		    0,
1214 		    TRUE,
1215 		    tmp_entry.protection,
1216 		    tmp_entry.max_protection,
1217 		    tmp_entry.inheritance);
1218 		assertf(kr == KERN_SUCCESS,
1219 		    "kr = 0x%x\n", kr);
1220 		assertf(map_addr == tmp_entry.vme_start,
1221 		    "map_addr=0x%llx vme_start=0x%llx tmp_entry=%p\n",
1222 		    (uint64_t)map_addr,
1223 		    (uint64_t) tmp_entry.vme_start,
1224 		    &tmp_entry);
1225 
1226 #if VM_MAP_DEBUG_APPLE_PROTECT
1227 		if (vm_map_debug_apple_protect) {
1228 			printf("APPLE_PROTECT: map %p [0x%llx:0x%llx] pager %p:"
1229 			    " backing:[object:%p,offset:0x%llx,"
1230 			    "crypto_backing_offset:0x%llx,"
1231 			    "crypto_start:0x%llx,crypto_end:0x%llx]\n",
1232 			    map,
1233 			    (uint64_t) map_addr,
1234 			    (uint64_t) (map_addr + (tmp_entry.vme_end -
1235 			    tmp_entry.vme_start)),
1236 			    unprotected_mem_obj,
1237 			    protected_object,
1238 			    VME_OFFSET(&tmp_entry),
1239 			    crypto_backing_offset,
1240 			    crypto_start,
1241 			    crypto_end);
1242 		}
1243 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
1244 
1245 		/*
1246 		 * Release the reference obtained by
1247 		 * apple_protect_pager_setup().
1248 		 * The mapping (if it succeeded) is now holding a reference on
1249 		 * the memory object.
1250 		 */
1251 		memory_object_deallocate(unprotected_mem_obj);
1252 		unprotected_mem_obj = MEMORY_OBJECT_NULL;
1253 
1254 		/* continue with next map entry */
1255 		crypto_backing_offset += (tmp_entry.vme_end -
1256 		    tmp_entry.vme_start);
1257 		crypto_backing_offset -= crypto_start;
1258 	}
1259 	kr = KERN_SUCCESS;
1260 
1261 done:
1262 	if (map_locked) {
1263 		vm_map_unlock(map);
1264 	}
1265 	return kr;
1266 }
1267 #endif  /* CONFIG_CODE_DECRYPTION */
1268 
1269 
1270 LCK_GRP_DECLARE(vm_map_lck_grp, "vm_map");
1271 LCK_ATTR_DECLARE(vm_map_lck_attr, 0, 0);
1272 LCK_ATTR_DECLARE(vm_map_lck_rw_attr, 0, LCK_ATTR_DEBUG);
1273 
1274 #if XNU_TARGET_OS_OSX
1275 #define MALLOC_NO_COW_DEFAULT 1
1276 #define MALLOC_NO_COW_EXCEPT_FORK_DEFAULT 1
1277 #else /* XNU_TARGET_OS_OSX */
1278 #define MALLOC_NO_COW_DEFAULT 1
1279 #define MALLOC_NO_COW_EXCEPT_FORK_DEFAULT 0
1280 #endif /* XNU_TARGET_OS_OSX */
1281 TUNABLE(int, malloc_no_cow, "malloc_no_cow", MALLOC_NO_COW_DEFAULT);
1282 TUNABLE(int, malloc_no_cow_except_fork, "malloc_no_cow_except_fork", MALLOC_NO_COW_EXCEPT_FORK_DEFAULT);
1283 uint64_t vm_memory_malloc_no_cow_mask = 0ULL;
1284 #if DEBUG
1285 int vm_check_map_sanity = 0;
1286 #endif
1287 
1288 /*
1289  *	vm_map_init:
1290  *
1291  *	Initialize the vm_map module.  Must be called before
1292  *	any other vm_map routines.
1293  *
1294  *	Map and entry structures are allocated from zones -- we must
1295  *	initialize those zones.
1296  *
1297  *	There are three zones of interest:
1298  *
1299  *	vm_map_zone:		used to allocate maps.
1300  *	vm_map_entry_zone:	used to allocate map entries.
1301  *
1302  *	LP32:
1303  *	vm_map_entry_reserved_zone:     fallback zone for kernel map entries
1304  *
1305  *	The kernel allocates map entries from a special zone that is initially
1306  *	"crammed" with memory.  It would be difficult (perhaps impossible) for
1307  *	the kernel to allocate more memory to a entry zone when it became
1308  *	empty since the very act of allocating memory implies the creation
1309  *	of a new entry.
1310  */
1311 __startup_func
1312 void
vm_map_init(void)1313 vm_map_init(void)
1314 {
1315 
1316 #if MACH_ASSERT
1317 	PE_parse_boot_argn("debug4k_filter", &debug4k_filter,
1318 	    sizeof(debug4k_filter));
1319 #endif /* MACH_ASSERT */
1320 
1321 	zone_create_ext(VM_MAP_ZONE_NAME, sizeof(struct _vm_map),
1322 	    VM_MAP_ZFLAGS, ZONE_ID_VM_MAP, NULL);
1323 
1324 	/*
1325 	 * Don't quarantine because we always need elements available
1326 	 * Disallow GC on this zone... to aid the GC.
1327 	 */
1328 	zone_create_ext(VM_MAP_ENTRY_ZONE_NAME,
1329 	    sizeof(struct vm_map_entry), VM_MAP_ENTRY_ZFLAGS,
1330 	    ZONE_ID_VM_MAP_ENTRY, ^(zone_t z) {
1331 		z->z_elems_rsv = (uint16_t)(32 *
1332 		(ml_early_cpu_max_number() + 1));
1333 	});
1334 
1335 	zone_create_ext(VM_MAP_HOLES_ZONE_NAME,
1336 	    sizeof(struct vm_map_links), VM_MAP_HOLES_ZFLAGS,
1337 	    ZONE_ID_VM_MAP_HOLES, ^(zone_t z) {
1338 		z->z_elems_rsv = (uint16_t)(16 * 1024 / zone_elem_outer_size(z));
1339 	});
1340 
1341 	zone_create_ext("VM map copies", sizeof(struct vm_map_copy),
1342 	    ZC_NOENCRYPT, ZONE_ID_VM_MAP_COPY, NULL);
1343 
1344 	/*
1345 	 * Add the stolen memory to zones, adjust zone size and stolen counts.
1346 	 */
1347 	zone_cram_early(vm_map_zone, map_data, map_data_size);
1348 	zone_cram_early(vm_map_entry_zone, kentry_data, kentry_data_size);
1349 	zone_cram_early(vm_map_holes_zone, map_holes_data, map_holes_data_size);
1350 	printf("VM boostrap: %d maps, %d entries and %d holes available\n",
1351 	    zone_count_free(vm_map_zone),
1352 	    zone_count_free(vm_map_entry_zone),
1353 	    zone_count_free(vm_map_holes_zone));
1354 
1355 	/*
1356 	 * Since these are covered by zones, remove them from stolen page accounting.
1357 	 */
1358 	VM_PAGE_MOVE_STOLEN(atop_64(map_data_size) + atop_64(kentry_data_size) + atop_64(map_holes_data_size));
1359 
1360 #if VM_MAP_DEBUG_APPLE_PROTECT
1361 	PE_parse_boot_argn("vm_map_debug_apple_protect",
1362 	    &vm_map_debug_apple_protect,
1363 	    sizeof(vm_map_debug_apple_protect));
1364 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
1365 #if VM_MAP_DEBUG_APPLE_FOURK
1366 	PE_parse_boot_argn("vm_map_debug_fourk",
1367 	    &vm_map_debug_fourk,
1368 	    sizeof(vm_map_debug_fourk));
1369 #endif /* VM_MAP_DEBUG_FOURK */
1370 
1371 	if (malloc_no_cow) {
1372 		vm_memory_malloc_no_cow_mask = 0ULL;
1373 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC;
1374 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_SMALL;
1375 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_MEDIUM;
1376 #if XNU_TARGET_OS_OSX
1377 		/*
1378 		 * On macOS, keep copy-on-write for MALLOC_LARGE because
1379 		 * realloc() may use vm_copy() to transfer the old contents
1380 		 * to the new location.
1381 		 */
1382 #else /* XNU_TARGET_OS_OSX */
1383 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE;
1384 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE_REUSABLE;
1385 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE_REUSED;
1386 #endif /* XNU_TARGET_OS_OSX */
1387 //		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_HUGE;
1388 //		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_REALLOC;
1389 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_TINY;
1390 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_NANO;
1391 //		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_TCMALLOC;
1392 		PE_parse_boot_argn("vm_memory_malloc_no_cow_mask",
1393 		    &vm_memory_malloc_no_cow_mask,
1394 		    sizeof(vm_memory_malloc_no_cow_mask));
1395 	}
1396 
1397 #if CONFIG_MAP_RANGES
1398 	vm_map_range_map_init();
1399 #endif /* CONFIG_MAP_RANGES */
1400 
1401 #if DEBUG
1402 	PE_parse_boot_argn("vm_check_map_sanity", &vm_check_map_sanity, sizeof(vm_check_map_sanity));
1403 	if (vm_check_map_sanity) {
1404 		kprintf("VM sanity checking enabled\n");
1405 	} else {
1406 		kprintf("VM sanity checking disabled. Set bootarg vm_check_map_sanity=1 to enable\n");
1407 	}
1408 #endif /* DEBUG */
1409 
1410 #if DEVELOPMENT || DEBUG
1411 	PE_parse_boot_argn("panic_on_unsigned_execute",
1412 	    &panic_on_unsigned_execute,
1413 	    sizeof(panic_on_unsigned_execute));
1414 	PE_parse_boot_argn("panic_on_mlock_failure",
1415 	    &panic_on_mlock_failure,
1416 	    sizeof(panic_on_mlock_failure));
1417 #endif /* DEVELOPMENT || DEBUG */
1418 }
1419 
1420 __startup_func
1421 static void
vm_map_steal_memory(void)1422 vm_map_steal_memory(void)
1423 {
1424 
1425 	/*
1426 	 * We need to reserve enough memory to support boostraping VM maps
1427 	 * and the zone subsystem.
1428 	 *
1429 	 * The VM Maps that need to function before zones can support them
1430 	 * are the ones registered with vm_map_will_allocate_early_map(),
1431 	 * which are:
1432 	 * - the kernel map
1433 	 * - the various submaps used by zones (pgz, meta, ...)
1434 	 *
1435 	 * We also need enough entries and holes to support them
1436 	 * until zone_metadata_init() is called, which is when
1437 	 * the zone allocator becomes capable of expanding dynamically.
1438 	 *
1439 	 * We need:
1440 	 * - VM_MAP_EARLY_COUNT_MAX worth of VM Maps.
1441 	 * - To allow for 3-4 entries per map, but the kernel map
1442 	 *   needs a multiple of VM_MAP_EARLY_COUNT_MAX entries
1443 	 *   to describe the submaps, so double it (and make it 8x too)
1444 	 * - To allow for holes between entries,
1445 	 *   hence needs the same budget as entries
1446 	 */
1447 	map_data_size = zone_get_early_alloc_size(VM_MAP_ZONE_NAME,
1448 	    sizeof(struct _vm_map), VM_MAP_ZFLAGS,
1449 	    VM_MAP_EARLY_COUNT_MAX);
1450 
1451 	kentry_data_size = zone_get_early_alloc_size(VM_MAP_ENTRY_ZONE_NAME,
1452 	    sizeof(struct vm_map_entry), VM_MAP_ENTRY_ZFLAGS,
1453 	    8 * VM_MAP_EARLY_COUNT_MAX);
1454 
1455 	map_holes_data_size = zone_get_early_alloc_size(VM_MAP_HOLES_ZONE_NAME,
1456 	    sizeof(struct vm_map_links), VM_MAP_HOLES_ZFLAGS,
1457 	    8 * VM_MAP_EARLY_COUNT_MAX);
1458 
1459 	/*
1460 	 * Steal a contiguous range of memory so that a simple range check
1461 	 * can validate early addresses being freed/crammed to these
1462 	 * zones
1463 	 */
1464 	map_data       = zone_early_mem_init(map_data_size + kentry_data_size +
1465 	    map_holes_data_size);
1466 	kentry_data    = map_data + map_data_size;
1467 	map_holes_data = kentry_data + kentry_data_size;
1468 }
1469 STARTUP(PMAP_STEAL, STARTUP_RANK_FIRST, vm_map_steal_memory);
1470 
1471 __startup_func
1472 static void
vm_kernel_boostraped(void)1473 vm_kernel_boostraped(void)
1474 {
1475 	zone_enable_caching(&zone_array[ZONE_ID_VM_MAP_ENTRY]);
1476 	zone_enable_caching(&zone_array[ZONE_ID_VM_MAP_HOLES]);
1477 	zone_enable_caching(&zone_array[ZONE_ID_VM_MAP_COPY]);
1478 
1479 	printf("VM bootstrap done: %d maps, %d entries and %d holes left\n",
1480 	    zone_count_free(vm_map_zone),
1481 	    zone_count_free(vm_map_entry_zone),
1482 	    zone_count_free(vm_map_holes_zone));
1483 }
1484 STARTUP(ZALLOC, STARTUP_RANK_SECOND, vm_kernel_boostraped);
1485 
1486 void
vm_map_disable_hole_optimization(vm_map_t map)1487 vm_map_disable_hole_optimization(vm_map_t map)
1488 {
1489 	vm_map_entry_t  head_entry, hole_entry, next_hole_entry;
1490 
1491 	if (map->holelistenabled) {
1492 		head_entry = hole_entry = CAST_TO_VM_MAP_ENTRY(map->holes_list);
1493 
1494 		while (hole_entry != NULL) {
1495 			next_hole_entry = hole_entry->vme_next;
1496 
1497 			hole_entry->vme_next = NULL;
1498 			hole_entry->vme_prev = NULL;
1499 			zfree_id(ZONE_ID_VM_MAP_HOLES, hole_entry);
1500 
1501 			if (next_hole_entry == head_entry) {
1502 				hole_entry = NULL;
1503 			} else {
1504 				hole_entry = next_hole_entry;
1505 			}
1506 		}
1507 
1508 		map->holes_list = NULL;
1509 		map->holelistenabled = FALSE;
1510 
1511 		map->first_free = vm_map_to_entry(map);
1512 		SAVE_HINT_HOLE_WRITE(map, NULL);
1513 	}
1514 }
1515 
1516 boolean_t
vm_kernel_map_is_kernel(vm_map_t map)1517 vm_kernel_map_is_kernel(vm_map_t map)
1518 {
1519 	return map->pmap == kernel_pmap;
1520 }
1521 
1522 /*
1523  *	vm_map_create:
1524  *
1525  *	Creates and returns a new empty VM map with
1526  *	the given physical map structure, and having
1527  *	the given lower and upper address bounds.
1528  */
1529 
1530 extern vm_map_t vm_map_create_external(
1531 	pmap_t                  pmap,
1532 	vm_map_offset_t         min_off,
1533 	vm_map_offset_t         max_off,
1534 	boolean_t               pageable);
1535 
1536 vm_map_t
vm_map_create_external(pmap_t pmap,vm_map_offset_t min,vm_map_offset_t max,boolean_t pageable)1537 vm_map_create_external(
1538 	pmap_t                  pmap,
1539 	vm_map_offset_t         min,
1540 	vm_map_offset_t         max,
1541 	boolean_t               pageable)
1542 {
1543 	vm_map_create_options_t options = VM_MAP_CREATE_DEFAULT;
1544 
1545 	if (pageable) {
1546 		options |= VM_MAP_CREATE_PAGEABLE;
1547 	}
1548 	return vm_map_create_options(pmap, min, max, options);
1549 }
1550 
1551 __startup_func
1552 void
vm_map_will_allocate_early_map(vm_map_t * owner)1553 vm_map_will_allocate_early_map(vm_map_t *owner)
1554 {
1555 	if (early_map_count >= VM_MAP_EARLY_COUNT_MAX) {
1556 		panic("VM_MAP_EARLY_COUNT_MAX is too low");
1557 	}
1558 
1559 	early_map_owners[early_map_count++] = owner;
1560 }
1561 
1562 __startup_func
1563 void
vm_map_relocate_early_maps(vm_offset_t delta)1564 vm_map_relocate_early_maps(vm_offset_t delta)
1565 {
1566 	for (uint32_t i = 0; i < early_map_count; i++) {
1567 		vm_address_t addr = (vm_address_t)*early_map_owners[i];
1568 
1569 		*early_map_owners[i] = (vm_map_t)(addr + delta);
1570 	}
1571 
1572 	early_map_count = ~0u;
1573 }
1574 
1575 /*
1576  *	Routine:	vm_map_relocate_early_elem
1577  *
1578  *	Purpose:
1579  *		Early zone elements are allocated in a temporary part
1580  *		of the address space.
1581  *
1582  *		Once the zones live in their final place, the early
1583  *		VM maps, map entries and map holes need to be relocated.
1584  *
1585  *		It involves rewriting any vm_map_t, vm_map_entry_t or
1586  *		pointers to vm_map_links. Other pointers to other types
1587  *		are fine.
1588  *
1589  *		Fortunately, pointers to those types are self-contained
1590  *		in those zones, _except_ for pointers to VM maps,
1591  *		which are tracked during early boot and fixed with
1592  *		vm_map_relocate_early_maps().
1593  */
1594 __startup_func
1595 void
vm_map_relocate_early_elem(uint32_t zone_id,vm_offset_t new_addr,vm_offset_t delta)1596 vm_map_relocate_early_elem(
1597 	uint32_t                zone_id,
1598 	vm_offset_t             new_addr,
1599 	vm_offset_t             delta)
1600 {
1601 #define relocate(type_t, field)  ({ \
1602 	typeof(((type_t)NULL)->field) *__field = &((type_t)new_addr)->field;   \
1603 	if (*__field) {                                                        \
1604 	        *__field = (typeof(*__field))((vm_offset_t)*__field + delta);  \
1605 	}                                                                      \
1606 })
1607 
1608 	switch (zone_id) {
1609 	case ZONE_ID_VM_MAP:
1610 	case ZONE_ID_VM_MAP_ENTRY:
1611 	case ZONE_ID_VM_MAP_HOLES:
1612 		break;
1613 
1614 	default:
1615 		panic("Unexpected zone ID %d", zone_id);
1616 	}
1617 
1618 	if (zone_id == ZONE_ID_VM_MAP) {
1619 		relocate(vm_map_t, hdr.links.prev);
1620 		relocate(vm_map_t, hdr.links.next);
1621 		((vm_map_t)new_addr)->pmap = kernel_pmap;
1622 #ifdef VM_MAP_STORE_USE_RB
1623 		relocate(vm_map_t, hdr.rb_head_store.rbh_root);
1624 #endif /* VM_MAP_STORE_USE_RB */
1625 		relocate(vm_map_t, hint);
1626 		relocate(vm_map_t, hole_hint);
1627 		relocate(vm_map_t, first_free);
1628 		return;
1629 	}
1630 
1631 	relocate(struct vm_map_links *, prev);
1632 	relocate(struct vm_map_links *, next);
1633 
1634 	if (zone_id == ZONE_ID_VM_MAP_ENTRY) {
1635 #ifdef VM_MAP_STORE_USE_RB
1636 		relocate(vm_map_entry_t, store.entry.rbe_left);
1637 		relocate(vm_map_entry_t, store.entry.rbe_right);
1638 		relocate(vm_map_entry_t, store.entry.rbe_parent);
1639 #endif /* VM_MAP_STORE_USE_RB */
1640 		if (((vm_map_entry_t)new_addr)->is_sub_map) {
1641 			/* no object to relocate because we haven't made any */
1642 			((vm_map_entry_t)new_addr)->vme_submap +=
1643 			    delta >> VME_SUBMAP_SHIFT;
1644 		}
1645 #if MAP_ENTRY_CREATION_DEBUG
1646 		relocate(vm_map_entry_t, vme_creation_maphdr);
1647 #endif /* MAP_ENTRY_CREATION_DEBUG */
1648 	}
1649 
1650 #undef relocate
1651 }
1652 
1653 /*
1654  * Generate a serial ID to identify a newly allocated vm_map
1655  */
1656 static uintptr_t vm_map_serial_current = 0;
1657 vm_map_serial_t vm_map_serial_generate(void);
1658 void vm_map_assign_serial(vm_map_t, vm_map_serial_t);
1659 
1660 vm_map_serial_t
vm_map_serial_generate(void)1661 vm_map_serial_generate(void)
1662 {
1663 	vm_map_serial_t serial = (void *)os_atomic_inc(&vm_map_serial_current, relaxed);
1664 	return serial;
1665 }
1666 
1667 void
vm_map_assign_serial(vm_map_t map,vm_map_serial_t serial)1668 vm_map_assign_serial(vm_map_t map, vm_map_serial_t serial)
1669 {
1670 	map->serial_id = serial;
1671 #if CONFIG_SPTM
1672 	/* Copy through our ID to the pmap (only available on SPTM systems) */
1673 	if (map->pmap) {
1674 		map->pmap->associated_vm_map_serial_id = map->serial_id;
1675 	}
1676 #endif /* CONFIG_SPTM */
1677 }
1678 
1679 vm_map_t
vm_map_create_options(pmap_t pmap,vm_map_offset_t min,vm_map_offset_t max,vm_map_create_options_t options)1680 vm_map_create_options(
1681 	pmap_t                  pmap,
1682 	vm_map_offset_t         min,
1683 	vm_map_offset_t         max,
1684 	vm_map_create_options_t options)
1685 {
1686 	vm_map_t result;
1687 
1688 #if DEBUG || DEVELOPMENT
1689 	if (__improbable(startup_phase < STARTUP_SUB_ZALLOC)) {
1690 		if (early_map_count != ~0u && early_map_count !=
1691 		    zone_count_allocated(vm_map_zone) + 1) {
1692 			panic("allocating %dth early map, owner not known",
1693 			    zone_count_allocated(vm_map_zone) + 1);
1694 		}
1695 		if (early_map_count != ~0u && pmap && pmap != kernel_pmap) {
1696 			panic("allocating %dth early map for non kernel pmap",
1697 			    early_map_count);
1698 		}
1699 	}
1700 #endif /* DEBUG || DEVELOPMENT */
1701 
1702 	result = zalloc_id(ZONE_ID_VM_MAP, Z_WAITOK | Z_NOFAIL | Z_ZERO);
1703 
1704 	vm_map_store_init(&result->hdr);
1705 	result->hdr.entries_pageable = (bool)(options & VM_MAP_CREATE_PAGEABLE);
1706 	vm_map_set_page_shift(result, PAGE_SHIFT);
1707 
1708 	result->size_limit      = RLIM_INFINITY;        /* default unlimited */
1709 	result->data_limit      = RLIM_INFINITY;        /* default unlimited */
1710 	result->user_wire_limit = MACH_VM_MAX_ADDRESS;  /* default limit is unlimited */
1711 	os_ref_init_count_raw(&result->map_refcnt, &map_refgrp, 1);
1712 
1713 	result->pmap = pmap;
1714 
1715 	/*
1716 	 * Immediately give ourselves an ID
1717 	 * Unless this map is being created as part of a fork, in which case
1718 	 * the caller will reassign the ID of the parent (so don't waste an
1719 	 *  increment here).
1720 	 */
1721 	if ((options & VM_MAP_CREATE_VIA_FORK) == 0) {
1722 		vm_map_assign_serial(result, vm_map_serial_generate());
1723 	}
1724 
1725 	result->min_offset = min;
1726 	result->max_offset = max;
1727 	result->first_free = vm_map_to_entry(result);
1728 	result->hint = vm_map_to_entry(result);
1729 
1730 	if (options & VM_MAP_CREATE_NEVER_FAULTS) {
1731 		assert(pmap == kernel_pmap);
1732 		result->never_faults = true;
1733 	}
1734 
1735 	/* "has_corpse_footprint" and "holelistenabled" are mutually exclusive */
1736 	if (options & VM_MAP_CREATE_CORPSE_FOOTPRINT) {
1737 		result->has_corpse_footprint = true;
1738 	} else if (!(options & VM_MAP_CREATE_DISABLE_HOLELIST)) {
1739 		struct vm_map_links *hole_entry;
1740 
1741 		hole_entry = zalloc_id(ZONE_ID_VM_MAP_HOLES, Z_WAITOK | Z_NOFAIL);
1742 		hole_entry->start = min;
1743 		/*
1744 		 * Holes can be used to track ranges all the way up to
1745 		 * MACH_VM_MAX_ADDRESS or more (e.g. kernel map).
1746 		 */
1747 		hole_entry->end = MAX(max, (vm_map_offset_t)MACH_VM_MAX_ADDRESS);
1748 		result->holes_list = result->hole_hint = hole_entry;
1749 		hole_entry->prev = hole_entry->next = CAST_TO_VM_MAP_ENTRY(hole_entry);
1750 		result->holelistenabled = true;
1751 	}
1752 
1753 	vm_map_lock_init(result);
1754 
1755 	return result;
1756 }
1757 
1758 /*
1759  * Adjusts a submap that was made by kmem_suballoc()
1760  * before it knew where it would be mapped,
1761  * so that it has the right min/max offsets.
1762  *
1763  * We do not need to hold any locks:
1764  * only the caller knows about this map,
1765  * and it is not published on any entry yet.
1766  */
1767 static void
vm_map_adjust_offsets(vm_map_t map,vm_map_offset_t min_off,vm_map_offset_t max_off)1768 vm_map_adjust_offsets(
1769 	vm_map_t                map,
1770 	vm_map_offset_t         min_off,
1771 	vm_map_offset_t         max_off)
1772 {
1773 	assert(map->min_offset == 0);
1774 	assert(map->max_offset == max_off - min_off);
1775 	assert(map->hdr.nentries == 0);
1776 	assert(os_ref_get_count_raw(&map->map_refcnt) == 2);
1777 
1778 	map->min_offset = min_off;
1779 	map->max_offset = max_off;
1780 
1781 	if (map->holelistenabled) {
1782 		struct vm_map_links *hole = map->holes_list;
1783 
1784 		hole->start = min_off;
1785 #if defined(__arm64__)
1786 		hole->end = max_off;
1787 #else
1788 		hole->end = MAX(max_off, (vm_map_offset_t)MACH_VM_MAX_ADDRESS);
1789 #endif
1790 	}
1791 }
1792 
1793 
1794 vm_map_size_t
vm_map_adjusted_size(vm_map_t map)1795 vm_map_adjusted_size(vm_map_t map)
1796 {
1797 	const struct vm_reserved_region *regions = NULL;
1798 	size_t num_regions = 0;
1799 	mach_vm_size_t  reserved_size = 0, map_size = 0;
1800 
1801 	if (map == NULL || (map->size == 0)) {
1802 		return 0;
1803 	}
1804 
1805 	map_size = map->size;
1806 
1807 	if (map->reserved_regions == FALSE || !vm_map_is_exotic(map) || map->terminated) {
1808 		/*
1809 		 * No special reserved regions or not an exotic map or the task
1810 		 * is terminating and these special regions might have already
1811 		 * been deallocated.
1812 		 */
1813 		return map_size;
1814 	}
1815 
1816 	num_regions = ml_get_vm_reserved_regions(vm_map_is_64bit(map), &regions);
1817 	assert((num_regions == 0) || (num_regions > 0 && regions != NULL));
1818 
1819 	while (num_regions) {
1820 		reserved_size += regions[--num_regions].vmrr_size;
1821 	}
1822 
1823 	/*
1824 	 * There are a few places where the map is being switched out due to
1825 	 * 'termination' without that bit being set (e.g. exec and corpse purging).
1826 	 * In those cases, we could have the map's regions being deallocated on
1827 	 * a core while some accounting process is trying to get the map's size.
1828 	 * So this assert can't be enabled till all those places are uniform in
1829 	 * their use of the 'map->terminated' bit.
1830 	 *
1831 	 * assert(map_size >= reserved_size);
1832 	 */
1833 
1834 	return (map_size >= reserved_size) ? (map_size - reserved_size) : map_size;
1835 }
1836 
1837 /*
1838  *	vm_map_entry_create:	[ internal use only ]
1839  *
1840  *	Allocates a VM map entry for insertion in the
1841  *	given map (or map copy).  No fields are filled.
1842  *
1843  *	The VM entry will be zero initialized, except for:
1844  *	- behavior set to VM_BEHAVIOR_DEFAULT
1845  *	- inheritance set to VM_INHERIT_DEFAULT
1846  */
1847 #define vm_map_entry_create(map)    _vm_map_entry_create(&(map)->hdr)
1848 
1849 #define vm_map_copy_entry_create(copy) _vm_map_entry_create(&(copy)->cpy_hdr)
1850 
1851 static vm_map_entry_t
_vm_map_entry_create(struct vm_map_header * map_header __unused)1852 _vm_map_entry_create(
1853 	struct vm_map_header    *map_header __unused)
1854 {
1855 	vm_map_entry_t entry = NULL;
1856 
1857 	entry = zalloc_id(ZONE_ID_VM_MAP_ENTRY, Z_WAITOK | Z_ZERO);
1858 
1859 	/*
1860 	 * Help the compiler with what we know to be true,
1861 	 * so that the further bitfields inits have good codegen.
1862 	 *
1863 	 * See rdar://87041299
1864 	 */
1865 	__builtin_assume(entry->vme_object_value == 0);
1866 	__builtin_assume(*(uint64_t *)(&entry->vme_object_value + 1) == 0);
1867 	__builtin_assume(*(uint64_t *)(&entry->vme_object_value + 2) == 0);
1868 
1869 	static_assert(VM_MAX_TAG_VALUE <= VME_ALIAS_MASK,
1870 	    "VME_ALIAS_MASK covers tags");
1871 
1872 	static_assert(VM_BEHAVIOR_DEFAULT == 0,
1873 	    "can skip zeroing of the behavior field");
1874 	entry->inheritance = VM_INHERIT_DEFAULT;
1875 
1876 #if MAP_ENTRY_CREATION_DEBUG
1877 	entry->vme_creation_maphdr = map_header;
1878 	entry->vme_creation_bt = btref_get(__builtin_frame_address(0),
1879 	    BTREF_GET_NOWAIT);
1880 #endif
1881 	return entry;
1882 }
1883 
1884 /*
1885  *	vm_map_entry_dispose:	[ internal use only ]
1886  *
1887  *	Inverse of vm_map_entry_create.
1888  *
1889  *      write map lock held so no need to
1890  *	do anything special to insure correctness
1891  *      of the stores
1892  */
1893 static void
vm_map_entry_dispose(vm_map_entry_t entry)1894 vm_map_entry_dispose(
1895 	vm_map_entry_t          entry)
1896 {
1897 #if VM_BTLOG_TAGS
1898 	if (entry->vme_kernel_object) {
1899 		btref_put(entry->vme_tag_btref);
1900 	}
1901 #endif /* VM_BTLOG_TAGS */
1902 #if MAP_ENTRY_CREATION_DEBUG
1903 	btref_put(entry->vme_creation_bt);
1904 #endif
1905 #if MAP_ENTRY_INSERTION_DEBUG
1906 	btref_put(entry->vme_insertion_bt);
1907 #endif
1908 	zfree(vm_map_entry_zone, entry);
1909 }
1910 
1911 #define vm_map_copy_entry_dispose(copy_entry) \
1912 	vm_map_entry_dispose(copy_entry)
1913 
1914 static vm_map_entry_t
vm_map_zap_first_entry(vm_map_zap_t list)1915 vm_map_zap_first_entry(
1916 	vm_map_zap_t            list)
1917 {
1918 	return list->vmz_head;
1919 }
1920 
1921 static vm_map_entry_t
vm_map_zap_last_entry(vm_map_zap_t list)1922 vm_map_zap_last_entry(
1923 	vm_map_zap_t            list)
1924 {
1925 	assert(vm_map_zap_first_entry(list));
1926 	return __container_of(list->vmz_tail, struct vm_map_entry, vme_next);
1927 }
1928 
1929 static void
vm_map_zap_append(vm_map_zap_t list,vm_map_entry_t entry)1930 vm_map_zap_append(
1931 	vm_map_zap_t            list,
1932 	vm_map_entry_t          entry)
1933 {
1934 	entry->vme_next = VM_MAP_ENTRY_NULL;
1935 	*list->vmz_tail = entry;
1936 	list->vmz_tail = &entry->vme_next;
1937 }
1938 
1939 static vm_map_entry_t
vm_map_zap_pop(vm_map_zap_t list)1940 vm_map_zap_pop(
1941 	vm_map_zap_t            list)
1942 {
1943 	vm_map_entry_t head = list->vmz_head;
1944 
1945 	if (head != VM_MAP_ENTRY_NULL &&
1946 	    (list->vmz_head = head->vme_next) == VM_MAP_ENTRY_NULL) {
1947 		list->vmz_tail = &list->vmz_head;
1948 	}
1949 
1950 	return head;
1951 }
1952 
1953 static void
vm_map_zap_dispose(vm_map_zap_t list)1954 vm_map_zap_dispose(
1955 	vm_map_zap_t            list)
1956 {
1957 	vm_map_entry_t          entry;
1958 
1959 	while ((entry = vm_map_zap_pop(list))) {
1960 		if (entry->is_sub_map) {
1961 			vm_map_deallocate(VME_SUBMAP(entry));
1962 		} else {
1963 			vm_object_deallocate(VME_OBJECT(entry));
1964 		}
1965 
1966 		vm_map_entry_dispose(entry);
1967 	}
1968 }
1969 
1970 #if MACH_ASSERT
1971 static boolean_t first_free_check = FALSE;
1972 boolean_t
first_free_is_valid(vm_map_t map)1973 first_free_is_valid(
1974 	vm_map_t        map)
1975 {
1976 	if (!first_free_check) {
1977 		return TRUE;
1978 	}
1979 
1980 	return first_free_is_valid_store( map );
1981 }
1982 #endif /* MACH_ASSERT */
1983 
1984 
1985 #define vm_map_copy_entry_link(copy, after_where, entry)                \
1986 	_vm_map_store_entry_link(&(copy)->cpy_hdr, after_where, (entry))
1987 
1988 #define vm_map_copy_entry_unlink(copy, entry)                           \
1989 	_vm_map_store_entry_unlink(&(copy)->cpy_hdr, (entry), false)
1990 
1991 /*
1992  *	vm_map_destroy:
1993  *
1994  *	Actually destroy a map.
1995  */
1996 void
vm_map_destroy(vm_map_t map)1997 vm_map_destroy(
1998 	vm_map_t        map)
1999 {
2000 	/* final cleanup: this is not allowed to fail */
2001 	vmr_flags_t flags = VM_MAP_REMOVE_NO_FLAGS;
2002 
2003 	VM_MAP_ZAP_DECLARE(zap);
2004 
2005 	vm_map_lock(map);
2006 
2007 	map->terminated = true;
2008 	/* clean up regular map entries */
2009 	(void)vm_map_delete(map, map->min_offset, map->max_offset, flags,
2010 	    KMEM_GUARD_NONE, &zap);
2011 	/* clean up leftover special mappings (commpage, GPU carveout, etc...) */
2012 	(void)vm_map_delete(map, 0x0, 0xFFFFFFFFFFFFF000ULL, flags,
2013 	    KMEM_GUARD_NONE, &zap);
2014 
2015 	vm_map_disable_hole_optimization(map);
2016 	vm_map_corpse_footprint_destroy(map);
2017 
2018 	vm_map_unlock(map);
2019 
2020 	vm_map_zap_dispose(&zap);
2021 
2022 	assert(map->hdr.nentries == 0);
2023 
2024 	if (map->pmap) {
2025 		pmap_destroy(map->pmap);
2026 	}
2027 
2028 	lck_rw_destroy(&map->lock, &vm_map_lck_grp);
2029 
2030 #if CONFIG_MAP_RANGES
2031 	kfree_data(map->extra_ranges,
2032 	    map->extra_ranges_count * sizeof(struct vm_map_user_range));
2033 #endif
2034 
2035 	zfree_id(ZONE_ID_VM_MAP, map);
2036 }
2037 
2038 /*
2039  * Returns pid of the task with the largest number of VM map entries.
2040  * Used in the zone-map-exhaustion jetsam path.
2041  */
2042 pid_t
find_largest_process_vm_map_entries(void)2043 find_largest_process_vm_map_entries(void)
2044 {
2045 	pid_t victim_pid = -1;
2046 	int max_vm_map_entries = 0;
2047 	task_t task = TASK_NULL;
2048 	queue_head_t *task_list = &tasks;
2049 
2050 	lck_mtx_lock(&tasks_threads_lock);
2051 	queue_iterate(task_list, task, task_t, tasks) {
2052 		if (task == kernel_task || !task->active) {
2053 			continue;
2054 		}
2055 
2056 		vm_map_t task_map = task->map;
2057 		if (task_map != VM_MAP_NULL) {
2058 			int task_vm_map_entries = task_map->hdr.nentries;
2059 			if (task_vm_map_entries > max_vm_map_entries) {
2060 				max_vm_map_entries = task_vm_map_entries;
2061 				victim_pid = pid_from_task(task);
2062 			}
2063 		}
2064 	}
2065 	lck_mtx_unlock(&tasks_threads_lock);
2066 
2067 	printf("zone_map_exhaustion: victim pid %d, vm region count: %d\n", victim_pid, max_vm_map_entries);
2068 	return victim_pid;
2069 }
2070 
2071 
2072 /*
2073  *	vm_map_lookup_entry:	[ internal use only ]
2074  *
2075  *	Calls into the vm map store layer to find the map
2076  *	entry containing (or immediately preceding) the
2077  *	specified address in the given map; the entry is returned
2078  *	in the "entry" parameter.  The boolean
2079  *	result indicates whether the address is
2080  *	actually contained in the map.
2081  */
2082 boolean_t
vm_map_lookup_entry(vm_map_t map,vm_map_offset_t address,vm_map_entry_t * entry)2083 vm_map_lookup_entry(
2084 	vm_map_t        map,
2085 	vm_map_offset_t address,
2086 	vm_map_entry_t  *entry)         /* OUT */
2087 {
2088 	bool result = false;
2089 
2090 #if KASAN_TBI
2091 	if (VM_KERNEL_ADDRESS(address)) {
2092 		address = vm_memtag_canonicalize_kernel(address);
2093 	}
2094 #endif /* KASAN_TBI */
2095 
2096 
2097 #if CONFIG_PROB_GZALLOC
2098 	if (map->pmap == kernel_pmap) {
2099 		assertf(!pgz_owned(address),
2100 		    "it is the responsibility of callers to unguard PGZ addresses");
2101 	}
2102 #endif /* CONFIG_PROB_GZALLOC */
2103 	result = vm_map_store_lookup_entry( map, address, entry );
2104 
2105 	return result;
2106 }
2107 
2108 boolean_t
vm_map_lookup_entry_or_next(vm_map_t map,vm_map_offset_t address,vm_map_entry_t * entry)2109 vm_map_lookup_entry_or_next(
2110 	vm_map_t        map,
2111 	vm_map_offset_t address,
2112 	vm_map_entry_t  *entry)         /* OUT */
2113 {
2114 	if (vm_map_lookup_entry(map, address, entry)) {
2115 		return true;
2116 	}
2117 
2118 	*entry = (*entry)->vme_next;
2119 	return false;
2120 }
2121 
2122 #if CONFIG_PROB_GZALLOC
2123 boolean_t
vm_map_lookup_entry_allow_pgz(vm_map_t map,vm_map_offset_t address,vm_map_entry_t * entry)2124 vm_map_lookup_entry_allow_pgz(
2125 	vm_map_t        map,
2126 	vm_map_offset_t address,
2127 	vm_map_entry_t  *entry)         /* OUT */
2128 {
2129 	return vm_map_store_lookup_entry( map, address, entry );
2130 }
2131 #endif /* CONFIG_PROB_GZALLOC */
2132 
2133 /*
2134  *	Routine:	vm_map_range_invalid_panic
2135  *	Purpose:
2136  *			Panic on detection of an invalid range id.
2137  */
2138 __abortlike
2139 static void
vm_map_range_invalid_panic(vm_map_t map,vm_map_range_id_t range_id)2140 vm_map_range_invalid_panic(
2141 	vm_map_t                map,
2142 	vm_map_range_id_t       range_id)
2143 {
2144 	panic("invalid range ID (%u) for map %p", range_id, map);
2145 }
2146 
2147 /*
2148  *	Routine:	vm_map_get_range
2149  *	Purpose:
2150  *			Adjust bounds based on security policy.
2151  */
2152 static struct mach_vm_range
vm_map_get_range(vm_map_t map,vm_map_address_t * address,vm_map_kernel_flags_t * vmk_flags,vm_map_size_t size,bool * is_ptr)2153 vm_map_get_range(
2154 	vm_map_t                map,
2155 	vm_map_address_t       *address,
2156 	vm_map_kernel_flags_t  *vmk_flags,
2157 	vm_map_size_t           size,
2158 	bool                   *is_ptr)
2159 {
2160 	struct mach_vm_range effective_range = {};
2161 	vm_map_range_id_t range_id = vmk_flags->vmkf_range_id;
2162 
2163 	if (map == kernel_map) {
2164 		effective_range = kmem_ranges[range_id];
2165 
2166 		if (startup_phase >= STARTUP_SUB_KMEM) {
2167 			/*
2168 			 * Hint provided by caller is zeroed as the range is restricted to a
2169 			 * subset of the entire kernel_map VA, which could put the hint outside
2170 			 * the range, causing vm_map_store_find_space to fail.
2171 			 */
2172 			*address = 0ull;
2173 			/*
2174 			 * Ensure that range_id passed in by the caller is within meaningful
2175 			 * bounds. Range id of KMEM_RANGE_ID_NONE will cause vm_map_locate_space
2176 			 * to fail as the corresponding range is invalid. Range id larger than
2177 			 * KMEM_RANGE_ID_MAX will lead to an OOB access.
2178 			 */
2179 			if ((range_id == KMEM_RANGE_ID_NONE) ||
2180 			    (range_id > KMEM_RANGE_ID_MAX)) {
2181 				vm_map_range_invalid_panic(map, range_id);
2182 			}
2183 
2184 			/*
2185 			 * Pointer ranges use kmem_locate_space to do allocations.
2186 			 *
2187 			 * Non pointer fronts look like [ Small | Large | Permanent ]
2188 			 * Adjust range for allocations larger than KMEM_SMALLMAP_THRESHOLD.
2189 			 * Allocations smaller than KMEM_SMALLMAP_THRESHOLD are allowed to
2190 			 * use the entire range.
2191 			 */
2192 			if (range_id < KMEM_RANGE_ID_SPRAYQTN) {
2193 				*is_ptr = true;
2194 			} else if (size >= KMEM_SMALLMAP_THRESHOLD) {
2195 				effective_range = kmem_large_ranges[range_id];
2196 			}
2197 		}
2198 #if CONFIG_MAP_RANGES
2199 	} else if (map->uses_user_ranges) {
2200 		switch (range_id) {
2201 		case UMEM_RANGE_ID_DEFAULT:
2202 			effective_range = map->default_range;
2203 			break;
2204 		case UMEM_RANGE_ID_HEAP:
2205 			effective_range = map->data_range;
2206 			break;
2207 		case UMEM_RANGE_ID_LARGE_FILE:
2208 			if (map->large_file_range.min_address != map->large_file_range.max_address) {
2209 				/* large file range is configured and should be used */
2210 				effective_range = map->large_file_range;
2211 			} else {
2212 				/*
2213 				 * the user asking for this user range might not have the
2214 				 * permissions to use the large file range (i.e., it doesn't
2215 				 * hold the correct entitlement), so we give it the data range
2216 				 * instead
2217 				 */
2218 				effective_range = map->data_range;
2219 			}
2220 			break;
2221 		case UMEM_RANGE_ID_FIXED:
2222 			/*
2223 			 * anywhere allocations with an address in "FIXED"
2224 			 * makes no sense, leave the range empty
2225 			 */
2226 			break;
2227 
2228 		default:
2229 			vm_map_range_invalid_panic(map, range_id);
2230 		}
2231 #endif /* CONFIG_MAP_RANGES */
2232 	} else {
2233 		/*
2234 		 * If minimum is 0, bump it up by PAGE_SIZE.  We want to limit
2235 		 * allocations of PAGEZERO to explicit requests since its
2236 		 * normal use is to catch dereferences of NULL and many
2237 		 * applications also treat pointers with a value of 0 as
2238 		 * special and suddenly having address 0 contain useable
2239 		 * memory would tend to confuse those applications.
2240 		 */
2241 		effective_range.min_address = MAX(map->min_offset, VM_MAP_PAGE_SIZE(map));
2242 		effective_range.max_address = map->max_offset;
2243 	}
2244 
2245 	return effective_range;
2246 }
2247 
2248 kern_return_t
vm_map_locate_space_anywhere(vm_map_t map,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_map_offset_t * start_inout,vm_map_entry_t * entry_out)2249 vm_map_locate_space_anywhere(
2250 	vm_map_t                map,
2251 	vm_map_size_t           size,
2252 	vm_map_offset_t         mask,
2253 	vm_map_kernel_flags_t   vmk_flags,
2254 	vm_map_offset_t        *start_inout,
2255 	vm_map_entry_t         *entry_out)
2256 {
2257 	struct mach_vm_range effective_range = {};
2258 	vm_map_size_t   guard_offset;
2259 	vm_map_offset_t hint, limit;
2260 	vm_map_entry_t  entry;
2261 	bool            is_kmem_ptr_range = false;
2262 
2263 	/*
2264 	 * Only supported by vm_map_enter() with a fixed address.
2265 	 */
2266 	assert(!vmk_flags.vmf_fixed);
2267 	assert(!vmk_flags.vmkf_beyond_max);
2268 
2269 	if (__improbable(map->wait_for_space)) {
2270 		/*
2271 		 * support for "wait_for_space" is minimal,
2272 		 * its only consumer is the ipc_kernel_copy_map.
2273 		 */
2274 		assert(!map->holelistenabled &&
2275 		    !vmk_flags.vmkf_last_free &&
2276 		    !vmk_flags.vmkf_keep_map_locked &&
2277 		    !vmk_flags.vmkf_map_jit &&
2278 		    !vmk_flags.vmf_random_addr &&
2279 		    *start_inout <= map->min_offset);
2280 	} else if (vmk_flags.vmkf_last_free) {
2281 		assert(!vmk_flags.vmkf_map_jit &&
2282 		    !vmk_flags.vmf_random_addr);
2283 	}
2284 
2285 	if (vmk_flags.vmkf_guard_before) {
2286 		guard_offset = VM_MAP_PAGE_SIZE(map);
2287 		assert(size > guard_offset);
2288 		size -= guard_offset;
2289 	} else {
2290 		assert(size != 0);
2291 		guard_offset = 0;
2292 	}
2293 
2294 	if (__improbable(!vm_map_is_map_size_valid(
2295 		    map, size, vmk_flags.vmkf_no_soft_limit))) {
2296 		return KERN_NO_SPACE;
2297 	}
2298 
2299 	/*
2300 	 * Validate range_id from flags and get associated range
2301 	 */
2302 	effective_range = vm_map_get_range(map, start_inout, &vmk_flags, size,
2303 	    &is_kmem_ptr_range);
2304 
2305 	if (is_kmem_ptr_range) {
2306 		return kmem_locate_space(size + guard_offset, vmk_flags.vmkf_range_id,
2307 		           vmk_flags.vmkf_last_free, start_inout, entry_out);
2308 	}
2309 
2310 #if XNU_TARGET_OS_OSX
2311 	if (__improbable(vmk_flags.vmkf_32bit_map_va)) {
2312 		assert(map != kernel_map);
2313 		effective_range.max_address = MIN(map->max_offset, 0x00000000FFFFF000ULL);
2314 	}
2315 #endif /* XNU_TARGET_OS_OSX */
2316 
2317 again:
2318 	if (vmk_flags.vmkf_last_free) {
2319 		hint = *start_inout;
2320 
2321 		if (hint == 0 || hint > effective_range.max_address) {
2322 			hint = effective_range.max_address;
2323 		}
2324 		if (hint <= effective_range.min_address) {
2325 			return KERN_NO_SPACE;
2326 		}
2327 		limit = effective_range.min_address;
2328 	} else {
2329 		hint = *start_inout;
2330 
2331 		if (vmk_flags.vmkf_map_jit) {
2332 			if (map->jit_entry_exists &&
2333 			    !VM_MAP_POLICY_ALLOW_MULTIPLE_JIT(map)) {
2334 				return KERN_INVALID_ARGUMENT;
2335 			}
2336 			if (VM_MAP_POLICY_ALLOW_JIT_RANDOM_ADDRESS(map)) {
2337 				vmk_flags.vmf_random_addr = true;
2338 			}
2339 		}
2340 
2341 		if (vmk_flags.vmf_random_addr) {
2342 			kern_return_t kr;
2343 
2344 			kr = vm_map_random_address_for_size(map, &hint, size, vmk_flags);
2345 			if (kr != KERN_SUCCESS) {
2346 				return kr;
2347 			}
2348 		}
2349 #if __x86_64__
2350 		else if ((hint == 0 || hint == vm_map_min(map)) &&
2351 		    !map->disable_vmentry_reuse &&
2352 		    map->vmmap_high_start != 0) {
2353 			hint = map->vmmap_high_start;
2354 		}
2355 #endif /* __x86_64__ */
2356 
2357 		if (hint < effective_range.min_address) {
2358 			hint = effective_range.min_address;
2359 		}
2360 		if (effective_range.max_address <= hint) {
2361 			return KERN_NO_SPACE;
2362 		}
2363 
2364 		limit = effective_range.max_address;
2365 	}
2366 	entry = vm_map_store_find_space(map,
2367 	    hint, limit, vmk_flags.vmkf_last_free,
2368 	    guard_offset, size, mask,
2369 	    start_inout);
2370 
2371 	if (__improbable(entry == NULL)) {
2372 		if (map->wait_for_space &&
2373 		    guard_offset + size <=
2374 		    effective_range.max_address - effective_range.min_address) {
2375 			assert_wait((event_t)map, THREAD_ABORTSAFE);
2376 			vm_map_unlock(map);
2377 			thread_block(THREAD_CONTINUE_NULL);
2378 			vm_map_lock(map);
2379 			goto again;
2380 		}
2381 		return KERN_NO_SPACE;
2382 	}
2383 
2384 	if (entry_out) {
2385 		*entry_out = entry;
2386 	}
2387 	return KERN_SUCCESS;
2388 }
2389 
2390 /*!
2391  * @function vm_map_locate_space_fixed()
2392  *
2393  * @brief
2394  * Locate (no reservation) a range in the specified VM map at a fixed address.
2395  *
2396  * @param map           the map to scan for memory, must be locked.
2397  * @param start         the fixed address trying to be reserved
2398  * @param size          the size of the allocation to make.
2399  * @param mask          an alignment mask the allocation must respect,
2400  * @param vmk_flags     the vm map kernel flags to influence this call.
2401  *                      vmk_flags.vmf_anywhere must not be set.
2402  * @param entry_out     the entry right before the hole.
2403  * @param zap_list      a zap list of entries to clean up after the call.
2404  *
2405  * @returns
2406  * - KERN_SUCCESS in case of success and no conflicting entry is found,
2407  *   in which case entry_out is set to the entry before the hole.
2408  *
2409  * - KERN_MEMORY_PRESENT if a conflicting entry is found,
2410  *   in which case entry_out is set the conflicting entry,
2411  *   the callers MUST handle this error explicitly.
2412  *
2413  * - KERN_INVALID_ADDRESS if the specified @c start or @c size
2414  *   would result in a mapping outside of the map.
2415  *
2416  * - KERN_NO_SPACE for various cases of unrecoverable failures.
2417  */
2418 static kern_return_t
vm_map_locate_space_fixed(vm_map_t map,vm_map_offset_t start,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_map_entry_t * entry_out,vm_map_zap_t zap_list)2419 vm_map_locate_space_fixed(
2420 	vm_map_t                map,
2421 	vm_map_offset_t         start,
2422 	vm_map_size_t           size,
2423 	vm_map_offset_t         mask,
2424 	vm_map_kernel_flags_t   vmk_flags,
2425 	vm_map_entry_t         *entry_out,
2426 	vm_map_zap_t            zap_list)
2427 {
2428 	vm_map_offset_t effective_min_offset, effective_max_offset;
2429 	vm_map_entry_t  entry;
2430 	vm_map_offset_t end;
2431 
2432 	assert(vmk_flags.vmf_fixed);
2433 
2434 	effective_min_offset = map->min_offset;
2435 	effective_max_offset = map->max_offset;
2436 
2437 	if (vmk_flags.vmkf_beyond_max) {
2438 		/*
2439 		 * Allow an insertion beyond the map's max offset.
2440 		 */
2441 		effective_max_offset = 0x00000000FFFFF000ULL;
2442 		if (vm_map_is_64bit(map)) {
2443 			effective_max_offset = 0xFFFFFFFFFFFFF000ULL;
2444 		}
2445 #if XNU_TARGET_OS_OSX
2446 	} else if (__improbable(vmk_flags.vmkf_32bit_map_va)) {
2447 		effective_max_offset = MIN(map->max_offset, 0x00000000FFFFF000ULL);
2448 #endif /* XNU_TARGET_OS_OSX */
2449 	}
2450 
2451 	if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT &&
2452 	    !vmk_flags.vmf_overwrite &&
2453 	    map->pmap == kernel_pmap &&
2454 	    vmk_flags.vm_tag == VM_MEMORY_REALLOC) {
2455 		/*
2456 		 * Force realloc() to switch to a new allocation,
2457 		 * to prevent 4k-fragmented virtual ranges.
2458 		 */
2459 //		DEBUG4K_ERROR("no realloc in place");
2460 		return KERN_NO_SPACE;
2461 	}
2462 
2463 	/*
2464 	 *	Verify that:
2465 	 *		the address doesn't itself violate
2466 	 *		the mask requirement.
2467 	 */
2468 
2469 	if ((start & mask) != 0) {
2470 		return KERN_NO_SPACE;
2471 	}
2472 
2473 	if (__improbable(!vm_map_is_map_size_valid(
2474 		    map, size, vmk_flags.vmkf_no_soft_limit))) {
2475 		return KERN_NO_SPACE;
2476 	}
2477 
2478 #if CONFIG_MAP_RANGES
2479 	if (map->uses_user_ranges) {
2480 		struct mach_vm_range r;
2481 
2482 		vm_map_user_range_resolve(map, start, 1, &r);
2483 		if (r.max_address == 0) {
2484 			return KERN_INVALID_ADDRESS;
2485 		}
2486 		effective_min_offset = r.min_address;
2487 		effective_max_offset = r.max_address;
2488 	}
2489 #endif /* CONFIG_MAP_RANGES */
2490 
2491 	if ((startup_phase >= STARTUP_SUB_KMEM) && !vmk_flags.vmkf_submap &&
2492 	    (map == kernel_map)) {
2493 		mach_vm_range_t r = kmem_validate_range_for_overwrite(start, size);
2494 		effective_min_offset = r->min_address;
2495 		effective_max_offset = r->max_address;
2496 	}
2497 
2498 	/*
2499 	 *	...	the address is within bounds
2500 	 */
2501 
2502 	end = start + size;
2503 
2504 	if ((start < effective_min_offset) ||
2505 	    (end > effective_max_offset) ||
2506 	    (start >= end)) {
2507 		return KERN_INVALID_ADDRESS;
2508 	}
2509 
2510 	if (vmk_flags.vmf_overwrite) {
2511 		vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_MAP_ALIGN | VM_MAP_REMOVE_TO_OVERWRITE;
2512 		kern_return_t remove_kr;
2513 
2514 		/*
2515 		 * Fixed mapping and "overwrite" flag: attempt to
2516 		 * remove all existing mappings in the specified
2517 		 * address range, saving them in our "zap_list".
2518 		 *
2519 		 * This avoids releasing the VM map lock in
2520 		 * vm_map_entry_delete() and allows atomicity
2521 		 * when we want to replace some mappings with a new one.
2522 		 * It also allows us to restore the old VM mappings if the
2523 		 * new mapping fails.
2524 		 */
2525 		remove_flags |= VM_MAP_REMOVE_NO_YIELD;
2526 
2527 		if (vmk_flags.vmkf_overwrite_immutable) {
2528 			/* we can overwrite immutable mappings */
2529 			remove_flags |= VM_MAP_REMOVE_IMMUTABLE;
2530 		}
2531 		if (vmk_flags.vmkf_remap_prot_copy) {
2532 			remove_flags |= VM_MAP_REMOVE_IMMUTABLE_CODE;
2533 		}
2534 		remove_kr = vm_map_delete(map, start, end, remove_flags,
2535 		    KMEM_GUARD_NONE, zap_list).kmr_return;
2536 		if (remove_kr) {
2537 			/* XXX FBDP restore zap_list? */
2538 			return remove_kr;
2539 		}
2540 	}
2541 
2542 	/*
2543 	 *	...	the starting address isn't allocated
2544 	 */
2545 
2546 	if (vm_map_lookup_entry(map, start, &entry)) {
2547 		*entry_out = entry;
2548 		return KERN_MEMORY_PRESENT;
2549 	}
2550 
2551 	/*
2552 	 *	...	the next region doesn't overlap the
2553 	 *		end point.
2554 	 */
2555 
2556 	if ((entry->vme_next != vm_map_to_entry(map)) &&
2557 	    (entry->vme_next->vme_start < end)) {
2558 		return KERN_NO_SPACE;
2559 	}
2560 
2561 	*entry_out = entry;
2562 	return KERN_SUCCESS;
2563 }
2564 
2565 /*
2566  *	Routine:	vm_map_find_space
2567  *	Purpose:
2568  *		Allocate a range in the specified virtual address map,
2569  *		returning the entry allocated for that range.
2570  *		Used by kmem_alloc, etc.
2571  *
2572  *		The map must be NOT be locked. It will be returned locked
2573  *		on KERN_SUCCESS, unlocked on failure.
2574  *
2575  *		If an entry is allocated, the object/offset fields
2576  *		are initialized to zero.
2577  */
2578 kern_return_t
vm_map_find_space(vm_map_t map,vm_map_offset_t hint_address,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_map_entry_t * o_entry)2579 vm_map_find_space(
2580 	vm_map_t                map,
2581 	vm_map_offset_t         hint_address,
2582 	vm_map_size_t           size,
2583 	vm_map_offset_t         mask,
2584 	vm_map_kernel_flags_t   vmk_flags,
2585 	vm_map_entry_t          *o_entry)       /* OUT */
2586 {
2587 	vm_map_entry_t          new_entry, entry;
2588 	kern_return_t           kr;
2589 
2590 	if (size == 0) {
2591 		return KERN_INVALID_ARGUMENT;
2592 	}
2593 
2594 	new_entry = vm_map_entry_create(map);
2595 	new_entry->use_pmap = true;
2596 	new_entry->protection = VM_PROT_DEFAULT;
2597 	new_entry->max_protection = VM_PROT_ALL;
2598 
2599 	if (VM_MAP_PAGE_SHIFT(map) != PAGE_SHIFT) {
2600 		new_entry->map_aligned = true;
2601 	}
2602 	if (vmk_flags.vmf_permanent) {
2603 		new_entry->vme_permanent = true;
2604 	}
2605 
2606 	vm_map_lock(map);
2607 
2608 	kr = vm_map_locate_space_anywhere(map, size, mask, vmk_flags,
2609 	    &hint_address, &entry);
2610 	if (kr != KERN_SUCCESS) {
2611 		vm_map_unlock(map);
2612 		vm_map_entry_dispose(new_entry);
2613 		return kr;
2614 	}
2615 	new_entry->vme_start = hint_address;
2616 	new_entry->vme_end = hint_address + size;
2617 
2618 	/*
2619 	 *	At this point,
2620 	 *
2621 	 *	- new_entry's "vme_start" and "vme_end" should define
2622 	 *	  the endpoints of the available new range,
2623 	 *
2624 	 *	- and "entry" should refer to the region before
2625 	 *	  the new range,
2626 	 *
2627 	 *	- and the map should still be locked.
2628 	 */
2629 
2630 	assert(page_aligned(new_entry->vme_start));
2631 	assert(page_aligned(new_entry->vme_end));
2632 	assert(VM_MAP_PAGE_ALIGNED(new_entry->vme_start, VM_MAP_PAGE_MASK(map)));
2633 	assert(VM_MAP_PAGE_ALIGNED(new_entry->vme_end, VM_MAP_PAGE_MASK(map)));
2634 
2635 
2636 	/*
2637 	 *	Insert the new entry into the list
2638 	 */
2639 
2640 	vm_map_store_entry_link(map, entry, new_entry,
2641 	    VM_MAP_KERNEL_FLAGS_NONE);
2642 	map->size += size;
2643 
2644 	/*
2645 	 *	Update the lookup hint
2646 	 */
2647 	SAVE_HINT_MAP_WRITE(map, new_entry);
2648 
2649 	*o_entry = new_entry;
2650 	return KERN_SUCCESS;
2651 }
2652 
2653 int vm_map_pmap_enter_print = FALSE;
2654 int vm_map_pmap_enter_enable = FALSE;
2655 
2656 /*
2657  *	Routine:	vm_map_pmap_enter [internal only]
2658  *
2659  *	Description:
2660  *		Force pages from the specified object to be entered into
2661  *		the pmap at the specified address if they are present.
2662  *		As soon as a page not found in the object the scan ends.
2663  *
2664  *	Returns:
2665  *		Nothing.
2666  *
2667  *	In/out conditions:
2668  *		The source map should not be locked on entry.
2669  */
2670 __unused static void
vm_map_pmap_enter(vm_map_t map,vm_map_offset_t addr,vm_map_offset_t end_addr,vm_object_t object,vm_object_offset_t offset,vm_prot_t protection)2671 vm_map_pmap_enter(
2672 	vm_map_t                map,
2673 	vm_map_offset_t         addr,
2674 	vm_map_offset_t         end_addr,
2675 	vm_object_t             object,
2676 	vm_object_offset_t      offset,
2677 	vm_prot_t               protection)
2678 {
2679 	int                     type_of_fault;
2680 	kern_return_t           kr;
2681 	uint8_t                 object_lock_type = 0;
2682 	struct vm_object_fault_info fault_info = {
2683 		.interruptible = THREAD_UNINT,
2684 	};
2685 
2686 	if (map->pmap == 0) {
2687 		return;
2688 	}
2689 
2690 	assert(VM_MAP_PAGE_SHIFT(map) == PAGE_SHIFT);
2691 
2692 	while (addr < end_addr) {
2693 		vm_page_t       m;
2694 
2695 
2696 		/*
2697 		 * TODO:
2698 		 * From vm_map_enter(), we come into this function without the map
2699 		 * lock held or the object lock held.
2700 		 * We haven't taken a reference on the object either.
2701 		 * We should do a proper lookup on the map to make sure
2702 		 * that things are sane before we go locking objects that
2703 		 * could have been deallocated from under us.
2704 		 */
2705 
2706 		object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2707 		vm_object_lock(object);
2708 
2709 		m = vm_page_lookup(object, offset);
2710 
2711 		if (m == VM_PAGE_NULL || m->vmp_busy || vm_page_is_fictitious(m) ||
2712 		    (m->vmp_unusual && (VMP_ERROR_GET(m) || m->vmp_restart || m->vmp_absent))) {
2713 			vm_object_unlock(object);
2714 			return;
2715 		}
2716 
2717 		if (vm_map_pmap_enter_print) {
2718 			printf("vm_map_pmap_enter:");
2719 			printf("map: %p, addr: %llx, object: %p, offset: %llx\n",
2720 			    map, (unsigned long long)addr, object, (unsigned long long)offset);
2721 		}
2722 		type_of_fault = DBG_CACHE_HIT_FAULT;
2723 		kr = vm_fault_enter(m, map->pmap,
2724 		    addr,
2725 		    PAGE_SIZE, 0,
2726 		    protection, protection,
2727 		    VM_PAGE_WIRED(m),
2728 		    VM_KERN_MEMORY_NONE,                 /* tag - not wiring */
2729 		    &fault_info,
2730 		    NULL,                  /* need_retry */
2731 		    &type_of_fault,
2732 		    &object_lock_type); /* Exclusive lock mode. Will remain unchanged.*/
2733 
2734 		vm_object_unlock(object);
2735 
2736 		offset += PAGE_SIZE_64;
2737 		addr += PAGE_SIZE;
2738 	}
2739 }
2740 
2741 #define MAX_TRIES_TO_GET_RANDOM_ADDRESS 1000
2742 static kern_return_t
vm_map_random_address_for_size(vm_map_t map,vm_map_offset_t * address,vm_map_size_t size,vm_map_kernel_flags_t vmk_flags)2743 vm_map_random_address_for_size(
2744 	vm_map_t                map,
2745 	vm_map_offset_t        *address,
2746 	vm_map_size_t           size,
2747 	vm_map_kernel_flags_t   vmk_flags)
2748 {
2749 	kern_return_t   kr = KERN_SUCCESS;
2750 	int             tries = 0;
2751 	vm_map_offset_t random_addr = 0;
2752 	vm_map_offset_t hole_end;
2753 
2754 	vm_map_entry_t  next_entry = VM_MAP_ENTRY_NULL;
2755 	vm_map_entry_t  prev_entry = VM_MAP_ENTRY_NULL;
2756 	vm_map_size_t   vm_hole_size = 0;
2757 	vm_map_size_t   addr_space_size;
2758 	bool            is_kmem_ptr;
2759 	struct mach_vm_range effective_range;
2760 
2761 	effective_range = vm_map_get_range(map, address, &vmk_flags, size,
2762 	    &is_kmem_ptr);
2763 
2764 	addr_space_size = effective_range.max_address - effective_range.min_address;
2765 	if (size >= addr_space_size) {
2766 		return KERN_NO_SPACE;
2767 	}
2768 	addr_space_size -= size;
2769 
2770 	assert(VM_MAP_PAGE_ALIGNED(size, VM_MAP_PAGE_MASK(map)));
2771 
2772 	while (tries < MAX_TRIES_TO_GET_RANDOM_ADDRESS) {
2773 		if (startup_phase < STARTUP_SUB_ZALLOC) {
2774 			random_addr = (vm_map_offset_t)early_random();
2775 		} else {
2776 			random_addr = (vm_map_offset_t)random();
2777 		}
2778 		random_addr <<= VM_MAP_PAGE_SHIFT(map);
2779 		random_addr = vm_map_trunc_page(
2780 			effective_range.min_address + (random_addr % addr_space_size),
2781 			VM_MAP_PAGE_MASK(map));
2782 
2783 #if CONFIG_PROB_GZALLOC
2784 		if (map->pmap == kernel_pmap && pgz_owned(random_addr)) {
2785 			continue;
2786 		}
2787 #endif /* CONFIG_PROB_GZALLOC */
2788 
2789 		if (vm_map_lookup_entry(map, random_addr, &prev_entry) == FALSE) {
2790 			if (prev_entry == vm_map_to_entry(map)) {
2791 				next_entry = vm_map_first_entry(map);
2792 			} else {
2793 				next_entry = prev_entry->vme_next;
2794 			}
2795 			if (next_entry == vm_map_to_entry(map)) {
2796 				hole_end = vm_map_max(map);
2797 			} else {
2798 				hole_end = next_entry->vme_start;
2799 			}
2800 			vm_hole_size = hole_end - random_addr;
2801 			if (vm_hole_size >= size) {
2802 				*address = random_addr;
2803 				break;
2804 			}
2805 		}
2806 		tries++;
2807 	}
2808 
2809 	if (tries == MAX_TRIES_TO_GET_RANDOM_ADDRESS) {
2810 		kr = KERN_NO_SPACE;
2811 	}
2812 	return kr;
2813 }
2814 
2815 static boolean_t
vm_memory_malloc_no_cow(int alias)2816 vm_memory_malloc_no_cow(
2817 	int alias)
2818 {
2819 	uint64_t alias_mask;
2820 
2821 	if (!malloc_no_cow) {
2822 		return FALSE;
2823 	}
2824 	if (alias > 63) {
2825 		return FALSE;
2826 	}
2827 	alias_mask = 1ULL << alias;
2828 	if (alias_mask & vm_memory_malloc_no_cow_mask) {
2829 		return TRUE;
2830 	}
2831 	return FALSE;
2832 }
2833 
2834 uint64_t vm_map_enter_RLIMIT_AS_count = 0;
2835 uint64_t vm_map_enter_RLIMIT_DATA_count = 0;
2836 /*
2837  *	Routine:	vm_map_enter
2838  *
2839  *	Description:
2840  *		Allocate a range in the specified virtual address map.
2841  *		The resulting range will refer to memory defined by
2842  *		the given memory object and offset into that object.
2843  *
2844  *		Arguments are as defined in the vm_map call.
2845  */
2846 static unsigned int vm_map_enter_restore_successes = 0;
2847 static unsigned int vm_map_enter_restore_failures = 0;
2848 kern_return_t
vm_map_enter(vm_map_t map,vm_map_offset_t * address,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_object_t object,vm_object_offset_t offset,boolean_t needs_copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)2849 vm_map_enter(
2850 	vm_map_t                map,
2851 	vm_map_offset_t         *address,       /* IN/OUT */
2852 	vm_map_size_t           size,
2853 	vm_map_offset_t         mask,
2854 	vm_map_kernel_flags_t   vmk_flags,
2855 	vm_object_t             object,
2856 	vm_object_offset_t      offset,
2857 	boolean_t               needs_copy,
2858 	vm_prot_t               cur_protection,
2859 	vm_prot_t               max_protection,
2860 	vm_inherit_t            inheritance)
2861 {
2862 	vm_map_entry_t          entry, new_entry;
2863 	vm_map_offset_t         start, tmp_start, tmp_offset;
2864 	vm_map_offset_t         end, tmp_end;
2865 	vm_map_offset_t         tmp2_start, tmp2_end;
2866 	vm_map_offset_t         step;
2867 	kern_return_t           result = KERN_SUCCESS;
2868 	bool                    map_locked = FALSE;
2869 	bool                    pmap_empty = TRUE;
2870 	bool                    new_mapping_established = FALSE;
2871 	const bool              keep_map_locked = vmk_flags.vmkf_keep_map_locked;
2872 	const bool              anywhere = !vmk_flags.vmf_fixed;
2873 	const bool              purgable = vmk_flags.vmf_purgeable;
2874 	const bool              no_cache = vmk_flags.vmf_no_cache;
2875 	const bool              is_submap = vmk_flags.vmkf_submap;
2876 	const bool              permanent = vmk_flags.vmf_permanent;
2877 	const bool              no_copy_on_read = vmk_flags.vmkf_no_copy_on_read;
2878 	const bool              entry_for_jit = vmk_flags.vmkf_map_jit;
2879 	const bool              iokit_acct = vmk_flags.vmkf_iokit_acct;
2880 	const bool              resilient_codesign = vmk_flags.vmf_resilient_codesign;
2881 	const bool              resilient_media = vmk_flags.vmf_resilient_media;
2882 	const bool              entry_for_tpro = vmk_flags.vmf_tpro;
2883 	const unsigned int      superpage_size = vmk_flags.vmf_superpage_size;
2884 	const vm_tag_t          alias = vmk_flags.vm_tag;
2885 	vm_tag_t                user_alias;
2886 	kern_return_t           kr;
2887 	bool                    clear_map_aligned = FALSE;
2888 	vm_map_size_t           chunk_size = 0;
2889 	vm_object_t             caller_object;
2890 	VM_MAP_ZAP_DECLARE(zap_old_list);
2891 	VM_MAP_ZAP_DECLARE(zap_new_list);
2892 
2893 	caller_object = object;
2894 
2895 	assertf(vmk_flags.__vmkf_unused2 == 0, "vmk_flags unused2=0x%llx\n", vmk_flags.__vmkf_unused2);
2896 
2897 	if (vmk_flags.vmf_4gb_chunk) {
2898 #if defined(__LP64__)
2899 		chunk_size = (4ULL * 1024 * 1024 * 1024); /* max. 4GB chunks for the new allocation */
2900 #else /* __LP64__ */
2901 		chunk_size = ANON_CHUNK_SIZE;
2902 #endif /* __LP64__ */
2903 	} else {
2904 		chunk_size = ANON_CHUNK_SIZE;
2905 	}
2906 
2907 
2908 
2909 	if (superpage_size) {
2910 		if (object != VM_OBJECT_NULL) {
2911 			/* caller can't provide their own VM object */
2912 			return KERN_INVALID_ARGUMENT;
2913 		}
2914 		switch (superpage_size) {
2915 			/*
2916 			 * Note that the current implementation only supports
2917 			 * a single size for superpages, SUPERPAGE_SIZE, per
2918 			 * architecture. As soon as more sizes are supposed
2919 			 * to be supported, SUPERPAGE_SIZE has to be replaced
2920 			 * with a lookup of the size depending on superpage_size.
2921 			 */
2922 #ifdef __x86_64__
2923 		case SUPERPAGE_SIZE_ANY:
2924 			/* handle it like 2 MB and round up to page size */
2925 			size = (size + 2 * 1024 * 1024 - 1) & ~(2 * 1024 * 1024 - 1);
2926 			OS_FALLTHROUGH;
2927 		case SUPERPAGE_SIZE_2MB:
2928 			break;
2929 #endif
2930 		default:
2931 			return KERN_INVALID_ARGUMENT;
2932 		}
2933 		mask = SUPERPAGE_SIZE - 1;
2934 		if (size & (SUPERPAGE_SIZE - 1)) {
2935 			return KERN_INVALID_ARGUMENT;
2936 		}
2937 		inheritance = VM_INHERIT_NONE;  /* fork() children won't inherit superpages */
2938 	}
2939 
2940 
2941 	if ((cur_protection & VM_PROT_WRITE) &&
2942 	    (cur_protection & VM_PROT_EXECUTE) &&
2943 #if XNU_TARGET_OS_OSX
2944 	    map->pmap != kernel_pmap &&
2945 	    (cs_process_global_enforcement() ||
2946 	    (vmk_flags.vmkf_cs_enforcement_override
2947 	    ? vmk_flags.vmkf_cs_enforcement
2948 	    : (vm_map_cs_enforcement(map)
2949 #if __arm64__
2950 	    || !VM_MAP_IS_EXOTIC(map)
2951 #endif /* __arm64__ */
2952 	    ))) &&
2953 #endif /* XNU_TARGET_OS_OSX */
2954 #if CODE_SIGNING_MONITOR
2955 	    (csm_address_space_exempt(map->pmap) != KERN_SUCCESS) &&
2956 #endif
2957 	    (VM_MAP_POLICY_WX_FAIL(map) ||
2958 	    VM_MAP_POLICY_WX_STRIP_X(map)) &&
2959 	    !entry_for_jit) {
2960 		boolean_t vm_protect_wx_fail = VM_MAP_POLICY_WX_FAIL(map);
2961 
2962 		DTRACE_VM3(cs_wx,
2963 		    uint64_t, 0,
2964 		    uint64_t, 0,
2965 		    vm_prot_t, cur_protection);
2966 		printf("CODE SIGNING: %d[%s] %s: curprot cannot be write+execute. %s\n",
2967 		    proc_selfpid(),
2968 		    (get_bsdtask_info(current_task())
2969 		    ? proc_name_address(get_bsdtask_info(current_task()))
2970 		    : "?"),
2971 		    __FUNCTION__,
2972 		    (vm_protect_wx_fail ? "failing" : "turning off execute"));
2973 		cur_protection &= ~VM_PROT_EXECUTE;
2974 		if (vm_protect_wx_fail) {
2975 			return KERN_PROTECTION_FAILURE;
2976 		}
2977 	}
2978 
2979 	if (entry_for_jit
2980 	    && cur_protection != VM_PROT_ALL) {
2981 		/*
2982 		 * Native macOS processes and all non-macOS processes are
2983 		 * expected to create JIT regions via mmap(MAP_JIT, RWX) but
2984 		 * the RWX requirement was not enforced, and thus, we must live
2985 		 * with our sins. We are now dealing with a JIT mapping without
2986 		 * RWX.
2987 		 *
2988 		 * We deal with these by letting the MAP_JIT stick in order
2989 		 * to avoid CS violations when these pages are mapped executable
2990 		 * down the line. In order to appease the page table monitor (you
2991 		 * know what I'm talking about), these pages will end up being
2992 		 * marked as XNU_USER_DEBUG, which will be allowed because we
2993 		 * don't enforce the code signing monitor on macOS systems. If
2994 		 * the user-space application ever changes permissions to RWX,
2995 		 * which they are allowed to since the mapping was originally
2996 		 * created with MAP_JIT, then they'll switch over to using the
2997 		 * XNU_USER_JIT type, and won't be allowed to downgrade any
2998 		 * more after that.
2999 		 *
3000 		 * When not on macOS, a MAP_JIT mapping without VM_PROT_ALL is
3001 		 * strictly disallowed.
3002 		 */
3003 
3004 #if XNU_TARGET_OS_OSX
3005 		/*
3006 		 * Continue to allow non-RWX JIT
3007 		 */
3008 #else
3009 		/* non-macOS: reject JIT regions without RWX */
3010 		DTRACE_VM3(cs_wx,
3011 		    uint64_t, 0,
3012 		    uint64_t, 0,
3013 		    vm_prot_t, cur_protection);
3014 		printf("CODE SIGNING: %d[%s] %s(%d): JIT requires RWX: failing. \n",
3015 		    proc_selfpid(),
3016 		    (get_bsdtask_info(current_task())
3017 		    ? proc_name_address(get_bsdtask_info(current_task()))
3018 		    : "?"),
3019 		    __FUNCTION__,
3020 		    cur_protection);
3021 		return KERN_PROTECTION_FAILURE;
3022 #endif
3023 	}
3024 
3025 	/*
3026 	 * If the task has requested executable lockdown,
3027 	 * deny any new executable mapping.
3028 	 */
3029 	if (map->map_disallow_new_exec == TRUE) {
3030 		if (cur_protection & VM_PROT_EXECUTE) {
3031 			return KERN_PROTECTION_FAILURE;
3032 		}
3033 	}
3034 
3035 	if (resilient_codesign) {
3036 		assert(!is_submap);
3037 		int reject_prot = (needs_copy ? VM_PROT_ALLEXEC : (VM_PROT_WRITE | VM_PROT_ALLEXEC));
3038 		if ((cur_protection | max_protection) & reject_prot) {
3039 			return KERN_PROTECTION_FAILURE;
3040 		}
3041 	}
3042 
3043 	if (resilient_media) {
3044 		assert(!is_submap);
3045 //		assert(!needs_copy);
3046 		if (object != VM_OBJECT_NULL &&
3047 		    !object->internal) {
3048 			/*
3049 			 * This mapping is directly backed by an external
3050 			 * memory manager (e.g. a vnode pager for a file):
3051 			 * we would not have any safe place to inject
3052 			 * a zero-filled page if an actual page is not
3053 			 * available, without possibly impacting the actual
3054 			 * contents of the mapped object (e.g. the file),
3055 			 * so we can't provide any media resiliency here.
3056 			 */
3057 			return KERN_INVALID_ARGUMENT;
3058 		}
3059 	}
3060 
3061 	if (entry_for_tpro) {
3062 		/*
3063 		 * TPRO overrides the effective permissions of the region
3064 		 * and explicitly maps as RW. Ensure we have been passed
3065 		 * the expected permissions. We accept `cur_protections`
3066 		 * RO as that will be handled on fault.
3067 		 */
3068 		if (!(max_protection & VM_PROT_READ) ||
3069 		    !(max_protection & VM_PROT_WRITE) ||
3070 		    !(cur_protection & VM_PROT_READ)) {
3071 			return KERN_PROTECTION_FAILURE;
3072 		}
3073 
3074 		/*
3075 		 * We can now downgrade the cur_protection to RO. This is a mild lie
3076 		 * to the VM layer. But TPRO will be responsible for toggling the
3077 		 * protections between RO/RW
3078 		 */
3079 		cur_protection = VM_PROT_READ;
3080 	}
3081 
3082 	if (is_submap) {
3083 		vm_map_t submap;
3084 		if (purgable) {
3085 			/* submaps can not be purgeable */
3086 			return KERN_INVALID_ARGUMENT;
3087 		}
3088 		if (object == VM_OBJECT_NULL) {
3089 			/* submaps can not be created lazily */
3090 			return KERN_INVALID_ARGUMENT;
3091 		}
3092 		submap = (vm_map_t) object;
3093 		if (VM_MAP_PAGE_SHIFT(submap) != VM_MAP_PAGE_SHIFT(map)) {
3094 			/* page size mismatch */
3095 			return KERN_INVALID_ARGUMENT;
3096 		}
3097 	}
3098 	if (vmk_flags.vmkf_already) {
3099 		/*
3100 		 * VM_FLAGS_ALREADY says that it's OK if the same mapping
3101 		 * is already present.  For it to be meaningul, the requested
3102 		 * mapping has to be at a fixed address (!VM_FLAGS_ANYWHERE) and
3103 		 * we shouldn't try and remove what was mapped there first
3104 		 * (!VM_FLAGS_OVERWRITE).
3105 		 */
3106 		if (!vmk_flags.vmf_fixed || vmk_flags.vmf_overwrite) {
3107 			return KERN_INVALID_ARGUMENT;
3108 		}
3109 	}
3110 
3111 	if (size == 0 ||
3112 	    (offset & MIN(VM_MAP_PAGE_MASK(map), PAGE_MASK_64)) != 0) {
3113 		*address = 0;
3114 		return KERN_INVALID_ARGUMENT;
3115 	}
3116 
3117 	if (map->pmap == kernel_pmap) {
3118 		user_alias = VM_KERN_MEMORY_NONE;
3119 	} else {
3120 		user_alias = alias;
3121 	}
3122 
3123 	if (user_alias == VM_MEMORY_MALLOC_MEDIUM) {
3124 		chunk_size = MALLOC_MEDIUM_CHUNK_SIZE;
3125 	}
3126 
3127 #define RETURN(value)   { result = value; goto BailOut; }
3128 
3129 	assertf(VM_MAP_PAGE_ALIGNED(*address, FOURK_PAGE_MASK), "0x%llx", (uint64_t)*address);
3130 	assertf(VM_MAP_PAGE_ALIGNED(size, FOURK_PAGE_MASK), "0x%llx", (uint64_t)size);
3131 	if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK) {
3132 		assertf(page_aligned(*address), "0x%llx", (uint64_t)*address);
3133 		assertf(page_aligned(size), "0x%llx", (uint64_t)size);
3134 	}
3135 
3136 	if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK &&
3137 	    !VM_MAP_PAGE_ALIGNED(size, VM_MAP_PAGE_MASK(map))) {
3138 		/*
3139 		 * In most cases, the caller rounds the size up to the
3140 		 * map's page size.
3141 		 * If we get a size that is explicitly not map-aligned here,
3142 		 * we'll have to respect the caller's wish and mark the
3143 		 * mapping as "not map-aligned" to avoid tripping the
3144 		 * map alignment checks later.
3145 		 */
3146 		clear_map_aligned = TRUE;
3147 	}
3148 	if (!anywhere &&
3149 	    VM_MAP_PAGE_MASK(map) >= PAGE_MASK &&
3150 	    !VM_MAP_PAGE_ALIGNED(*address, VM_MAP_PAGE_MASK(map))) {
3151 		/*
3152 		 * We've been asked to map at a fixed address and that
3153 		 * address is not aligned to the map's specific alignment.
3154 		 * The caller should know what it's doing (i.e. most likely
3155 		 * mapping some fragmented copy map, transferring memory from
3156 		 * a VM map with a different alignment), so clear map_aligned
3157 		 * for this new VM map entry and proceed.
3158 		 */
3159 		clear_map_aligned = TRUE;
3160 	}
3161 
3162 	/*
3163 	 * Only zero-fill objects are allowed to be purgable.
3164 	 * LP64todo - limit purgable objects to 32-bits for now
3165 	 */
3166 	if (purgable &&
3167 	    (offset != 0 ||
3168 	    (object != VM_OBJECT_NULL &&
3169 	    (object->vo_size != size ||
3170 	    object->purgable == VM_PURGABLE_DENY))
3171 #if __LP64__
3172 	    || size > ANON_MAX_SIZE
3173 #endif
3174 	    )) {
3175 		return KERN_INVALID_ARGUMENT;
3176 	}
3177 
3178 	if (__improbable(!vm_map_is_map_size_valid(
3179 		    map, size, vmk_flags.vmkf_no_soft_limit))) {
3180 		return KERN_NO_SPACE;
3181 	}
3182 
3183 	vm_map_lock(map);
3184 	map_locked = TRUE;
3185 
3186 
3187 	if (anywhere) {
3188 		result = vm_map_locate_space_anywhere(map, size, mask, vmk_flags,
3189 		    address, &entry);
3190 		start = *address;
3191 	} else {
3192 		start = *address;
3193 		result = vm_map_locate_space_fixed(map, start, size, mask,
3194 		    vmk_flags, &entry, &zap_old_list);
3195 	}
3196 
3197 	end = start + size;
3198 
3199 	assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
3200 
3201 	/*
3202 	 * Check if what's already there is what we want.
3203 	 */
3204 	if (result == KERN_MEMORY_PRESENT) {
3205 		assert(!anywhere);
3206 		if (!(vmk_flags.vmkf_already)) {
3207 			RETURN(KERN_NO_SPACE);
3208 		}
3209 		tmp_start = start;
3210 		tmp_offset = offset;
3211 		if (entry->vme_start < start) {
3212 			tmp_start -= start - entry->vme_start;
3213 			tmp_offset -= start - entry->vme_start;
3214 		}
3215 		for (; entry->vme_start < end;
3216 		    entry = entry->vme_next) {
3217 			/*
3218 			 * Check if the mapping's attributes
3219 			 * match the existing map entry.
3220 			 */
3221 			if (entry == vm_map_to_entry(map) ||
3222 			    entry->vme_start != tmp_start ||
3223 			    entry->is_sub_map != is_submap ||
3224 			    VME_OFFSET(entry) != tmp_offset ||
3225 			    entry->needs_copy != needs_copy ||
3226 			    entry->protection != cur_protection ||
3227 			    entry->max_protection != max_protection ||
3228 			    entry->inheritance != inheritance ||
3229 			    entry->iokit_acct != iokit_acct ||
3230 			    VME_ALIAS(entry) != alias) {
3231 				/* not the same mapping ! */
3232 				RETURN(KERN_NO_SPACE);
3233 			}
3234 			/*
3235 			 * Check if the same object is being mapped.
3236 			 */
3237 			if (is_submap) {
3238 				if (VME_SUBMAP(entry) !=
3239 				    (vm_map_t) object) {
3240 					/* not the same submap */
3241 					RETURN(KERN_NO_SPACE);
3242 				}
3243 			} else {
3244 				if (VME_OBJECT(entry) != object) {
3245 					/* not the same VM object... */
3246 					vm_object_t obj2;
3247 
3248 					obj2 = VME_OBJECT(entry);
3249 					if ((obj2 == VM_OBJECT_NULL || obj2->internal) &&
3250 					    (object == VM_OBJECT_NULL || object->internal)) {
3251 						/*
3252 						 * ... but both are
3253 						 * anonymous memory,
3254 						 * so equivalent.
3255 						 */
3256 					} else {
3257 						RETURN(KERN_NO_SPACE);
3258 					}
3259 				}
3260 			}
3261 
3262 			tmp_offset += entry->vme_end - entry->vme_start;
3263 			tmp_start += entry->vme_end - entry->vme_start;
3264 			if (entry->vme_end >= end) {
3265 				/* reached the end of our mapping */
3266 				break;
3267 			}
3268 		}
3269 		/* it all matches:  let's use what's already there ! */
3270 		RETURN(KERN_MEMORY_PRESENT);
3271 	}
3272 
3273 	if (result != KERN_SUCCESS) {
3274 		goto BailOut;
3275 	}
3276 
3277 
3278 	/*
3279 	 *	At this point,
3280 	 *		"start" and "end" should define the endpoints of the
3281 	 *			available new range, and
3282 	 *		"entry" should refer to the region before the new
3283 	 *			range, and
3284 	 *
3285 	 *		the map should be locked.
3286 	 */
3287 
3288 	/*
3289 	 *	See whether we can avoid creating a new entry (and object) by
3290 	 *	extending one of our neighbors.  [So far, we only attempt to
3291 	 *	extend from below.]  Note that we can never extend/join
3292 	 *	purgable objects because they need to remain distinct
3293 	 *	entities in order to implement their "volatile object"
3294 	 *	semantics.
3295 	 */
3296 
3297 	if (purgable ||
3298 	    entry_for_jit ||
3299 	    entry_for_tpro ||
3300 	    vm_memory_malloc_no_cow(user_alias)) {
3301 		if (superpage_size) {
3302 			/*
3303 			 * For "super page" allocations, we will allocate
3304 			 * special physically-contiguous VM objects later on,
3305 			 * so we should not have flags instructing us to create
3306 			 * a differently special VM object here.
3307 			 */
3308 			RETURN(KERN_INVALID_ARGUMENT);
3309 		}
3310 
3311 		if (object == VM_OBJECT_NULL) {
3312 			assert(!superpage_size);
3313 			object = vm_object_allocate(size, map->serial_id);
3314 			vm_object_lock(object);
3315 			object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
3316 			VM_OBJECT_SET_TRUE_SHARE(object, FALSE);
3317 			if (malloc_no_cow_except_fork &&
3318 			    !purgable &&
3319 			    !entry_for_jit &&
3320 			    !entry_for_tpro &&
3321 			    vm_memory_malloc_no_cow(user_alias)) {
3322 				object->copy_strategy = MEMORY_OBJECT_COPY_DELAY_FORK;
3323 				VM_OBJECT_SET_TRUE_SHARE(object, TRUE);
3324 			}
3325 			if (entry_for_jit) {
3326 				object->vo_inherit_copy_none = true;
3327 			}
3328 			if (purgable) {
3329 				task_t owner;
3330 				VM_OBJECT_SET_PURGABLE(object, VM_PURGABLE_NONVOLATILE);
3331 				if (map->pmap == kernel_pmap) {
3332 					/*
3333 					 * Purgeable mappings made in a kernel
3334 					 * map are "owned" by the kernel itself
3335 					 * rather than the current user task
3336 					 * because they're likely to be used by
3337 					 * more than this user task (see
3338 					 * execargs_purgeable_allocate(), for
3339 					 * example).
3340 					 */
3341 					owner = kernel_task;
3342 				} else {
3343 					owner = current_task();
3344 				}
3345 				assert(object->vo_owner == NULL);
3346 				assert(object->resident_page_count == 0);
3347 				assert(object->wired_page_count == 0);
3348 				vm_purgeable_nonvolatile_enqueue(object, owner);
3349 			}
3350 			vm_object_unlock(object);
3351 			offset = (vm_object_offset_t)0;
3352 		}
3353 	} else if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
3354 		/* no coalescing if address space uses sub-pages */
3355 	} else if ((is_submap == FALSE) &&
3356 	    (object == VM_OBJECT_NULL) &&
3357 	    (entry != vm_map_to_entry(map)) &&
3358 	    (entry->vme_end == start) &&
3359 	    (!entry->is_shared) &&
3360 	    (!entry->is_sub_map) &&
3361 	    (!entry->in_transition) &&
3362 	    (!entry->needs_wakeup) &&
3363 	    (entry->behavior == VM_BEHAVIOR_DEFAULT) &&
3364 	    (entry->protection == cur_protection) &&
3365 	    (entry->max_protection == max_protection) &&
3366 	    (entry->inheritance == inheritance) &&
3367 	    ((user_alias == VM_MEMORY_REALLOC) ||
3368 	    (VME_ALIAS(entry) == alias)) &&
3369 	    (entry->no_cache == no_cache) &&
3370 	    (entry->vme_permanent == permanent) &&
3371 	    /* no coalescing for immutable executable mappings */
3372 	    !((entry->protection & VM_PROT_EXECUTE) &&
3373 	    entry->vme_permanent) &&
3374 	    (!entry->superpage_size && !superpage_size) &&
3375 	    /*
3376 	     * No coalescing if not map-aligned, to avoid propagating
3377 	     * that condition any further than needed:
3378 	     */
3379 	    (!entry->map_aligned || !clear_map_aligned) &&
3380 	    (!entry->zero_wired_pages) &&
3381 	    (!entry->used_for_jit && !entry_for_jit) &&
3382 #if __arm64e__
3383 	    (!entry->used_for_tpro && !entry_for_tpro) &&
3384 #endif
3385 	    (!entry->csm_associated) &&
3386 	    (entry->iokit_acct == iokit_acct) &&
3387 	    (!entry->vme_resilient_codesign) &&
3388 	    (!entry->vme_resilient_media) &&
3389 	    (!entry->vme_atomic) &&
3390 	    (entry->vme_no_copy_on_read == no_copy_on_read) &&
3391 
3392 	    ((entry->vme_end - entry->vme_start) + size <=
3393 	    (user_alias == VM_MEMORY_REALLOC ?
3394 	    ANON_CHUNK_SIZE :
3395 	    NO_COALESCE_LIMIT)) &&
3396 
3397 	    (entry->wired_count == 0)) {        /* implies user_wired_count == 0 */
3398 		if (vm_object_coalesce(VME_OBJECT(entry),
3399 		    VM_OBJECT_NULL,
3400 		    VME_OFFSET(entry),
3401 		    (vm_object_offset_t) 0,
3402 		    (vm_map_size_t)(entry->vme_end - entry->vme_start),
3403 		    (vm_map_size_t)(end - entry->vme_end))) {
3404 			/*
3405 			 *	Coalesced the two objects - can extend
3406 			 *	the previous map entry to include the
3407 			 *	new range.
3408 			 */
3409 			map->size += (end - entry->vme_end);
3410 			assert(entry->vme_start < end);
3411 			assert(VM_MAP_PAGE_ALIGNED(end,
3412 			    VM_MAP_PAGE_MASK(map)));
3413 			if (__improbable(vm_debug_events)) {
3414 				DTRACE_VM5(map_entry_extend, vm_map_t, map, vm_map_entry_t, entry, vm_address_t, entry->vme_start, vm_address_t, entry->vme_end, vm_address_t, end);
3415 			}
3416 			entry->vme_end = end;
3417 			if (map->holelistenabled) {
3418 				vm_map_store_update_first_free(map, entry, TRUE);
3419 			} else {
3420 				vm_map_store_update_first_free(map, map->first_free, TRUE);
3421 			}
3422 			new_mapping_established = TRUE;
3423 			RETURN(KERN_SUCCESS);
3424 		}
3425 	}
3426 
3427 	step = superpage_size ? SUPERPAGE_SIZE : (end - start);
3428 	new_entry = NULL;
3429 
3430 	if (vmk_flags.vmkf_submap_adjust) {
3431 		vm_map_adjust_offsets((vm_map_t)caller_object, start, end);
3432 		offset = start;
3433 	}
3434 
3435 	for (tmp2_start = start; tmp2_start < end; tmp2_start += step) {
3436 		tmp2_end = tmp2_start + step;
3437 		/*
3438 		 *	Create a new entry
3439 		 *
3440 		 * XXX FBDP
3441 		 * The reserved "page zero" in each process's address space can
3442 		 * be arbitrarily large.  Splitting it into separate objects and
3443 		 * therefore different VM map entries serves no purpose and just
3444 		 * slows down operations on the VM map, so let's not split the
3445 		 * allocation into chunks if the max protection is NONE.  That
3446 		 * memory should never be accessible, so it will never get to the
3447 		 * default pager.
3448 		 */
3449 		tmp_start = tmp2_start;
3450 		if (!is_submap &&
3451 		    object == VM_OBJECT_NULL &&
3452 		    size > chunk_size &&
3453 		    max_protection != VM_PROT_NONE &&
3454 		    superpage_size == 0) {
3455 			tmp_end = tmp_start + chunk_size;
3456 		} else {
3457 			tmp_end = tmp2_end;
3458 		}
3459 		do {
3460 			if (!is_submap &&
3461 			    object != VM_OBJECT_NULL &&
3462 			    object->internal &&
3463 			    offset + (tmp_end - tmp_start) > object->vo_size) {
3464 //				printf("FBDP object %p size 0x%llx overmapping offset 0x%llx size 0x%llx\n", object, object->vo_size, offset, (uint64_t)(tmp_end - tmp_start));
3465 				DTRACE_VM5(vm_map_enter_overmap,
3466 				    vm_map_t, map,
3467 				    vm_map_address_t, tmp_start,
3468 				    vm_map_address_t, tmp_end,
3469 				    vm_object_offset_t, offset,
3470 				    vm_object_size_t, object->vo_size);
3471 			}
3472 			new_entry = vm_map_entry_insert(map,
3473 			    entry, tmp_start, tmp_end,
3474 			    object, offset, vmk_flags,
3475 			    needs_copy,
3476 			    cur_protection, max_protection,
3477 			    (entry_for_jit && !VM_MAP_POLICY_ALLOW_JIT_INHERIT(map) ?
3478 			    VM_INHERIT_NONE : inheritance),
3479 			    clear_map_aligned);
3480 
3481 			assert(!is_kernel_object(object) || (VM_KERN_MEMORY_NONE != alias));
3482 
3483 			if (resilient_codesign) {
3484 				int reject_prot = (needs_copy ? VM_PROT_ALLEXEC : (VM_PROT_WRITE | VM_PROT_ALLEXEC));
3485 				if (!((cur_protection | max_protection) & reject_prot)) {
3486 					new_entry->vme_resilient_codesign = TRUE;
3487 				}
3488 			}
3489 
3490 			if (resilient_media &&
3491 			    (object == VM_OBJECT_NULL ||
3492 			    object->internal)) {
3493 				new_entry->vme_resilient_media = TRUE;
3494 			}
3495 
3496 			assert(!new_entry->iokit_acct);
3497 			if (!is_submap &&
3498 			    object != VM_OBJECT_NULL &&
3499 			    object->internal &&
3500 			    (object->purgable != VM_PURGABLE_DENY ||
3501 			    object->vo_ledger_tag)) {
3502 				assert(new_entry->use_pmap);
3503 				assert(!new_entry->iokit_acct);
3504 				/*
3505 				 * Turn off pmap accounting since
3506 				 * purgeable (or tagged) objects have their
3507 				 * own ledgers.
3508 				 */
3509 				new_entry->use_pmap = FALSE;
3510 			} else if (!is_submap &&
3511 			    iokit_acct &&
3512 			    object != VM_OBJECT_NULL &&
3513 			    object->internal) {
3514 				/* alternate accounting */
3515 				assert(!new_entry->iokit_acct);
3516 				assert(new_entry->use_pmap);
3517 				new_entry->iokit_acct = TRUE;
3518 				new_entry->use_pmap = FALSE;
3519 				DTRACE_VM4(
3520 					vm_map_iokit_mapped_region,
3521 					vm_map_t, map,
3522 					vm_map_offset_t, new_entry->vme_start,
3523 					vm_map_offset_t, new_entry->vme_end,
3524 					int, VME_ALIAS(new_entry));
3525 				vm_map_iokit_mapped_region(
3526 					map,
3527 					(new_entry->vme_end -
3528 					new_entry->vme_start));
3529 			} else if (!is_submap) {
3530 				assert(!new_entry->iokit_acct);
3531 				assert(new_entry->use_pmap);
3532 			}
3533 
3534 			if (is_submap) {
3535 				vm_map_t        submap;
3536 				boolean_t       submap_is_64bit;
3537 				boolean_t       use_pmap;
3538 
3539 				assert(new_entry->is_sub_map);
3540 				assert(!new_entry->use_pmap);
3541 				assert(!new_entry->iokit_acct);
3542 				submap = (vm_map_t) object;
3543 				submap_is_64bit = vm_map_is_64bit(submap);
3544 				use_pmap = vmk_flags.vmkf_nested_pmap;
3545 #ifndef NO_NESTED_PMAP
3546 				if (use_pmap && submap->pmap == NULL) {
3547 					ledger_t ledger = map->pmap->ledger;
3548 					/* we need a sub pmap to nest... */
3549 					submap->pmap = pmap_create_options(ledger, 0,
3550 					    submap_is_64bit ? PMAP_CREATE_64BIT : 0);
3551 					if (submap->pmap == NULL) {
3552 						/* let's proceed without nesting... */
3553 					}
3554 #if defined(__arm64__)
3555 					else {
3556 						pmap_set_nested(submap->pmap);
3557 					}
3558 #endif
3559 				}
3560 				if (use_pmap && submap->pmap != NULL) {
3561 					if (VM_MAP_PAGE_SHIFT(map) != VM_MAP_PAGE_SHIFT(submap)) {
3562 						DEBUG4K_ERROR("map %p (%d) submap %p (%d): incompatible page sizes\n", map, VM_MAP_PAGE_SHIFT(map), submap, VM_MAP_PAGE_SHIFT(submap));
3563 						kr = KERN_FAILURE;
3564 					} else {
3565 						kr = pmap_nest(map->pmap,
3566 						    submap->pmap,
3567 						    tmp_start,
3568 						    tmp_end - tmp_start);
3569 					}
3570 					if (kr != KERN_SUCCESS) {
3571 						printf("vm_map_enter: "
3572 						    "pmap_nest(0x%llx,0x%llx) "
3573 						    "error 0x%x\n",
3574 						    (long long)tmp_start,
3575 						    (long long)tmp_end,
3576 						    kr);
3577 					} else {
3578 						/* we're now nested ! */
3579 						new_entry->use_pmap = TRUE;
3580 						pmap_empty = FALSE;
3581 					}
3582 				}
3583 #endif /* NO_NESTED_PMAP */
3584 			}
3585 			entry = new_entry;
3586 
3587 			if (superpage_size) {
3588 				vm_page_t pages, m;
3589 				vm_object_t sp_object;
3590 				vm_object_offset_t sp_offset;
3591 
3592 				assert(object == VM_OBJECT_NULL);
3593 				VME_OFFSET_SET(entry, 0);
3594 
3595 				/* allocate one superpage */
3596 				kr = cpm_allocate(SUPERPAGE_SIZE, &pages, 0, SUPERPAGE_NBASEPAGES - 1, TRUE, 0);
3597 				if (kr != KERN_SUCCESS) {
3598 					/* deallocate whole range... */
3599 					new_mapping_established = TRUE;
3600 					/* ... but only up to "tmp_end" */
3601 					size -= end - tmp_end;
3602 					RETURN(kr);
3603 				}
3604 
3605 				/* create one vm_object per superpage */
3606 				sp_object = vm_object_allocate((vm_map_size_t)(entry->vme_end - entry->vme_start), map->serial_id);
3607 				vm_object_lock(sp_object);
3608 				sp_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
3609 				VM_OBJECT_SET_PHYS_CONTIGUOUS(sp_object, TRUE);
3610 				sp_object->vo_shadow_offset = (vm_object_offset_t)VM_PAGE_GET_PHYS_PAGE(pages) * PAGE_SIZE;
3611 				VME_OBJECT_SET(entry, sp_object, false, 0);
3612 				assert(entry->use_pmap);
3613 
3614 				/* enter the base pages into the object */
3615 				for (sp_offset = 0;
3616 				    sp_offset < SUPERPAGE_SIZE;
3617 				    sp_offset += PAGE_SIZE) {
3618 					m = pages;
3619 					pmap_zero_page(VM_PAGE_GET_PHYS_PAGE(m));
3620 					pages = NEXT_PAGE(m);
3621 					*(NEXT_PAGE_PTR(m)) = VM_PAGE_NULL;
3622 					vm_page_insert_wired(m, sp_object, sp_offset, VM_KERN_MEMORY_OSFMK);
3623 				}
3624 				vm_object_unlock(sp_object);
3625 			}
3626 		} while (tmp_end != tmp2_end &&
3627 		    (tmp_start = tmp_end) &&
3628 		    (tmp_end = (tmp2_end - tmp_end > chunk_size) ?
3629 		    tmp_end + chunk_size : tmp2_end));
3630 	}
3631 
3632 	new_mapping_established = TRUE;
3633 
3634 
3635 BailOut:
3636 	assert(map_locked == TRUE);
3637 
3638 	/*
3639 	 * Address space limit enforcement (RLIMIT_AS and RLIMIT_DATA):
3640 	 * If we have identified and possibly established the new mapping(s),
3641 	 * make sure we did not go beyond the address space limit.
3642 	 */
3643 	if (result == KERN_SUCCESS) {
3644 		if (map->size_limit != RLIM_INFINITY &&
3645 		    map->size > map->size_limit) {
3646 			/*
3647 			 * Establishing the requested mappings would exceed
3648 			 * the process's RLIMIT_AS limit: fail with
3649 			 * KERN_NO_SPACE.
3650 			 */
3651 			result = KERN_NO_SPACE;
3652 			printf("%d[%s] %s: map size 0x%llx over RLIMIT_AS 0x%llx\n",
3653 			    proc_selfpid(),
3654 			    (get_bsdtask_info(current_task())
3655 			    ? proc_name_address(get_bsdtask_info(current_task()))
3656 			    : "?"),
3657 			    __FUNCTION__,
3658 			    (uint64_t) map->size,
3659 			    (uint64_t) map->size_limit);
3660 			DTRACE_VM2(vm_map_enter_RLIMIT_AS,
3661 			    vm_map_size_t, map->size,
3662 			    uint64_t, map->size_limit);
3663 			vm_map_enter_RLIMIT_AS_count++;
3664 		} else if (map->data_limit != RLIM_INFINITY &&
3665 		    map->size > map->data_limit) {
3666 			/*
3667 			 * Establishing the requested mappings would exceed
3668 			 * the process's RLIMIT_DATA limit: fail with
3669 			 * KERN_NO_SPACE.
3670 			 */
3671 			result = KERN_NO_SPACE;
3672 			printf("%d[%s] %s: map size 0x%llx over RLIMIT_DATA 0x%llx\n",
3673 			    proc_selfpid(),
3674 			    (get_bsdtask_info(current_task())
3675 			    ? proc_name_address(get_bsdtask_info(current_task()))
3676 			    : "?"),
3677 			    __FUNCTION__,
3678 			    (uint64_t) map->size,
3679 			    (uint64_t) map->data_limit);
3680 			DTRACE_VM2(vm_map_enter_RLIMIT_DATA,
3681 			    vm_map_size_t, map->size,
3682 			    uint64_t, map->data_limit);
3683 			vm_map_enter_RLIMIT_DATA_count++;
3684 		}
3685 	}
3686 
3687 	if (result == KERN_SUCCESS) {
3688 		vm_prot_t pager_prot;
3689 		memory_object_t pager;
3690 
3691 #if DEBUG
3692 		if (pmap_empty &&
3693 		    !(vmk_flags.vmkf_no_pmap_check)) {
3694 			assert(pmap_is_empty(map->pmap,
3695 			    *address,
3696 			    *address + size));
3697 		}
3698 #endif /* DEBUG */
3699 
3700 		/*
3701 		 * For "named" VM objects, let the pager know that the
3702 		 * memory object is being mapped.  Some pagers need to keep
3703 		 * track of this, to know when they can reclaim the memory
3704 		 * object, for example.
3705 		 * VM calls memory_object_map() for each mapping (specifying
3706 		 * the protection of each mapping) and calls
3707 		 * memory_object_last_unmap() when all the mappings are gone.
3708 		 */
3709 		pager_prot = max_protection;
3710 		if (needs_copy) {
3711 			/*
3712 			 * Copy-On-Write mapping: won't modify
3713 			 * the memory object.
3714 			 */
3715 			pager_prot &= ~VM_PROT_WRITE;
3716 		}
3717 		if (!is_submap &&
3718 		    object != VM_OBJECT_NULL &&
3719 		    object->named &&
3720 		    object->pager != MEMORY_OBJECT_NULL) {
3721 			vm_object_lock(object);
3722 			pager = object->pager;
3723 			if (object->named &&
3724 			    pager != MEMORY_OBJECT_NULL) {
3725 				assert(object->pager_ready);
3726 				vm_object_mapping_wait(object, THREAD_UNINT);
3727 				/* object might have lost its pager while waiting */
3728 				pager = object->pager;
3729 				if (object->named && pager != MEMORY_OBJECT_NULL) {
3730 					vm_object_mapping_begin(object);
3731 					vm_object_unlock(object);
3732 
3733 					kr = memory_object_map(pager, pager_prot);
3734 					assert(kr == KERN_SUCCESS);
3735 
3736 					vm_object_lock(object);
3737 					vm_object_mapping_end(object);
3738 				}
3739 			}
3740 			vm_object_unlock(object);
3741 		}
3742 	}
3743 
3744 	assert(map_locked == TRUE);
3745 
3746 	if (new_mapping_established) {
3747 		/*
3748 		 * If we release the map lock for any reason below,
3749 		 * another thread could deallocate our new mapping,
3750 		 * releasing the caller's reference on "caller_object",
3751 		 * which was transferred to the mapping.
3752 		 * If this was the only reference, the object could be
3753 		 * destroyed.
3754 		 *
3755 		 * We need to take an extra reference on "caller_object"
3756 		 * to keep it alive if we need to return the caller's
3757 		 * reference to the caller in case of failure.
3758 		 */
3759 		if (is_submap) {
3760 			vm_map_reference((vm_map_t)caller_object);
3761 		} else {
3762 			vm_object_reference(caller_object);
3763 		}
3764 	}
3765 
3766 	if (!keep_map_locked) {
3767 		vm_map_unlock(map);
3768 		map_locked = FALSE;
3769 		entry = VM_MAP_ENTRY_NULL;
3770 		new_entry = VM_MAP_ENTRY_NULL;
3771 	}
3772 
3773 	/*
3774 	 * We can't hold the map lock if we enter this block.
3775 	 */
3776 
3777 	if (result == KERN_SUCCESS) {
3778 		/*	Wire down the new entry if the user
3779 		 *	requested all new map entries be wired.
3780 		 */
3781 		if ((map->wiring_required) || (superpage_size)) {
3782 			assert(!keep_map_locked);
3783 			pmap_empty = FALSE; /* pmap won't be empty */
3784 			kr = vm_map_wire_nested(map, start, end,
3785 			    cur_protection, VM_KERN_MEMORY_MLOCK,
3786 			    TRUE, PMAP_NULL, 0, NULL);
3787 			result = kr;
3788 		}
3789 
3790 	}
3791 
3792 	if (result != KERN_SUCCESS) {
3793 		if (new_mapping_established) {
3794 			vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_FLAGS;
3795 
3796 			/*
3797 			 * We have to get rid of the new mappings since we
3798 			 * won't make them available to the user.
3799 			 * Try and do that atomically, to minimize the risk
3800 			 * that someone else create new mappings that range.
3801 			 */
3802 			if (!map_locked) {
3803 				vm_map_lock(map);
3804 				map_locked = TRUE;
3805 			}
3806 			remove_flags |= VM_MAP_REMOVE_NO_MAP_ALIGN;
3807 			remove_flags |= VM_MAP_REMOVE_NO_YIELD;
3808 			if (permanent) {
3809 				remove_flags |= VM_MAP_REMOVE_IMMUTABLE;
3810 			}
3811 			(void) vm_map_delete(map,
3812 			    *address, *address + size,
3813 			    remove_flags,
3814 			    KMEM_GUARD_NONE, &zap_new_list);
3815 		}
3816 
3817 		if (vm_map_zap_first_entry(&zap_old_list)) {
3818 			vm_map_entry_t entry1, entry2;
3819 
3820 			/*
3821 			 * The new mapping failed.  Attempt to restore
3822 			 * the old mappings, saved in the "zap_old_map".
3823 			 */
3824 			if (!map_locked) {
3825 				vm_map_lock(map);
3826 				map_locked = TRUE;
3827 			}
3828 
3829 			/* first check if the coast is still clear */
3830 			start = vm_map_zap_first_entry(&zap_old_list)->vme_start;
3831 			end   = vm_map_zap_last_entry(&zap_old_list)->vme_end;
3832 
3833 			if (vm_map_lookup_entry(map, start, &entry1) ||
3834 			    vm_map_lookup_entry(map, end, &entry2) ||
3835 			    entry1 != entry2) {
3836 				/*
3837 				 * Part of that range has already been
3838 				 * re-mapped:  we can't restore the old
3839 				 * mappings...
3840 				 */
3841 				vm_map_enter_restore_failures++;
3842 			} else {
3843 				/*
3844 				 * Transfer the saved map entries from
3845 				 * "zap_old_map" to the original "map",
3846 				 * inserting them all after "entry1".
3847 				 */
3848 				while ((entry2 = vm_map_zap_pop(&zap_old_list))) {
3849 					vm_map_size_t entry_size;
3850 
3851 					entry_size = (entry2->vme_end -
3852 					    entry2->vme_start);
3853 					vm_map_store_entry_link(map, entry1, entry2,
3854 					    VM_MAP_KERNEL_FLAGS_NONE);
3855 					map->size += entry_size;
3856 					entry1 = entry2;
3857 				}
3858 				if (map->wiring_required) {
3859 					/*
3860 					 * XXX TODO: we should rewire the
3861 					 * old pages here...
3862 					 */
3863 				}
3864 				vm_map_enter_restore_successes++;
3865 			}
3866 		}
3867 	}
3868 
3869 	/*
3870 	 * The caller is responsible for releasing the lock if it requested to
3871 	 * keep the map locked.
3872 	 */
3873 	if (map_locked && !keep_map_locked) {
3874 		vm_map_unlock(map);
3875 	}
3876 
3877 	vm_map_zap_dispose(&zap_old_list);
3878 	vm_map_zap_dispose(&zap_new_list);
3879 
3880 	if (new_mapping_established) {
3881 		/*
3882 		 * The caller had a reference on "caller_object" and we
3883 		 * transferred that reference to the mapping.
3884 		 * We also took an extra reference on "caller_object" to keep
3885 		 * it alive while the map was unlocked.
3886 		 */
3887 		if (result == KERN_SUCCESS) {
3888 			/*
3889 			 * On success, the caller's reference on the object gets
3890 			 * tranferred to the mapping.
3891 			 * Release our extra reference.
3892 			 */
3893 			if (is_submap) {
3894 				vm_map_deallocate((vm_map_t)caller_object);
3895 			} else {
3896 				vm_object_deallocate(caller_object);
3897 			}
3898 		} else {
3899 			/*
3900 			 * On error, the caller expects to still have a
3901 			 * reference on the object it gave us.
3902 			 * Let's use our extra reference for that.
3903 			 */
3904 		}
3905 	}
3906 
3907 	return result;
3908 
3909 #undef  RETURN
3910 }
3911 
3912 /*
3913  * Counters for the prefault optimization.
3914  */
3915 int64_t vm_prefault_nb_pages = 0;
3916 int64_t vm_prefault_nb_bailout = 0;
3917 
3918 static kern_return_t
vm_map_enter_adjust_offset(vm_object_offset_t * obj_offs,vm_object_offset_t * obj_end,vm_object_offset_t quantity)3919 vm_map_enter_adjust_offset(
3920 	vm_object_offset_t *obj_offs,
3921 	vm_object_offset_t *obj_end,
3922 	vm_object_offset_t  quantity)
3923 {
3924 	if (os_add_overflow(*obj_offs, quantity, obj_offs) ||
3925 	    os_add_overflow(*obj_end, quantity, obj_end) ||
3926 	    vm_map_round_page_mask(*obj_end, PAGE_MASK) == 0) {
3927 		return KERN_INVALID_ARGUMENT;
3928 	}
3929 
3930 	return KERN_SUCCESS;
3931 }
3932 
3933 static __attribute__((always_inline, warn_unused_result))
3934 kern_return_t
vm_map_enter_mem_object_sanitize(vm_map_t target_map,vm_map_offset_ut address_u,vm_map_size_ut initial_size_u,vm_map_offset_ut mask_u,vm_object_offset_ut offset_u,vm_prot_ut cur_protection_u,vm_prot_ut max_protection_u,vm_inherit_ut inheritance_u,vm_map_kernel_flags_t vmk_flags,ipc_port_t port,vm_map_address_t * map_addr,vm_map_size_t * map_size,vm_map_offset_t * mask,vm_object_offset_t * obj_offs,vm_object_offset_t * obj_end,vm_object_size_t * obj_size,vm_prot_t * cur_protection,vm_prot_t * max_protection,vm_inherit_t * inheritance)3935 vm_map_enter_mem_object_sanitize(
3936 	vm_map_t                target_map,
3937 	vm_map_offset_ut        address_u,
3938 	vm_map_size_ut          initial_size_u,
3939 	vm_map_offset_ut        mask_u,
3940 	vm_object_offset_ut     offset_u,
3941 	vm_prot_ut              cur_protection_u,
3942 	vm_prot_ut              max_protection_u,
3943 	vm_inherit_ut           inheritance_u,
3944 	vm_map_kernel_flags_t   vmk_flags,
3945 	ipc_port_t              port,
3946 	vm_map_address_t       *map_addr,
3947 	vm_map_size_t          *map_size,
3948 	vm_map_offset_t        *mask,
3949 	vm_object_offset_t     *obj_offs,
3950 	vm_object_offset_t     *obj_end,
3951 	vm_object_size_t       *obj_size,
3952 	vm_prot_t              *cur_protection,
3953 	vm_prot_t              *max_protection,
3954 	vm_inherit_t           *inheritance)
3955 {
3956 	kern_return_t           result;
3957 
3958 	result = vm_sanitize_cur_and_max_prots(cur_protection_u, max_protection_u,
3959 	    VM_SANITIZE_CALLER_ENTER_MEM_OBJ, target_map,
3960 	    VM_PROT_IS_MASK, cur_protection,
3961 	    max_protection);
3962 	if (__improbable(result != KERN_SUCCESS)) {
3963 		return result;
3964 	}
3965 
3966 	result = vm_sanitize_inherit(inheritance_u, VM_SANITIZE_CALLER_ENTER_MEM_OBJ,
3967 	    inheritance);
3968 	if (__improbable(result != KERN_SUCCESS)) {
3969 		return result;
3970 	}
3971 
3972 	result = vm_sanitize_mask(mask_u, VM_SANITIZE_CALLER_ENTER_MEM_OBJ, mask);
3973 	if (__improbable(result != KERN_SUCCESS)) {
3974 		return result;
3975 	}
3976 
3977 	if (vmk_flags.vmf_fixed) {
3978 		vm_map_address_t        map_end;
3979 
3980 		result = vm_sanitize_addr_size(address_u, initial_size_u,
3981 		    VM_SANITIZE_CALLER_ENTER_MEM_OBJ,
3982 		    target_map,
3983 		    VM_SANITIZE_FLAGS_SIZE_ZERO_FAILS | VM_SANITIZE_FLAGS_REALIGN_START,
3984 		    map_addr, &map_end, map_size);
3985 		if (__improbable(result != KERN_SUCCESS)) {
3986 			return result;
3987 		}
3988 	} else {
3989 		*map_addr = vm_sanitize_addr(target_map, address_u);
3990 		result = vm_sanitize_size(0, initial_size_u,
3991 		    VM_SANITIZE_CALLER_ENTER_MEM_OBJ, target_map,
3992 		    VM_SANITIZE_FLAGS_SIZE_ZERO_FAILS, map_size);
3993 		if (__improbable(result != KERN_SUCCESS)) {
3994 			return result;
3995 		}
3996 	}
3997 
3998 	*obj_size = vm_object_round_page(*map_size);
3999 	if (__improbable(*obj_size == 0)) {
4000 		return KERN_INVALID_ARGUMENT;
4001 	}
4002 
4003 	if (IP_VALID(port)) {
4004 		result = vm_sanitize_addr_size(offset_u, *obj_size,
4005 		    VM_SANITIZE_CALLER_ENTER_MEM_OBJ,
4006 		    PAGE_MASK,
4007 		    VM_SANITIZE_FLAGS_SIZE_ZERO_FAILS |
4008 		    VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES,
4009 		    obj_offs, obj_end, obj_size);
4010 		if (__improbable(result != KERN_SUCCESS)) {
4011 			return result;
4012 		}
4013 	} else {
4014 		*obj_offs = 0;
4015 		*obj_end  = *obj_size;
4016 	}
4017 
4018 	return KERN_SUCCESS;
4019 }
4020 
4021 kern_return_t
vm_map_enter_mem_object(vm_map_t target_map,vm_map_offset_ut * address_u,vm_map_size_ut initial_size_u,vm_map_offset_ut mask_u,vm_map_kernel_flags_t vmk_flags,ipc_port_t port,vm_object_offset_ut offset_u,boolean_t copy,vm_prot_ut cur_protection_u,vm_prot_ut max_protection_u,vm_inherit_ut inheritance_u,upl_page_list_ptr_t page_list,unsigned int page_list_count)4022 vm_map_enter_mem_object(
4023 	vm_map_t                target_map,
4024 	vm_map_offset_ut       *address_u,
4025 	vm_map_size_ut          initial_size_u,
4026 	vm_map_offset_ut        mask_u,
4027 	vm_map_kernel_flags_t   vmk_flags,
4028 	ipc_port_t              port,
4029 	vm_object_offset_ut     offset_u,
4030 	boolean_t               copy,
4031 	vm_prot_ut              cur_protection_u,
4032 	vm_prot_ut              max_protection_u,
4033 	vm_inherit_ut           inheritance_u,
4034 	upl_page_list_ptr_t     page_list,
4035 	unsigned int            page_list_count)
4036 {
4037 	vm_map_offset_t         mask;
4038 	vm_prot_t               cur_protection;
4039 	vm_prot_t               max_protection;
4040 	vm_inherit_t            inheritance;
4041 	vm_map_address_t        map_addr, map_mask;
4042 	vm_map_size_t           map_size;
4043 	vm_object_t             object = VM_OBJECT_NULL;
4044 	vm_object_offset_t      obj_offs, obj_end;
4045 	vm_object_size_t        obj_size;
4046 	kern_return_t           result;
4047 	boolean_t               mask_cur_protection, mask_max_protection;
4048 	boolean_t               kernel_prefault, try_prefault = (page_list_count != 0);
4049 	vm_map_offset_t         offset_in_mapping = 0;
4050 
4051 	if (VM_MAP_PAGE_SHIFT(target_map) < PAGE_SHIFT) {
4052 		/* XXX TODO4K prefaulting depends on page size... */
4053 		try_prefault = FALSE;
4054 	}
4055 
4056 	/*
4057 	 * Check arguments for validity
4058 	 */
4059 	if ((target_map == VM_MAP_NULL) ||
4060 	    (try_prefault && (copy || !page_list))) {
4061 		return KERN_INVALID_ARGUMENT;
4062 	}
4063 
4064 	map_mask = vm_map_page_mask(target_map);
4065 
4066 	/*
4067 	 * Sanitize any input parameters that are addr/size/prot/inherit
4068 	 */
4069 	result = vm_map_enter_mem_object_sanitize(
4070 		target_map,
4071 		*address_u,
4072 		initial_size_u,
4073 		mask_u,
4074 		offset_u,
4075 		cur_protection_u,
4076 		max_protection_u,
4077 		inheritance_u,
4078 		vmk_flags,
4079 		port,
4080 		&map_addr,
4081 		&map_size,
4082 		&mask,
4083 		&obj_offs,
4084 		&obj_end,
4085 		&obj_size,
4086 		&cur_protection,
4087 		&max_protection,
4088 		&inheritance);
4089 	if (__improbable(result != KERN_SUCCESS)) {
4090 		return vm_sanitize_get_kr(result);
4091 	}
4092 
4093 	assertf(vmk_flags.__vmkf_unused2 == 0, "vmk_flags unused2=0x%llx\n", vmk_flags.__vmkf_unused2);
4094 	vm_map_kernel_flags_update_range_id(&vmk_flags, target_map, map_size);
4095 
4096 	mask_cur_protection = cur_protection & VM_PROT_IS_MASK;
4097 	mask_max_protection = max_protection & VM_PROT_IS_MASK;
4098 	cur_protection &= ~VM_PROT_IS_MASK;
4099 	max_protection &= ~VM_PROT_IS_MASK;
4100 
4101 #if __arm64__
4102 	if (cur_protection & VM_PROT_EXECUTE) {
4103 		cur_protection |= VM_PROT_READ;
4104 	}
4105 #endif /* __arm64__ */
4106 
4107 	/*
4108 	 * Find the vm object (if any) corresponding to this port.
4109 	 */
4110 	if (!IP_VALID(port)) {
4111 		object = VM_OBJECT_NULL;
4112 		copy = FALSE;
4113 	} else if (ip_kotype(port) == IKOT_NAMED_ENTRY) {
4114 		vm_named_entry_t        named_entry;
4115 		vm_object_size_t        initial_size;
4116 
4117 		named_entry = mach_memory_entry_from_port(port);
4118 
4119 		if (vmk_flags.vmf_return_data_addr ||
4120 		    vmk_flags.vmf_return_4k_data_addr) {
4121 			result = vm_map_enter_adjust_offset(&obj_offs,
4122 			    &obj_end, named_entry->data_offset);
4123 			if (__improbable(result)) {
4124 				return result;
4125 			}
4126 		}
4127 
4128 		/* a few checks to make sure user is obeying rules */
4129 		if (mask_max_protection) {
4130 			max_protection &= named_entry->protection;
4131 		}
4132 		if (mask_cur_protection) {
4133 			cur_protection &= named_entry->protection;
4134 		}
4135 		if ((named_entry->protection & max_protection) !=
4136 		    max_protection) {
4137 			return KERN_INVALID_RIGHT;
4138 		}
4139 		if ((named_entry->protection & cur_protection) !=
4140 		    cur_protection) {
4141 			return KERN_INVALID_RIGHT;
4142 		}
4143 
4144 		/*
4145 		 * unwrap is safe because we know obj_size is larger and doesn't
4146 		 * overflow
4147 		 */
4148 		initial_size = VM_SANITIZE_UNSAFE_UNWRAP(initial_size_u);
4149 		if (named_entry->size < obj_offs + initial_size) {
4150 			return KERN_INVALID_ARGUMENT;
4151 		}
4152 
4153 		/* for a vm_map_copy, we can only map it whole */
4154 		if (named_entry->is_copy &&
4155 		    (obj_size != named_entry->size) &&
4156 		    (vm_map_round_page(obj_size, map_mask) == named_entry->size)) {
4157 			/* XXX FBDP use the rounded size... */
4158 			obj_end += named_entry->size - obj_size;
4159 			obj_size = named_entry->size;
4160 		}
4161 
4162 		if (named_entry->offset) {
4163 			/*
4164 			 * the callers parameter offset is defined to be the
4165 			 * offset from beginning of named entry offset in object
4166 			 *
4167 			 * Because we checked above that
4168 			 *   obj_offs + obj_size < named_entry_size
4169 			 * these overflow checks should be redundant...
4170 			 */
4171 			result = vm_map_enter_adjust_offset(&obj_offs,
4172 			    &obj_end, named_entry->offset);
4173 			if (__improbable(result)) {
4174 				return result;
4175 			}
4176 		}
4177 
4178 		if (!VM_MAP_PAGE_ALIGNED(obj_size, map_mask)) {
4179 			/*
4180 			 * Let's not map more than requested;
4181 			 * vm_map_enter() will handle this "not map-aligned"
4182 			 * case.
4183 			 */
4184 			map_size = obj_size;
4185 		}
4186 
4187 		named_entry_lock(named_entry);
4188 
4189 		// rdar://130307561 (Combine copy, object, and submap fields of vm_named_entry into an enum)
4190 		assert(named_entry->is_copy || named_entry->is_object || named_entry->is_sub_map);
4191 
4192 		if (named_entry->is_sub_map) {
4193 			vm_map_t                submap;
4194 
4195 			assert(!named_entry->is_copy);
4196 			assert(!named_entry->is_object);
4197 
4198 			if (vmk_flags.vmf_return_data_addr ||
4199 			    vmk_flags.vmf_return_4k_data_addr) {
4200 				panic("VM_FLAGS_RETURN_DATA_ADDR not expected for submap.");
4201 			}
4202 
4203 			submap = named_entry->backing.map;
4204 			vm_map_reference(submap);
4205 			named_entry_unlock(named_entry);
4206 
4207 			vmk_flags.vmkf_submap = TRUE;
4208 			result = vm_map_enter(target_map,
4209 			    &map_addr,
4210 			    map_size,
4211 			    mask,
4212 			    vmk_flags,
4213 			    (vm_object_t)(uintptr_t) submap,
4214 			    obj_offs,
4215 			    copy,
4216 			    cur_protection,
4217 			    max_protection,
4218 			    inheritance);
4219 			if (result != KERN_SUCCESS) {
4220 				vm_map_deallocate(submap);
4221 				return result;
4222 			}
4223 			/*
4224 			 * No need to lock "submap" just to check its
4225 			 * "mapped" flag: that flag is never reset
4226 			 * once it's been set and if we race, we'll
4227 			 * just end up setting it twice, which is OK.
4228 			 */
4229 			if (submap->mapped_in_other_pmaps == FALSE &&
4230 			    vm_map_pmap(submap) != PMAP_NULL &&
4231 			    vm_map_pmap(submap) !=
4232 			    vm_map_pmap(target_map)) {
4233 				/*
4234 				 * This submap is being mapped in a map
4235 				 * that uses a different pmap.
4236 				 * Set its "mapped_in_other_pmaps" flag
4237 				 * to indicate that we now need to
4238 				 * remove mappings from all pmaps rather
4239 				 * than just the submap's pmap.
4240 				 */
4241 				vm_map_lock(submap);
4242 				submap->mapped_in_other_pmaps = TRUE;
4243 				vm_map_unlock(submap);
4244 			}
4245 			goto out;
4246 		}
4247 
4248 		if (named_entry->is_copy) {
4249 			kern_return_t   kr;
4250 			vm_map_copy_t   copy_map;
4251 			vm_map_entry_t  copy_entry;
4252 			vm_map_offset_t copy_addr;
4253 			vm_map_copy_t   target_copy_map;
4254 			vm_map_offset_t overmap_start, overmap_end;
4255 			vm_map_offset_t trimmed_start;
4256 			vm_map_size_t   target_size;
4257 
4258 			assert(!named_entry->is_object);
4259 			assert(!named_entry->is_sub_map);
4260 
4261 			int allowed_flags = VM_FLAGS_FIXED |
4262 			    VM_FLAGS_ANYWHERE |
4263 			    VM_FLAGS_OVERWRITE |
4264 			    VM_FLAGS_RETURN_4K_DATA_ADDR |
4265 			    VM_FLAGS_RETURN_DATA_ADDR;
4266 
4267 			if (!vm_map_kernel_flags_check_vmflags(vmk_flags, allowed_flags)) {
4268 				named_entry_unlock(named_entry);
4269 				return KERN_INVALID_ARGUMENT;
4270 			}
4271 
4272 			copy_map = named_entry->backing.copy;
4273 			assert(copy_map->type == VM_MAP_COPY_ENTRY_LIST);
4274 			if (copy_map->type != VM_MAP_COPY_ENTRY_LIST) {
4275 				/* unsupported type; should not happen */
4276 				printf("vm_map_enter_mem_object: "
4277 				    "memory_entry->backing.copy "
4278 				    "unsupported type 0x%x\n",
4279 				    copy_map->type);
4280 				named_entry_unlock(named_entry);
4281 				return KERN_INVALID_ARGUMENT;
4282 			}
4283 
4284 			if (VM_MAP_PAGE_SHIFT(target_map) != copy_map->cpy_hdr.page_shift) {
4285 				DEBUG4K_SHARE("copy_map %p offset %llx size 0x%llx pgshift %d -> target_map %p pgshift %d\n", copy_map, obj_offs, (uint64_t)map_size, copy_map->cpy_hdr.page_shift, target_map, VM_MAP_PAGE_SHIFT(target_map));
4286 			}
4287 
4288 			if (vmk_flags.vmf_return_data_addr ||
4289 			    vmk_flags.vmf_return_4k_data_addr) {
4290 				offset_in_mapping = obj_offs & map_mask;
4291 				if (vmk_flags.vmf_return_4k_data_addr) {
4292 					offset_in_mapping &= ~((signed)(0xFFF));
4293 				}
4294 			}
4295 
4296 			target_copy_map = VM_MAP_COPY_NULL;
4297 			target_size = copy_map->size;
4298 			overmap_start = 0;
4299 			overmap_end = 0;
4300 			trimmed_start = 0;
4301 			if (copy_map->cpy_hdr.page_shift != VM_MAP_PAGE_SHIFT(target_map)) {
4302 				DEBUG4K_ADJUST("adjusting...\n");
4303 				kr = vm_map_copy_adjust_to_target(
4304 					copy_map,
4305 					obj_offs,
4306 					initial_size,
4307 					target_map,
4308 					copy,
4309 					&target_copy_map,
4310 					&overmap_start,
4311 					&overmap_end,
4312 					&trimmed_start);
4313 				if (kr != KERN_SUCCESS) {
4314 					named_entry_unlock(named_entry);
4315 					return kr;
4316 				}
4317 				target_size = target_copy_map->size;
4318 			} else {
4319 				/*
4320 				 * Assert that the vm_map_copy is coming from the right
4321 				 * zone and hasn't been forged
4322 				 */
4323 				vm_map_copy_require(copy_map);
4324 				target_copy_map = copy_map;
4325 			}
4326 
4327 			vm_map_kernel_flags_t rsv_flags = vmk_flags;
4328 
4329 			vm_map_kernel_flags_and_vmflags(&rsv_flags,
4330 			    (VM_FLAGS_FIXED |
4331 			    VM_FLAGS_ANYWHERE |
4332 			    VM_FLAGS_OVERWRITE |
4333 			    VM_FLAGS_RETURN_4K_DATA_ADDR |
4334 			    VM_FLAGS_RETURN_DATA_ADDR));
4335 
4336 			/* reserve a contiguous range */
4337 			kr = vm_map_enter(target_map,
4338 			    &map_addr,
4339 			    vm_map_round_page(target_size, map_mask),
4340 			    mask,
4341 			    rsv_flags,
4342 			    VM_OBJECT_NULL,
4343 			    0,
4344 			    FALSE,               /* copy */
4345 			    cur_protection,
4346 			    max_protection,
4347 			    inheritance);
4348 			if (kr != KERN_SUCCESS) {
4349 				DEBUG4K_ERROR("kr 0x%x\n", kr);
4350 				if (target_copy_map != copy_map) {
4351 					vm_map_copy_discard(target_copy_map);
4352 					target_copy_map = VM_MAP_COPY_NULL;
4353 				}
4354 				named_entry_unlock(named_entry);
4355 				return kr;
4356 			}
4357 
4358 			copy_addr = map_addr;
4359 
4360 			for (copy_entry = vm_map_copy_first_entry(target_copy_map);
4361 			    copy_entry != vm_map_copy_to_entry(target_copy_map);
4362 			    copy_entry = copy_entry->vme_next) {
4363 				vm_map_t                copy_submap = VM_MAP_NULL;
4364 				vm_object_t             copy_object = VM_OBJECT_NULL;
4365 				vm_map_size_t           copy_size;
4366 				vm_object_offset_t      copy_offset;
4367 				boolean_t               do_copy = false;
4368 
4369 				if (copy_entry->is_sub_map) {
4370 					copy_submap = VME_SUBMAP(copy_entry);
4371 					copy_object = (vm_object_t)copy_submap;
4372 				} else {
4373 					copy_object = VME_OBJECT(copy_entry);
4374 				}
4375 				copy_offset = VME_OFFSET(copy_entry);
4376 				copy_size = (copy_entry->vme_end -
4377 				    copy_entry->vme_start);
4378 
4379 				/* sanity check */
4380 				if ((copy_addr + copy_size) >
4381 				    (map_addr +
4382 				    overmap_start + overmap_end +
4383 				    named_entry->size /* XXX full size */)) {
4384 					/* over-mapping too much !? */
4385 					kr = KERN_INVALID_ARGUMENT;
4386 					DEBUG4K_ERROR("kr 0x%x\n", kr);
4387 					/* abort */
4388 					break;
4389 				}
4390 
4391 				/* take a reference on the object */
4392 				if (copy_entry->is_sub_map) {
4393 					vm_map_reference(copy_submap);
4394 				} else {
4395 					if (!copy &&
4396 					    copy_object != VM_OBJECT_NULL &&
4397 					    copy_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
4398 						bool is_writable;
4399 
4400 						/*
4401 						 * We need to resolve our side of this
4402 						 * "symmetric" copy-on-write now; we
4403 						 * need a new object to map and share,
4404 						 * instead of the current one which
4405 						 * might still be shared with the
4406 						 * original mapping.
4407 						 *
4408 						 * Note: A "vm_map_copy_t" does not
4409 						 * have a lock but we're protected by
4410 						 * the named entry's lock here.
4411 						 */
4412 						// assert(copy_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC);
4413 						VME_OBJECT_SHADOW(copy_entry, copy_size, TRUE);
4414 						assert(copy_object != VME_OBJECT(copy_entry));
4415 						is_writable = false;
4416 						if (copy_entry->protection & VM_PROT_WRITE) {
4417 							is_writable = true;
4418 #if __arm64e__
4419 						} else if (copy_entry->used_for_tpro) {
4420 							is_writable = true;
4421 #endif /* __arm64e__ */
4422 						}
4423 						if (!copy_entry->needs_copy && is_writable) {
4424 							vm_prot_t prot;
4425 
4426 							prot = copy_entry->protection & ~VM_PROT_WRITE;
4427 							vm_object_pmap_protect(copy_object,
4428 							    copy_offset,
4429 							    copy_size,
4430 							    PMAP_NULL,
4431 							    PAGE_SIZE,
4432 							    0,
4433 							    prot);
4434 						}
4435 						copy_entry->needs_copy = FALSE;
4436 						copy_entry->is_shared = TRUE;
4437 						copy_object = VME_OBJECT(copy_entry);
4438 						copy_offset = VME_OFFSET(copy_entry);
4439 						vm_object_lock(copy_object);
4440 						/* we're about to make a shared mapping of this object */
4441 						copy_object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
4442 						VM_OBJECT_SET_TRUE_SHARE(copy_object, TRUE);
4443 						vm_object_unlock(copy_object);
4444 					}
4445 
4446 					if (copy_object != VM_OBJECT_NULL &&
4447 					    copy_object->named &&
4448 					    copy_object->pager != MEMORY_OBJECT_NULL &&
4449 					    copy_object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4450 						memory_object_t pager;
4451 						vm_prot_t       pager_prot;
4452 
4453 						/*
4454 						 * For "named" VM objects, let the pager know that the
4455 						 * memory object is being mapped.  Some pagers need to keep
4456 						 * track of this, to know when they can reclaim the memory
4457 						 * object, for example.
4458 						 * VM calls memory_object_map() for each mapping (specifying
4459 						 * the protection of each mapping) and calls
4460 						 * memory_object_last_unmap() when all the mappings are gone.
4461 						 */
4462 						pager_prot = max_protection;
4463 						if (copy) {
4464 							/*
4465 							 * Copy-On-Write mapping: won't modify the
4466 							 * memory object.
4467 							 */
4468 							pager_prot &= ~VM_PROT_WRITE;
4469 						}
4470 						vm_object_lock(copy_object);
4471 						pager = copy_object->pager;
4472 						if (copy_object->named &&
4473 						    pager != MEMORY_OBJECT_NULL &&
4474 						    copy_object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4475 							assert(copy_object->pager_ready);
4476 							vm_object_mapping_wait(copy_object, THREAD_UNINT);
4477 							/*
4478 							 * Object might have lost its pager
4479 							 * while waiting.
4480 							 */
4481 							pager = copy_object->pager;
4482 							if (copy_object->named &&
4483 							    pager != MEMORY_OBJECT_NULL) {
4484 								vm_object_mapping_begin(copy_object);
4485 								vm_object_unlock(copy_object);
4486 
4487 								kr = memory_object_map(pager, pager_prot);
4488 								assert(kr == KERN_SUCCESS);
4489 
4490 								vm_object_lock(copy_object);
4491 								vm_object_mapping_end(copy_object);
4492 							}
4493 						}
4494 						vm_object_unlock(copy_object);
4495 					}
4496 
4497 					/*
4498 					 *	Perform the copy if requested
4499 					 */
4500 
4501 					if (copy && copy_object != VM_OBJECT_NULL) {
4502 						vm_object_t             new_object;
4503 						vm_object_offset_t      new_offset;
4504 
4505 						result = vm_object_copy_strategically(copy_object, copy_offset,
4506 						    copy_size,
4507 						    false,                                   /* forking */
4508 						    &new_object, &new_offset,
4509 						    &do_copy);
4510 
4511 
4512 						if (result == KERN_MEMORY_RESTART_COPY) {
4513 							boolean_t success;
4514 							boolean_t src_needs_copy;
4515 
4516 							/*
4517 							 * XXX
4518 							 * We currently ignore src_needs_copy.
4519 							 * This really is the issue of how to make
4520 							 * MEMORY_OBJECT_COPY_SYMMETRIC safe for
4521 							 * non-kernel users to use. Solution forthcoming.
4522 							 * In the meantime, since we don't allow non-kernel
4523 							 * memory managers to specify symmetric copy,
4524 							 * we won't run into problems here.
4525 							 */
4526 							new_object = copy_object;
4527 							new_offset = copy_offset;
4528 							success = vm_object_copy_quickly(new_object,
4529 							    new_offset,
4530 							    copy_size,
4531 							    &src_needs_copy,
4532 							    &do_copy);
4533 							assert(success);
4534 							result = KERN_SUCCESS;
4535 						}
4536 						if (result != KERN_SUCCESS) {
4537 							kr = result;
4538 							break;
4539 						}
4540 
4541 						copy_object = new_object;
4542 						copy_offset = new_offset;
4543 						/*
4544 						 * No extra object reference for the mapping:
4545 						 * the mapping should be the only thing keeping
4546 						 * this new object alive.
4547 						 */
4548 					} else {
4549 						/*
4550 						 * We already have the right object
4551 						 * to map.
4552 						 */
4553 						copy_object = VME_OBJECT(copy_entry);
4554 						/* take an extra ref for the mapping below */
4555 						vm_object_reference(copy_object);
4556 					}
4557 				}
4558 
4559 				/*
4560 				 * If the caller does not want a specific
4561 				 * tag for this new mapping:  use
4562 				 * the tag of the original mapping.
4563 				 */
4564 				vm_map_kernel_flags_t vmk_remap_flags = {
4565 					.vmkf_submap = copy_entry->is_sub_map,
4566 				};
4567 
4568 				vm_map_kernel_flags_set_vmflags(&vmk_remap_flags,
4569 				    vm_map_kernel_flags_vmflags(vmk_flags),
4570 				    vmk_flags.vm_tag ?: VME_ALIAS(copy_entry));
4571 
4572 				/* over-map the object into destination */
4573 				vmk_remap_flags.vmf_fixed = true;
4574 				vmk_remap_flags.vmf_overwrite = true;
4575 
4576 				if (!copy && !copy_entry->is_sub_map) {
4577 					/*
4578 					 * copy-on-write should have been
4579 					 * resolved at this point, or we would
4580 					 * end up sharing instead of copying.
4581 					 */
4582 					assert(!copy_entry->needs_copy);
4583 				}
4584 #if XNU_TARGET_OS_OSX
4585 				if (copy_entry->used_for_jit) {
4586 					vmk_remap_flags.vmkf_map_jit = TRUE;
4587 				}
4588 #endif /* XNU_TARGET_OS_OSX */
4589 
4590 				kr = vm_map_enter(target_map,
4591 				    &copy_addr,
4592 				    copy_size,
4593 				    (vm_map_offset_t) 0,
4594 				    vmk_remap_flags,
4595 				    copy_object,
4596 				    copy_offset,
4597 				    ((copy_object == NULL)
4598 				    ? FALSE
4599 				    : (copy || copy_entry->needs_copy)),
4600 				    cur_protection,
4601 				    max_protection,
4602 				    inheritance);
4603 				if (kr != KERN_SUCCESS) {
4604 					DEBUG4K_SHARE("failed kr 0x%x\n", kr);
4605 					if (copy_entry->is_sub_map) {
4606 						vm_map_deallocate(copy_submap);
4607 					} else {
4608 						vm_object_deallocate(copy_object);
4609 					}
4610 					/* abort */
4611 					break;
4612 				}
4613 
4614 				/* next mapping */
4615 				copy_addr += copy_size;
4616 			}
4617 
4618 			named_entry_unlock(named_entry);
4619 			if (target_copy_map != copy_map) {
4620 				vm_map_copy_discard(target_copy_map);
4621 				target_copy_map = VM_MAP_COPY_NULL;
4622 			}
4623 
4624 			if (kr == KERN_SUCCESS) {
4625 				if (overmap_start) {
4626 					DEBUG4K_SHARE("map %p map_addr 0x%llx offset_in_mapping 0x%llx overmap_start 0x%llx -> *address 0x%llx\n", target_map, (uint64_t)map_addr, (uint64_t)offset_in_mapping, (uint64_t)overmap_start, (uint64_t)(map_addr + offset_in_mapping + overmap_start));
4627 				}
4628 				offset_in_mapping += overmap_start;
4629 			} else if (!vmk_flags.vmf_overwrite) {
4630 				/* deallocate the contiguous range */
4631 				vm_map_remove(target_map, map_addr,
4632 				    map_addr + map_size);
4633 			}
4634 			result = kr;
4635 			goto out;
4636 		}
4637 
4638 		if (named_entry->is_object) {
4639 			unsigned int    access;
4640 			unsigned int    wimg_mode;
4641 
4642 			assert(!named_entry->is_copy);
4643 			assert(!named_entry->is_sub_map);
4644 
4645 			/* we are mapping a VM object */
4646 
4647 			access = named_entry->access;
4648 
4649 			if (vmk_flags.vmf_return_data_addr ||
4650 			    vmk_flags.vmf_return_4k_data_addr) {
4651 				offset_in_mapping = obj_offs & map_mask;
4652 				if (vmk_flags.vmf_return_4k_data_addr) {
4653 					offset_in_mapping &= ~((signed)(0xFFF));
4654 				}
4655 				obj_offs -= offset_in_mapping;
4656 				map_size  = vm_map_round_page(initial_size +
4657 				    offset_in_mapping, map_mask);
4658 			}
4659 
4660 			object = vm_named_entry_to_vm_object(named_entry);
4661 			assert(object != VM_OBJECT_NULL);
4662 			vm_object_lock(object);
4663 			named_entry_unlock(named_entry);
4664 
4665 			wimg_mode = object->wimg_bits;
4666 			vm_prot_to_wimg(access, &wimg_mode);
4667 			if (object->wimg_bits != wimg_mode) {
4668 				vm_object_change_wimg_mode(object, wimg_mode);
4669 			}
4670 
4671 			vm_object_reference_locked(object);
4672 			vm_object_unlock(object);
4673 		} else {
4674 			panic("invalid VM named entry %p", named_entry);
4675 		}
4676 	} else if (ip_kotype(port) == IKOT_MEMORY_OBJECT) {
4677 		/*
4678 		 * JMM - This is temporary until we unify named entries
4679 		 * and raw memory objects.
4680 		 *
4681 		 * Detected fake ip_kotype for a memory object.  In
4682 		 * this case, the port isn't really a port at all, but
4683 		 * instead is just a raw memory object.
4684 		 */
4685 		if (vmk_flags.vmf_return_data_addr ||
4686 		    vmk_flags.vmf_return_4k_data_addr) {
4687 			panic("VM_FLAGS_RETURN_DATA_ADDR not expected for raw memory object.");
4688 		}
4689 
4690 		object = memory_object_to_vm_object((memory_object_t)port);
4691 		if (object == VM_OBJECT_NULL) {
4692 			return KERN_INVALID_OBJECT;
4693 		}
4694 		vm_object_reference(object);
4695 
4696 		/* wait for object (if any) to be ready */
4697 		if (object != VM_OBJECT_NULL) {
4698 			if (is_kernel_object(object)) {
4699 				printf("Warning: Attempt to map kernel object"
4700 				    " by a non-private kernel entity\n");
4701 				return KERN_INVALID_OBJECT;
4702 			}
4703 			if (!object->pager_ready) {
4704 				vm_object_lock(object);
4705 
4706 				while (!object->pager_ready) {
4707 					vm_object_sleep(object,
4708 					    VM_OBJECT_EVENT_PAGER_READY,
4709 					    THREAD_UNINT,
4710 					    LCK_SLEEP_EXCLUSIVE);
4711 				}
4712 				vm_object_unlock(object);
4713 			}
4714 		}
4715 	} else {
4716 		return KERN_INVALID_OBJECT;
4717 	}
4718 
4719 	if (object != VM_OBJECT_NULL &&
4720 	    object->named &&
4721 	    object->pager != MEMORY_OBJECT_NULL &&
4722 	    object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4723 		memory_object_t pager;
4724 		vm_prot_t       pager_prot;
4725 		kern_return_t   kr;
4726 
4727 		/*
4728 		 * For "named" VM objects, let the pager know that the
4729 		 * memory object is being mapped.  Some pagers need to keep
4730 		 * track of this, to know when they can reclaim the memory
4731 		 * object, for example.
4732 		 * VM calls memory_object_map() for each mapping (specifying
4733 		 * the protection of each mapping) and calls
4734 		 * memory_object_last_unmap() when all the mappings are gone.
4735 		 */
4736 		pager_prot = max_protection;
4737 		if (copy) {
4738 			/*
4739 			 * Copy-On-Write mapping: won't modify the
4740 			 * memory object.
4741 			 */
4742 			pager_prot &= ~VM_PROT_WRITE;
4743 		}
4744 		vm_object_lock(object);
4745 		pager = object->pager;
4746 		if (object->named &&
4747 		    pager != MEMORY_OBJECT_NULL &&
4748 		    object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4749 			assert(object->pager_ready);
4750 			vm_object_mapping_wait(object, THREAD_UNINT);
4751 			/* object might have lost its pager while waiting */
4752 			pager = object->pager;
4753 			if (object->named && pager != MEMORY_OBJECT_NULL) {
4754 				vm_object_mapping_begin(object);
4755 				vm_object_unlock(object);
4756 
4757 				kr = memory_object_map(pager, pager_prot);
4758 				assert(kr == KERN_SUCCESS);
4759 
4760 				vm_object_lock(object);
4761 				vm_object_mapping_end(object);
4762 			}
4763 		}
4764 		vm_object_unlock(object);
4765 	}
4766 
4767 	/*
4768 	 *	Perform the copy if requested
4769 	 */
4770 
4771 	if (copy) {
4772 		vm_object_t             new_object;
4773 		vm_object_offset_t      new_offset;
4774 
4775 		result = vm_object_copy_strategically(object,
4776 		    obj_offs,
4777 		    map_size,
4778 		    false,                                   /* forking */
4779 		    &new_object, &new_offset,
4780 		    &copy);
4781 
4782 
4783 		if (result == KERN_MEMORY_RESTART_COPY) {
4784 			boolean_t success;
4785 			boolean_t src_needs_copy;
4786 
4787 			/*
4788 			 * XXX
4789 			 * We currently ignore src_needs_copy.
4790 			 * This really is the issue of how to make
4791 			 * MEMORY_OBJECT_COPY_SYMMETRIC safe for
4792 			 * non-kernel users to use. Solution forthcoming.
4793 			 * In the meantime, since we don't allow non-kernel
4794 			 * memory managers to specify symmetric copy,
4795 			 * we won't run into problems here.
4796 			 */
4797 			new_object = object;
4798 			new_offset = obj_offs;
4799 			success = vm_object_copy_quickly(new_object,
4800 			    new_offset,
4801 			    map_size,
4802 			    &src_needs_copy,
4803 			    &copy);
4804 			assert(success);
4805 			result = KERN_SUCCESS;
4806 		}
4807 		/*
4808 		 *	Throw away the reference to the
4809 		 *	original object, as it won't be mapped.
4810 		 */
4811 
4812 		vm_object_deallocate(object);
4813 
4814 		if (result != KERN_SUCCESS) {
4815 			return result;
4816 		}
4817 
4818 		object   = new_object;
4819 		obj_offs = new_offset;
4820 	}
4821 
4822 	/*
4823 	 * If non-kernel users want to try to prefault pages, the mapping and prefault
4824 	 * needs to be atomic.
4825 	 */
4826 	kernel_prefault = (try_prefault && vm_kernel_map_is_kernel(target_map));
4827 	vmk_flags.vmkf_keep_map_locked = (try_prefault && !kernel_prefault);
4828 
4829 	result = vm_map_enter(target_map,
4830 	    &map_addr, map_size,
4831 	    (vm_map_offset_t)mask,
4832 	    vmk_flags,
4833 	    object, obj_offs,
4834 	    copy,
4835 	    cur_protection, max_protection,
4836 	    inheritance);
4837 	if (result != KERN_SUCCESS) {
4838 		vm_object_deallocate(object);
4839 	}
4840 
4841 	/*
4842 	 * Try to prefault, and do not forget to release the vm map lock.
4843 	 */
4844 	if (result == KERN_SUCCESS && try_prefault) {
4845 		mach_vm_address_t va = map_addr;
4846 		kern_return_t kr = KERN_SUCCESS;
4847 		unsigned int i = 0;
4848 		int pmap_options;
4849 
4850 		pmap_options = kernel_prefault ? 0 : PMAP_OPTIONS_NOWAIT;
4851 
4852 		for (i = 0; i < page_list_count; ++i) {
4853 			if (!UPL_VALID_PAGE(page_list, i)) {
4854 				if (kernel_prefault) {
4855 					assertf(FALSE, "kernel_prefault && !UPL_VALID_PAGE");
4856 					result = KERN_MEMORY_ERROR;
4857 					break;
4858 				}
4859 			} else {
4860 				/*
4861 				 * If this function call failed, we should stop
4862 				 * trying to optimize, other calls are likely
4863 				 * going to fail too.
4864 				 *
4865 				 * We are not gonna report an error for such
4866 				 * failure though. That's an optimization, not
4867 				 * something critical.
4868 				 */
4869 				kr = pmap_enter_object_options_check(target_map->pmap,
4870 				    va, 0, object, UPL_PHYS_PAGE(page_list, i),
4871 				    cur_protection, VM_PROT_NONE,
4872 				    TRUE, pmap_options);
4873 				if (kr != KERN_SUCCESS) {
4874 					OSIncrementAtomic64(&vm_prefault_nb_bailout);
4875 					if (kernel_prefault) {
4876 						result = kr;
4877 					}
4878 					break;
4879 				}
4880 				OSIncrementAtomic64(&vm_prefault_nb_pages);
4881 			}
4882 
4883 			/* Next virtual address */
4884 			va += PAGE_SIZE;
4885 		}
4886 		if (vmk_flags.vmkf_keep_map_locked) {
4887 			vm_map_unlock(target_map);
4888 		}
4889 	}
4890 
4891 out:
4892 	if (result == KERN_SUCCESS) {
4893 #if KASAN
4894 		if (target_map->pmap == kernel_pmap) {
4895 			kasan_notify_address(map_addr, map_size);
4896 		}
4897 #endif
4898 		*address_u = vm_sanitize_wrap_addr(map_addr + offset_in_mapping);
4899 	}
4900 	return result;
4901 }
4902 
4903 kern_return_t
vm_map_enter_mem_object_prefault(vm_map_t target_map,vm_map_offset_ut * address,vm_map_size_ut initial_size,vm_map_offset_ut mask,vm_map_kernel_flags_t vmk_flags,ipc_port_t port,vm_object_offset_ut offset,vm_prot_ut cur_protection,vm_prot_ut max_protection,upl_page_list_ptr_t page_list,unsigned int page_list_count)4904 vm_map_enter_mem_object_prefault(
4905 	vm_map_t                target_map,
4906 	vm_map_offset_ut       *address,
4907 	vm_map_size_ut          initial_size,
4908 	vm_map_offset_ut        mask,
4909 	vm_map_kernel_flags_t   vmk_flags,
4910 	ipc_port_t              port,
4911 	vm_object_offset_ut     offset,
4912 	vm_prot_ut              cur_protection,
4913 	vm_prot_ut              max_protection,
4914 	upl_page_list_ptr_t     page_list,
4915 	unsigned int            page_list_count)
4916 {
4917 	/* range_id is set by vm_map_enter_mem_object */
4918 	return vm_map_enter_mem_object(target_map,
4919 	           address,
4920 	           initial_size,
4921 	           mask,
4922 	           vmk_flags,
4923 	           port,
4924 	           offset,
4925 	           FALSE,
4926 	           cur_protection,
4927 	           max_protection,
4928 	           VM_INHERIT_DEFAULT,
4929 	           page_list,
4930 	           page_list_count);
4931 }
4932 
4933 static __attribute__((always_inline, warn_unused_result))
4934 kern_return_t
vm_map_enter_mem_object_control_sanitize(vm_map_t target_map,vm_map_offset_ut address_u,vm_map_size_ut initial_size_u,vm_map_offset_ut mask_u,vm_object_offset_ut offset_u,vm_prot_ut cur_protection_u,vm_prot_ut max_protection_u,vm_inherit_ut inheritance_u,vm_map_kernel_flags_t vmk_flags,vm_map_address_t * map_addr,vm_map_size_t * map_size,vm_map_offset_t * mask,vm_object_offset_t * obj_offs,vm_object_offset_t * obj_end,vm_object_size_t * obj_size,vm_prot_t * cur_protection,vm_prot_t * max_protection,vm_inherit_t * inheritance)4935 vm_map_enter_mem_object_control_sanitize(
4936 	vm_map_t                target_map,
4937 	vm_map_offset_ut        address_u,
4938 	vm_map_size_ut          initial_size_u,
4939 	vm_map_offset_ut        mask_u,
4940 	vm_object_offset_ut     offset_u,
4941 	vm_prot_ut              cur_protection_u,
4942 	vm_prot_ut              max_protection_u,
4943 	vm_inherit_ut           inheritance_u,
4944 	vm_map_kernel_flags_t   vmk_flags,
4945 	vm_map_address_t       *map_addr,
4946 	vm_map_size_t          *map_size,
4947 	vm_map_offset_t        *mask,
4948 	vm_object_offset_t     *obj_offs,
4949 	vm_object_offset_t     *obj_end,
4950 	vm_object_size_t       *obj_size,
4951 	vm_prot_t              *cur_protection,
4952 	vm_prot_t              *max_protection,
4953 	vm_inherit_t           *inheritance)
4954 {
4955 	kern_return_t           kr;
4956 
4957 	kr = vm_sanitize_cur_and_max_prots(cur_protection_u, max_protection_u,
4958 	    VM_SANITIZE_CALLER_ENTER_MEM_OBJ_CTL, target_map,
4959 	    cur_protection, max_protection);
4960 	if (__improbable(kr != KERN_SUCCESS)) {
4961 		return kr;
4962 	}
4963 
4964 	kr = vm_sanitize_inherit(inheritance_u, VM_SANITIZE_CALLER_ENTER_MEM_OBJ_CTL,
4965 	    inheritance);
4966 	if (__improbable(kr != KERN_SUCCESS)) {
4967 		return kr;
4968 	}
4969 
4970 	kr = vm_sanitize_mask(mask_u, VM_SANITIZE_CALLER_ENTER_MEM_OBJ_CTL, mask);
4971 	if (__improbable(kr != KERN_SUCCESS)) {
4972 		return kr;
4973 	}
4974 	/*
4975 	 * Ensure arithmetic doesn't overflow in vm_object space (kernel
4976 	 * pages).
4977 	 * We keep unaligned values for now. The call we eventually make to
4978 	 * vm_map_enter does guarantee that offset_u is page aligned for EITHER
4979 	 * target_map pages or kernel pages. But this isn't enough to guarantee
4980 	 * kernel space alignment.
4981 	 */
4982 	kr = vm_sanitize_addr_size(offset_u, initial_size_u,
4983 	    VM_SANITIZE_CALLER_ENTER_MEM_OBJ_CTL, PAGE_MASK,
4984 	    VM_SANITIZE_FLAGS_SIZE_ZERO_FAILS |
4985 	    VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES,
4986 	    obj_offs, obj_end, obj_size);
4987 	if (__improbable(kr != KERN_SUCCESS)) {
4988 		return kr;
4989 	}
4990 
4991 	/*
4992 	 * There is no vm_sanitize_addr_size variant that also adjusts for
4993 	 * a separate offset. Rather than create one for this one-off issue,
4994 	 * we sanitize map_addr and map_size individually, relying on
4995 	 * vm_sanitize_size to incorporate the offset. Then, we perform the
4996 	 * overflow check manually below.
4997 	 */
4998 	*map_addr = vm_sanitize_addr(target_map, address_u);
4999 	kr = vm_sanitize_size(offset_u, initial_size_u,
5000 	    VM_SANITIZE_CALLER_ENTER_MEM_OBJ_CTL, target_map,
5001 	    VM_SANITIZE_FLAGS_SIZE_ZERO_FAILS, map_size);
5002 	if (__improbable(kr != KERN_SUCCESS)) {
5003 		return kr;
5004 	}
5005 
5006 	/*
5007 	 * Ensure arithmetic doesn't overflow in target_map space.
5008 	 * The computation of map_size above accounts for the possibility that
5009 	 * offset_u might be unaligned in target_map space.
5010 	 */
5011 	if (vmk_flags.vmf_fixed) {
5012 		vm_map_address_t map_end;
5013 
5014 		if (__improbable(os_add_overflow(*map_addr, *map_size, &map_end))) {
5015 			return KERN_INVALID_ARGUMENT;
5016 		}
5017 	}
5018 
5019 	return KERN_SUCCESS;
5020 }
5021 
5022 kern_return_t
vm_map_enter_mem_object_control(vm_map_t target_map,vm_map_offset_ut * address_u,vm_map_size_ut initial_size_u,vm_map_offset_ut mask_u,vm_map_kernel_flags_t vmk_flags,memory_object_control_t control,vm_object_offset_ut offset_u,boolean_t needs_copy,vm_prot_ut cur_protection_u,vm_prot_ut max_protection_u,vm_inherit_ut inheritance_u)5023 vm_map_enter_mem_object_control(
5024 	vm_map_t                target_map,
5025 	vm_map_offset_ut       *address_u,
5026 	vm_map_size_ut          initial_size_u,
5027 	vm_map_offset_ut        mask_u,
5028 	vm_map_kernel_flags_t   vmk_flags,
5029 	memory_object_control_t control,
5030 	vm_object_offset_ut     offset_u,
5031 	boolean_t               needs_copy,
5032 	vm_prot_ut              cur_protection_u,
5033 	vm_prot_ut              max_protection_u,
5034 	vm_inherit_ut           inheritance_u)
5035 {
5036 	vm_map_offset_t         mask;
5037 	vm_prot_t               cur_protection;
5038 	vm_prot_t               max_protection;
5039 	vm_inherit_t            inheritance;
5040 	vm_map_address_t        map_addr;
5041 	vm_map_size_t           map_size;
5042 	vm_object_t             object;
5043 	vm_object_offset_t      obj_offs, obj_end;
5044 	vm_object_size_t        obj_size;
5045 	kern_return_t           result;
5046 	memory_object_t         pager;
5047 	vm_prot_t               pager_prot;
5048 	kern_return_t           kr;
5049 
5050 	/*
5051 	 * Check arguments for validity
5052 	 */
5053 	if (target_map == VM_MAP_NULL) {
5054 		return KERN_INVALID_ARGUMENT;
5055 	}
5056 
5057 	/*
5058 	 * We only support vmf_return_data_addr-like behavior.
5059 	 */
5060 	vmk_flags.vmf_return_data_addr = true;
5061 
5062 	/*
5063 	 * Sanitize any input parameters that are addr/size/prot/inherit
5064 	 */
5065 	kr = vm_map_enter_mem_object_control_sanitize(target_map,
5066 	    *address_u,
5067 	    initial_size_u,
5068 	    mask_u,
5069 	    offset_u,
5070 	    cur_protection_u,
5071 	    max_protection_u,
5072 	    inheritance_u,
5073 	    vmk_flags,
5074 	    &map_addr,
5075 	    &map_size,
5076 	    &mask,
5077 	    &obj_offs,
5078 	    &obj_end,
5079 	    &obj_size,
5080 	    &cur_protection,
5081 	    &max_protection,
5082 	    &inheritance);
5083 	if (__improbable(kr != KERN_SUCCESS)) {
5084 		return vm_sanitize_get_kr(kr);
5085 	}
5086 
5087 	object = memory_object_control_to_vm_object(control);
5088 
5089 	if (object == VM_OBJECT_NULL) {
5090 		return KERN_INVALID_OBJECT;
5091 	}
5092 
5093 	if (is_kernel_object(object)) {
5094 		printf("Warning: Attempt to map kernel object"
5095 		    " by a non-private kernel entity\n");
5096 		return KERN_INVALID_OBJECT;
5097 	}
5098 
5099 	vm_object_lock(object);
5100 	os_ref_retain_locked_raw(&object->ref_count, &vm_object_refgrp);
5101 
5102 
5103 	/*
5104 	 * For "named" VM objects, let the pager know that the
5105 	 * memory object is being mapped.  Some pagers need to keep
5106 	 * track of this, to know when they can reclaim the memory
5107 	 * object, for example.
5108 	 * VM calls memory_object_map() for each mapping (specifying
5109 	 * the protection of each mapping) and calls
5110 	 * memory_object_last_unmap() when all the mappings are gone.
5111 	 */
5112 	pager_prot = max_protection;
5113 	if (needs_copy) {
5114 		pager_prot &= ~VM_PROT_WRITE;
5115 	}
5116 	pager = object->pager;
5117 	if (object->named &&
5118 	    pager != MEMORY_OBJECT_NULL &&
5119 	    object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
5120 		assert(object->pager_ready);
5121 		vm_object_mapping_wait(object, THREAD_UNINT);
5122 		/* object might have lost its pager while waiting */
5123 		pager = object->pager;
5124 		if (object->named && pager != MEMORY_OBJECT_NULL) {
5125 			vm_object_mapping_begin(object);
5126 			vm_object_unlock(object);
5127 
5128 			kr = memory_object_map(pager, pager_prot);
5129 			assert(kr == KERN_SUCCESS);
5130 
5131 			vm_object_lock(object);
5132 			vm_object_mapping_end(object);
5133 		}
5134 	}
5135 	vm_object_unlock(object);
5136 
5137 	/*
5138 	 *	Perform the copy if requested
5139 	 */
5140 
5141 	if (needs_copy) {
5142 		vm_object_t             new_object;
5143 		vm_object_offset_t      new_offset;
5144 
5145 		result = vm_object_copy_strategically(object, obj_offs, obj_size,
5146 		    false,                                   /* forking */
5147 		    &new_object, &new_offset,
5148 		    &needs_copy);
5149 
5150 
5151 		if (result == KERN_MEMORY_RESTART_COPY) {
5152 			boolean_t success;
5153 			boolean_t src_needs_copy;
5154 
5155 			/*
5156 			 * XXX
5157 			 * We currently ignore src_needs_copy.
5158 			 * This really is the issue of how to make
5159 			 * MEMORY_OBJECT_COPY_SYMMETRIC safe for
5160 			 * non-kernel users to use. Solution forthcoming.
5161 			 * In the meantime, since we don't allow non-kernel
5162 			 * memory managers to specify symmetric copy,
5163 			 * we won't run into problems here.
5164 			 */
5165 			new_object = object;
5166 			new_offset = obj_offs;
5167 			success = vm_object_copy_quickly(new_object,
5168 			    new_offset, obj_size,
5169 			    &src_needs_copy,
5170 			    &needs_copy);
5171 			assert(success);
5172 			result = KERN_SUCCESS;
5173 		}
5174 		/*
5175 		 *	Throw away the reference to the
5176 		 *	original object, as it won't be mapped.
5177 		 */
5178 
5179 		vm_object_deallocate(object);
5180 
5181 		if (result != KERN_SUCCESS) {
5182 			return result;
5183 		}
5184 
5185 		object   = new_object;
5186 		obj_offs = new_offset;
5187 	}
5188 
5189 	result = vm_map_enter(target_map,
5190 	    &map_addr, map_size,
5191 	    (vm_map_offset_t)mask,
5192 	    vmk_flags,
5193 	    object,
5194 	    obj_offs,
5195 	    needs_copy,
5196 	    cur_protection, max_protection,
5197 	    inheritance);
5198 
5199 	if (result == KERN_SUCCESS) {
5200 		*address_u = vm_sanitize_wrap_addr(
5201 			map_addr + (obj_offs & vm_map_page_mask(target_map)));
5202 	} else {
5203 		vm_object_deallocate(object);
5204 	}
5205 
5206 	return result;
5207 }
5208 
5209 
5210 /* Not used without nested pmaps */
5211 #ifndef NO_NESTED_PMAP
5212 /*
5213  * Clip and unnest a portion of a nested submap mapping.
5214  */
5215 
5216 
5217 static void
vm_map_clip_unnest(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t start_unnest,vm_map_offset_t end_unnest)5218 vm_map_clip_unnest(
5219 	vm_map_t        map,
5220 	vm_map_entry_t  entry,
5221 	vm_map_offset_t start_unnest,
5222 	vm_map_offset_t end_unnest)
5223 {
5224 	vm_map_offset_t old_start_unnest = start_unnest;
5225 	vm_map_offset_t old_end_unnest = end_unnest;
5226 
5227 	assert(entry->is_sub_map);
5228 	assert(VME_SUBMAP(entry) != NULL);
5229 	assert(entry->use_pmap);
5230 
5231 	/*
5232 	 * Query the platform for the optimal unnest range.
5233 	 * DRK: There's some duplication of effort here, since
5234 	 * callers may have adjusted the range to some extent. This
5235 	 * routine was introduced to support 1GiB subtree nesting
5236 	 * for x86 platforms, which can also nest on 2MiB boundaries
5237 	 * depending on size/alignment.
5238 	 */
5239 	if (pmap_adjust_unnest_parameters(map->pmap, &start_unnest, &end_unnest)) {
5240 		assert(VME_SUBMAP(entry)->is_nested_map);
5241 		assert(!VME_SUBMAP(entry)->disable_vmentry_reuse);
5242 		log_unnest_badness(map,
5243 		    old_start_unnest,
5244 		    old_end_unnest,
5245 		    VME_SUBMAP(entry)->is_nested_map,
5246 		    (entry->vme_start +
5247 		    VME_SUBMAP(entry)->lowest_unnestable_start -
5248 		    VME_OFFSET(entry)));
5249 	}
5250 
5251 	if (entry->vme_start > start_unnest ||
5252 	    entry->vme_end < end_unnest) {
5253 		panic("vm_map_clip_unnest(0x%llx,0x%llx): "
5254 		    "bad nested entry: start=0x%llx end=0x%llx\n",
5255 		    (long long)start_unnest, (long long)end_unnest,
5256 		    (long long)entry->vme_start, (long long)entry->vme_end);
5257 	}
5258 
5259 	if (start_unnest > entry->vme_start) {
5260 		_vm_map_clip_start(&map->hdr,
5261 		    entry,
5262 		    start_unnest);
5263 		if (map->holelistenabled) {
5264 			vm_map_store_update_first_free(map, NULL, FALSE);
5265 		} else {
5266 			vm_map_store_update_first_free(map, map->first_free, FALSE);
5267 		}
5268 	}
5269 	if (entry->vme_end > end_unnest) {
5270 		_vm_map_clip_end(&map->hdr,
5271 		    entry,
5272 		    end_unnest);
5273 		if (map->holelistenabled) {
5274 			vm_map_store_update_first_free(map, NULL, FALSE);
5275 		} else {
5276 			vm_map_store_update_first_free(map, map->first_free, FALSE);
5277 		}
5278 	}
5279 
5280 	pmap_unnest(map->pmap,
5281 	    entry->vme_start,
5282 	    entry->vme_end - entry->vme_start);
5283 	if ((map->mapped_in_other_pmaps) && os_ref_get_count_raw(&map->map_refcnt) != 0) {
5284 		/* clean up parent map/maps */
5285 		vm_map_submap_pmap_clean(
5286 			map, entry->vme_start,
5287 			entry->vme_end,
5288 			VME_SUBMAP(entry),
5289 			VME_OFFSET(entry));
5290 	}
5291 	entry->use_pmap = FALSE;
5292 	if ((map->pmap != kernel_pmap) &&
5293 	    (VME_ALIAS(entry) == VM_MEMORY_SHARED_PMAP)) {
5294 		VME_ALIAS_SET(entry, VM_MEMORY_UNSHARED_PMAP);
5295 	}
5296 }
5297 #endif  /* NO_NESTED_PMAP */
5298 
5299 __abortlike
5300 static void
__vm_map_clip_atomic_entry_panic(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t where)5301 __vm_map_clip_atomic_entry_panic(
5302 	vm_map_t        map,
5303 	vm_map_entry_t  entry,
5304 	vm_map_offset_t where)
5305 {
5306 	panic("vm_map_clip(%p): Attempting to clip an atomic VM map entry "
5307 	    "%p [0x%llx:0x%llx] at 0x%llx", map, entry,
5308 	    (uint64_t)entry->vme_start,
5309 	    (uint64_t)entry->vme_end,
5310 	    (uint64_t)where);
5311 }
5312 
5313 /*
5314  *	vm_map_clip_start:	[ internal use only ]
5315  *
5316  *	Asserts that the given entry begins at or after
5317  *	the specified address; if necessary,
5318  *	it splits the entry into two.
5319  */
5320 void
vm_map_clip_start(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t startaddr)5321 vm_map_clip_start(
5322 	vm_map_t        map,
5323 	vm_map_entry_t  entry,
5324 	vm_map_offset_t startaddr)
5325 {
5326 #ifndef NO_NESTED_PMAP
5327 	if (entry->is_sub_map &&
5328 	    entry->use_pmap &&
5329 	    startaddr >= entry->vme_start) {
5330 		vm_map_offset_t start_unnest, end_unnest;
5331 
5332 		/*
5333 		 * Make sure "startaddr" is no longer in a nested range
5334 		 * before we clip.  Unnest only the minimum range the platform
5335 		 * can handle.
5336 		 * vm_map_clip_unnest may perform additional adjustments to
5337 		 * the unnest range.
5338 		 */
5339 		start_unnest = startaddr & ~(pmap_shared_region_size_min(map->pmap) - 1);
5340 		end_unnest = start_unnest + pmap_shared_region_size_min(map->pmap);
5341 		vm_map_clip_unnest(map, entry, start_unnest, end_unnest);
5342 	}
5343 #endif /* NO_NESTED_PMAP */
5344 	if (startaddr > entry->vme_start) {
5345 		if (!entry->is_sub_map &&
5346 		    VME_OBJECT(entry) &&
5347 		    VME_OBJECT(entry)->phys_contiguous) {
5348 			pmap_remove(map->pmap,
5349 			    (addr64_t)(entry->vme_start),
5350 			    (addr64_t)(entry->vme_end));
5351 		}
5352 		if (entry->vme_atomic) {
5353 			__vm_map_clip_atomic_entry_panic(map, entry, startaddr);
5354 		}
5355 
5356 		DTRACE_VM5(
5357 			vm_map_clip_start,
5358 			vm_map_t, map,
5359 			vm_map_offset_t, entry->vme_start,
5360 			vm_map_offset_t, entry->vme_end,
5361 			vm_map_offset_t, startaddr,
5362 			int, VME_ALIAS(entry));
5363 
5364 		_vm_map_clip_start(&map->hdr, entry, startaddr);
5365 		if (map->holelistenabled) {
5366 			vm_map_store_update_first_free(map, NULL, FALSE);
5367 		} else {
5368 			vm_map_store_update_first_free(map, map->first_free, FALSE);
5369 		}
5370 	}
5371 }
5372 
5373 
5374 #define vm_map_copy_clip_start(copy, entry, startaddr) \
5375 	MACRO_BEGIN \
5376 	if ((startaddr) > (entry)->vme_start) \
5377 	        _vm_map_clip_start(&(copy)->cpy_hdr,(entry),(startaddr)); \
5378 	MACRO_END
5379 
5380 /*
5381  *	This routine is called only when it is known that
5382  *	the entry must be split.
5383  */
5384 static void
_vm_map_clip_start(struct vm_map_header * map_header,vm_map_entry_t entry,vm_map_offset_t start)5385 _vm_map_clip_start(
5386 	struct vm_map_header    *map_header,
5387 	vm_map_entry_t          entry,
5388 	vm_map_offset_t         start)
5389 {
5390 	vm_map_entry_t  new_entry;
5391 
5392 	/*
5393 	 *	Split off the front portion --
5394 	 *	note that we must insert the new
5395 	 *	entry BEFORE this one, so that
5396 	 *	this entry has the specified starting
5397 	 *	address.
5398 	 */
5399 
5400 	if (entry->map_aligned) {
5401 		assert(VM_MAP_PAGE_ALIGNED(start,
5402 		    VM_MAP_HDR_PAGE_MASK(map_header)));
5403 	}
5404 
5405 	new_entry = _vm_map_entry_create(map_header);
5406 	vm_map_entry_copy_full(new_entry, entry);
5407 
5408 	new_entry->vme_end = start;
5409 	assert(new_entry->vme_start < new_entry->vme_end);
5410 	VME_OFFSET_SET(entry, VME_OFFSET(entry) + (start - entry->vme_start));
5411 	if (__improbable(start >= entry->vme_end)) {
5412 		panic("mapHdr %p entry %p start 0x%llx end 0x%llx new start 0x%llx", map_header, entry, entry->vme_start, entry->vme_end, start);
5413 	}
5414 	assert(start < entry->vme_end);
5415 	entry->vme_start = start;
5416 
5417 #if VM_BTLOG_TAGS
5418 	if (new_entry->vme_kernel_object) {
5419 		btref_retain(new_entry->vme_tag_btref);
5420 	}
5421 #endif /* VM_BTLOG_TAGS */
5422 
5423 	_vm_map_store_entry_link(map_header, entry->vme_prev, new_entry);
5424 
5425 	if (entry->is_sub_map) {
5426 		vm_map_reference(VME_SUBMAP(new_entry));
5427 	} else {
5428 		vm_object_reference(VME_OBJECT(new_entry));
5429 	}
5430 }
5431 
5432 
5433 /*
5434  *	vm_map_clip_end:	[ internal use only ]
5435  *
5436  *	Asserts that the given entry ends at or before
5437  *	the specified address; if necessary,
5438  *	it splits the entry into two.
5439  */
5440 void
vm_map_clip_end(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t endaddr)5441 vm_map_clip_end(
5442 	vm_map_t        map,
5443 	vm_map_entry_t  entry,
5444 	vm_map_offset_t endaddr)
5445 {
5446 	if (endaddr > entry->vme_end) {
5447 		/*
5448 		 * Within the scope of this clipping, limit "endaddr" to
5449 		 * the end of this map entry...
5450 		 */
5451 		endaddr = entry->vme_end;
5452 	}
5453 #ifndef NO_NESTED_PMAP
5454 	if (entry->is_sub_map && entry->use_pmap) {
5455 		vm_map_offset_t start_unnest, end_unnest;
5456 
5457 		/*
5458 		 * Make sure the range between the start of this entry and
5459 		 * the new "endaddr" is no longer nested before we clip.
5460 		 * Unnest only the minimum range the platform can handle.
5461 		 * vm_map_clip_unnest may perform additional adjustments to
5462 		 * the unnest range.
5463 		 */
5464 		start_unnest = entry->vme_start;
5465 		end_unnest =
5466 		    (endaddr + pmap_shared_region_size_min(map->pmap) - 1) &
5467 		    ~(pmap_shared_region_size_min(map->pmap) - 1);
5468 		vm_map_clip_unnest(map, entry, start_unnest, end_unnest);
5469 	}
5470 #endif /* NO_NESTED_PMAP */
5471 	if (endaddr < entry->vme_end) {
5472 		if (!entry->is_sub_map &&
5473 		    VME_OBJECT(entry) &&
5474 		    VME_OBJECT(entry)->phys_contiguous) {
5475 			pmap_remove(map->pmap,
5476 			    (addr64_t)(entry->vme_start),
5477 			    (addr64_t)(entry->vme_end));
5478 		}
5479 		if (entry->vme_atomic) {
5480 			__vm_map_clip_atomic_entry_panic(map, entry, endaddr);
5481 		}
5482 		DTRACE_VM5(
5483 			vm_map_clip_end,
5484 			vm_map_t, map,
5485 			vm_map_offset_t, entry->vme_start,
5486 			vm_map_offset_t, entry->vme_end,
5487 			vm_map_offset_t, endaddr,
5488 			int, VME_ALIAS(entry));
5489 
5490 		_vm_map_clip_end(&map->hdr, entry, endaddr);
5491 		if (map->holelistenabled) {
5492 			vm_map_store_update_first_free(map, NULL, FALSE);
5493 		} else {
5494 			vm_map_store_update_first_free(map, map->first_free, FALSE);
5495 		}
5496 	}
5497 }
5498 
5499 
5500 #define vm_map_copy_clip_end(copy, entry, endaddr) \
5501 	MACRO_BEGIN \
5502 	if ((endaddr) < (entry)->vme_end) \
5503 	        _vm_map_clip_end(&(copy)->cpy_hdr,(entry),(endaddr)); \
5504 	MACRO_END
5505 
5506 /*
5507  *	This routine is called only when it is known that
5508  *	the entry must be split.
5509  */
5510 static void
_vm_map_clip_end(struct vm_map_header * map_header,vm_map_entry_t entry,vm_map_offset_t end)5511 _vm_map_clip_end(
5512 	struct vm_map_header    *map_header,
5513 	vm_map_entry_t          entry,
5514 	vm_map_offset_t         end)
5515 {
5516 	vm_map_entry_t  new_entry;
5517 
5518 	/*
5519 	 *	Create a new entry and insert it
5520 	 *	AFTER the specified entry
5521 	 */
5522 
5523 	if (entry->map_aligned) {
5524 		assert(VM_MAP_PAGE_ALIGNED(end,
5525 		    VM_MAP_HDR_PAGE_MASK(map_header)));
5526 	}
5527 
5528 	new_entry = _vm_map_entry_create(map_header);
5529 	vm_map_entry_copy_full(new_entry, entry);
5530 
5531 	if (__improbable(end <= entry->vme_start)) {
5532 		panic("mapHdr %p entry %p start 0x%llx end 0x%llx new end 0x%llx", map_header, entry, entry->vme_start, entry->vme_end, end);
5533 	}
5534 	assert(entry->vme_start < end);
5535 	new_entry->vme_start = entry->vme_end = end;
5536 	VME_OFFSET_SET(new_entry,
5537 	    VME_OFFSET(new_entry) + (end - entry->vme_start));
5538 	assert(new_entry->vme_start < new_entry->vme_end);
5539 
5540 #if VM_BTLOG_TAGS
5541 	if (new_entry->vme_kernel_object) {
5542 		btref_retain(new_entry->vme_tag_btref);
5543 	}
5544 #endif /* VM_BTLOG_TAGS */
5545 
5546 	_vm_map_store_entry_link(map_header, entry, new_entry);
5547 
5548 	if (entry->is_sub_map) {
5549 		vm_map_reference(VME_SUBMAP(new_entry));
5550 	} else {
5551 		vm_object_reference(VME_OBJECT(new_entry));
5552 	}
5553 }
5554 
5555 
5556 /*
5557  *	VM_MAP_RANGE_CHECK:	[ internal use only ]
5558  *
5559  *	Asserts that the starting and ending region
5560  *	addresses fall within the valid range of the map.
5561  */
5562 #define VM_MAP_RANGE_CHECK(map, start, end)     \
5563 	MACRO_BEGIN                             \
5564 	if (start < vm_map_min(map))            \
5565 	        start = vm_map_min(map);        \
5566 	if (end > vm_map_max(map))              \
5567 	        end = vm_map_max(map);          \
5568 	if (start > end)                        \
5569 	        start = end;                    \
5570 	MACRO_END
5571 
5572 /*
5573  *	vm_map_range_check:	[ internal use only ]
5574  *
5575  *	Check that the region defined by the specified start and
5576  *	end addresses are wholly contained within a single map
5577  *	entry or set of adjacent map entries of the spacified map,
5578  *	i.e. the specified region contains no unmapped space.
5579  *	If any or all of the region is unmapped, FALSE is returned.
5580  *	Otherwise, TRUE is returned and if the output argument 'entry'
5581  *	is not NULL it points to the map entry containing the start
5582  *	of the region.
5583  *
5584  *	The map is locked for reading on entry and is left locked.
5585  */
5586 static boolean_t
vm_map_range_check(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_map_entry_t * entry)5587 vm_map_range_check(
5588 	vm_map_t                map,
5589 	vm_map_offset_t         start,
5590 	vm_map_offset_t         end,
5591 	vm_map_entry_t          *entry)
5592 {
5593 	vm_map_entry_t          cur;
5594 	vm_map_offset_t         prev;
5595 
5596 	/*
5597 	 *      Basic sanity checks first
5598 	 */
5599 	if (start < vm_map_min(map) || end > vm_map_max(map) || start > end) {
5600 		return FALSE;
5601 	}
5602 
5603 	/*
5604 	 *      Check first if the region starts within a valid
5605 	 *	mapping for the map.
5606 	 */
5607 	if (!vm_map_lookup_entry(map, start, &cur)) {
5608 		return FALSE;
5609 	}
5610 
5611 	/*
5612 	 *	Optimize for the case that the region is contained
5613 	 *	in a single map entry.
5614 	 */
5615 	if (entry != (vm_map_entry_t *) NULL) {
5616 		*entry = cur;
5617 	}
5618 	if (end <= cur->vme_end) {
5619 		return TRUE;
5620 	}
5621 
5622 	/*
5623 	 *      If the region is not wholly contained within a
5624 	 *      single entry, walk the entries looking for holes.
5625 	 */
5626 	prev = cur->vme_end;
5627 	cur = cur->vme_next;
5628 	while ((cur != vm_map_to_entry(map)) && (prev == cur->vme_start)) {
5629 		if (end <= cur->vme_end) {
5630 			return TRUE;
5631 		}
5632 		prev = cur->vme_end;
5633 		cur = cur->vme_next;
5634 	}
5635 	return FALSE;
5636 }
5637 
5638 static __attribute__((always_inline, warn_unused_result))
5639 kern_return_t
vm_map_protect_sanitize(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_prot_ut new_prot_u,vm_map_offset_t * start,vm_map_offset_t * end,vm_prot_t * new_prot)5640 vm_map_protect_sanitize(
5641 	vm_map_t                map,
5642 	vm_map_offset_ut        start_u,
5643 	vm_map_offset_ut        end_u,
5644 	vm_prot_ut              new_prot_u,
5645 	vm_map_offset_t        *start,
5646 	vm_map_offset_t        *end,
5647 	vm_prot_t              *new_prot)
5648 {
5649 	kern_return_t           kr;
5650 	vm_map_size_t           size;
5651 	vm_sanitize_flags_t     flags = VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS;
5652 
5653 
5654 	kr = vm_sanitize_prot(new_prot_u, VM_SANITIZE_CALLER_VM_MAP_PROTECT,
5655 	    map, VM_PROT_COPY, new_prot);
5656 	if (__improbable(kr != KERN_SUCCESS)) {
5657 		return kr;
5658 	}
5659 
5660 	kr = vm_sanitize_addr_end(start_u, end_u, VM_SANITIZE_CALLER_VM_MAP_PROTECT,
5661 	    map, flags, start, end, &size);
5662 	if (__improbable(kr != KERN_SUCCESS)) {
5663 		return kr;
5664 	}
5665 
5666 	return KERN_SUCCESS;
5667 }
5668 
5669 /*
5670  *	vm_map_protect:
5671  *
5672  *	Sets the protection of the specified address
5673  *	region in the target map.  If "set_max" is
5674  *	specified, the maximum protection is to be set;
5675  *	otherwise, only the current protection is affected.
5676  */
5677 kern_return_t
vm_map_protect(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,boolean_t set_max,vm_prot_ut new_prot_u)5678 vm_map_protect(
5679 	vm_map_t                map,
5680 	vm_map_offset_ut        start_u,
5681 	vm_map_offset_ut        end_u,
5682 	boolean_t               set_max,
5683 	vm_prot_ut              new_prot_u)
5684 {
5685 	vm_map_entry_t                  current;
5686 	vm_map_offset_t                 prev;
5687 	vm_map_entry_t                  entry;
5688 	vm_prot_t                       new_prot;
5689 	vm_prot_t                       new_max;
5690 	int                             pmap_options = 0;
5691 	kern_return_t                   kr;
5692 	vm_map_offset_t                 start, original_start;
5693 	vm_map_offset_t                 end;
5694 
5695 	kr = vm_map_protect_sanitize(map,
5696 	    start_u,
5697 	    end_u,
5698 	    new_prot_u,
5699 	    &start,
5700 	    &end,
5701 	    &new_prot);
5702 	if (__improbable(kr != KERN_SUCCESS)) {
5703 		return vm_sanitize_get_kr(kr);
5704 	}
5705 	original_start = start;
5706 
5707 	if (new_prot & VM_PROT_COPY) {
5708 		vm_map_offset_t         new_start;
5709 		vm_prot_t               cur_prot, max_prot;
5710 		vm_map_kernel_flags_t   kflags;
5711 
5712 		/* LP64todo - see below */
5713 		if (start >= map->max_offset) {
5714 			return KERN_INVALID_ADDRESS;
5715 		}
5716 
5717 		if ((new_prot & VM_PROT_ALLEXEC) &&
5718 		    map->pmap != kernel_pmap &&
5719 		    (vm_map_cs_enforcement(map)
5720 #if XNU_TARGET_OS_OSX && __arm64__
5721 		    || !VM_MAP_IS_EXOTIC(map)
5722 #endif /* XNU_TARGET_OS_OSX && __arm64__ */
5723 		    ) &&
5724 		    VM_MAP_POLICY_WX_FAIL(map)) {
5725 			DTRACE_VM3(cs_wx,
5726 			    uint64_t, (uint64_t) start,
5727 			    uint64_t, (uint64_t) end,
5728 			    vm_prot_t, new_prot);
5729 			printf("CODE SIGNING: %d[%s] %s:%d(0x%llx,0x%llx,0x%x) can't have both write and exec at the same time\n",
5730 			    proc_selfpid(),
5731 			    (get_bsdtask_info(current_task())
5732 			    ? proc_name_address(get_bsdtask_info(current_task()))
5733 			    : "?"),
5734 			    __FUNCTION__, __LINE__,
5735 #if DEVELOPMENT || DEBUG
5736 			    (uint64_t)start,
5737 			    (uint64_t)end,
5738 #else /* DEVELOPMENT || DEBUG */
5739 			    (uint64_t)0,
5740 			    (uint64_t)0,
5741 #endif /* DEVELOPMENT || DEBUG */
5742 			    new_prot);
5743 			return KERN_PROTECTION_FAILURE;
5744 		}
5745 
5746 		/*
5747 		 * Let vm_map_remap_extract() know that it will need to:
5748 		 * + make a copy of the mapping
5749 		 * + add VM_PROT_WRITE to the max protections
5750 		 * + remove any protections that are no longer allowed from the
5751 		 *   max protections (to avoid any WRITE/EXECUTE conflict, for
5752 		 *   example).
5753 		 * Note that "max_prot" is an IN/OUT parameter only for this
5754 		 * specific (VM_PROT_COPY) case.  It's usually an OUT parameter
5755 		 * only.
5756 		 */
5757 		max_prot = new_prot & (VM_PROT_ALL | VM_PROT_ALLEXEC);
5758 		cur_prot = VM_PROT_NONE;
5759 		kflags = VM_MAP_KERNEL_FLAGS_FIXED(.vmf_overwrite = true);
5760 		kflags.vmkf_remap_prot_copy = true;
5761 		kflags.vmkf_tpro_enforcement_override = !vm_map_tpro_enforcement(map);
5762 		new_start = start;
5763 		kr = vm_map_remap(map,
5764 		    vm_sanitize_wrap_addr_ref(&new_start),
5765 		    end - start,
5766 		    0, /* mask */
5767 		    kflags,
5768 		    map,
5769 		    start,
5770 		    TRUE, /* copy-on-write remapping! */
5771 		    vm_sanitize_wrap_prot_ref(&cur_prot), /* IN/OUT */
5772 		    vm_sanitize_wrap_prot_ref(&max_prot), /* IN/OUT */
5773 		    VM_INHERIT_DEFAULT);
5774 		if (kr != KERN_SUCCESS) {
5775 			return kr;
5776 		}
5777 		new_prot &= ~VM_PROT_COPY;
5778 	}
5779 
5780 	vm_map_lock(map);
5781 restart_after_unlock:
5782 
5783 	/* LP64todo - remove this check when vm_map_commpage64()
5784 	 * no longer has to stuff in a map_entry for the commpage
5785 	 * above the map's max_offset.
5786 	 */
5787 	if (start >= map->max_offset) {
5788 		vm_map_unlock(map);
5789 		return KERN_INVALID_ADDRESS;
5790 	}
5791 
5792 	while (1) {
5793 		/*
5794 		 *      Lookup the entry.  If it doesn't start in a valid
5795 		 *	entry, return an error.
5796 		 */
5797 		if (!vm_map_lookup_entry(map, start, &entry)) {
5798 			vm_map_unlock(map);
5799 			return KERN_INVALID_ADDRESS;
5800 		}
5801 
5802 		if (entry->superpage_size && (start & (SUPERPAGE_SIZE - 1))) { /* extend request to whole entry */
5803 			start = SUPERPAGE_ROUND_DOWN(start);
5804 			continue;
5805 		}
5806 		break;
5807 	}
5808 	if (entry->superpage_size) {
5809 		end = SUPERPAGE_ROUND_UP(end);
5810 	}
5811 
5812 	/*
5813 	 *	Make a first pass to check for protection and address
5814 	 *	violations.
5815 	 */
5816 
5817 	current = entry;
5818 	prev = current->vme_start;
5819 	while ((current != vm_map_to_entry(map)) &&
5820 	    (current->vme_start < end)) {
5821 		/*
5822 		 * If there is a hole, return an error.
5823 		 */
5824 		if (current->vme_start != prev) {
5825 			vm_map_unlock(map);
5826 			return KERN_INVALID_ADDRESS;
5827 		}
5828 
5829 		new_max = current->max_protection;
5830 
5831 #if defined(__x86_64__)
5832 		/* Allow max mask to include execute prot bits if this map doesn't enforce CS */
5833 		if (set_max && (new_prot & VM_PROT_ALLEXEC) && !vm_map_cs_enforcement(map)) {
5834 			new_max = (new_max & ~VM_PROT_ALLEXEC) | (new_prot & VM_PROT_ALLEXEC);
5835 		}
5836 #elif CODE_SIGNING_MONITOR
5837 		if (set_max && (new_prot & VM_PROT_EXECUTE) && (csm_address_space_exempt(map->pmap) == KERN_SUCCESS)) {
5838 			new_max |= VM_PROT_EXECUTE;
5839 		}
5840 #endif
5841 		if ((new_prot & new_max) != new_prot) {
5842 			vm_map_unlock(map);
5843 			return KERN_PROTECTION_FAILURE;
5844 		}
5845 
5846 		if (current->used_for_jit &&
5847 		    pmap_has_prot_policy(map->pmap, current->translated_allow_execute, current->protection)) {
5848 			vm_map_unlock(map);
5849 			return KERN_PROTECTION_FAILURE;
5850 		}
5851 
5852 #if __arm64e__
5853 		/* Disallow protecting hw assisted TPRO mappings */
5854 		if (current->used_for_tpro) {
5855 			vm_map_unlock(map);
5856 			return KERN_PROTECTION_FAILURE;
5857 		}
5858 #endif /* __arm64e__ */
5859 
5860 
5861 		if ((new_prot & VM_PROT_WRITE) &&
5862 		    (new_prot & VM_PROT_ALLEXEC) &&
5863 #if XNU_TARGET_OS_OSX
5864 		    map->pmap != kernel_pmap &&
5865 		    (vm_map_cs_enforcement(map)
5866 #if __arm64__
5867 		    || !VM_MAP_IS_EXOTIC(map)
5868 #endif /* __arm64__ */
5869 		    ) &&
5870 #endif /* XNU_TARGET_OS_OSX */
5871 #if CODE_SIGNING_MONITOR
5872 		    (csm_address_space_exempt(map->pmap) != KERN_SUCCESS) &&
5873 #endif
5874 		    !(current->used_for_jit)) {
5875 			DTRACE_VM3(cs_wx,
5876 			    uint64_t, (uint64_t) current->vme_start,
5877 			    uint64_t, (uint64_t) current->vme_end,
5878 			    vm_prot_t, new_prot);
5879 			printf("CODE SIGNING: %d[%s] %s:%d(0x%llx,0x%llx,0x%x) can't have both write and exec at the same time\n",
5880 			    proc_selfpid(),
5881 			    (get_bsdtask_info(current_task())
5882 			    ? proc_name_address(get_bsdtask_info(current_task()))
5883 			    : "?"),
5884 			    __FUNCTION__, __LINE__,
5885 #if DEVELOPMENT || DEBUG
5886 			    (uint64_t)current->vme_start,
5887 			    (uint64_t)current->vme_end,
5888 #else /* DEVELOPMENT || DEBUG */
5889 			    (uint64_t)0,
5890 			    (uint64_t)0,
5891 #endif /* DEVELOPMENT || DEBUG */
5892 			    new_prot);
5893 			new_prot &= ~VM_PROT_ALLEXEC;
5894 			if (VM_MAP_POLICY_WX_FAIL(map)) {
5895 				vm_map_unlock(map);
5896 				return KERN_PROTECTION_FAILURE;
5897 			}
5898 		}
5899 
5900 		/*
5901 		 * If the task has requested executable lockdown,
5902 		 * deny both:
5903 		 * - adding executable protections OR
5904 		 * - adding write protections to an existing executable mapping.
5905 		 */
5906 		if (map->map_disallow_new_exec == TRUE) {
5907 			if ((new_prot & VM_PROT_ALLEXEC) ||
5908 			    ((current->protection & VM_PROT_EXECUTE) && (new_prot & VM_PROT_WRITE))) {
5909 				vm_map_unlock(map);
5910 				return KERN_PROTECTION_FAILURE;
5911 			}
5912 		}
5913 
5914 		prev = current->vme_end;
5915 		current = current->vme_next;
5916 	}
5917 
5918 #if __arm64__
5919 	if (end > prev &&
5920 	    end == vm_map_round_page(prev, VM_MAP_PAGE_MASK(map))) {
5921 		vm_map_entry_t prev_entry;
5922 
5923 		prev_entry = current->vme_prev;
5924 		if (prev_entry != vm_map_to_entry(map) &&
5925 		    !prev_entry->map_aligned &&
5926 		    (vm_map_round_page(prev_entry->vme_end,
5927 		    VM_MAP_PAGE_MASK(map))
5928 		    == end)) {
5929 			/*
5930 			 * The last entry in our range is not "map-aligned"
5931 			 * but it would have reached all the way to "end"
5932 			 * if it had been map-aligned, so this is not really
5933 			 * a hole in the range and we can proceed.
5934 			 */
5935 			prev = end;
5936 		}
5937 	}
5938 #endif /* __arm64__ */
5939 
5940 	if (end > prev) {
5941 		vm_map_unlock(map);
5942 		return KERN_INVALID_ADDRESS;
5943 	}
5944 
5945 	/*
5946 	 *	Go back and fix up protections.
5947 	 *	Clip to start here if the range starts within
5948 	 *	the entry.
5949 	 */
5950 
5951 	current = entry;
5952 	if (current != vm_map_to_entry(map)) {
5953 		/* clip and unnest if necessary */
5954 		vm_map_clip_start(map, current, start);
5955 	}
5956 
5957 	while ((current != vm_map_to_entry(map)) &&
5958 	    (current->vme_start < end)) {
5959 		vm_prot_t       old_prot;
5960 
5961 		if (current->in_transition) {
5962 			wait_result_t wait_result;
5963 			vm_map_offset_t current_start;
5964 
5965 			/*
5966 			 * Another thread is wiring/unwiring this entry.
5967 			 * Let the other thread know we are waiting.
5968 			 */
5969 			current_start = current->vme_start;
5970 			current->needs_wakeup = true;
5971 			/* wait for the other thread to be done */
5972 			wait_result = vm_map_entry_wait(map, TH_UNINT);
5973 			/*
5974 			 * We unlocked the map, so anything could have changed in the
5975 			 * range and we need to re-check from "current_start" to "end".
5976 			 * Our entries might no longer be valid.
5977 			 */
5978 			current = NULL;
5979 			entry = NULL;
5980 			/*
5981 			 * Re-lookup and re-clip "current_start".
5982 			 * If it's no longer mapped,
5983 			 */
5984 			vm_map_lookup_entry_or_next(map, current_start, &current);
5985 			if (current != vm_map_to_entry(map)) {
5986 				vm_map_clip_start(map, current, current_start);
5987 			}
5988 			/* restart from this point */
5989 			start = current_start;
5990 			goto restart_after_unlock;
5991 		}
5992 
5993 		vm_map_clip_end(map, current, end);
5994 
5995 #if DEVELOPMENT || DEBUG
5996 		if (current->csm_associated && vm_log_xnu_user_debug) {
5997 			printf("FBDP %d[%s] %s(0x%llx,0x%llx,0x%x) on map %p entry %p [0x%llx:0x%llx 0x%x/0x%x] csm_associated\n",
5998 			    proc_selfpid(),
5999 			    (get_bsdtask_info(current_task())
6000 			    ? proc_name_address(get_bsdtask_info(current_task()))
6001 			    : "?"),
6002 			    __FUNCTION__,
6003 			    (uint64_t)start,
6004 			    (uint64_t)end,
6005 			    new_prot,
6006 			    map, current,
6007 			    current->vme_start,
6008 			    current->vme_end,
6009 			    current->protection,
6010 			    current->max_protection);
6011 		}
6012 #endif /* DEVELOPMENT || DEBUG */
6013 
6014 		if (current->is_sub_map) {
6015 			/* clipping did unnest if needed */
6016 			assert(!current->use_pmap);
6017 		}
6018 
6019 		old_prot = current->protection;
6020 
6021 		if (set_max) {
6022 			current->max_protection = new_prot;
6023 			/* Consider either EXECUTE or UEXEC as EXECUTE for this masking */
6024 			current->protection = (new_prot & old_prot);
6025 		} else {
6026 			current->protection = new_prot;
6027 		}
6028 
6029 #if CODE_SIGNING_MONITOR
6030 		if (/* a !csm_associated mapping becoming executable */
6031 			((!current->csm_associated &&
6032 			!(old_prot & VM_PROT_EXECUTE) &&
6033 			(current->protection & VM_PROT_EXECUTE))
6034 			||
6035 			/* a csm_associated mapping becoming writable */
6036 			(current->csm_associated &&
6037 			!(old_prot & VM_PROT_WRITE) &&
6038 			(current->protection & VM_PROT_WRITE)))) {
6039 			/*
6040 			 * This mapping has not already been marked as
6041 			 * "user_debug" and it is either:
6042 			 * 1. not code-signing-monitored and becoming executable
6043 			 * 2. code-signing-monitored and becoming writable,
6044 			 * so inform the CodeSigningMonitor and mark the
6045 			 * mapping as "user_debug" if appropriate.
6046 			 */
6047 			vm_map_kernel_flags_t vmk_flags;
6048 			vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
6049 			/* pretend it's a vm_protect(VM_PROT_COPY)... */
6050 			vmk_flags.vmkf_remap_prot_copy = true;
6051 			kr = vm_map_entry_cs_associate(map, current, vmk_flags);
6052 #if DEVELOPMENT || DEBUG
6053 			if (vm_log_xnu_user_debug) {
6054 				printf("FBDP %d[%s] %s:%d map %p entry %p [ 0x%llx 0x%llx ] prot 0x%x -> 0x%x cs_associate -> %d user_debug=%d\n",
6055 				    proc_selfpid(),
6056 				    (get_bsdtask_info(current_task()) ? proc_name_address(get_bsdtask_info(current_task())) : "?"),
6057 				    __FUNCTION__, __LINE__,
6058 				    map, current,
6059 				    current->vme_start, current->vme_end,
6060 				    old_prot, current->protection,
6061 				    kr, current->vme_xnu_user_debug);
6062 			}
6063 #endif /* DEVELOPMENT || DEBUG */
6064 		}
6065 #endif /* CODE_SIGNING_MONITOR */
6066 
6067 		/*
6068 		 *	Update physical map if necessary.
6069 		 *	If the request is to turn off write protection,
6070 		 *	we won't do it for real (in pmap). This is because
6071 		 *	it would cause copy-on-write to fail.  We've already
6072 		 *	set, the new protection in the map, so if a
6073 		 *	write-protect fault occurred, it will be fixed up
6074 		 *	properly, COW or not.
6075 		 */
6076 		if (current->protection != old_prot) {
6077 			/* Look one level in we support nested pmaps */
6078 			/* from mapped submaps which are direct entries */
6079 			/* in our map */
6080 
6081 			vm_prot_t prot;
6082 
6083 			prot = current->protection;
6084 			if (current->is_sub_map || (VME_OBJECT(current) == NULL) || (VME_OBJECT(current) != compressor_object)) {
6085 				prot &= ~VM_PROT_WRITE;
6086 			} else {
6087 				assert(!VME_OBJECT(current)->code_signed);
6088 				assert(VME_OBJECT(current)->copy_strategy == MEMORY_OBJECT_COPY_NONE);
6089 				if (prot & VM_PROT_WRITE) {
6090 					/*
6091 					 * For write requests on the
6092 					 * compressor, we wil ask the
6093 					 * pmap layer to prevent us from
6094 					 * taking a write fault when we
6095 					 * attempt to access the mapping
6096 					 * next.
6097 					 */
6098 					pmap_options |= PMAP_OPTIONS_PROTECT_IMMEDIATE;
6099 				}
6100 			}
6101 
6102 			if (override_nx(map, VME_ALIAS(current)) && prot) {
6103 				prot |= VM_PROT_EXECUTE;
6104 			}
6105 
6106 #if DEVELOPMENT || DEBUG
6107 			if (!(old_prot & VM_PROT_EXECUTE) &&
6108 			    (prot & VM_PROT_EXECUTE) &&
6109 			    panic_on_unsigned_execute &&
6110 			    (proc_selfcsflags() & CS_KILL)) {
6111 				panic("vm_map_protect(%p,0x%llx,0x%llx) old=0x%x new=0x%x - <rdar://23770418> code-signing bypass?", map, (uint64_t)current->vme_start, (uint64_t)current->vme_end, old_prot, prot);
6112 			}
6113 #endif /* DEVELOPMENT || DEBUG */
6114 
6115 			if (pmap_has_prot_policy(map->pmap, current->translated_allow_execute, prot)) {
6116 				if (current->wired_count) {
6117 					panic("vm_map_protect(%p,0x%llx,0x%llx) new=0x%x wired=%x",
6118 					    map, (uint64_t)current->vme_start, (uint64_t)current->vme_end, prot, current->wired_count);
6119 				}
6120 
6121 				/* If the pmap layer cares about this
6122 				 * protection type, force a fault for
6123 				 * each page so that vm_fault will
6124 				 * repopulate the page with the full
6125 				 * set of protections.
6126 				 */
6127 				/*
6128 				 * TODO: We don't seem to need this,
6129 				 * but this is due to an internal
6130 				 * implementation detail of
6131 				 * pmap_protect.  Do we want to rely
6132 				 * on this?
6133 				 */
6134 				prot = VM_PROT_NONE;
6135 			}
6136 
6137 			if (current->is_sub_map && current->use_pmap) {
6138 				pmap_protect(VME_SUBMAP(current)->pmap,
6139 				    current->vme_start,
6140 				    current->vme_end,
6141 				    prot);
6142 			} else {
6143 				pmap_protect_options(map->pmap,
6144 				    current->vme_start,
6145 				    current->vme_end,
6146 				    prot,
6147 				    pmap_options,
6148 				    NULL);
6149 			}
6150 		}
6151 		current = current->vme_next;
6152 	}
6153 
6154 	if (entry == VM_MAP_ENTRY_NULL) {
6155 		/*
6156 		 * Re-lookup the original start of our range.
6157 		 * If it's no longer mapped, start with the next mapping.
6158 		 */
6159 		vm_map_lookup_entry_or_next(map, original_start, &entry);
6160 	}
6161 	current = entry;
6162 	while ((current != vm_map_to_entry(map)) &&
6163 	    (current->vme_start <= end)) {
6164 		vm_map_simplify_entry(map, current);
6165 		current = current->vme_next;
6166 	}
6167 
6168 	vm_map_unlock(map);
6169 	return KERN_SUCCESS;
6170 }
6171 
6172 static __attribute__((always_inline, warn_unused_result))
6173 kern_return_t
vm_map_inherit_sanitize(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_inherit_ut new_inheritance_u,vm_map_offset_t * start,vm_map_offset_t * end,vm_inherit_t * new_inheritance)6174 vm_map_inherit_sanitize(
6175 	vm_map_t                        map,
6176 	vm_map_offset_ut                start_u,
6177 	vm_map_offset_ut                end_u,
6178 	vm_inherit_ut                   new_inheritance_u,
6179 	vm_map_offset_t                *start,
6180 	vm_map_offset_t                *end,
6181 	vm_inherit_t                   *new_inheritance)
6182 {
6183 	kern_return_t   kr;
6184 	vm_map_size_t   size;
6185 
6186 	kr = vm_sanitize_inherit(new_inheritance_u,
6187 	    VM_SANITIZE_CALLER_VM_MAP_INHERIT, new_inheritance);
6188 	if (__improbable(kr != KERN_SUCCESS)) {
6189 		return kr;
6190 	}
6191 
6192 	vm_sanitize_flags_t flags = VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS;
6193 
6194 
6195 	kr = vm_sanitize_addr_end(start_u, end_u, VM_SANITIZE_CALLER_VM_MAP_INHERIT,
6196 	    map, flags, start, end, &size);
6197 	if (__improbable(kr != KERN_SUCCESS)) {
6198 		return kr;
6199 	}
6200 
6201 	return KERN_SUCCESS;
6202 }
6203 
6204 /*
6205  *	vm_map_inherit:
6206  *
6207  *	Sets the inheritance of the specified address
6208  *	range in the target map.  Inheritance
6209  *	affects how the map will be shared with
6210  *	child maps at the time of vm_map_fork.
6211  */
6212 kern_return_t
vm_map_inherit(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_inherit_ut new_inheritance_u)6213 vm_map_inherit(
6214 	vm_map_t                        map,
6215 	vm_map_offset_ut                start_u,
6216 	vm_map_offset_ut                end_u,
6217 	vm_inherit_ut                   new_inheritance_u)
6218 {
6219 	vm_map_entry_t  entry;
6220 	vm_map_entry_t  temp_entry;
6221 	kern_return_t   kr;
6222 	vm_map_offset_t start;
6223 	vm_map_offset_t end;
6224 	vm_inherit_t    new_inheritance;
6225 
6226 	kr = vm_map_inherit_sanitize(map,
6227 	    start_u,
6228 	    end_u,
6229 	    new_inheritance_u,
6230 	    &start,
6231 	    &end,
6232 	    &new_inheritance);
6233 	if (__improbable(kr != KERN_SUCCESS)) {
6234 		return vm_sanitize_get_kr(kr);
6235 	}
6236 
6237 	vm_map_lock(map);
6238 
6239 	VM_MAP_RANGE_CHECK(map, start, end);
6240 
6241 	if (vm_map_lookup_entry(map, start, &temp_entry)) {
6242 		entry = temp_entry;
6243 	} else {
6244 		temp_entry = temp_entry->vme_next;
6245 		entry = temp_entry;
6246 	}
6247 
6248 	/* first check entire range for entries which can't support the */
6249 	/* given inheritance. */
6250 	while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
6251 		if (entry->is_sub_map) {
6252 			if (new_inheritance == VM_INHERIT_COPY) {
6253 				vm_map_unlock(map);
6254 				return KERN_INVALID_ARGUMENT;
6255 			}
6256 		}
6257 
6258 		entry = entry->vme_next;
6259 	}
6260 
6261 	entry = temp_entry;
6262 	if (entry != vm_map_to_entry(map)) {
6263 		/* clip and unnest if necessary */
6264 		vm_map_clip_start(map, entry, start);
6265 	}
6266 
6267 	while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
6268 		vm_map_clip_end(map, entry, end);
6269 		if (entry->is_sub_map) {
6270 			/* clip did unnest if needed */
6271 			assert(!entry->use_pmap);
6272 		}
6273 
6274 		entry->inheritance = new_inheritance;
6275 
6276 		entry = entry->vme_next;
6277 	}
6278 
6279 	vm_map_unlock(map);
6280 	return KERN_SUCCESS;
6281 }
6282 
6283 /*
6284  * Update the accounting for the amount of wired memory in this map.  If the user has
6285  * exceeded the defined limits, then we fail.  Wiring on behalf of the kernel never fails.
6286  */
6287 
6288 static kern_return_t
add_wire_counts(vm_map_t map,vm_map_entry_t entry,boolean_t user_wire)6289 add_wire_counts(
6290 	vm_map_t        map,
6291 	vm_map_entry_t  entry,
6292 	boolean_t       user_wire)
6293 {
6294 	vm_map_size_t   size;
6295 
6296 	bool first_wire = entry->wired_count == 0 && entry->user_wired_count == 0;
6297 
6298 	if (user_wire) {
6299 		unsigned int total_wire_count =  vm_page_wire_count + vm_lopage_free_count;
6300 
6301 		/*
6302 		 * We're wiring memory at the request of the user.  Check if this is the first time the user is wiring
6303 		 * this map entry.
6304 		 */
6305 
6306 		if (entry->user_wired_count == 0) {
6307 			size = entry->vme_end - entry->vme_start;
6308 
6309 			/*
6310 			 * Since this is the first time the user is wiring this map entry, check to see if we're
6311 			 * exceeding the user wire limits.  There is a per map limit which is the smaller of either
6312 			 * the process's rlimit or the global vm_per_task_user_wire_limit which caps this value.  There is also
6313 			 * a system-wide limit on the amount of memory all users can wire.  If the user is over either
6314 			 * limit, then we fail.
6315 			 */
6316 
6317 			if (size + map->user_wire_size > MIN(map->user_wire_limit, vm_per_task_user_wire_limit) ||
6318 			    size + ptoa_64(total_wire_count) > vm_global_user_wire_limit) {
6319 				if (size + ptoa_64(total_wire_count) > vm_global_user_wire_limit) {
6320 #if DEVELOPMENT || DEBUG
6321 					if (panic_on_mlock_failure) {
6322 						panic("mlock: Over global wire limit. %llu bytes wired and requested to wire %llu bytes more", ptoa_64(total_wire_count), (uint64_t) size);
6323 					}
6324 #endif /* DEVELOPMENT || DEBUG */
6325 					os_atomic_inc(&vm_add_wire_count_over_global_limit, relaxed);
6326 				} else {
6327 					os_atomic_inc(&vm_add_wire_count_over_user_limit, relaxed);
6328 #if DEVELOPMENT || DEBUG
6329 					if (panic_on_mlock_failure) {
6330 						panic("mlock: Over process wire limit. %llu bytes wired and requested to wire %llu bytes more", (uint64_t) map->user_wire_size, (uint64_t) size);
6331 					}
6332 #endif /* DEVELOPMENT || DEBUG */
6333 				}
6334 				return KERN_RESOURCE_SHORTAGE;
6335 			}
6336 
6337 			/*
6338 			 * The first time the user wires an entry, we also increment the wired_count and add this to
6339 			 * the total that has been wired in the map.
6340 			 */
6341 
6342 			if (entry->wired_count >= MAX_WIRE_COUNT) {
6343 				return KERN_FAILURE;
6344 			}
6345 
6346 			entry->wired_count++;
6347 			map->user_wire_size += size;
6348 		}
6349 
6350 		if (entry->user_wired_count >= MAX_WIRE_COUNT) {
6351 			return KERN_FAILURE;
6352 		}
6353 
6354 		entry->user_wired_count++;
6355 	} else {
6356 		/*
6357 		 * The kernel's wiring the memory.  Just bump the count and continue.
6358 		 */
6359 
6360 		if (entry->wired_count >= MAX_WIRE_COUNT) {
6361 			panic("vm_map_wire: too many wirings");
6362 		}
6363 
6364 		entry->wired_count++;
6365 	}
6366 
6367 	if (first_wire) {
6368 		vme_btref_consider_and_set(entry, __builtin_frame_address(0));
6369 	}
6370 
6371 	return KERN_SUCCESS;
6372 }
6373 
6374 /*
6375  * Update the memory wiring accounting now that the given map entry is being unwired.
6376  */
6377 
6378 static void
subtract_wire_counts(vm_map_t map,vm_map_entry_t entry,boolean_t user_wire)6379 subtract_wire_counts(
6380 	vm_map_t        map,
6381 	vm_map_entry_t  entry,
6382 	boolean_t       user_wire)
6383 {
6384 	if (user_wire) {
6385 		/*
6386 		 * We're unwiring memory at the request of the user.  See if we're removing the last user wire reference.
6387 		 */
6388 
6389 		if (entry->user_wired_count == 1) {
6390 			/*
6391 			 * We're removing the last user wire reference.  Decrement the wired_count and the total
6392 			 * user wired memory for this map.
6393 			 */
6394 
6395 			assert(entry->wired_count >= 1);
6396 			entry->wired_count--;
6397 			map->user_wire_size -= entry->vme_end - entry->vme_start;
6398 		}
6399 
6400 		assert(entry->user_wired_count >= 1);
6401 		entry->user_wired_count--;
6402 	} else {
6403 		/*
6404 		 * The kernel is unwiring the memory.   Just update the count.
6405 		 */
6406 
6407 		assert(entry->wired_count >= 1);
6408 		entry->wired_count--;
6409 	}
6410 
6411 	vme_btref_consider_and_put(entry);
6412 }
6413 
6414 int cs_executable_wire = 0;
6415 
6416 static kern_return_t
vm_map_wire_nested(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t caller_prot,vm_tag_t tag,boolean_t user_wire,pmap_t map_pmap,vm_map_offset_t pmap_addr,ppnum_t * physpage_p)6417 vm_map_wire_nested(
6418 	vm_map_t                map,
6419 	vm_map_offset_t         start,
6420 	vm_map_offset_t         end,
6421 	vm_prot_t               caller_prot,
6422 	vm_tag_t                tag,
6423 	boolean_t               user_wire,
6424 	pmap_t                  map_pmap,
6425 	vm_map_offset_t         pmap_addr,
6426 	ppnum_t                *physpage_p)
6427 {
6428 	vm_map_entry_t          entry;
6429 	vm_prot_t               access_type;
6430 	struct vm_map_entry     *first_entry, tmp_entry;
6431 	vm_map_t                real_map;
6432 	vm_map_offset_t         s, e;
6433 	kern_return_t           rc;
6434 	boolean_t               need_wakeup;
6435 	boolean_t               main_map = FALSE;
6436 	wait_interrupt_t        interruptible_state;
6437 	thread_t                cur_thread;
6438 	unsigned int            last_timestamp;
6439 	vm_map_size_t           size;
6440 	boolean_t               wire_and_extract;
6441 	vm_prot_t               extra_prots;
6442 
6443 	extra_prots = VM_PROT_COPY;
6444 	extra_prots |= VM_PROT_COPY_FAIL_IF_EXECUTABLE;
6445 #if XNU_TARGET_OS_OSX
6446 	if (map->pmap == kernel_pmap ||
6447 	    !vm_map_cs_enforcement(map)) {
6448 		extra_prots &= ~VM_PROT_COPY_FAIL_IF_EXECUTABLE;
6449 	}
6450 #endif /* XNU_TARGET_OS_OSX */
6451 #if CODE_SIGNING_MONITOR
6452 	if (csm_address_space_exempt(map->pmap) == KERN_SUCCESS) {
6453 		extra_prots &= ~VM_PROT_COPY_FAIL_IF_EXECUTABLE;
6454 	}
6455 #endif /* CODE_SIGNING_MONITOR */
6456 
6457 	access_type = (caller_prot & (VM_PROT_ALL | VM_PROT_ALLEXEC));
6458 
6459 	wire_and_extract = FALSE;
6460 	if (physpage_p != NULL) {
6461 		/*
6462 		 * The caller wants the physical page number of the
6463 		 * wired page.  We return only one physical page number
6464 		 * so this works for only one page at a time.
6465 		 *
6466 		 * The only caller (vm_map_wire_and_extract)
6467 		 * guarantees it.
6468 		 */
6469 		assert(end - start == VM_MAP_PAGE_SIZE(map));
6470 		wire_and_extract = TRUE;
6471 		*physpage_p = 0;
6472 	}
6473 
6474 	VM_MAP_RANGE_CHECK(map, start, end);
6475 	assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
6476 	assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
6477 	if (start == end) {
6478 		/* We wired what the caller asked for, zero pages */
6479 		return KERN_SUCCESS;
6480 	}
6481 
6482 	vm_map_lock(map);
6483 	if (map_pmap == NULL) {
6484 		main_map = TRUE;
6485 	}
6486 	last_timestamp = map->timestamp;
6487 
6488 	need_wakeup = FALSE;
6489 	cur_thread = current_thread();
6490 
6491 	s = start;
6492 	rc = KERN_SUCCESS;
6493 
6494 	if (vm_map_lookup_entry(map, s, &first_entry)) {
6495 		entry = first_entry;
6496 		/*
6497 		 * vm_map_clip_start will be done later.
6498 		 * We don't want to unnest any nested submaps here !
6499 		 */
6500 	} else {
6501 		/* Start address is not in map */
6502 		rc = KERN_INVALID_ADDRESS;
6503 		goto done;
6504 	}
6505 
6506 	while ((entry != vm_map_to_entry(map)) && (s < end)) {
6507 		/*
6508 		 * At this point, we have wired from "start" to "s".
6509 		 * We still need to wire from "s" to "end".
6510 		 *
6511 		 * "entry" hasn't been clipped, so it could start before "s"
6512 		 * and/or end after "end".
6513 		 */
6514 
6515 		/* "e" is how far we want to wire in this entry */
6516 		e = entry->vme_end;
6517 		if (e > end) {
6518 			e = end;
6519 		}
6520 
6521 		/*
6522 		 * If another thread is wiring/unwiring this entry then
6523 		 * block after informing other thread to wake us up.
6524 		 */
6525 		if (entry->in_transition) {
6526 			wait_result_t wait_result;
6527 
6528 			/*
6529 			 * We have not clipped the entry.  Make sure that
6530 			 * the start address is in range so that the lookup
6531 			 * below will succeed.
6532 			 * "s" is the current starting point: we've already
6533 			 * wired from "start" to "s" and we still have
6534 			 * to wire from "s" to "end".
6535 			 */
6536 
6537 			entry->needs_wakeup = TRUE;
6538 
6539 			/*
6540 			 * wake up anybody waiting on entries that we have
6541 			 * already wired.
6542 			 */
6543 			if (need_wakeup) {
6544 				vm_map_entry_wakeup(map);
6545 				need_wakeup = FALSE;
6546 			}
6547 			/*
6548 			 * User wiring is interruptible
6549 			 */
6550 			wait_result = vm_map_entry_wait(map,
6551 			    (user_wire) ? THREAD_ABORTSAFE :
6552 			    THREAD_UNINT);
6553 			if (user_wire && wait_result == THREAD_INTERRUPTED) {
6554 				/*
6555 				 * undo the wirings we have done so far
6556 				 * We do not clear the needs_wakeup flag,
6557 				 * because we cannot tell if we were the
6558 				 * only one waiting.
6559 				 */
6560 				rc = KERN_FAILURE;
6561 				goto done;
6562 			}
6563 
6564 			/*
6565 			 * Cannot avoid a lookup here. reset timestamp.
6566 			 */
6567 			last_timestamp = map->timestamp;
6568 
6569 			/*
6570 			 * The entry could have been clipped, look it up again.
6571 			 * Worse that can happen is, it may not exist anymore.
6572 			 */
6573 			if (!vm_map_lookup_entry(map, s, &first_entry)) {
6574 				/*
6575 				 * User: undo everything upto the previous
6576 				 * entry.  let vm_map_unwire worry about
6577 				 * checking the validity of the range.
6578 				 */
6579 				rc = KERN_FAILURE;
6580 				goto done;
6581 			}
6582 			entry = first_entry;
6583 			continue;
6584 		}
6585 
6586 		if (entry->is_sub_map) {
6587 			vm_map_offset_t sub_start;
6588 			vm_map_offset_t sub_end;
6589 			vm_map_offset_t local_start;
6590 			vm_map_offset_t local_end;
6591 			pmap_t          pmap;
6592 			vm_map_t        sub_map = VM_MAP_NULL;
6593 
6594 			if (wire_and_extract) {
6595 				/*
6596 				 * Wiring would result in copy-on-write
6597 				 * which would not be compatible with
6598 				 * the sharing we have with the original
6599 				 * provider of this memory.
6600 				 */
6601 				rc = KERN_INVALID_ARGUMENT;
6602 				goto done;
6603 			}
6604 
6605 			vm_map_clip_start(map, entry, s);
6606 			vm_map_clip_end(map, entry, end);
6607 
6608 			sub_start = VME_OFFSET(entry);
6609 			sub_end = entry->vme_end;
6610 			sub_end += VME_OFFSET(entry) - entry->vme_start;
6611 
6612 			local_end = entry->vme_end;
6613 			if (map_pmap == NULL) {
6614 				vm_object_t             object;
6615 				vm_object_offset_t      offset;
6616 				vm_prot_t               prot;
6617 				boolean_t               wired;
6618 				vm_map_entry_t          local_entry;
6619 				vm_map_version_t         version;
6620 				vm_map_t                lookup_map;
6621 
6622 				if (entry->use_pmap) {
6623 					pmap = VME_SUBMAP(entry)->pmap;
6624 					/* ppc implementation requires that */
6625 					/* submaps pmap address ranges line */
6626 					/* up with parent map */
6627 #ifdef notdef
6628 					pmap_addr = sub_start;
6629 #endif
6630 					pmap_addr = s;
6631 				} else {
6632 					pmap = map->pmap;
6633 					pmap_addr = s;
6634 				}
6635 
6636 				if (entry->wired_count) {
6637 					if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6638 						goto done;
6639 					}
6640 
6641 					/*
6642 					 * The map was not unlocked:
6643 					 * no need to goto re-lookup.
6644 					 * Just go directly to next entry.
6645 					 */
6646 					entry = entry->vme_next;
6647 					s = entry->vme_start;
6648 					continue;
6649 				}
6650 
6651 				/* call vm_map_lookup_and_lock_object to */
6652 				/* cause any needs copy to be   */
6653 				/* evaluated */
6654 				local_start = entry->vme_start;
6655 				lookup_map = map;
6656 				vm_map_lock_write_to_read(map);
6657 				rc = vm_map_lookup_and_lock_object(
6658 					&lookup_map, local_start,
6659 					(access_type | extra_prots),
6660 					OBJECT_LOCK_EXCLUSIVE,
6661 					&version, &object,
6662 					&offset, &prot, &wired,
6663 					NULL,
6664 					&real_map, NULL);
6665 				if (rc != KERN_SUCCESS) {
6666 					vm_map_unlock_read(lookup_map);
6667 					assert(map_pmap == NULL);
6668 					vm_map_unwire_nested(map, start,
6669 					    s, user_wire, PMAP_NULL, 0);
6670 					return rc;
6671 				}
6672 				vm_object_unlock(object);
6673 				if (real_map != lookup_map) {
6674 					vm_map_unlock(real_map);
6675 				}
6676 				vm_map_unlock_read(lookup_map);
6677 				vm_map_lock(map);
6678 
6679 				/* we unlocked, so must re-lookup */
6680 				if (!vm_map_lookup_entry(map,
6681 				    local_start,
6682 				    &local_entry)) {
6683 					rc = KERN_FAILURE;
6684 					goto done;
6685 				}
6686 
6687 				/*
6688 				 * entry could have been "simplified",
6689 				 * so re-clip
6690 				 */
6691 				entry = local_entry;
6692 				assert(s == local_start);
6693 				vm_map_clip_start(map, entry, s);
6694 				vm_map_clip_end(map, entry, end);
6695 				/* re-compute "e" */
6696 				e = entry->vme_end;
6697 				if (e > end) {
6698 					e = end;
6699 				}
6700 
6701 				/* did we have a change of type? */
6702 				if (!entry->is_sub_map) {
6703 					last_timestamp = map->timestamp;
6704 					continue;
6705 				}
6706 			} else {
6707 				local_start = entry->vme_start;
6708 				pmap = map_pmap;
6709 			}
6710 
6711 			if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6712 				goto done;
6713 			}
6714 
6715 			entry->in_transition = TRUE;
6716 
6717 			sub_map = VME_SUBMAP(entry);
6718 			vm_map_reference(sub_map);
6719 			vm_map_unlock(map);
6720 			rc = vm_map_wire_nested(sub_map,
6721 			    sub_start, sub_end,
6722 			    caller_prot, tag,
6723 			    user_wire, pmap, pmap_addr,
6724 			    NULL);
6725 			vm_map_deallocate(sub_map);
6726 			sub_map = VM_MAP_NULL;
6727 			vm_map_lock(map);
6728 
6729 			/*
6730 			 * Find the entry again.  It could have been clipped
6731 			 * after we unlocked the map.
6732 			 */
6733 			if (!vm_map_lookup_entry(map, local_start,
6734 			    &first_entry)) {
6735 				panic("vm_map_wire: re-lookup failed");
6736 			}
6737 			entry = first_entry;
6738 
6739 			assert(local_start == s);
6740 			/* re-compute "e" */
6741 			e = entry->vme_end;
6742 			if (e > end) {
6743 				e = end;
6744 			}
6745 
6746 			last_timestamp = map->timestamp;
6747 			while ((entry != vm_map_to_entry(map)) &&
6748 			    (entry->vme_start < e)) {
6749 				assert(entry->in_transition);
6750 				entry->in_transition = FALSE;
6751 				if (entry->needs_wakeup) {
6752 					entry->needs_wakeup = FALSE;
6753 					need_wakeup = TRUE;
6754 				}
6755 				if (rc != KERN_SUCCESS) {/* from vm_*_wire */
6756 					subtract_wire_counts(map, entry, user_wire);
6757 				}
6758 				entry = entry->vme_next;
6759 			}
6760 			if (rc != KERN_SUCCESS) {       /* from vm_*_wire */
6761 				goto done;
6762 			}
6763 
6764 			/* no need to relookup again */
6765 			s = entry->vme_start;
6766 			continue;
6767 		}
6768 
6769 		/*
6770 		 * If this entry is already wired then increment
6771 		 * the appropriate wire reference count.
6772 		 */
6773 		if (entry->wired_count) {
6774 			if ((entry->protection & access_type) != access_type) {
6775 				/* found a protection problem */
6776 
6777 				/*
6778 				 * XXX FBDP
6779 				 * We should always return an error
6780 				 * in this case but since we didn't
6781 				 * enforce it before, let's do
6782 				 * it only for the new "wire_and_extract"
6783 				 * code path for now...
6784 				 */
6785 				if (wire_and_extract) {
6786 					rc = KERN_PROTECTION_FAILURE;
6787 					goto done;
6788 				}
6789 			}
6790 
6791 			/*
6792 			 * entry is already wired down, get our reference
6793 			 * after clipping to our range.
6794 			 */
6795 			vm_map_clip_start(map, entry, s);
6796 			vm_map_clip_end(map, entry, end);
6797 
6798 			if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6799 				goto done;
6800 			}
6801 
6802 			if (wire_and_extract) {
6803 				vm_object_t             object;
6804 				vm_object_offset_t      offset;
6805 				vm_page_t               m;
6806 
6807 				/*
6808 				 * We don't have to "wire" the page again
6809 				 * bit we still have to "extract" its
6810 				 * physical page number, after some sanity
6811 				 * checks.
6812 				 */
6813 				assert((entry->vme_end - entry->vme_start)
6814 				    == PAGE_SIZE);
6815 				assert(!entry->needs_copy);
6816 				assert(!entry->is_sub_map);
6817 				assert(VME_OBJECT(entry));
6818 				if (((entry->vme_end - entry->vme_start)
6819 				    != PAGE_SIZE) ||
6820 				    entry->needs_copy ||
6821 				    entry->is_sub_map ||
6822 				    VME_OBJECT(entry) == VM_OBJECT_NULL) {
6823 					rc = KERN_INVALID_ARGUMENT;
6824 					goto done;
6825 				}
6826 
6827 				object = VME_OBJECT(entry);
6828 				offset = VME_OFFSET(entry);
6829 				/* need exclusive lock to update m->dirty */
6830 				if (entry->protection & VM_PROT_WRITE) {
6831 					vm_object_lock(object);
6832 				} else {
6833 					vm_object_lock_shared(object);
6834 				}
6835 				m = vm_page_lookup(object, offset);
6836 				assert(m != VM_PAGE_NULL);
6837 				assert(VM_PAGE_WIRED(m));
6838 				if (m != VM_PAGE_NULL && VM_PAGE_WIRED(m)) {
6839 					*physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
6840 					if (entry->protection & VM_PROT_WRITE) {
6841 						vm_object_lock_assert_exclusive(
6842 							object);
6843 						m->vmp_dirty = TRUE;
6844 					}
6845 				} else {
6846 					/* not already wired !? */
6847 					*physpage_p = 0;
6848 				}
6849 				vm_object_unlock(object);
6850 			}
6851 
6852 			/* map was not unlocked: no need to relookup */
6853 			entry = entry->vme_next;
6854 			s = entry->vme_start;
6855 			continue;
6856 		}
6857 
6858 		/*
6859 		 * Unwired entry or wire request transmitted via submap
6860 		 */
6861 
6862 		/*
6863 		 * Wiring would copy the pages to the shadow object.
6864 		 * The shadow object would not be code-signed so
6865 		 * attempting to execute code from these copied pages
6866 		 * would trigger a code-signing violation.
6867 		 */
6868 
6869 		if ((entry->protection & VM_PROT_EXECUTE)
6870 #if XNU_TARGET_OS_OSX
6871 		    &&
6872 		    map->pmap != kernel_pmap &&
6873 		    (vm_map_cs_enforcement(map)
6874 #if __arm64__
6875 		    || !VM_MAP_IS_EXOTIC(map)
6876 #endif /* __arm64__ */
6877 		    )
6878 #endif /* XNU_TARGET_OS_OSX */
6879 #if CODE_SIGNING_MONITOR
6880 		    &&
6881 		    (csm_address_space_exempt(map->pmap) != KERN_SUCCESS)
6882 #endif
6883 		    ) {
6884 #if MACH_ASSERT
6885 			printf("pid %d[%s] wiring executable range from "
6886 			    "0x%llx to 0x%llx: rejected to preserve "
6887 			    "code-signing\n",
6888 			    proc_selfpid(),
6889 			    (get_bsdtask_info(current_task())
6890 			    ? proc_name_address(get_bsdtask_info(current_task()))
6891 			    : "?"),
6892 			    (uint64_t) entry->vme_start,
6893 			    (uint64_t) entry->vme_end);
6894 #endif /* MACH_ASSERT */
6895 			DTRACE_VM2(cs_executable_wire,
6896 			    uint64_t, (uint64_t)entry->vme_start,
6897 			    uint64_t, (uint64_t)entry->vme_end);
6898 			cs_executable_wire++;
6899 			rc = KERN_PROTECTION_FAILURE;
6900 			goto done;
6901 		}
6902 
6903 		/*
6904 		 * Perform actions of vm_map_lookup that need the write
6905 		 * lock on the map: create a shadow object for a
6906 		 * copy-on-write region, or an object for a zero-fill
6907 		 * region.
6908 		 */
6909 		size = entry->vme_end - entry->vme_start;
6910 		/*
6911 		 * If wiring a copy-on-write page, we need to copy it now
6912 		 * even if we're only (currently) requesting read access.
6913 		 * This is aggressive, but once it's wired we can't move it.
6914 		 */
6915 		if (entry->needs_copy) {
6916 			if (wire_and_extract) {
6917 				/*
6918 				 * We're supposed to share with the original
6919 				 * provider so should not be "needs_copy"
6920 				 */
6921 				rc = KERN_INVALID_ARGUMENT;
6922 				goto done;
6923 			}
6924 
6925 			VME_OBJECT_SHADOW(entry, size,
6926 			    vm_map_always_shadow(map));
6927 			entry->needs_copy = FALSE;
6928 		} else if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
6929 			if (wire_and_extract) {
6930 				/*
6931 				 * We're supposed to share with the original
6932 				 * provider so should already have an object.
6933 				 */
6934 				rc = KERN_INVALID_ARGUMENT;
6935 				goto done;
6936 			}
6937 			VME_OBJECT_SET(entry, vm_object_allocate(size, map->serial_id), false, 0);
6938 			VME_OFFSET_SET(entry, (vm_object_offset_t)0);
6939 			assert(entry->use_pmap);
6940 		} else if (VME_OBJECT(entry)->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
6941 			if (wire_and_extract) {
6942 				/*
6943 				 * We're supposed to share with the original
6944 				 * provider so should not be COPY_SYMMETRIC.
6945 				 */
6946 				rc = KERN_INVALID_ARGUMENT;
6947 				goto done;
6948 			}
6949 			/*
6950 			 * Force an unrequested "copy-on-write" but only for
6951 			 * the range we're wiring.
6952 			 */
6953 //			printf("FBDP %s:%d map %p entry %p [ 0x%llx 0x%llx ] s 0x%llx end 0x%llx wire&extract=%d\n", __FUNCTION__, __LINE__, map, entry, (uint64_t)entry->vme_start, (uint64_t)entry->vme_end, (uint64_t)s, (uint64_t)end, wire_and_extract);
6954 			vm_map_clip_start(map, entry, s);
6955 			vm_map_clip_end(map, entry, end);
6956 			/* recompute "size" */
6957 			size = entry->vme_end - entry->vme_start;
6958 			/* make a shadow object */
6959 			vm_object_t orig_object;
6960 			vm_object_offset_t orig_offset;
6961 			orig_object = VME_OBJECT(entry);
6962 			orig_offset = VME_OFFSET(entry);
6963 			VME_OBJECT_SHADOW(entry, size, vm_map_always_shadow(map));
6964 			if (VME_OBJECT(entry) != orig_object) {
6965 				/*
6966 				 * This mapping has not been shared (or it would be
6967 				 * COPY_DELAY instead of COPY_SYMMETRIC) and it has
6968 				 * not been copied-on-write (or it would be marked
6969 				 * as "needs_copy" and would have been handled above
6970 				 * and also already write-protected).
6971 				 * We still need to write-protect here to prevent
6972 				 * other threads from modifying these pages while
6973 				 * we're in the process of copying and wiring
6974 				 * the copied pages.
6975 				 * Since the mapping is neither shared nor COWed,
6976 				 * we only need to write-protect the PTEs for this
6977 				 * mapping.
6978 				 */
6979 				vm_object_pmap_protect(orig_object,
6980 				    orig_offset,
6981 				    size,
6982 				    map->pmap,
6983 				    VM_MAP_PAGE_SIZE(map),
6984 				    entry->vme_start,
6985 				    entry->protection & ~VM_PROT_WRITE);
6986 			}
6987 		}
6988 		if (VME_OBJECT(entry)->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
6989 			/*
6990 			 * Make the object COPY_DELAY to get a stable object
6991 			 * to wire.
6992 			 * That should avoid creating long shadow chains while
6993 			 * wiring/unwiring the same range repeatedly.
6994 			 * That also prevents part of the object from being
6995 			 * wired while another part is "needs_copy", which
6996 			 * could result in conflicting rules wrt copy-on-write.
6997 			 */
6998 			vm_object_t object;
6999 
7000 			object = VME_OBJECT(entry);
7001 			vm_object_lock(object);
7002 			if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
7003 				assertf(vm_object_round_page(VME_OFFSET(entry) + size) - vm_object_trunc_page(VME_OFFSET(entry)) == object->vo_size,
7004 				    "object %p size 0x%llx entry %p [0x%llx:0x%llx:0x%llx] size 0x%llx\n",
7005 				    object, (uint64_t)object->vo_size,
7006 				    entry,
7007 				    (uint64_t)entry->vme_start,
7008 				    (uint64_t)entry->vme_end,
7009 				    (uint64_t)VME_OFFSET(entry),
7010 				    (uint64_t)size);
7011 				assertf(os_ref_get_count_raw(&object->ref_count) == 1,
7012 				    "object %p ref_count %d\n",
7013 				    object, os_ref_get_count_raw(&object->ref_count));
7014 				assertf(!entry->needs_copy,
7015 				    "entry %p\n", entry);
7016 				object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
7017 				VM_OBJECT_SET_TRUE_SHARE(object, TRUE);
7018 			}
7019 			vm_object_unlock(object);
7020 		}
7021 
7022 		vm_map_clip_start(map, entry, s);
7023 		vm_map_clip_end(map, entry, end);
7024 
7025 		/* re-compute "e" */
7026 		e = entry->vme_end;
7027 		if (e > end) {
7028 			e = end;
7029 		}
7030 
7031 		/*
7032 		 * Check for holes and protection mismatch.
7033 		 * Holes: Next entry should be contiguous unless this
7034 		 *	  is the end of the region.
7035 		 * Protection: Access requested must be allowed, unless
7036 		 *	wiring is by protection class
7037 		 */
7038 		if ((entry->vme_end < end) &&
7039 		    ((entry->vme_next == vm_map_to_entry(map)) ||
7040 		    (entry->vme_next->vme_start > entry->vme_end))) {
7041 			/* found a hole */
7042 			rc = KERN_INVALID_ADDRESS;
7043 			goto done;
7044 		}
7045 		if ((entry->protection & access_type) != access_type) {
7046 			/* found a protection problem */
7047 			rc = KERN_PROTECTION_FAILURE;
7048 			goto done;
7049 		}
7050 
7051 		assert(entry->wired_count == 0 && entry->user_wired_count == 0);
7052 
7053 		if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
7054 			goto done;
7055 		}
7056 
7057 		entry->in_transition = TRUE;
7058 
7059 		/*
7060 		 * This entry might get split once we unlock the map.
7061 		 * In vm_fault_wire(), we need the current range as
7062 		 * defined by this entry.  In order for this to work
7063 		 * along with a simultaneous clip operation, we make a
7064 		 * temporary copy of this entry and use that for the
7065 		 * wiring.  Note that the underlying objects do not
7066 		 * change during a clip.
7067 		 */
7068 		tmp_entry = *entry;
7069 
7070 		/*
7071 		 * The in_transition state guarentees that the entry
7072 		 * (or entries for this range, if split occured) will be
7073 		 * there when the map lock is acquired for the second time.
7074 		 */
7075 		vm_map_unlock(map);
7076 
7077 		if (!user_wire && cur_thread != THREAD_NULL) {
7078 			interruptible_state = thread_interrupt_level(THREAD_UNINT);
7079 		} else {
7080 			interruptible_state = THREAD_UNINT;
7081 		}
7082 
7083 		if (map_pmap) {
7084 			rc = vm_fault_wire(map,
7085 			    &tmp_entry, caller_prot, tag, map_pmap, pmap_addr,
7086 			    physpage_p);
7087 		} else {
7088 			rc = vm_fault_wire(map,
7089 			    &tmp_entry, caller_prot, tag, map->pmap,
7090 			    tmp_entry.vme_start,
7091 			    physpage_p);
7092 		}
7093 
7094 		if (!user_wire && cur_thread != THREAD_NULL) {
7095 			thread_interrupt_level(interruptible_state);
7096 		}
7097 
7098 		vm_map_lock(map);
7099 
7100 		if (last_timestamp + 1 != map->timestamp) {
7101 			/*
7102 			 * Find the entry again.  It could have been clipped
7103 			 * after we unlocked the map.
7104 			 */
7105 			if (!vm_map_lookup_entry(map, tmp_entry.vme_start,
7106 			    &first_entry)) {
7107 				panic("vm_map_wire: re-lookup failed");
7108 			}
7109 
7110 			entry = first_entry;
7111 		}
7112 
7113 		last_timestamp = map->timestamp;
7114 
7115 		while ((entry != vm_map_to_entry(map)) &&
7116 		    (entry->vme_start < tmp_entry.vme_end)) {
7117 			assert(entry->in_transition);
7118 			entry->in_transition = FALSE;
7119 			if (entry->needs_wakeup) {
7120 				entry->needs_wakeup = FALSE;
7121 				need_wakeup = TRUE;
7122 			}
7123 			if (rc != KERN_SUCCESS) {       /* from vm_*_wire */
7124 				subtract_wire_counts(map, entry, user_wire);
7125 			}
7126 			entry = entry->vme_next;
7127 		}
7128 
7129 		if (rc != KERN_SUCCESS) {               /* from vm_*_wire */
7130 			goto done;
7131 		}
7132 
7133 		if ((entry != vm_map_to_entry(map)) && /* we still have entries in the map */
7134 		    (tmp_entry.vme_end != end) &&    /* AND, we are not at the end of the requested range */
7135 		    (entry->vme_start != tmp_entry.vme_end)) { /* AND, the next entry is not contiguous. */
7136 			/* found a "new" hole */
7137 			s = tmp_entry.vme_end;
7138 			rc = KERN_INVALID_ADDRESS;
7139 			goto done;
7140 		}
7141 
7142 		s = entry->vme_start;
7143 	} /* end while loop through map entries */
7144 
7145 done:
7146 	if (rc == KERN_SUCCESS) {
7147 		/* repair any damage we may have made to the VM map */
7148 		vm_map_simplify_range(map, start, end);
7149 	}
7150 
7151 	vm_map_unlock(map);
7152 
7153 	/*
7154 	 * wake up anybody waiting on entries we wired.
7155 	 */
7156 	if (need_wakeup) {
7157 		vm_map_entry_wakeup(map);
7158 	}
7159 
7160 	if (rc != KERN_SUCCESS) {
7161 		/* undo what has been wired so far */
7162 		vm_map_unwire_nested(map, start, s, user_wire,
7163 		    map_pmap, pmap_addr);
7164 		if (physpage_p) {
7165 			*physpage_p = 0;
7166 		}
7167 	}
7168 
7169 	return rc;
7170 }
7171 
7172 static __attribute__((always_inline, warn_unused_result))
7173 kern_return_t
vm_map_wire_sanitize(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_prot_ut prot_u,vm_sanitize_caller_t vm_sanitize_caller,vm_map_offset_t * start,vm_map_offset_t * end,vm_map_size_t * size,vm_prot_t * prot)7174 vm_map_wire_sanitize(
7175 	vm_map_t                map,
7176 	vm_map_offset_ut        start_u,
7177 	vm_map_offset_ut        end_u,
7178 	vm_prot_ut              prot_u,
7179 	vm_sanitize_caller_t    vm_sanitize_caller,
7180 	vm_map_offset_t        *start,
7181 	vm_map_offset_t        *end,
7182 	vm_map_size_t          *size,
7183 	vm_prot_t              *prot)
7184 {
7185 	kern_return_t   kr;
7186 
7187 	vm_sanitize_flags_t flags = VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS;
7188 
7189 
7190 	kr = vm_sanitize_addr_end(start_u, end_u, vm_sanitize_caller, map,
7191 	    flags, start, end, size);
7192 	if (__improbable(kr != KERN_SUCCESS)) {
7193 		return kr;
7194 	}
7195 
7196 	kr = vm_sanitize_prot(prot_u, vm_sanitize_caller, map, prot);
7197 	if (__improbable(kr != KERN_SUCCESS)) {
7198 		return kr;
7199 	}
7200 
7201 	return KERN_SUCCESS;
7202 }
7203 
7204 /*
7205  * Validation function for vm_map_wire_nested().
7206  */
7207 kern_return_t
vm_map_wire_impl(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_prot_ut prot_u,vm_tag_t tag,boolean_t user_wire,ppnum_t * physpage_p,vm_sanitize_caller_t vm_sanitize_caller)7208 vm_map_wire_impl(
7209 	vm_map_t                map,
7210 	vm_map_offset_ut        start_u,
7211 	vm_map_offset_ut        end_u,
7212 	vm_prot_ut              prot_u,
7213 	vm_tag_t                tag,
7214 	boolean_t               user_wire,
7215 	ppnum_t                *physpage_p,
7216 	vm_sanitize_caller_t    vm_sanitize_caller)
7217 {
7218 	vm_map_offset_t start, end;
7219 	vm_map_size_t   size;
7220 	vm_prot_t       prot;
7221 	kern_return_t   kr;
7222 
7223 	/*
7224 	 * Sanitize any input parameters that are addr/size/prot/inherit
7225 	 */
7226 	kr = vm_map_wire_sanitize(map,
7227 	    start_u,
7228 	    end_u,
7229 	    prot_u,
7230 	    vm_sanitize_caller,
7231 	    &start,
7232 	    &end,
7233 	    &size,
7234 	    &prot);
7235 	if (__improbable(kr != KERN_SUCCESS)) {
7236 		if (physpage_p) {
7237 			*physpage_p = 0;
7238 		}
7239 		return vm_sanitize_get_kr(kr);
7240 	}
7241 
7242 	return vm_map_wire_nested(map, start, end, prot, tag, user_wire,
7243 	           PMAP_NULL, 0, physpage_p);
7244 }
7245 
7246 kern_return_t
vm_map_wire_external(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_prot_ut prot_u,boolean_t user_wire)7247 vm_map_wire_external(
7248 	vm_map_t                map,
7249 	vm_map_offset_ut        start_u,
7250 	vm_map_offset_ut        end_u,
7251 	vm_prot_ut              prot_u,
7252 	boolean_t               user_wire)
7253 {
7254 	vm_tag_t tag = vm_tag_bt();
7255 
7256 	return vm_map_wire_kernel(map, start_u, end_u, prot_u, tag, user_wire);
7257 }
7258 
7259 kern_return_t
vm_map_wire_kernel(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_prot_ut prot_u,vm_tag_t tag,boolean_t user_wire)7260 vm_map_wire_kernel(
7261 	vm_map_t                map,
7262 	vm_map_offset_ut        start_u,
7263 	vm_map_offset_ut        end_u,
7264 	vm_prot_ut              prot_u,
7265 	vm_tag_t                tag,
7266 	boolean_t               user_wire)
7267 {
7268 	return vm_map_wire_impl(map, start_u, end_u, prot_u, tag,
7269 	           user_wire, NULL, VM_SANITIZE_CALLER_VM_MAP_WIRE);
7270 }
7271 
7272 #if XNU_PLATFORM_MacOSX
7273 
7274 kern_return_t
vm_map_wire_and_extract(vm_map_t map,vm_map_offset_ut start_u,vm_prot_ut prot_u,boolean_t user_wire,ppnum_t * physpage_p)7275 vm_map_wire_and_extract(
7276 	vm_map_t                map,
7277 	vm_map_offset_ut        start_u,
7278 	vm_prot_ut              prot_u,
7279 	boolean_t               user_wire,
7280 	ppnum_t                *physpage_p)
7281 {
7282 	vm_tag_t         tag    = vm_tag_bt();
7283 	vm_map_size_ut   size_u = vm_sanitize_wrap_size(VM_MAP_PAGE_SIZE(map));
7284 	vm_map_offset_ut end_u  = vm_sanitize_compute_ut_end(start_u, size_u);
7285 
7286 	return vm_map_wire_impl(map, start_u, end_u, prot_u, tag,
7287 	           user_wire, physpage_p, VM_SANITIZE_CALLER_VM_MAP_WIRE);
7288 }
7289 
7290 #endif /* XNU_PLATFORM_MacOSX */
7291 
7292 static kern_return_t
vm_map_unwire_nested(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,boolean_t user_wire,pmap_t map_pmap,vm_map_offset_t pmap_addr)7293 vm_map_unwire_nested(
7294 	vm_map_t                map,
7295 	vm_map_offset_t         start,
7296 	vm_map_offset_t         end,
7297 	boolean_t               user_wire,
7298 	pmap_t                  map_pmap,
7299 	vm_map_offset_t         pmap_addr)
7300 {
7301 	vm_map_entry_t          entry;
7302 	struct vm_map_entry     *first_entry, tmp_entry;
7303 	boolean_t               need_wakeup;
7304 	boolean_t               main_map = FALSE;
7305 	unsigned int            last_timestamp;
7306 
7307 	VM_MAP_RANGE_CHECK(map, start, end);
7308 	assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
7309 	assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
7310 
7311 	if (start == end) {
7312 		/* We unwired what the caller asked for: zero pages */
7313 		return KERN_SUCCESS;
7314 	}
7315 
7316 	vm_map_lock(map);
7317 	if (map_pmap == NULL) {
7318 		main_map = TRUE;
7319 	}
7320 	last_timestamp = map->timestamp;
7321 
7322 	if (vm_map_lookup_entry(map, start, &first_entry)) {
7323 		entry = first_entry;
7324 		/*
7325 		 * vm_map_clip_start will be done later.
7326 		 * We don't want to unnest any nested sub maps here !
7327 		 */
7328 	} else {
7329 		if (!user_wire) {
7330 			panic("vm_map_unwire: start not found");
7331 		}
7332 		/*	Start address is not in map. */
7333 		vm_map_unlock(map);
7334 		return KERN_INVALID_ADDRESS;
7335 	}
7336 
7337 	if (entry->superpage_size) {
7338 		/* superpages are always wired */
7339 		vm_map_unlock(map);
7340 		return KERN_INVALID_ADDRESS;
7341 	}
7342 
7343 	need_wakeup = FALSE;
7344 	while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
7345 		if (entry->in_transition) {
7346 			/*
7347 			 * 1)
7348 			 * Another thread is wiring down this entry. Note
7349 			 * that if it is not for the other thread we would
7350 			 * be unwiring an unwired entry.  This is not
7351 			 * permitted.  If we wait, we will be unwiring memory
7352 			 * we did not wire.
7353 			 *
7354 			 * 2)
7355 			 * Another thread is unwiring this entry.  We did not
7356 			 * have a reference to it, because if we did, this
7357 			 * entry will not be getting unwired now.
7358 			 */
7359 			if (!user_wire) {
7360 				/*
7361 				 * XXX FBDP
7362 				 * This could happen:  there could be some
7363 				 * overlapping vslock/vsunlock operations
7364 				 * going on.
7365 				 * We should probably just wait and retry,
7366 				 * but then we have to be careful that this
7367 				 * entry could get "simplified" after
7368 				 * "in_transition" gets unset and before
7369 				 * we re-lookup the entry, so we would
7370 				 * have to re-clip the entry to avoid
7371 				 * re-unwiring what we have already unwired...
7372 				 * See vm_map_wire_nested().
7373 				 *
7374 				 * Or we could just ignore "in_transition"
7375 				 * here and proceed to decement the wired
7376 				 * count(s) on this entry.  That should be fine
7377 				 * as long as "wired_count" doesn't drop all
7378 				 * the way to 0 (and we should panic if THAT
7379 				 * happens).
7380 				 */
7381 				panic("vm_map_unwire: in_transition entry");
7382 			}
7383 
7384 			entry = entry->vme_next;
7385 			continue;
7386 		}
7387 
7388 		if (entry->is_sub_map) {
7389 			vm_map_offset_t sub_start;
7390 			vm_map_offset_t sub_end;
7391 			vm_map_offset_t local_end;
7392 			pmap_t          pmap;
7393 			vm_map_t        sub_map = VM_MAP_NULL;
7394 
7395 			vm_map_clip_start(map, entry, start);
7396 			vm_map_clip_end(map, entry, end);
7397 
7398 			sub_start = VME_OFFSET(entry);
7399 			sub_end = entry->vme_end - entry->vme_start;
7400 			sub_end += VME_OFFSET(entry);
7401 			local_end = entry->vme_end;
7402 			if (map_pmap == NULL) {
7403 				if (entry->use_pmap) {
7404 					pmap = VME_SUBMAP(entry)->pmap;
7405 					pmap_addr = sub_start;
7406 				} else {
7407 					pmap = map->pmap;
7408 					pmap_addr = start;
7409 				}
7410 				if (entry->wired_count == 0 ||
7411 				    (user_wire && entry->user_wired_count == 0)) {
7412 					if (!user_wire) {
7413 						panic("vm_map_unwire: entry is unwired");
7414 					}
7415 					entry = entry->vme_next;
7416 					continue;
7417 				}
7418 
7419 				/*
7420 				 * Check for holes
7421 				 * Holes: Next entry should be contiguous unless
7422 				 * this is the end of the region.
7423 				 */
7424 				if (((entry->vme_end < end) &&
7425 				    ((entry->vme_next == vm_map_to_entry(map)) ||
7426 				    (entry->vme_next->vme_start
7427 				    > entry->vme_end)))) {
7428 					if (!user_wire) {
7429 						panic("vm_map_unwire: non-contiguous region");
7430 					}
7431 /*
7432  *                                       entry = entry->vme_next;
7433  *                                       continue;
7434  */
7435 				}
7436 
7437 				subtract_wire_counts(map, entry, user_wire);
7438 
7439 				if (entry->wired_count != 0) {
7440 					entry = entry->vme_next;
7441 					continue;
7442 				}
7443 
7444 				entry->in_transition = TRUE;
7445 				tmp_entry = *entry;/* see comment in vm_map_wire() */
7446 
7447 				/*
7448 				 * We can unlock the map now. The in_transition state
7449 				 * guarantees existance of the entry.
7450 				 */
7451 				sub_map = VME_SUBMAP(entry);
7452 				vm_map_reference(sub_map);
7453 				vm_map_unlock(map);
7454 				vm_map_unwire_nested(sub_map,
7455 				    sub_start, sub_end, user_wire, pmap, pmap_addr);
7456 				vm_map_deallocate(sub_map);
7457 				sub_map = VM_MAP_NULL;
7458 				vm_map_lock(map);
7459 
7460 				if (last_timestamp + 1 != map->timestamp) {
7461 					/*
7462 					 * Find the entry again.  It could have been
7463 					 * clipped or deleted after we unlocked the map.
7464 					 */
7465 					if (!vm_map_lookup_entry(map,
7466 					    tmp_entry.vme_start,
7467 					    &first_entry)) {
7468 						if (!user_wire) {
7469 							panic("vm_map_unwire: re-lookup failed");
7470 						}
7471 						entry = first_entry->vme_next;
7472 					} else {
7473 						entry = first_entry;
7474 					}
7475 				}
7476 				last_timestamp = map->timestamp;
7477 
7478 				/*
7479 				 * clear transition bit for all constituent entries
7480 				 * that were in the original entry (saved in
7481 				 * tmp_entry).  Also check for waiters.
7482 				 */
7483 				while ((entry != vm_map_to_entry(map)) &&
7484 				    (entry->vme_start < tmp_entry.vme_end)) {
7485 					assert(entry->in_transition);
7486 					entry->in_transition = FALSE;
7487 					if (entry->needs_wakeup) {
7488 						entry->needs_wakeup = FALSE;
7489 						need_wakeup = TRUE;
7490 					}
7491 					entry = entry->vme_next;
7492 				}
7493 				continue;
7494 			} else {
7495 				tmp_entry = *entry;
7496 				sub_map = VME_SUBMAP(entry);
7497 				vm_map_reference(sub_map);
7498 				vm_map_unlock(map);
7499 				vm_map_unwire_nested(sub_map,
7500 				    sub_start, sub_end, user_wire, map_pmap,
7501 				    pmap_addr);
7502 				vm_map_deallocate(sub_map);
7503 				sub_map = VM_MAP_NULL;
7504 				vm_map_lock(map);
7505 
7506 				if (last_timestamp + 1 != map->timestamp) {
7507 					/*
7508 					 * Find the entry again.  It could have been
7509 					 * clipped or deleted after we unlocked the map.
7510 					 */
7511 					if (!vm_map_lookup_entry(map,
7512 					    tmp_entry.vme_start,
7513 					    &first_entry)) {
7514 						if (!user_wire) {
7515 							panic("vm_map_unwire: re-lookup failed");
7516 						}
7517 						entry = first_entry->vme_next;
7518 					} else {
7519 						entry = first_entry;
7520 					}
7521 				}
7522 				last_timestamp = map->timestamp;
7523 			}
7524 		}
7525 
7526 
7527 		if ((entry->wired_count == 0) ||
7528 		    (user_wire && entry->user_wired_count == 0)) {
7529 			if (!user_wire) {
7530 				panic("vm_map_unwire: entry is unwired");
7531 			}
7532 
7533 			entry = entry->vme_next;
7534 			continue;
7535 		}
7536 
7537 		assert(entry->wired_count > 0 &&
7538 		    (!user_wire || entry->user_wired_count > 0));
7539 
7540 		vm_map_clip_start(map, entry, start);
7541 		vm_map_clip_end(map, entry, end);
7542 
7543 		/*
7544 		 * Check for holes
7545 		 * Holes: Next entry should be contiguous unless
7546 		 *	  this is the end of the region.
7547 		 */
7548 		if (((entry->vme_end < end) &&
7549 		    ((entry->vme_next == vm_map_to_entry(map)) ||
7550 		    (entry->vme_next->vme_start > entry->vme_end)))) {
7551 			if (!user_wire) {
7552 				panic("vm_map_unwire: non-contiguous region");
7553 			}
7554 			/*
7555 			 * entry = entry->vme_next;
7556 			 * continue;
7557 			 */
7558 		}
7559 
7560 		subtract_wire_counts(map, entry, user_wire);
7561 
7562 		if (entry->wired_count != 0) {
7563 			entry = entry->vme_next;
7564 			continue;
7565 		}
7566 
7567 		if (entry->zero_wired_pages) {
7568 			entry->zero_wired_pages = FALSE;
7569 		}
7570 
7571 		entry->in_transition = TRUE;
7572 		tmp_entry = *entry;     /* see comment in vm_map_wire() */
7573 
7574 		/*
7575 		 * We can unlock the map now. The in_transition state
7576 		 * guarantees existance of the entry.
7577 		 */
7578 		vm_map_unlock(map);
7579 		if (map_pmap) {
7580 			vm_fault_unwire(map, &tmp_entry, FALSE, map_pmap,
7581 			    pmap_addr, tmp_entry.vme_end);
7582 		} else {
7583 			vm_fault_unwire(map, &tmp_entry, FALSE, map->pmap,
7584 			    tmp_entry.vme_start, tmp_entry.vme_end);
7585 		}
7586 		vm_map_lock(map);
7587 
7588 		if (last_timestamp + 1 != map->timestamp) {
7589 			/*
7590 			 * Find the entry again.  It could have been clipped
7591 			 * or deleted after we unlocked the map.
7592 			 */
7593 			if (!vm_map_lookup_entry(map, tmp_entry.vme_start,
7594 			    &first_entry)) {
7595 				if (!user_wire) {
7596 					panic("vm_map_unwire: re-lookup failed");
7597 				}
7598 				entry = first_entry->vme_next;
7599 			} else {
7600 				entry = first_entry;
7601 			}
7602 		}
7603 		last_timestamp = map->timestamp;
7604 
7605 		/*
7606 		 * clear transition bit for all constituent entries that
7607 		 * were in the original entry (saved in tmp_entry).  Also
7608 		 * check for waiters.
7609 		 */
7610 		while ((entry != vm_map_to_entry(map)) &&
7611 		    (entry->vme_start < tmp_entry.vme_end)) {
7612 			assert(entry->in_transition);
7613 			entry->in_transition = FALSE;
7614 			if (entry->needs_wakeup) {
7615 				entry->needs_wakeup = FALSE;
7616 				need_wakeup = TRUE;
7617 			}
7618 			entry = entry->vme_next;
7619 		}
7620 	}
7621 
7622 	/*
7623 	 * We might have fragmented the address space when we wired this
7624 	 * range of addresses.  Attempt to re-coalesce these VM map entries
7625 	 * with their neighbors now that they're no longer wired.
7626 	 * Under some circumstances, address space fragmentation can
7627 	 * prevent VM object shadow chain collapsing, which can cause
7628 	 * swap space leaks.
7629 	 */
7630 	vm_map_simplify_range(map, start, end);
7631 
7632 	vm_map_unlock(map);
7633 	/*
7634 	 * wake up anybody waiting on entries that we have unwired.
7635 	 */
7636 	if (need_wakeup) {
7637 		vm_map_entry_wakeup(map);
7638 	}
7639 	return KERN_SUCCESS;
7640 }
7641 
7642 kern_return_t
vm_map_unwire(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,boolean_t user_wire)7643 vm_map_unwire(
7644 	vm_map_t                map,
7645 	vm_map_offset_ut        start_u,
7646 	vm_map_offset_ut        end_u,
7647 	boolean_t               user_wire)
7648 {
7649 	return vm_map_unwire_impl(map, start_u, end_u, user_wire,
7650 	           VM_SANITIZE_CALLER_VM_MAP_UNWIRE);
7651 }
7652 
7653 static __attribute__((always_inline, warn_unused_result))
7654 kern_return_t
vm_map_unwire_sanitize(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_sanitize_caller_t vm_sanitize_caller,vm_map_offset_t * start,vm_map_offset_t * end,vm_map_size_t * size)7655 vm_map_unwire_sanitize(
7656 	vm_map_t                map,
7657 	vm_map_offset_ut        start_u,
7658 	vm_map_offset_ut        end_u,
7659 	vm_sanitize_caller_t    vm_sanitize_caller,
7660 	vm_map_offset_t        *start,
7661 	vm_map_offset_t        *end,
7662 	vm_map_size_t          *size)
7663 {
7664 	vm_sanitize_flags_t flags = VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS;
7665 
7666 
7667 	return vm_sanitize_addr_end(start_u, end_u, vm_sanitize_caller, map,
7668 	           flags, start, end, size);
7669 }
7670 
7671 kern_return_t
vm_map_unwire_impl(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,boolean_t user_wire,vm_sanitize_caller_t vm_sanitize_caller)7672 vm_map_unwire_impl(
7673 	vm_map_t                map,
7674 	vm_map_offset_ut        start_u,
7675 	vm_map_offset_ut        end_u,
7676 	boolean_t               user_wire,
7677 	vm_sanitize_caller_t    vm_sanitize_caller)
7678 {
7679 	vm_map_offset_t start, end;
7680 	vm_map_size_t   size;
7681 	kern_return_t   kr;
7682 
7683 	/*
7684 	 * Sanitize any input parameters that are addr/size/prot/inherit
7685 	 */
7686 	kr = vm_map_unwire_sanitize(
7687 		map,
7688 		start_u,
7689 		end_u,
7690 		vm_sanitize_caller,
7691 		&start,
7692 		&end,
7693 		&size);
7694 	if (__improbable(kr != KERN_SUCCESS)) {
7695 		return vm_sanitize_get_kr(kr);
7696 	}
7697 
7698 	return vm_map_unwire_nested(map, start, end,
7699 	           user_wire, (pmap_t)NULL, 0);
7700 }
7701 
7702 
7703 /*
7704  *	vm_map_entry_zap:	[ internal use only ]
7705  *
7706  *	Remove the entry from the target map
7707  *	and put it on a zap list.
7708  */
7709 static void
vm_map_entry_zap(vm_map_t map,vm_map_entry_t entry,vm_map_zap_t zap)7710 vm_map_entry_zap(
7711 	vm_map_t                map,
7712 	vm_map_entry_t          entry,
7713 	vm_map_zap_t            zap)
7714 {
7715 	vm_map_offset_t s, e;
7716 
7717 	s = entry->vme_start;
7718 	e = entry->vme_end;
7719 	assert(VM_MAP_PAGE_ALIGNED(s, FOURK_PAGE_MASK));
7720 	assert(VM_MAP_PAGE_ALIGNED(e, FOURK_PAGE_MASK));
7721 	if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK) {
7722 		assert(page_aligned(s));
7723 		assert(page_aligned(e));
7724 	}
7725 	if (entry->map_aligned == TRUE) {
7726 		assert(VM_MAP_PAGE_ALIGNED(s, VM_MAP_PAGE_MASK(map)));
7727 		assert(VM_MAP_PAGE_ALIGNED(e, VM_MAP_PAGE_MASK(map)));
7728 	}
7729 	assert(entry->wired_count == 0);
7730 	assert(entry->user_wired_count == 0);
7731 	assert(!entry->vme_permanent);
7732 
7733 	vm_map_store_entry_unlink(map, entry, false);
7734 	map->size -= e - s;
7735 
7736 	vm_map_zap_append(zap, entry);
7737 }
7738 
7739 static void
vm_map_submap_pmap_clean(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_map_t sub_map,vm_map_offset_t offset)7740 vm_map_submap_pmap_clean(
7741 	vm_map_t        map,
7742 	vm_map_offset_t start,
7743 	vm_map_offset_t end,
7744 	vm_map_t        sub_map,
7745 	vm_map_offset_t offset)
7746 {
7747 	vm_map_offset_t submap_start;
7748 	vm_map_offset_t submap_end;
7749 	vm_map_size_t   remove_size;
7750 	vm_map_entry_t  entry;
7751 
7752 	submap_end = offset + (end - start);
7753 	submap_start = offset;
7754 
7755 	vm_map_lock_read(sub_map);
7756 	if (vm_map_lookup_entry(sub_map, offset, &entry)) {
7757 		remove_size = (entry->vme_end - entry->vme_start);
7758 		if (offset > entry->vme_start) {
7759 			remove_size -= offset - entry->vme_start;
7760 		}
7761 
7762 
7763 		if (submap_end < entry->vme_end) {
7764 			remove_size -=
7765 			    entry->vme_end - submap_end;
7766 		}
7767 		if (entry->is_sub_map) {
7768 			vm_map_submap_pmap_clean(
7769 				sub_map,
7770 				start,
7771 				start + remove_size,
7772 				VME_SUBMAP(entry),
7773 				VME_OFFSET(entry));
7774 		} else {
7775 			if (map->mapped_in_other_pmaps &&
7776 			    os_ref_get_count_raw(&map->map_refcnt) != 0 &&
7777 			    VME_OBJECT(entry) != NULL) {
7778 				vm_object_pmap_protect_options(
7779 					VME_OBJECT(entry),
7780 					(VME_OFFSET(entry) +
7781 					offset -
7782 					entry->vme_start),
7783 					remove_size,
7784 					PMAP_NULL,
7785 					PAGE_SIZE,
7786 					entry->vme_start,
7787 					VM_PROT_NONE,
7788 					PMAP_OPTIONS_REMOVE);
7789 			} else {
7790 				pmap_remove(map->pmap,
7791 				    (addr64_t)start,
7792 				    (addr64_t)(start + remove_size));
7793 			}
7794 		}
7795 	}
7796 
7797 	entry = entry->vme_next;
7798 
7799 	while ((entry != vm_map_to_entry(sub_map))
7800 	    && (entry->vme_start < submap_end)) {
7801 		remove_size = (entry->vme_end - entry->vme_start);
7802 		if (submap_end < entry->vme_end) {
7803 			remove_size -= entry->vme_end - submap_end;
7804 		}
7805 		if (entry->is_sub_map) {
7806 			vm_map_submap_pmap_clean(
7807 				sub_map,
7808 				(start + entry->vme_start) - offset,
7809 				((start + entry->vme_start) - offset) + remove_size,
7810 				VME_SUBMAP(entry),
7811 				VME_OFFSET(entry));
7812 		} else {
7813 			if (map->mapped_in_other_pmaps &&
7814 			    os_ref_get_count_raw(&map->map_refcnt) != 0 &&
7815 			    VME_OBJECT(entry) != NULL) {
7816 				vm_object_pmap_protect_options(
7817 					VME_OBJECT(entry),
7818 					VME_OFFSET(entry),
7819 					remove_size,
7820 					PMAP_NULL,
7821 					PAGE_SIZE,
7822 					entry->vme_start,
7823 					VM_PROT_NONE,
7824 					PMAP_OPTIONS_REMOVE);
7825 			} else {
7826 				pmap_remove(map->pmap,
7827 				    (addr64_t)((start + entry->vme_start)
7828 				    - offset),
7829 				    (addr64_t)(((start + entry->vme_start)
7830 				    - offset) + remove_size));
7831 			}
7832 		}
7833 		entry = entry->vme_next;
7834 	}
7835 	vm_map_unlock_read(sub_map);
7836 	return;
7837 }
7838 
7839 /*
7840  *     virt_memory_guard_ast:
7841  *
7842  *     Handle the AST callout for a virtual memory guard.
7843  *	   raise an EXC_GUARD exception and terminate the task
7844  *     if configured to do so.
7845  */
7846 void
virt_memory_guard_ast(thread_t thread,mach_exception_data_type_t code,mach_exception_data_type_t subcode)7847 virt_memory_guard_ast(
7848 	thread_t thread,
7849 	mach_exception_data_type_t code,
7850 	mach_exception_data_type_t subcode)
7851 {
7852 	task_t task = get_threadtask(thread);
7853 	assert(task != kernel_task);
7854 	assert(task == current_task());
7855 	kern_return_t sync_exception_result;
7856 	uint32_t behavior;
7857 
7858 	behavior = task->task_exc_guard;
7859 
7860 
7861 	/* Is delivery enabled */
7862 	if ((behavior & TASK_EXC_GUARD_VM_DELIVER) == 0) {
7863 		return;
7864 	}
7865 
7866 	/* If only once, make sure we're that once */
7867 	while (behavior & TASK_EXC_GUARD_VM_ONCE) {
7868 		uint32_t new_behavior = behavior & ~TASK_EXC_GUARD_VM_DELIVER;
7869 
7870 		if (OSCompareAndSwap(behavior, new_behavior, &task->task_exc_guard)) {
7871 			break;
7872 		}
7873 		behavior = task->task_exc_guard;
7874 		if ((behavior & TASK_EXC_GUARD_VM_DELIVER) == 0) {
7875 			return;
7876 		}
7877 	}
7878 
7879 	const bool fatal = task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL;
7880 	/* Raise exception synchronously and see if handler claimed it */
7881 	sync_exception_result = task_exception_notify(EXC_GUARD, code, subcode, fatal);
7882 
7883 	if (fatal) {
7884 		/*
7885 		 * If Synchronous EXC_GUARD delivery was successful then
7886 		 * kill the process and return, else kill the process
7887 		 * and deliver the exception via EXC_CORPSE_NOTIFY.
7888 		 */
7889 
7890 
7891 		int flags = PX_DEBUG_NO_HONOR;
7892 		exception_info_t info = {
7893 			.os_reason = OS_REASON_GUARD,
7894 			.exception_type = EXC_GUARD,
7895 			.mx_code = code,
7896 			.mx_subcode = subcode
7897 		};
7898 
7899 		if (sync_exception_result == KERN_SUCCESS) {
7900 			flags |= PX_PSIGNAL;
7901 		}
7902 		exit_with_mach_exception(current_proc(), info, flags);
7903 	} else if (task->task_exc_guard & TASK_EXC_GUARD_VM_CORPSE) {
7904 		/*
7905 		 * If the synchronous EXC_GUARD delivery was not successful,
7906 		 * raise a simulated crash.
7907 		 */
7908 		if (sync_exception_result != KERN_SUCCESS) {
7909 			task_violated_guard(code, subcode, NULL, FALSE);
7910 		}
7911 	}
7912 }
7913 
7914 /*
7915  * Validate policy for VM guard exceptions and encode the correct Mach exception
7916  * code and subcode if the policy allows delivering a guard exception here.
7917  */
7918 static bool
vm_map_guard_exception_internal(vm_map_offset_t address,unsigned reason,mach_exception_code_t * code,mach_exception_data_type_t * subcode)7919 vm_map_guard_exception_internal(
7920 	vm_map_offset_t            address,
7921 	unsigned                   reason,
7922 	mach_exception_code_t      *code,
7923 	mach_exception_data_type_t *subcode)
7924 {
7925 	unsigned int guard_type = GUARD_TYPE_VIRT_MEMORY;
7926 	unsigned int target = 0; /* should we pass in pid associated with map? */
7927 
7928 	task_t task = current_task_early();
7929 
7930 	/* Can't deliver exceptions to a NULL task (early boot) or kernel task */
7931 	if (task == NULL || task == kernel_task) {
7932 		return false;
7933 	}
7934 
7935 
7936 	*code = 0;
7937 	EXC_GUARD_ENCODE_TYPE(*code, guard_type);
7938 	EXC_GUARD_ENCODE_FLAVOR(*code, reason);
7939 	EXC_GUARD_ENCODE_TARGET(*code, target);
7940 	*subcode = (uint64_t)address;
7941 
7942 	return true;
7943 }
7944 
7945 /*
7946  *     vm_map_guard_exception:
7947  *
7948  *     Generate a GUARD_TYPE_VIRTUAL_MEMORY EXC_GUARD exception.
7949  *
7950  *         `reason` is kGUARD_EXC_DEALLOC_GAP when we find nothing mapped,
7951  *     or if there is a gap in the mapping when a user address space
7952  *     was requested. We report the address of the first gap found.
7953  */
7954 
7955 void
vm_map_guard_exception(vm_map_offset_t address,unsigned reason)7956 vm_map_guard_exception(
7957 	vm_map_offset_t            address,
7958 	unsigned                   reason)
7959 {
7960 	mach_exception_code_t code;
7961 	mach_exception_data_type_t subcode;
7962 	if (vm_map_guard_exception_internal(address, reason, &code, &subcode)) {
7963 		task_t task = current_task();
7964 		bool fatal = task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL;
7965 
7966 		thread_guard_violation(current_thread(), code, subcode, fatal);
7967 	}
7968 }
7969 
7970 
7971 static kern_return_t
vm_map_delete_submap_recurse(vm_map_t submap,vm_map_offset_t submap_start,vm_map_offset_t submap_end)7972 vm_map_delete_submap_recurse(
7973 	vm_map_t submap,
7974 	vm_map_offset_t submap_start,
7975 	vm_map_offset_t submap_end)
7976 {
7977 	vm_map_entry_t submap_entry;
7978 
7979 	/*
7980 	 * Verify that the submap does not contain any "permanent" entries
7981 	 * within the specified range. We permit TPRO ranges to be overwritten
7982 	 * as we only reach this path if TPRO const protection is disabled for a
7983 	 * given map.
7984 	 *
7985 	 * We do not care about gaps.
7986 	 */
7987 
7988 	vm_map_lock(submap);
7989 
7990 	if (!vm_map_lookup_entry(submap, submap_start, &submap_entry)) {
7991 		submap_entry = submap_entry->vme_next;
7992 	}
7993 
7994 	for (;
7995 	    submap_entry != vm_map_to_entry(submap) &&
7996 	    submap_entry->vme_start < submap_end;
7997 	    submap_entry = submap_entry->vme_next) {
7998 		if (submap_entry->vme_permanent
7999 #ifdef __arm64e__
8000 		    /* allow TPRO submap entries to be overwritten */
8001 		    && !submap_entry->used_for_tpro
8002 #endif
8003 		    ) {
8004 			/* "permanent" entry -> fail */
8005 			vm_map_unlock(submap);
8006 			return KERN_PROTECTION_FAILURE;
8007 		}
8008 	}
8009 	/* no "permanent" entries in the range -> success */
8010 	vm_map_unlock(submap);
8011 	return KERN_SUCCESS;
8012 }
8013 
8014 __abortlike
8015 static void
__vm_map_delete_misaligned_panic(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)8016 __vm_map_delete_misaligned_panic(
8017 	vm_map_t                map,
8018 	vm_map_offset_t         start,
8019 	vm_map_offset_t         end)
8020 {
8021 	panic("vm_map_delete(%p,0x%llx,0x%llx): start is not aligned to 0x%x",
8022 	    map, (uint64_t)start, (uint64_t)end, VM_MAP_PAGE_SIZE(map));
8023 }
8024 
8025 __abortlike
8026 static void
__vm_map_delete_failed_panic(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,kern_return_t kr)8027 __vm_map_delete_failed_panic(
8028 	vm_map_t                map,
8029 	vm_map_offset_t         start,
8030 	vm_map_offset_t         end,
8031 	kern_return_t           kr)
8032 {
8033 	panic("vm_map_delete(%p,0x%llx,0x%llx): failed unexpected with %d",
8034 	    map, (uint64_t)start, (uint64_t)end, kr);
8035 }
8036 
8037 __abortlike
8038 static void
__vm_map_delete_gap_panic(vm_map_t map,vm_map_offset_t where,vm_map_offset_t start,vm_map_offset_t end)8039 __vm_map_delete_gap_panic(
8040 	vm_map_t                map,
8041 	vm_map_offset_t         where,
8042 	vm_map_offset_t         start,
8043 	vm_map_offset_t         end)
8044 {
8045 	panic("vm_map_delete(%p,0x%llx,0x%llx): no map entry at 0x%llx",
8046 	    map, (uint64_t)start, (uint64_t)end, (uint64_t)where);
8047 }
8048 
8049 __abortlike
8050 static void
__vm_map_delete_permanent_panic(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_map_entry_t entry)8051 __vm_map_delete_permanent_panic(
8052 	vm_map_t                map,
8053 	vm_map_offset_t         start,
8054 	vm_map_offset_t         end,
8055 	vm_map_entry_t          entry)
8056 {
8057 	panic("vm_map_delete(%p,0x%llx,0x%llx): "
8058 	    "Attempting to remove permanent VM map entry %p [0x%llx:0x%llx]",
8059 	    map, (uint64_t)start, (uint64_t)end, entry,
8060 	    (uint64_t)entry->vme_start,
8061 	    (uint64_t)entry->vme_end);
8062 }
8063 
8064 __options_decl(vm_map_delete_state_t, uint32_t, {
8065 	VMDS_NONE               = 0x0000,
8066 
8067 	VMDS_FOUND_GAP          = 0x0001,
8068 	VMDS_GAPS_OK            = 0x0002,
8069 
8070 	VMDS_KERNEL_PMAP        = 0x0004,
8071 	VMDS_NEEDS_LOOKUP       = 0x0008,
8072 	VMDS_NEEDS_WAKEUP       = 0x0010,
8073 	VMDS_KERNEL_KMEMPTR     = 0x0020
8074 });
8075 
8076 /*
8077  * vm_map_clamp_to_pmap(map, start, end)
8078  *
8079  * Modify *start and *end so they fall within the bounds of map->pmap.
8080  */
8081 #if MACH_ASSERT
8082 static void
vm_map_clamp_to_pmap(vm_map_t map,vm_map_address_t * start,vm_map_address_t * end)8083 vm_map_clamp_to_pmap(vm_map_t map, vm_map_address_t *start, vm_map_address_t *end)
8084 {
8085 	vm_map_address_t min;
8086 	vm_map_address_t max;
8087 
8088 #if __x86_64__
8089 	/* x86_64 struct pmap does not have min and max fields */
8090 	if (map->pmap == kernel_pmap) {
8091 		min = VM_MIN_KERNEL_AND_KEXT_ADDRESS;
8092 		max = VM_MAX_KERNEL_ADDRESS;
8093 	} else {
8094 		min = VM_MAP_MIN_ADDRESS;
8095 		max = VM_MAP_MAX_ADDRESS;
8096 	}
8097 #else
8098 	min = map->pmap->min;
8099 	max = map->pmap->max;
8100 #endif
8101 
8102 	if (*start < min) {
8103 		*start = min;
8104 	} else if (*start > max) {
8105 		*start = max;
8106 	}
8107 	if (*end < min) {
8108 		*end = min;
8109 	} else if (*end > max) {
8110 		*end = max;
8111 	}
8112 }
8113 #endif
8114 
8115 int vm_log_map_delete_permanent_prot_none = 0;
8116 /*
8117  *	vm_map_delete:	[ internal use only ]
8118  *
8119  *	Deallocates the given address range from the target map.
8120  *	Removes all user wirings. Unwires one kernel wiring if
8121  *	VM_MAP_REMOVE_KUNWIRE is set.  Waits for kernel wirings to go
8122  *	away if VM_MAP_REMOVE_WAIT_FOR_KWIRE is set.  Sleeps
8123  *	interruptibly if VM_MAP_REMOVE_INTERRUPTIBLE is set.
8124  *
8125  *
8126  *	When the map is a kernel map, then any error in removing mappings
8127  *	will lead to a panic so that clients do not have to repeat the panic
8128  *	code at each call site.  If VM_MAP_REMOVE_INTERRUPTIBLE
8129  *	is also passed, then KERN_ABORTED will not lead to a panic.
8130  *
8131  *	This routine is called with map locked and leaves map locked.
8132  */
8133 static kmem_return_t
vm_map_delete(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vmr_flags_t flags,kmem_guard_t guard,vm_map_zap_t zap_list)8134 vm_map_delete(
8135 	vm_map_t                map,
8136 	vm_map_offset_t         start,
8137 	vm_map_offset_t         end,
8138 	vmr_flags_t             flags,
8139 	kmem_guard_t            guard,
8140 	vm_map_zap_t            zap_list)
8141 {
8142 	vm_map_entry_t          entry, next;
8143 	int                     interruptible;
8144 	vm_map_offset_t         gap_start = 0;
8145 	vm_map_offset_t         clear_in_transition_end = 0;
8146 	__unused vm_map_offset_t save_start = start;
8147 	__unused vm_map_offset_t save_end = end;
8148 	vm_map_delete_state_t   state = VMDS_NONE;
8149 	kmem_return_t           ret = { };
8150 	vm_map_range_id_t       range_id = 0;
8151 	struct kmem_page_meta  *meta = NULL;
8152 	uint32_t                size_idx, slot_idx;
8153 	struct mach_vm_range    slot;
8154 
8155 	if (vm_map_pmap(map) == kernel_pmap) {
8156 		state |= VMDS_KERNEL_PMAP;
8157 		range_id = kmem_addr_get_range(start, end - start);
8158 		if (kmem_is_ptr_range(range_id)) {
8159 			state |= VMDS_KERNEL_KMEMPTR;
8160 			slot_idx = kmem_addr_get_slot_idx(start, end, range_id, &meta,
8161 			    &size_idx, &slot);
8162 		}
8163 	}
8164 
8165 	if (map->terminated || os_ref_get_count_raw(&map->map_refcnt) == 0) {
8166 		state |= VMDS_GAPS_OK;
8167 	}
8168 
8169 	if (map->corpse_source &&
8170 	    !(flags & VM_MAP_REMOVE_TO_OVERWRITE) &&
8171 	    !map->terminated) {
8172 		/*
8173 		 * The map is being used for corpses related diagnostics.
8174 		 * So skip any entry removal to avoid perturbing the map state.
8175 		 * The cleanup will happen in task_terminate_internal after the
8176 		 * call to task_port_no_senders.
8177 		 */
8178 		goto out;
8179 	}
8180 
8181 	interruptible = (flags & VM_MAP_REMOVE_INTERRUPTIBLE) ?
8182 	    THREAD_ABORTSAFE : THREAD_UNINT;
8183 
8184 	if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) == 0 &&
8185 	    (start & VM_MAP_PAGE_MASK(map))) {
8186 		__vm_map_delete_misaligned_panic(map, start, end);
8187 	}
8188 
8189 	if ((state & VMDS_GAPS_OK) == 0) {
8190 		/*
8191 		 * If the map isn't terminated then all deletions must have
8192 		 * no gaps, and be within the [min, max) of the map.
8193 		 *
8194 		 * We got here without VM_MAP_RANGE_CHECK() being called,
8195 		 * and hence must validate bounds manually.
8196 		 *
8197 		 * It is worth noting that because vm_deallocate() will
8198 		 * round_page() the deallocation size, it's possible for "end"
8199 		 * to be 0 here due to overflow. We hence must treat it as being
8200 		 * beyond vm_map_max(map).
8201 		 *
8202 		 * Similarly, end < start means some wrap around happend,
8203 		 * which should cause an error or panic.
8204 		 */
8205 		if (end == 0 || end > vm_map_max(map)) {
8206 			state |= VMDS_FOUND_GAP;
8207 			gap_start = vm_map_max(map);
8208 			if (state & VMDS_KERNEL_PMAP) {
8209 				__vm_map_delete_gap_panic(map,
8210 				    gap_start, start, end);
8211 			}
8212 			goto out;
8213 		}
8214 
8215 		if (end < start) {
8216 			if (state & VMDS_KERNEL_PMAP) {
8217 				__vm_map_delete_gap_panic(map,
8218 				    vm_map_max(map), start, end);
8219 			}
8220 			ret.kmr_return = KERN_INVALID_ARGUMENT;
8221 			goto out;
8222 		}
8223 
8224 		if (start < vm_map_min(map)) {
8225 			state |= VMDS_FOUND_GAP;
8226 			gap_start = start;
8227 			if (state & VMDS_KERNEL_PMAP) {
8228 				__vm_map_delete_gap_panic(map,
8229 				    gap_start, start, end);
8230 			}
8231 			goto out;
8232 		}
8233 	} else {
8234 		/*
8235 		 * If the map is terminated, we must accept start/end
8236 		 * being beyond the boundaries of the map as this is
8237 		 * how some of the mappings like commpage mappings
8238 		 * can be destroyed (they're outside of those bounds).
8239 		 *
8240 		 * end < start is still something we can't cope with,
8241 		 * so just bail.
8242 		 */
8243 		if (end < start) {
8244 			goto out;
8245 		}
8246 	}
8247 
8248 
8249 	/*
8250 	 *	Find the start of the region.
8251 	 *
8252 	 *	If in a superpage, extend the range
8253 	 *	to include the start of the mapping.
8254 	 */
8255 	while (vm_map_lookup_entry_or_next(map, start, &entry)) {
8256 		if (entry->superpage_size && (start & ~SUPERPAGE_MASK)) {
8257 			start = SUPERPAGE_ROUND_DOWN(start);
8258 		} else {
8259 			SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
8260 			break;
8261 		}
8262 	}
8263 
8264 	if (entry->superpage_size) {
8265 		end = SUPERPAGE_ROUND_UP(end);
8266 	}
8267 
8268 	/*
8269 	 *	Step through all entries in this region
8270 	 */
8271 	for (vm_map_offset_t s = start; s < end;) {
8272 		/*
8273 		 * At this point, we have deleted all the memory entries
8274 		 * in [start, s) and are proceeding with the [s, end) range.
8275 		 *
8276 		 * This loop might drop the map lock, and it is possible that
8277 		 * some memory was already reallocated within [start, s)
8278 		 * and we don't want to mess with those entries.
8279 		 *
8280 		 * Some of those entries could even have been re-assembled
8281 		 * with an entry after "s" (in vm_map_simplify_entry()), so
8282 		 * we may have to vm_map_clip_start() again.
8283 		 *
8284 		 * When clear_in_transition_end is set, the we had marked
8285 		 * [start, clear_in_transition_end) as "in_transition"
8286 		 * during a previous iteration and we need to clear it.
8287 		 */
8288 
8289 		/*
8290 		 * Step 1: If needed (because we dropped locks),
8291 		 *         lookup the entry again.
8292 		 *
8293 		 *         If we're coming back from unwiring (Step 5),
8294 		 *         we also need to mark the entries as no longer
8295 		 *         in transition after that.
8296 		 */
8297 
8298 		if (state & VMDS_NEEDS_LOOKUP) {
8299 			state &= ~VMDS_NEEDS_LOOKUP;
8300 
8301 			if (vm_map_lookup_entry_or_next(map, s, &entry)) {
8302 				SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
8303 			}
8304 
8305 			if (state & VMDS_KERNEL_KMEMPTR) {
8306 				kmem_validate_slot(s, meta, size_idx, slot_idx);
8307 			}
8308 		}
8309 
8310 		if (clear_in_transition_end) {
8311 			for (vm_map_entry_t it = entry;
8312 			    it != vm_map_to_entry(map) &&
8313 			    it->vme_start < clear_in_transition_end;
8314 			    it = it->vme_next) {
8315 				assert(it->in_transition);
8316 				it->in_transition = FALSE;
8317 				if (it->needs_wakeup) {
8318 					it->needs_wakeup = FALSE;
8319 					state |= VMDS_NEEDS_WAKEUP;
8320 				}
8321 			}
8322 
8323 			clear_in_transition_end = 0;
8324 		}
8325 
8326 
8327 		/*
8328 		 * Step 2: Perform various policy checks
8329 		 *         before we do _anything_ to this entry.
8330 		 */
8331 
8332 		if (entry == vm_map_to_entry(map) || s < entry->vme_start) {
8333 			if (state & (VMDS_GAPS_OK | VMDS_FOUND_GAP)) {
8334 				/*
8335 				 * Either we found a gap already,
8336 				 * or we are tearing down a map,
8337 				 * keep going.
8338 				 */
8339 			} else if (state & VMDS_KERNEL_PMAP) {
8340 				__vm_map_delete_gap_panic(map, s, start, end);
8341 			} else if (s < end) {
8342 				state |= VMDS_FOUND_GAP;
8343 				gap_start = s;
8344 			}
8345 
8346 			if (entry == vm_map_to_entry(map) ||
8347 			    end <= entry->vme_start) {
8348 				break;
8349 			}
8350 
8351 			s = entry->vme_start;
8352 		}
8353 
8354 		if (state & VMDS_KERNEL_PMAP) {
8355 			/*
8356 			 * In the kernel map and its submaps,
8357 			 * permanent entries never die, even
8358 			 * if VM_MAP_REMOVE_IMMUTABLE is passed.
8359 			 */
8360 			if (entry->vme_permanent) {
8361 				__vm_map_delete_permanent_panic(map, start, end, entry);
8362 			}
8363 
8364 			if (flags & VM_MAP_REMOVE_GUESS_SIZE) {
8365 				end = entry->vme_end;
8366 				flags &= ~VM_MAP_REMOVE_GUESS_SIZE;
8367 			}
8368 
8369 			/*
8370 			 * In the kernel map and its submaps,
8371 			 * the removal of an atomic/guarded entry is strict.
8372 			 *
8373 			 * An atomic entry is processed only if it was
8374 			 * specifically targeted.
8375 			 *
8376 			 * We might have deleted non-atomic entries before
8377 			 * we reach this this point however...
8378 			 */
8379 			kmem_entry_validate_guard(map, entry,
8380 			    start, end - start, guard);
8381 		}
8382 
8383 		/*
8384 		 * Step 2.1: handle "permanent" and "submap" entries
8385 		 * *before* clipping to avoid triggering some unnecessary
8386 		 * un-nesting of the shared region.
8387 		 */
8388 		if (entry->vme_permanent && entry->is_sub_map) {
8389 //			printf("FBDP %s:%d permanent submap...\n", __FUNCTION__, __LINE__);
8390 			/*
8391 			 * Un-mapping a "permanent" mapping of a user-space
8392 			 * submap is not allowed unless...
8393 			 */
8394 			if (flags & VM_MAP_REMOVE_IMMUTABLE) {
8395 				/*
8396 				 * a. explicitly requested by the kernel caller.
8397 				 */
8398 //				printf("FBDP %s:%d flags & REMOVE_IMMUTABLE\n", __FUNCTION__, __LINE__);
8399 			} else if ((flags & VM_MAP_REMOVE_IMMUTABLE_CODE) &&
8400 			    developer_mode_state()) {
8401 				/*
8402 				 * b. we're in "developer" mode (for
8403 				 *    breakpoints, dtrace probes, ...).
8404 				 */
8405 //				printf("FBDP %s:%d flags & REMOVE_IMMUTABLE_CODE\n", __FUNCTION__, __LINE__);
8406 			} else if (map->terminated) {
8407 				/*
8408 				 * c. this is the final address space cleanup.
8409 				 */
8410 //				printf("FBDP %s:%d map->terminated\n", __FUNCTION__, __LINE__);
8411 			} else {
8412 				vm_map_offset_t submap_start, submap_end;
8413 				kern_return_t submap_kr;
8414 
8415 				/*
8416 				 * Check if there are any "permanent" mappings
8417 				 * in this range in the submap.
8418 				 */
8419 				if (entry->in_transition) {
8420 					/* can that even happen ? */
8421 					goto in_transition;
8422 				}
8423 				/* compute the clipped range in the submap */
8424 				submap_start = s - entry->vme_start;
8425 				submap_start += VME_OFFSET(entry);
8426 				submap_end = end - entry->vme_start;
8427 				submap_end += VME_OFFSET(entry);
8428 				submap_kr = vm_map_delete_submap_recurse(
8429 					VME_SUBMAP(entry),
8430 					submap_start,
8431 					submap_end);
8432 				if (submap_kr != KERN_SUCCESS) {
8433 					/*
8434 					 * There are some "permanent" mappings
8435 					 * in the submap: we are not allowed
8436 					 * to remove this range.
8437 					 */
8438 					printf("%d[%s] removing permanent submap entry "
8439 					    "%p [0x%llx:0x%llx] prot 0x%x/0x%x -> KERN_PROT_FAILURE\n",
8440 					    proc_selfpid(),
8441 					    (get_bsdtask_info(current_task())
8442 					    ? proc_name_address(get_bsdtask_info(current_task()))
8443 					    : "?"), entry,
8444 					    (uint64_t)entry->vme_start,
8445 					    (uint64_t)entry->vme_end,
8446 					    entry->protection,
8447 					    entry->max_protection);
8448 					DTRACE_VM6(vm_map_delete_permanent_deny_submap,
8449 					    vm_map_entry_t, entry,
8450 					    vm_map_offset_t, entry->vme_start,
8451 					    vm_map_offset_t, entry->vme_end,
8452 					    vm_prot_t, entry->protection,
8453 					    vm_prot_t, entry->max_protection,
8454 					    int, VME_ALIAS(entry));
8455 					ret.kmr_return = KERN_PROTECTION_FAILURE;
8456 					goto out;
8457 				}
8458 				/* no permanent mappings: proceed */
8459 			}
8460 		}
8461 
8462 		/*
8463 		 * Step 3: Perform any clipping needed.
8464 		 *
8465 		 *         After this, "entry" starts at "s", ends before "end"
8466 		 */
8467 
8468 		if (entry->vme_start < s) {
8469 			if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) &&
8470 			    entry->map_aligned &&
8471 			    !VM_MAP_PAGE_ALIGNED(s, VM_MAP_PAGE_MASK(map))) {
8472 				/*
8473 				 * The entry will no longer be map-aligned
8474 				 * after clipping and the caller said it's OK.
8475 				 */
8476 				entry->map_aligned = FALSE;
8477 			}
8478 			vm_map_clip_start(map, entry, s);
8479 			SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
8480 		}
8481 
8482 		if (end < entry->vme_end) {
8483 			if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) &&
8484 			    entry->map_aligned &&
8485 			    !VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map))) {
8486 				/*
8487 				 * The entry will no longer be map-aligned
8488 				 * after clipping and the caller said it's OK.
8489 				 */
8490 				entry->map_aligned = FALSE;
8491 			}
8492 			vm_map_clip_end(map, entry, end);
8493 		}
8494 
8495 		if (entry->vme_permanent && entry->is_sub_map) {
8496 			/*
8497 			 * We already went through step 2.1 which did not deny
8498 			 * the removal of this "permanent" and "is_sub_map"
8499 			 * entry.
8500 			 * Now that we've clipped what we actually want to
8501 			 * delete, undo the "permanent" part to allow the
8502 			 * removal to proceed.
8503 			 */
8504 			DTRACE_VM6(vm_map_delete_permanent_allow_submap,
8505 			    vm_map_entry_t, entry,
8506 			    vm_map_offset_t, entry->vme_start,
8507 			    vm_map_offset_t, entry->vme_end,
8508 			    vm_prot_t, entry->protection,
8509 			    vm_prot_t, entry->max_protection,
8510 			    int, VME_ALIAS(entry));
8511 			entry->vme_permanent = false;
8512 		}
8513 
8514 		assert(s == entry->vme_start);
8515 		assert(entry->vme_end <= end);
8516 
8517 
8518 		/*
8519 		 * Step 4: If the entry is in flux, wait for this to resolve.
8520 		 */
8521 
8522 		if (entry->in_transition) {
8523 			wait_result_t wait_result;
8524 
8525 in_transition:
8526 			/*
8527 			 * Another thread is wiring/unwiring this entry.
8528 			 * Let the other thread know we are waiting.
8529 			 */
8530 
8531 			entry->needs_wakeup = TRUE;
8532 
8533 			/*
8534 			 * wake up anybody waiting on entries that we have
8535 			 * already unwired/deleted.
8536 			 */
8537 			if (state & VMDS_NEEDS_WAKEUP) {
8538 				vm_map_entry_wakeup(map);
8539 				state &= ~VMDS_NEEDS_WAKEUP;
8540 			}
8541 
8542 			wait_result = vm_map_entry_wait(map, interruptible);
8543 
8544 			if (interruptible &&
8545 			    wait_result == THREAD_INTERRUPTED) {
8546 				/*
8547 				 * We do not clear the needs_wakeup flag,
8548 				 * since we cannot tell if we were the only one.
8549 				 */
8550 				ret.kmr_return = KERN_ABORTED;
8551 				return ret;
8552 			}
8553 
8554 			/*
8555 			 * The entry could have been clipped or it
8556 			 * may not exist anymore.  Look it up again.
8557 			 */
8558 			state |= VMDS_NEEDS_LOOKUP;
8559 			continue;
8560 		}
8561 
8562 
8563 		/*
8564 		 * Step 5: Handle wiring
8565 		 */
8566 
8567 		if (entry->wired_count) {
8568 			struct vm_map_entry tmp_entry;
8569 			boolean_t           user_wire;
8570 			unsigned int        last_timestamp;
8571 
8572 			user_wire = entry->user_wired_count > 0;
8573 
8574 			/*
8575 			 *      Remove a kernel wiring if requested
8576 			 */
8577 			if (flags & VM_MAP_REMOVE_KUNWIRE) {
8578 				entry->wired_count--;
8579 				vme_btref_consider_and_put(entry);
8580 			}
8581 
8582 			/*
8583 			 *	Remove all user wirings for proper accounting
8584 			 */
8585 			while (entry->user_wired_count) {
8586 				subtract_wire_counts(map, entry, user_wire);
8587 			}
8588 
8589 			/*
8590 			 * All our DMA I/O operations in IOKit are currently
8591 			 * done by wiring through the map entries of the task
8592 			 * requesting the I/O.
8593 			 *
8594 			 * Because of this, we must always wait for kernel wirings
8595 			 * to go away on the entries before deleting them.
8596 			 *
8597 			 * Any caller who wants to actually remove a kernel wiring
8598 			 * should explicitly set the VM_MAP_REMOVE_KUNWIRE flag to
8599 			 * properly remove one wiring instead of blasting through
8600 			 * them all.
8601 			 */
8602 			if (entry->wired_count != 0) {
8603 				assert(map != kernel_map);
8604 				/*
8605 				 * Cannot continue.  Typical case is when
8606 				 * a user thread has physical io pending on
8607 				 * on this page.  Either wait for the
8608 				 * kernel wiring to go away or return an
8609 				 * error.
8610 				 */
8611 				wait_result_t wait_result;
8612 
8613 				entry->needs_wakeup = TRUE;
8614 				wait_result = vm_map_entry_wait(map,
8615 				    interruptible);
8616 
8617 				if (interruptible &&
8618 				    wait_result == THREAD_INTERRUPTED) {
8619 					/*
8620 					 * We do not clear the
8621 					 * needs_wakeup flag, since we
8622 					 * cannot tell if we were the
8623 					 * only one.
8624 					 */
8625 					ret.kmr_return = KERN_ABORTED;
8626 					return ret;
8627 				}
8628 
8629 
8630 				/*
8631 				 * The entry could have been clipped or
8632 				 * it may not exist anymore.  Look it
8633 				 * up again.
8634 				 */
8635 				state |= VMDS_NEEDS_LOOKUP;
8636 				continue;
8637 			}
8638 
8639 			/*
8640 			 * We can unlock the map now.
8641 			 *
8642 			 * The entry might be split once we unlock the map,
8643 			 * but we need the range as defined by this entry
8644 			 * to be stable. So we must make a local copy.
8645 			 *
8646 			 * The underlying objects do not change during clips,
8647 			 * and the in_transition state guarentees existence
8648 			 * of the entry.
8649 			 */
8650 			last_timestamp = map->timestamp;
8651 			entry->in_transition = TRUE;
8652 			tmp_entry = *entry;
8653 			vm_map_unlock(map);
8654 
8655 			if (tmp_entry.is_sub_map) {
8656 				vm_map_t sub_map;
8657 				vm_map_offset_t sub_start, sub_end;
8658 				pmap_t pmap;
8659 				vm_map_offset_t pmap_addr;
8660 
8661 
8662 				sub_map = VME_SUBMAP(&tmp_entry);
8663 				sub_start = VME_OFFSET(&tmp_entry);
8664 				sub_end = sub_start + (tmp_entry.vme_end -
8665 				    tmp_entry.vme_start);
8666 				if (tmp_entry.use_pmap) {
8667 					pmap = sub_map->pmap;
8668 					pmap_addr = tmp_entry.vme_start;
8669 				} else {
8670 					pmap = map->pmap;
8671 					pmap_addr = tmp_entry.vme_start;
8672 				}
8673 				(void) vm_map_unwire_nested(sub_map,
8674 				    sub_start, sub_end,
8675 				    user_wire,
8676 				    pmap, pmap_addr);
8677 			} else {
8678 				vm_map_offset_t entry_end = tmp_entry.vme_end;
8679 				vm_map_offset_t max_end;
8680 
8681 				if (flags & VM_MAP_REMOVE_NOKUNWIRE_LAST) {
8682 					max_end = end - VM_MAP_PAGE_SIZE(map);
8683 					if (entry_end > max_end) {
8684 						entry_end = max_end;
8685 					}
8686 				}
8687 
8688 				if (tmp_entry.vme_kernel_object) {
8689 					pmap_protect_options(
8690 						map->pmap,
8691 						tmp_entry.vme_start,
8692 						entry_end,
8693 						VM_PROT_NONE,
8694 						PMAP_OPTIONS_REMOVE,
8695 						NULL);
8696 				}
8697 				vm_fault_unwire(map, &tmp_entry,
8698 				    tmp_entry.vme_kernel_object, map->pmap,
8699 				    tmp_entry.vme_start, entry_end);
8700 			}
8701 
8702 			vm_map_lock(map);
8703 
8704 			/*
8705 			 * Unwiring happened, we can now go back to deleting
8706 			 * them (after we clear the in_transition bit for the range).
8707 			 */
8708 			if (last_timestamp + 1 != map->timestamp) {
8709 				state |= VMDS_NEEDS_LOOKUP;
8710 			}
8711 			clear_in_transition_end = tmp_entry.vme_end;
8712 			continue;
8713 		}
8714 
8715 		assert(entry->wired_count == 0);
8716 		assert(entry->user_wired_count == 0);
8717 
8718 
8719 		/*
8720 		 * Step 6: Entry is unwired and ready for us to delete !
8721 		 */
8722 
8723 		if (!entry->vme_permanent) {
8724 			/*
8725 			 * Typical case: the entry really shouldn't be permanent
8726 			 */
8727 		} else if ((flags & VM_MAP_REMOVE_IMMUTABLE_CODE) &&
8728 		    (entry->protection & VM_PROT_EXECUTE) &&
8729 		    developer_mode_state()) {
8730 			/*
8731 			 * Allow debuggers to undo executable mappings
8732 			 * when developer mode is on.
8733 			 */
8734 #if 0
8735 			printf("FBDP %d[%s] removing permanent executable entry "
8736 			    "%p [0x%llx:0x%llx] prot 0x%x/0x%x\n",
8737 			    proc_selfpid(),
8738 			    (current_task()->bsd_info
8739 			    ? proc_name_address(current_task()->bsd_info)
8740 			    : "?"), entry,
8741 			    (uint64_t)entry->vme_start,
8742 			    (uint64_t)entry->vme_end,
8743 			    entry->protection,
8744 			    entry->max_protection);
8745 #endif
8746 			entry->vme_permanent = FALSE;
8747 		} else if ((flags & VM_MAP_REMOVE_IMMUTABLE) || map->terminated) {
8748 #if 0
8749 			printf("FBDP %d[%s] removing permanent entry "
8750 			    "%p [0x%llx:0x%llx] prot 0x%x/0x%x\n",
8751 			    proc_selfpid(),
8752 			    (current_task()->bsd_info
8753 			    ? proc_name_address(current_task()->bsd_info)
8754 			    : "?"), entry,
8755 			    (uint64_t)entry->vme_start,
8756 			    (uint64_t)entry->vme_end,
8757 			    entry->protection,
8758 			    entry->max_protection);
8759 #endif
8760 			entry->vme_permanent = FALSE;
8761 #if CODE_SIGNING_MONITOR
8762 		} else if ((entry->protection & VM_PROT_EXECUTE) && !csm_enabled()) {
8763 			entry->vme_permanent = FALSE;
8764 
8765 			printf("%d[%s] %s(0x%llx,0x%llx): "
8766 			    "code signing monitor disabled, allowing for permanent executable entry [0x%llx:0x%llx] "
8767 			    "prot 0x%x/0x%x\n",
8768 			    proc_selfpid(),
8769 			    (get_bsdtask_info(current_task())
8770 			    ? proc_name_address(get_bsdtask_info(current_task()))
8771 			    : "?"),
8772 			    __FUNCTION__,
8773 			    (uint64_t)start,
8774 			    (uint64_t)end,
8775 			    (uint64_t)entry->vme_start,
8776 			    (uint64_t)entry->vme_end,
8777 			    entry->protection,
8778 			    entry->max_protection);
8779 #endif
8780 		} else {
8781 			DTRACE_VM6(vm_map_delete_permanent,
8782 			    vm_map_entry_t, entry,
8783 			    vm_map_offset_t, entry->vme_start,
8784 			    vm_map_offset_t, entry->vme_end,
8785 			    vm_prot_t, entry->protection,
8786 			    vm_prot_t, entry->max_protection,
8787 			    int, VME_ALIAS(entry));
8788 		}
8789 
8790 		if (entry->is_sub_map) {
8791 			assertf(VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)) >= VM_MAP_PAGE_SHIFT(map),
8792 			    "map %p (%d) entry %p submap %p (%d)\n",
8793 			    map, VM_MAP_PAGE_SHIFT(map), entry,
8794 			    VME_SUBMAP(entry),
8795 			    VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)));
8796 			if (entry->use_pmap) {
8797 #ifndef NO_NESTED_PMAP
8798 				int pmap_flags;
8799 
8800 				if (map->terminated) {
8801 					/*
8802 					 * This is the final cleanup of the
8803 					 * address space being terminated.
8804 					 * No new mappings are expected and
8805 					 * we don't really need to unnest the
8806 					 * shared region (and lose the "global"
8807 					 * pmap mappings, if applicable).
8808 					 *
8809 					 * Tell the pmap layer that we're
8810 					 * "clean" wrt nesting.
8811 					 */
8812 					pmap_flags = PMAP_UNNEST_CLEAN;
8813 				} else {
8814 					/*
8815 					 * We're unmapping part of the nested
8816 					 * shared region, so we can't keep the
8817 					 * nested pmap.
8818 					 */
8819 					pmap_flags = 0;
8820 				}
8821 				pmap_unnest_options(
8822 					map->pmap,
8823 					(addr64_t)entry->vme_start,
8824 					entry->vme_end - entry->vme_start,
8825 					pmap_flags);
8826 #endif  /* NO_NESTED_PMAP */
8827 				if (map->mapped_in_other_pmaps &&
8828 				    os_ref_get_count_raw(&map->map_refcnt) != 0) {
8829 					/* clean up parent map/maps */
8830 					vm_map_submap_pmap_clean(
8831 						map, entry->vme_start,
8832 						entry->vme_end,
8833 						VME_SUBMAP(entry),
8834 						VME_OFFSET(entry));
8835 				}
8836 			} else {
8837 				vm_map_submap_pmap_clean(
8838 					map, entry->vme_start, entry->vme_end,
8839 					VME_SUBMAP(entry),
8840 					VME_OFFSET(entry));
8841 			}
8842 		} else if (entry->vme_kernel_object ||
8843 		    VME_OBJECT(entry) == compressor_object) {
8844 			/*
8845 			 * nothing to do
8846 			 */
8847 		} else if (map->mapped_in_other_pmaps &&
8848 		    os_ref_get_count_raw(&map->map_refcnt) != 0) {
8849 			vm_object_pmap_protect_options(
8850 				VME_OBJECT(entry), VME_OFFSET(entry),
8851 				entry->vme_end - entry->vme_start,
8852 				PMAP_NULL,
8853 				PAGE_SIZE,
8854 				entry->vme_start,
8855 				VM_PROT_NONE,
8856 				PMAP_OPTIONS_REMOVE);
8857 		} else if ((VME_OBJECT(entry) != VM_OBJECT_NULL) ||
8858 		    (state & VMDS_KERNEL_PMAP)) {
8859 			/* Remove translations associated
8860 			 * with this range unless the entry
8861 			 * does not have an object, or
8862 			 * it's the kernel map or a descendant
8863 			 * since the platform could potentially
8864 			 * create "backdoor" mappings invisible
8865 			 * to the VM. It is expected that
8866 			 * objectless, non-kernel ranges
8867 			 * do not have such VM invisible
8868 			 * translations.
8869 			 */
8870 			vm_map_address_t remove_start = entry->vme_start;
8871 			vm_map_address_t remove_end = entry->vme_end;
8872 #if MACH_ASSERT
8873 			/*
8874 			 * Prevent panics in pmap_remove() from some vm test code
8875 			 * which uses virtual address ranges that pmap disallows.
8876 			 */
8877 			if (thread_get_test_option(test_option_vm_map_clamp_pmap_remove)) {
8878 				vm_map_clamp_to_pmap(map, &remove_start, &remove_end);
8879 			}
8880 #endif /* MACH_ASSERT */
8881 			pmap_remove(map->pmap, remove_start, remove_end);
8882 		}
8883 
8884 #if DEBUG
8885 		/*
8886 		 * All pmap mappings for this map entry must have been
8887 		 * cleared by now.
8888 		 */
8889 		assert(pmap_is_empty(map->pmap,
8890 		    entry->vme_start,
8891 		    entry->vme_end));
8892 #endif /* DEBUG */
8893 
8894 		if (entry->iokit_acct) {
8895 			/* alternate accounting */
8896 			DTRACE_VM4(vm_map_iokit_unmapped_region,
8897 			    vm_map_t, map,
8898 			    vm_map_offset_t, entry->vme_start,
8899 			    vm_map_offset_t, entry->vme_end,
8900 			    int, VME_ALIAS(entry));
8901 			vm_map_iokit_unmapped_region(map,
8902 			    (entry->vme_end -
8903 			    entry->vme_start));
8904 			entry->iokit_acct = FALSE;
8905 			entry->use_pmap = FALSE;
8906 		}
8907 
8908 		/* move "s" forward */
8909 		s    = entry->vme_end;
8910 		next = entry->vme_next;
8911 		if (!entry->map_aligned) {
8912 			vm_map_offset_t rounded_s;
8913 
8914 			/*
8915 			 * Skip artificial gap due to mis-aligned entry
8916 			 * on devices with a page size smaller than the
8917 			 * map's page size (i.e. 16k task on a 4k device).
8918 			 */
8919 			rounded_s = VM_MAP_ROUND_PAGE(s, VM_MAP_PAGE_MASK(map));
8920 			if (next == vm_map_to_entry(map)) {
8921 				s = rounded_s;
8922 			} else if (s < rounded_s) {
8923 				s = MIN(rounded_s, next->vme_start);
8924 			}
8925 		}
8926 		ret.kmr_size += s - entry->vme_start;
8927 
8928 		if (entry->vme_permanent) {
8929 			/*
8930 			 * A permanent entry can not be removed, so leave it
8931 			 * in place but remove all access permissions.
8932 			 */
8933 			if (__improbable(vm_log_map_delete_permanent_prot_none)) {
8934 				printf("%s:%d %d[%s] map %p entry %p [ 0x%llx - 0x%llx ] submap %d prot 0x%x/0x%x -> 0/0\n",
8935 				    __FUNCTION__, __LINE__,
8936 				    proc_selfpid(),
8937 				    (get_bsdtask_info(current_task())
8938 				    ? proc_name_address(get_bsdtask_info(current_task()))
8939 				    : "?"),
8940 				    map,
8941 				    entry,
8942 				    (uint64_t)entry->vme_start,
8943 				    (uint64_t)entry->vme_end,
8944 				    entry->is_sub_map,
8945 				    entry->protection,
8946 				    entry->max_protection);
8947 			}
8948 			DTRACE_VM6(vm_map_delete_permanent_prot_none,
8949 			    vm_map_entry_t, entry,
8950 			    vm_map_offset_t, entry->vme_start,
8951 			    vm_map_offset_t, entry->vme_end,
8952 			    vm_prot_t, entry->protection,
8953 			    vm_prot_t, entry->max_protection,
8954 			    int, VME_ALIAS(entry));
8955 			entry->protection = VM_PROT_NONE;
8956 			entry->max_protection = VM_PROT_NONE;
8957 #ifdef __arm64e__
8958 			entry->used_for_tpro = FALSE;
8959 #endif
8960 		} else {
8961 			vm_map_entry_zap(map, entry, zap_list);
8962 		}
8963 
8964 		entry = next;
8965 		next  = VM_MAP_ENTRY_NULL;
8966 
8967 		if ((flags & VM_MAP_REMOVE_NO_YIELD) == 0 && s < end) {
8968 			unsigned int last_timestamp = map->timestamp++;
8969 
8970 			if (lck_rw_lock_yield_exclusive(&map->lock,
8971 			    LCK_RW_YIELD_ANY_WAITER)) {
8972 				if (last_timestamp != map->timestamp + 1) {
8973 					state |= VMDS_NEEDS_LOOKUP;
8974 				}
8975 			} else {
8976 				/* we didn't yield, undo our change */
8977 				map->timestamp--;
8978 			}
8979 		}
8980 	}
8981 
8982 	if (map->wait_for_space) {
8983 		thread_wakeup((event_t) map);
8984 	}
8985 
8986 	if (state & VMDS_NEEDS_WAKEUP) {
8987 		vm_map_entry_wakeup(map);
8988 	}
8989 
8990 out:
8991 	if ((state & VMDS_KERNEL_PMAP) && ret.kmr_return) {
8992 		__vm_map_delete_failed_panic(map, start, end, ret.kmr_return);
8993 	}
8994 
8995 	if (state & VMDS_KERNEL_KMEMPTR) {
8996 		kmem_free_space(start, end, range_id, &slot);
8997 	}
8998 
8999 	if (state & VMDS_FOUND_GAP) {
9000 		DTRACE_VM3(kern_vm_deallocate_gap,
9001 		    vm_map_offset_t, gap_start,
9002 		    vm_map_offset_t, save_start,
9003 		    vm_map_offset_t, save_end);
9004 		if (flags & VM_MAP_REMOVE_GAPS_FAIL) {
9005 			ret.kmr_return = KERN_INVALID_VALUE;
9006 		} else {
9007 			vm_map_guard_exception(gap_start, kGUARD_EXC_DEALLOC_GAP);
9008 		}
9009 	}
9010 
9011 	return ret;
9012 }
9013 
9014 kmem_return_t
vm_map_remove_and_unlock(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vmr_flags_t flags,kmem_guard_t guard)9015 vm_map_remove_and_unlock(
9016 	vm_map_t        map,
9017 	vm_map_offset_t start,
9018 	vm_map_offset_t end,
9019 	vmr_flags_t     flags,
9020 	kmem_guard_t    guard)
9021 {
9022 	kmem_return_t ret;
9023 	VM_MAP_ZAP_DECLARE(zap);
9024 
9025 	ret = vm_map_delete(map, start, end, flags, guard, &zap);
9026 	vm_map_unlock(map);
9027 
9028 	vm_map_zap_dispose(&zap);
9029 
9030 	return ret;
9031 }
9032 
9033 /*
9034  *	vm_map_remove_guard:
9035  *
9036  *	Remove the given address range from the target map.
9037  *	This is the exported form of vm_map_delete.
9038  */
9039 kmem_return_t
vm_map_remove_guard(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vmr_flags_t flags,kmem_guard_t guard)9040 vm_map_remove_guard(
9041 	vm_map_t        map,
9042 	vm_map_offset_t start,
9043 	vm_map_offset_t end,
9044 	vmr_flags_t     flags,
9045 	kmem_guard_t    guard)
9046 {
9047 	vm_map_lock(map);
9048 	return vm_map_remove_and_unlock(map, start, end, flags, guard);
9049 }
9050 
9051 
9052 /*
9053  *  vm_map_setup:
9054  *
9055  *  Perform any required setup on a new task's map. Must be called before the task
9056  *  is enabled for IPC access, since after this point other threads may be able
9057  *  to look up the task port and make VM API calls.
9058  */
9059 void
vm_map_setup(vm_map_t map,task_t task)9060 vm_map_setup(vm_map_t map, task_t task)
9061 {
9062 	/*
9063 	 * map does NOT take a reference on owning_task. If the map has terminated,
9064 	 * it is possible that the pointer is NULL, so reads of owning_task must
9065 	 * happen under the map lock and explicitly check for NULL.
9066 	 */
9067 	vm_map_lock(map);
9068 	assert(!map->owning_task);
9069 	map->owning_task = task;
9070 	vm_map_unlock(map);
9071 #if CONFIG_DEFERRED_RECLAIM
9072 	vm_deferred_reclamation_metadata_t vdrm = task->deferred_reclamation_metadata;
9073 	if (vdrm) {
9074 		vm_deferred_reclamation_task_fork_register(vdrm);
9075 	}
9076 #endif /* CONFIG_DEFERRED_RECLAIM */
9077 }
9078 
9079 /*
9080  *	vm_map_terminate:
9081  *
9082  *	Clean out a task's map.
9083  */
9084 kern_return_t
vm_map_terminate(vm_map_t map)9085 vm_map_terminate(
9086 	vm_map_t        map)
9087 {
9088 	vm_map_lock(map);
9089 	map->terminated = TRUE;
9090 	map->owning_task = NULL;
9091 	vm_map_disable_hole_optimization(map);
9092 	(void)vm_map_remove_and_unlock(map, map->min_offset, map->max_offset,
9093 	    VM_MAP_REMOVE_NO_FLAGS, KMEM_GUARD_NONE);
9094 	return KERN_SUCCESS;
9095 }
9096 
9097 /*
9098  *	Routine:	vm_map_copy_allocate
9099  *
9100  *	Description:
9101  *		Allocates and initializes a map copy object.
9102  */
9103 static vm_map_copy_t
vm_map_copy_allocate(uint16_t type)9104 vm_map_copy_allocate(uint16_t type)
9105 {
9106 	vm_map_copy_t new_copy;
9107 
9108 	new_copy = zalloc_id(ZONE_ID_VM_MAP_COPY, Z_WAITOK | Z_ZERO);
9109 	new_copy->type = type;
9110 	if (type == VM_MAP_COPY_ENTRY_LIST) {
9111 		new_copy->c_u.hdr.rb_head_store.rbh_root = (void*)(int)SKIP_RB_TREE;
9112 		vm_map_store_init(&new_copy->cpy_hdr);
9113 	}
9114 	return new_copy;
9115 }
9116 
9117 /*
9118  *	Routine:	vm_map_copy_discard
9119  *
9120  *	Description:
9121  *		Dispose of a map copy object (returned by
9122  *		vm_map_copyin).
9123  */
9124 void
vm_map_copy_discard(vm_map_copy_t copy)9125 vm_map_copy_discard(
9126 	vm_map_copy_t   copy)
9127 {
9128 	if (copy == VM_MAP_COPY_NULL) {
9129 		return;
9130 	}
9131 
9132 	/*
9133 	 * Assert that the vm_map_copy is coming from the right
9134 	 * zone and hasn't been forged
9135 	 */
9136 	vm_map_copy_require(copy);
9137 
9138 	switch (copy->type) {
9139 	case VM_MAP_COPY_ENTRY_LIST:
9140 		while (vm_map_copy_first_entry(copy) !=
9141 		    vm_map_copy_to_entry(copy)) {
9142 			vm_map_entry_t  entry = vm_map_copy_first_entry(copy);
9143 
9144 			vm_map_copy_entry_unlink(copy, entry);
9145 			if (entry->is_sub_map) {
9146 				vm_map_deallocate(VME_SUBMAP(entry));
9147 			} else {
9148 				vm_object_deallocate(VME_OBJECT(entry));
9149 			}
9150 			vm_map_copy_entry_dispose(entry);
9151 		}
9152 		break;
9153 	case VM_MAP_COPY_KERNEL_BUFFER:
9154 
9155 		/*
9156 		 * The vm_map_copy_t and possibly the data buffer were
9157 		 * allocated by a single call to kalloc_data(), i.e. the
9158 		 * vm_map_copy_t was not allocated out of the zone.
9159 		 */
9160 		if (copy->size > msg_ool_size_small || copy->offset) {
9161 			panic("Invalid vm_map_copy_t sz:%lld, ofst:%lld",
9162 			    (long long)copy->size, (long long)copy->offset);
9163 		}
9164 		kfree_data(copy->cpy_kdata, copy->size);
9165 	}
9166 	zfree_id(ZONE_ID_VM_MAP_COPY, copy);
9167 }
9168 
9169 #if XNU_PLATFORM_MacOSX
9170 
9171 __exported
9172 extern vm_map_copy_t vm_map_copy_copy(vm_map_copy_t copy);
9173 
9174 /*
9175  *	Routine:	vm_map_copy_copy
9176  *
9177  *	Description:
9178  *			Move the information in a map copy object to
9179  *			a new map copy object, leaving the old one
9180  *			empty.
9181  *
9182  *			This is used by kernel routines that need
9183  *			to look at out-of-line data (in copyin form)
9184  *			before deciding whether to return SUCCESS.
9185  *			If the routine returns FAILURE, the original
9186  *			copy object will be deallocated; therefore,
9187  *			these routines must make a copy of the copy
9188  *			object and leave the original empty so that
9189  *			deallocation will not fail.
9190  */
9191 vm_map_copy_t
vm_map_copy_copy(vm_map_copy_t copy)9192 vm_map_copy_copy(
9193 	vm_map_copy_t   copy)
9194 {
9195 	vm_map_copy_t   new_copy;
9196 
9197 	if (copy == VM_MAP_COPY_NULL) {
9198 		return VM_MAP_COPY_NULL;
9199 	}
9200 
9201 	/*
9202 	 * Assert that the vm_map_copy is coming from the right
9203 	 * zone and hasn't been forged
9204 	 */
9205 	vm_map_copy_require(copy);
9206 
9207 	/*
9208 	 * Allocate a new copy object, and copy the information
9209 	 * from the old one into it.
9210 	 */
9211 
9212 	new_copy = zalloc_id(ZONE_ID_VM_MAP_COPY, Z_WAITOK | Z_ZERO | Z_NOFAIL);
9213 	memcpy((void *) new_copy, (void *) copy, sizeof(struct vm_map_copy));
9214 #if __has_feature(ptrauth_calls)
9215 	if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) {
9216 		new_copy->cpy_kdata = copy->cpy_kdata;
9217 	}
9218 #endif
9219 
9220 	if (copy->type == VM_MAP_COPY_ENTRY_LIST) {
9221 		/*
9222 		 * The links in the entry chain must be
9223 		 * changed to point to the new copy object.
9224 		 */
9225 		vm_map_copy_first_entry(copy)->vme_prev
9226 		        = vm_map_copy_to_entry(new_copy);
9227 		vm_map_copy_last_entry(copy)->vme_next
9228 		        = vm_map_copy_to_entry(new_copy);
9229 	}
9230 
9231 	/*
9232 	 * Change the old copy object into one that contains
9233 	 * nothing to be deallocated.
9234 	 */
9235 	bzero(copy, sizeof(struct vm_map_copy));
9236 	copy->type = VM_MAP_COPY_KERNEL_BUFFER;
9237 
9238 	/*
9239 	 * Return the new object.
9240 	 */
9241 	return new_copy;
9242 }
9243 
9244 #endif /* XNU_PLATFORM_MacOSX */
9245 
9246 static boolean_t
vm_map_entry_is_overwritable(vm_map_t dst_map __unused,vm_map_entry_t entry)9247 vm_map_entry_is_overwritable(
9248 	vm_map_t        dst_map __unused,
9249 	vm_map_entry_t  entry)
9250 {
9251 	if (!(entry->protection & VM_PROT_WRITE)) {
9252 		/* can't overwrite if not writable */
9253 		return FALSE;
9254 	}
9255 #if !__x86_64__
9256 	if (entry->used_for_jit &&
9257 	    vm_map_cs_enforcement(dst_map) &&
9258 	    !dst_map->cs_debugged) {
9259 		/*
9260 		 * Can't overwrite a JIT region while cs_enforced
9261 		 * and not cs_debugged.
9262 		 */
9263 		return FALSE;
9264 	}
9265 
9266 #if __arm64e__
9267 	/* Do not allow overwrite HW assisted TPRO entries */
9268 	if (entry->used_for_tpro) {
9269 		return FALSE;
9270 	}
9271 #endif /* __arm64e__ */
9272 
9273 	if (entry->vme_permanent) {
9274 		if (entry->is_sub_map) {
9275 			/*
9276 			 * We can't tell if the submap contains "permanent"
9277 			 * entries within the range targeted by the caller.
9278 			 * The caller will have to check for that with
9279 			 * vm_map_overwrite_submap_recurse() for example.
9280 			 */
9281 		} else {
9282 			/*
9283 			 * Do not allow overwriting of a "permanent"
9284 			 * entry.
9285 			 */
9286 			DTRACE_VM6(vm_map_delete_permanent_deny_overwrite,
9287 			    vm_map_entry_t, entry,
9288 			    vm_map_offset_t, entry->vme_start,
9289 			    vm_map_offset_t, entry->vme_end,
9290 			    vm_prot_t, entry->protection,
9291 			    vm_prot_t, entry->max_protection,
9292 			    int, VME_ALIAS(entry));
9293 			return FALSE;
9294 		}
9295 	}
9296 #endif /* !__x86_64__ */
9297 
9298 	if (entry->is_sub_map) {
9299 		/* remember not to assume every entry has a VM object... */
9300 	}
9301 
9302 
9303 	return TRUE;
9304 }
9305 
9306 static kern_return_t
vm_map_overwrite_submap_recurse(vm_map_t dst_map,vm_map_offset_t dst_addr,vm_map_size_t dst_size)9307 vm_map_overwrite_submap_recurse(
9308 	vm_map_t        dst_map,
9309 	vm_map_offset_t dst_addr,
9310 	vm_map_size_t   dst_size)
9311 {
9312 	vm_map_offset_t dst_end;
9313 	vm_map_entry_t  tmp_entry;
9314 	vm_map_entry_t  entry;
9315 	kern_return_t   result;
9316 	boolean_t       encountered_sub_map = FALSE;
9317 
9318 
9319 
9320 	/*
9321 	 *	Verify that the destination is all writeable
9322 	 *	initially.  We have to trunc the destination
9323 	 *	address and round the copy size or we'll end up
9324 	 *	splitting entries in strange ways.
9325 	 */
9326 
9327 	dst_end = vm_map_round_page(dst_addr + dst_size,
9328 	    VM_MAP_PAGE_MASK(dst_map));
9329 	vm_map_lock(dst_map);
9330 
9331 start_pass_1:
9332 	if (!vm_map_lookup_entry(dst_map, dst_addr, &tmp_entry)) {
9333 		vm_map_unlock(dst_map);
9334 		return KERN_INVALID_ADDRESS;
9335 	}
9336 
9337 	vm_map_clip_start(dst_map,
9338 	    tmp_entry,
9339 	    vm_map_trunc_page(dst_addr,
9340 	    VM_MAP_PAGE_MASK(dst_map)));
9341 	if (tmp_entry->is_sub_map) {
9342 		/* clipping did unnest if needed */
9343 		assert(!tmp_entry->use_pmap);
9344 	}
9345 
9346 	for (entry = tmp_entry;;) {
9347 		vm_map_entry_t  next;
9348 
9349 		next = entry->vme_next;
9350 		while (entry->is_sub_map) {
9351 			vm_map_offset_t sub_start;
9352 			vm_map_offset_t sub_end;
9353 			vm_map_offset_t local_end;
9354 			vm_map_t        sub_map;
9355 
9356 			if (entry->in_transition) {
9357 				/*
9358 				 * Say that we are waiting, and wait for entry.
9359 				 */
9360 				entry->needs_wakeup = TRUE;
9361 				vm_map_entry_wait(dst_map, THREAD_UNINT);
9362 
9363 				goto start_pass_1;
9364 			}
9365 
9366 			encountered_sub_map = TRUE;
9367 			sub_start = VME_OFFSET(entry);
9368 
9369 			if (entry->vme_end < dst_end) {
9370 				sub_end = entry->vme_end;
9371 			} else {
9372 				sub_end = dst_end;
9373 			}
9374 			sub_end -= entry->vme_start;
9375 			sub_end += VME_OFFSET(entry);
9376 			local_end = entry->vme_end;
9377 			sub_map = VME_SUBMAP(entry);
9378 			vm_map_reference(sub_map);
9379 			vm_map_unlock(dst_map);
9380 
9381 			result = vm_map_overwrite_submap_recurse(
9382 				sub_map,
9383 				sub_start,
9384 				sub_end - sub_start);
9385 
9386 			vm_map_deallocate(sub_map);
9387 			sub_map = VM_MAP_NULL;
9388 
9389 			if (result != KERN_SUCCESS) {
9390 				return result;
9391 			}
9392 			if (dst_end <= entry->vme_end) {
9393 				return KERN_SUCCESS;
9394 			}
9395 			vm_map_lock(dst_map);
9396 			if (!vm_map_lookup_entry(dst_map, local_end,
9397 			    &tmp_entry)) {
9398 				vm_map_unlock(dst_map);
9399 				return KERN_INVALID_ADDRESS;
9400 			}
9401 			entry = tmp_entry;
9402 			next = entry->vme_next;
9403 		}
9404 		assert(!entry->is_sub_map);
9405 
9406 		if (!(entry->protection & VM_PROT_WRITE)) {
9407 			vm_map_unlock(dst_map);
9408 			return KERN_PROTECTION_FAILURE;
9409 		}
9410 
9411 		if (!vm_map_entry_is_overwritable(dst_map, entry)) {
9412 			vm_map_unlock(dst_map);
9413 			return KERN_PROTECTION_FAILURE;
9414 		}
9415 
9416 		/*
9417 		 *	If the entry is in transition, we must wait
9418 		 *	for it to exit that state.  Anything could happen
9419 		 *	when we unlock the map, so start over.
9420 		 */
9421 		if (entry->in_transition) {
9422 			/*
9423 			 * Say that we are waiting, and wait for entry.
9424 			 */
9425 			entry->needs_wakeup = TRUE;
9426 			vm_map_entry_wait(dst_map, THREAD_UNINT);
9427 
9428 			goto start_pass_1;
9429 		}
9430 
9431 /*
9432  *		our range is contained completely within this map entry
9433  */
9434 		if (dst_end <= entry->vme_end) {
9435 			vm_map_unlock(dst_map);
9436 			return KERN_SUCCESS;
9437 		}
9438 /*
9439  *		check that range specified is contiguous region
9440  */
9441 		if ((next == vm_map_to_entry(dst_map)) ||
9442 		    (next->vme_start != entry->vme_end)) {
9443 			vm_map_unlock(dst_map);
9444 			return KERN_INVALID_ADDRESS;
9445 		}
9446 
9447 		/*
9448 		 *	Check for permanent objects in the destination.
9449 		 */
9450 		assert(!entry->is_sub_map);
9451 		if ((VME_OBJECT(entry) != VM_OBJECT_NULL) &&
9452 		    ((!VME_OBJECT(entry)->internal) ||
9453 		    (VME_OBJECT(entry)->true_share))) {
9454 			if (encountered_sub_map) {
9455 				vm_map_unlock(dst_map);
9456 				return KERN_FAILURE;
9457 			}
9458 		}
9459 
9460 
9461 		entry = next;
9462 	}/* for */
9463 	vm_map_unlock(dst_map);
9464 	return KERN_SUCCESS;
9465 }
9466 
9467 /*
9468  *	Routine:	vm_map_copy_overwrite
9469  *
9470  *	Description:
9471  *		Copy the memory described by the map copy
9472  *		object (copy; returned by vm_map_copyin) onto
9473  *		the specified destination region (dst_map, dst_addr).
9474  *		The destination must be writeable.
9475  *
9476  *		Unlike vm_map_copyout, this routine actually
9477  *		writes over previously-mapped memory.  If the
9478  *		previous mapping was to a permanent (user-supplied)
9479  *		memory object, it is preserved.
9480  *
9481  *		The attributes (protection and inheritance) of the
9482  *		destination region are preserved.
9483  *
9484  *		If successful, consumes the copy object.
9485  *		Otherwise, the caller is responsible for it.
9486  *
9487  *	Implementation notes:
9488  *		To overwrite aligned temporary virtual memory, it is
9489  *		sufficient to remove the previous mapping and insert
9490  *		the new copy.  This replacement is done either on
9491  *		the whole region (if no permanent virtual memory
9492  *		objects are embedded in the destination region) or
9493  *		in individual map entries.
9494  *
9495  *		To overwrite permanent virtual memory , it is necessary
9496  *		to copy each page, as the external memory management
9497  *		interface currently does not provide any optimizations.
9498  *
9499  *		Unaligned memory also has to be copied.  It is possible
9500  *		to use 'vm_trickery' to copy the aligned data.  This is
9501  *		not done but not hard to implement.
9502  *
9503  *		Once a page of permanent memory has been overwritten,
9504  *		it is impossible to interrupt this function; otherwise,
9505  *		the call would be neither atomic nor location-independent.
9506  *		The kernel-state portion of a user thread must be
9507  *		interruptible.
9508  *
9509  *		It may be expensive to forward all requests that might
9510  *		overwrite permanent memory (vm_write, vm_copy) to
9511  *		uninterruptible kernel threads.  This routine may be
9512  *		called by interruptible threads; however, success is
9513  *		not guaranteed -- if the request cannot be performed
9514  *		atomically and interruptibly, an error indication is
9515  *		returned.
9516  *
9517  *		Callers of this function must call vm_map_copy_require on
9518  *		previously created vm_map_copy_t or pass a newly created
9519  *		one to ensure that it hasn't been forged.
9520  */
9521 static kern_return_t
vm_map_copy_overwrite_nested(vm_map_t dst_map,vm_map_address_t dst_addr,vm_map_copy_t copy,boolean_t interruptible,pmap_t pmap,boolean_t discard_on_success)9522 vm_map_copy_overwrite_nested(
9523 	vm_map_t                dst_map,
9524 	vm_map_address_t        dst_addr,
9525 	vm_map_copy_t           copy,
9526 	boolean_t               interruptible,
9527 	pmap_t                  pmap,
9528 	boolean_t               discard_on_success)
9529 {
9530 	vm_map_offset_t         dst_end;
9531 	vm_map_entry_t          tmp_entry;
9532 	vm_map_entry_t          entry;
9533 	kern_return_t           kr;
9534 	boolean_t               aligned = TRUE;
9535 	boolean_t               contains_permanent_objects = FALSE;
9536 	boolean_t               encountered_sub_map = FALSE;
9537 	vm_map_offset_t         base_addr;
9538 	vm_map_size_t           copy_size;
9539 	vm_map_size_t           total_size;
9540 	uint16_t                copy_page_shift;
9541 
9542 	/*
9543 	 *	Check for special kernel buffer allocated
9544 	 *	by new_ipc_kmsg_copyin.
9545 	 */
9546 
9547 	if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) {
9548 		kr = vm_map_copyout_kernel_buffer(
9549 			dst_map, &dst_addr,
9550 			copy, copy->size, TRUE,
9551 			discard_on_success);
9552 		return kr;
9553 	}
9554 
9555 	/*
9556 	 *      Only works for entry lists at the moment.  Will
9557 	 *	support page lists later.
9558 	 */
9559 
9560 	assert(copy->type == VM_MAP_COPY_ENTRY_LIST);
9561 
9562 	if (copy->size == 0) {
9563 		if (discard_on_success) {
9564 			vm_map_copy_discard(copy);
9565 		}
9566 		return KERN_SUCCESS;
9567 	}
9568 
9569 	copy_page_shift = copy->cpy_hdr.page_shift;
9570 
9571 	/*
9572 	 *	Verify that the destination is all writeable
9573 	 *	initially.  We have to trunc the destination
9574 	 *	address and round the copy size or we'll end up
9575 	 *	splitting entries in strange ways.
9576 	 */
9577 
9578 	if (!VM_MAP_PAGE_ALIGNED(copy->size,
9579 	    VM_MAP_PAGE_MASK(dst_map)) ||
9580 	    !VM_MAP_PAGE_ALIGNED(copy->offset,
9581 	    VM_MAP_PAGE_MASK(dst_map)) ||
9582 	    !VM_MAP_PAGE_ALIGNED(dst_addr,
9583 	    VM_MAP_PAGE_MASK(dst_map)) ||
9584 	    copy_page_shift != VM_MAP_PAGE_SHIFT(dst_map)) {
9585 		aligned = FALSE;
9586 		dst_end = vm_map_round_page(dst_addr + copy->size,
9587 		    VM_MAP_PAGE_MASK(dst_map));
9588 	} else {
9589 		dst_end = dst_addr + copy->size;
9590 	}
9591 
9592 	vm_map_lock(dst_map);
9593 
9594 	/* LP64todo - remove this check when vm_map_commpage64()
9595 	 * no longer has to stuff in a map_entry for the commpage
9596 	 * above the map's max_offset.
9597 	 */
9598 	if (dst_addr >= dst_map->max_offset) {
9599 		vm_map_unlock(dst_map);
9600 		return KERN_INVALID_ADDRESS;
9601 	}
9602 
9603 start_pass_1:
9604 	if (!vm_map_lookup_entry(dst_map, dst_addr, &tmp_entry)) {
9605 		vm_map_unlock(dst_map);
9606 		return KERN_INVALID_ADDRESS;
9607 	}
9608 	vm_map_clip_start(dst_map,
9609 	    tmp_entry,
9610 	    vm_map_trunc_page(dst_addr,
9611 	    VM_MAP_PAGE_MASK(dst_map)));
9612 	for (entry = tmp_entry;;) {
9613 		vm_map_entry_t  next = entry->vme_next;
9614 
9615 		while (entry->is_sub_map) {
9616 			vm_map_offset_t sub_start;
9617 			vm_map_offset_t sub_end;
9618 			vm_map_offset_t local_end;
9619 
9620 			if (entry->in_transition) {
9621 				/*
9622 				 * Say that we are waiting, and wait for entry.
9623 				 */
9624 				entry->needs_wakeup = TRUE;
9625 				vm_map_entry_wait(dst_map, THREAD_UNINT);
9626 
9627 				goto start_pass_1;
9628 			}
9629 
9630 			local_end = entry->vme_end;
9631 			if (!(entry->needs_copy)) {
9632 				vm_map_t sub_map = VM_MAP_NULL;
9633 
9634 				/* if needs_copy we are a COW submap */
9635 				/* in such a case we just replace so */
9636 				/* there is no need for the follow-  */
9637 				/* ing check.                        */
9638 				encountered_sub_map = TRUE;
9639 				sub_start = VME_OFFSET(entry);
9640 
9641 				if (entry->vme_end < dst_end) {
9642 					sub_end = entry->vme_end;
9643 				} else {
9644 					sub_end = dst_end;
9645 				}
9646 				sub_end -= entry->vme_start;
9647 				sub_end += VME_OFFSET(entry);
9648 				sub_map = VME_SUBMAP(entry);
9649 				vm_map_reference(sub_map);
9650 				vm_map_unlock(dst_map);
9651 
9652 				kr = vm_map_overwrite_submap_recurse(
9653 					sub_map,
9654 					sub_start,
9655 					sub_end - sub_start);
9656 
9657 				vm_map_deallocate(sub_map);
9658 				sub_map = VM_MAP_NULL;
9659 				if (kr != KERN_SUCCESS) {
9660 					return kr;
9661 				}
9662 				vm_map_lock(dst_map);
9663 			}
9664 
9665 			if (dst_end <= entry->vme_end) {
9666 				goto start_overwrite;
9667 			}
9668 			if (!vm_map_lookup_entry(dst_map, local_end,
9669 			    &entry)) {
9670 				vm_map_unlock(dst_map);
9671 				return KERN_INVALID_ADDRESS;
9672 			}
9673 			next = entry->vme_next;
9674 		}
9675 		assert(!entry->is_sub_map);
9676 
9677 		if (!(entry->protection & VM_PROT_WRITE)) {
9678 			vm_map_unlock(dst_map);
9679 			return KERN_PROTECTION_FAILURE;
9680 		}
9681 
9682 		if (!vm_map_entry_is_overwritable(dst_map, entry)) {
9683 			vm_map_unlock(dst_map);
9684 			return KERN_PROTECTION_FAILURE;
9685 		}
9686 
9687 		/*
9688 		 *	If the entry is in transition, we must wait
9689 		 *	for it to exit that state.  Anything could happen
9690 		 *	when we unlock the map, so start over.
9691 		 */
9692 		if (entry->in_transition) {
9693 			/*
9694 			 * Say that we are waiting, and wait for entry.
9695 			 */
9696 			entry->needs_wakeup = TRUE;
9697 			vm_map_entry_wait(dst_map, THREAD_UNINT);
9698 
9699 			goto start_pass_1;
9700 		}
9701 
9702 /*
9703  *		our range is contained completely within this map entry
9704  */
9705 		if (dst_end <= entry->vme_end) {
9706 			break;
9707 		}
9708 /*
9709  *		check that range specified is contiguous region
9710  */
9711 		if ((next == vm_map_to_entry(dst_map)) ||
9712 		    (next->vme_start != entry->vme_end)) {
9713 			vm_map_unlock(dst_map);
9714 			return KERN_INVALID_ADDRESS;
9715 		}
9716 
9717 
9718 		/*
9719 		 *	Check for permanent objects in the destination.
9720 		 */
9721 		assert(!entry->is_sub_map);
9722 		if ((VME_OBJECT(entry) != VM_OBJECT_NULL) &&
9723 		    ((!VME_OBJECT(entry)->internal) ||
9724 		    (VME_OBJECT(entry)->true_share))) {
9725 			contains_permanent_objects = TRUE;
9726 		}
9727 
9728 		entry = next;
9729 	}/* for */
9730 
9731 start_overwrite:
9732 	/*
9733 	 *	If there are permanent objects in the destination, then
9734 	 *	the copy cannot be interrupted.
9735 	 */
9736 
9737 	if (interruptible && contains_permanent_objects) {
9738 		vm_map_unlock(dst_map);
9739 		return KERN_FAILURE;   /* XXX */
9740 	}
9741 
9742 	/*
9743 	 *
9744 	 *	Make a second pass, overwriting the data
9745 	 *	At the beginning of each loop iteration,
9746 	 *	the next entry to be overwritten is "tmp_entry"
9747 	 *	(initially, the value returned from the lookup above),
9748 	 *	and the starting address expected in that entry
9749 	 *	is "start".
9750 	 */
9751 
9752 	total_size = copy->size;
9753 	if (encountered_sub_map) {
9754 		copy_size = 0;
9755 		/* re-calculate tmp_entry since we've had the map */
9756 		/* unlocked */
9757 		if (!vm_map_lookup_entry( dst_map, dst_addr, &tmp_entry)) {
9758 			vm_map_unlock(dst_map);
9759 			return KERN_INVALID_ADDRESS;
9760 		}
9761 	} else {
9762 		copy_size = copy->size;
9763 	}
9764 
9765 	base_addr = dst_addr;
9766 	while (TRUE) {
9767 		/* deconstruct the copy object and do in parts */
9768 		/* only in sub_map, interruptable case */
9769 		vm_map_entry_t  copy_entry;
9770 		vm_map_entry_t  previous_prev = VM_MAP_ENTRY_NULL;
9771 		vm_map_entry_t  next_copy = VM_MAP_ENTRY_NULL;
9772 		int             nentries;
9773 		int             remaining_entries = 0;
9774 		vm_map_offset_t new_offset = 0;
9775 
9776 		for (entry = tmp_entry; copy_size == 0;) {
9777 			vm_map_entry_t  next;
9778 
9779 			next = entry->vme_next;
9780 
9781 			/* tmp_entry and base address are moved along */
9782 			/* each time we encounter a sub-map.  Otherwise */
9783 			/* entry can outpase tmp_entry, and the copy_size */
9784 			/* may reflect the distance between them */
9785 			/* if the current entry is found to be in transition */
9786 			/* we will start over at the beginning or the last */
9787 			/* encounter of a submap as dictated by base_addr */
9788 			/* we will zero copy_size accordingly. */
9789 			if (entry->in_transition) {
9790 				/*
9791 				 * Say that we are waiting, and wait for entry.
9792 				 */
9793 				entry->needs_wakeup = TRUE;
9794 				vm_map_entry_wait(dst_map, THREAD_UNINT);
9795 
9796 				if (!vm_map_lookup_entry(dst_map, base_addr,
9797 				    &tmp_entry)) {
9798 					vm_map_unlock(dst_map);
9799 					return KERN_INVALID_ADDRESS;
9800 				}
9801 				copy_size = 0;
9802 				entry = tmp_entry;
9803 				continue;
9804 			}
9805 			if (entry->is_sub_map) {
9806 				vm_map_offset_t sub_start;
9807 				vm_map_offset_t sub_end;
9808 				vm_map_offset_t local_end;
9809 				vm_map_t        sub_map = VM_MAP_NULL;
9810 				bool            use_pmap;
9811 
9812 				if (entry->needs_copy) {
9813 					/* if this is a COW submap */
9814 					/* just back the range with a */
9815 					/* anonymous entry */
9816 					assert(!entry->vme_permanent);
9817 					if (entry->vme_end < dst_end) {
9818 						sub_end = entry->vme_end;
9819 					} else {
9820 						sub_end = dst_end;
9821 					}
9822 					if (entry->vme_start < base_addr) {
9823 						sub_start = base_addr;
9824 					} else {
9825 						sub_start = entry->vme_start;
9826 					}
9827 					vm_map_clip_end(
9828 						dst_map, entry, sub_end);
9829 					vm_map_clip_start(
9830 						dst_map, entry, sub_start);
9831 					assert(!entry->use_pmap);
9832 					assert(!entry->iokit_acct);
9833 					entry->use_pmap = TRUE;
9834 					vm_map_deallocate(VME_SUBMAP(entry));
9835 					assert(!entry->vme_permanent);
9836 					VME_OBJECT_SET(entry, VM_OBJECT_NULL, false, 0);
9837 					VME_OFFSET_SET(entry, 0);
9838 					entry->is_shared = FALSE;
9839 					entry->needs_copy = FALSE;
9840 					entry->protection = VM_PROT_DEFAULT;
9841 					entry->max_protection = VM_PROT_ALL;
9842 					entry->wired_count = 0;
9843 					entry->user_wired_count = 0;
9844 					if (entry->inheritance
9845 					    == VM_INHERIT_SHARE) {
9846 						entry->inheritance = VM_INHERIT_COPY;
9847 					}
9848 					continue;
9849 				}
9850 				/* first take care of any non-sub_map */
9851 				/* entries to send */
9852 				if (base_addr < entry->vme_start) {
9853 					/* stuff to send */
9854 					copy_size =
9855 					    entry->vme_start - base_addr;
9856 					break;
9857 				}
9858 				sub_start = VME_OFFSET(entry);
9859 
9860 				if (entry->vme_end < dst_end) {
9861 					sub_end = entry->vme_end;
9862 				} else {
9863 					sub_end = dst_end;
9864 				}
9865 				sub_end -= entry->vme_start;
9866 				sub_end += VME_OFFSET(entry);
9867 				local_end = entry->vme_end;
9868 				use_pmap = entry->use_pmap;
9869 				sub_map = VME_SUBMAP(entry);
9870 				vm_map_reference(sub_map);
9871 				vm_map_unlock(dst_map);
9872 				copy_size = sub_end - sub_start;
9873 
9874 				/* adjust the copy object */
9875 				if (total_size > copy_size) {
9876 					vm_map_size_t   local_size = 0;
9877 					vm_map_size_t   entry_size;
9878 
9879 					nentries = 1;
9880 					new_offset = copy->offset;
9881 					copy_entry = vm_map_copy_first_entry(copy);
9882 					while (copy_entry !=
9883 					    vm_map_copy_to_entry(copy)) {
9884 						entry_size = copy_entry->vme_end -
9885 						    copy_entry->vme_start;
9886 						if ((local_size < copy_size) &&
9887 						    ((local_size + entry_size)
9888 						    >= copy_size)) {
9889 							vm_map_copy_clip_end(copy,
9890 							    copy_entry,
9891 							    copy_entry->vme_start +
9892 							    (copy_size - local_size));
9893 							entry_size = copy_entry->vme_end -
9894 							    copy_entry->vme_start;
9895 							local_size += entry_size;
9896 							new_offset += entry_size;
9897 						}
9898 						if (local_size >= copy_size) {
9899 							next_copy = copy_entry->vme_next;
9900 							copy_entry->vme_next =
9901 							    vm_map_copy_to_entry(copy);
9902 							previous_prev =
9903 							    copy->cpy_hdr.links.prev;
9904 							copy->cpy_hdr.links.prev = copy_entry;
9905 							copy->size = copy_size;
9906 							remaining_entries =
9907 							    copy->cpy_hdr.nentries;
9908 							remaining_entries -= nentries;
9909 							copy->cpy_hdr.nentries = nentries;
9910 							break;
9911 						} else {
9912 							local_size += entry_size;
9913 							new_offset += entry_size;
9914 							nentries++;
9915 						}
9916 						copy_entry = copy_entry->vme_next;
9917 					}
9918 				}
9919 
9920 				if ((use_pmap) && (pmap == NULL)) {
9921 					kr = vm_map_copy_overwrite_nested(
9922 						sub_map,
9923 						sub_start,
9924 						copy,
9925 						interruptible,
9926 						sub_map->pmap,
9927 						TRUE);
9928 				} else if (pmap != NULL) {
9929 					kr = vm_map_copy_overwrite_nested(
9930 						sub_map,
9931 						sub_start,
9932 						copy,
9933 						interruptible, pmap,
9934 						TRUE);
9935 				} else {
9936 					kr = vm_map_copy_overwrite_nested(
9937 						sub_map,
9938 						sub_start,
9939 						copy,
9940 						interruptible,
9941 						dst_map->pmap,
9942 						TRUE);
9943 				}
9944 
9945 				vm_map_deallocate(sub_map);
9946 				sub_map = VM_MAP_NULL;
9947 
9948 				if (kr != KERN_SUCCESS) {
9949 					if (next_copy != NULL) {
9950 						copy->cpy_hdr.nentries +=
9951 						    remaining_entries;
9952 						copy->cpy_hdr.links.prev->vme_next =
9953 						    next_copy;
9954 						copy->cpy_hdr.links.prev
9955 						        = previous_prev;
9956 						copy->size = total_size;
9957 					}
9958 					return kr;
9959 				}
9960 				if (dst_end <= local_end) {
9961 					return KERN_SUCCESS;
9962 				}
9963 				/* otherwise copy no longer exists, it was */
9964 				/* destroyed after successful copy_overwrite */
9965 				copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
9966 				copy->offset = new_offset;
9967 				copy->cpy_hdr.page_shift = copy_page_shift;
9968 
9969 				total_size -= copy_size;
9970 				copy_size = 0;
9971 				/* put back remainder of copy in container */
9972 				if (next_copy != NULL) {
9973 					copy->cpy_hdr.nentries = remaining_entries;
9974 					copy->cpy_hdr.links.next = next_copy;
9975 					copy->cpy_hdr.links.prev = previous_prev;
9976 					copy->size = total_size;
9977 					next_copy->vme_prev =
9978 					    vm_map_copy_to_entry(copy);
9979 					next_copy = NULL;
9980 				}
9981 				base_addr = local_end;
9982 				vm_map_lock(dst_map);
9983 				if (!vm_map_lookup_entry(dst_map,
9984 				    local_end, &tmp_entry)) {
9985 					vm_map_unlock(dst_map);
9986 					return KERN_INVALID_ADDRESS;
9987 				}
9988 				entry = tmp_entry;
9989 				continue;
9990 			}
9991 			assert(!entry->is_sub_map);
9992 
9993 			if (dst_end <= entry->vme_end) {
9994 				copy_size = dst_end - base_addr;
9995 				break;
9996 			}
9997 
9998 			if ((next == vm_map_to_entry(dst_map)) ||
9999 			    (next->vme_start != entry->vme_end)) {
10000 				vm_map_unlock(dst_map);
10001 				return KERN_INVALID_ADDRESS;
10002 			}
10003 
10004 			entry = next;
10005 		}/* for */
10006 
10007 		next_copy = NULL;
10008 		nentries = 1;
10009 
10010 		/* adjust the copy object */
10011 		if (total_size > copy_size) {
10012 			vm_map_size_t   local_size = 0;
10013 			vm_map_size_t   entry_size;
10014 
10015 			new_offset = copy->offset;
10016 			copy_entry = vm_map_copy_first_entry(copy);
10017 			while (copy_entry != vm_map_copy_to_entry(copy)) {
10018 				entry_size = copy_entry->vme_end -
10019 				    copy_entry->vme_start;
10020 				if ((local_size < copy_size) &&
10021 				    ((local_size + entry_size)
10022 				    >= copy_size)) {
10023 					vm_map_copy_clip_end(copy, copy_entry,
10024 					    copy_entry->vme_start +
10025 					    (copy_size - local_size));
10026 					entry_size = copy_entry->vme_end -
10027 					    copy_entry->vme_start;
10028 					local_size += entry_size;
10029 					new_offset += entry_size;
10030 				}
10031 				if (local_size >= copy_size) {
10032 					next_copy = copy_entry->vme_next;
10033 					copy_entry->vme_next =
10034 					    vm_map_copy_to_entry(copy);
10035 					previous_prev =
10036 					    copy->cpy_hdr.links.prev;
10037 					copy->cpy_hdr.links.prev = copy_entry;
10038 					copy->size = copy_size;
10039 					remaining_entries =
10040 					    copy->cpy_hdr.nentries;
10041 					remaining_entries -= nentries;
10042 					copy->cpy_hdr.nentries = nentries;
10043 					break;
10044 				} else {
10045 					local_size += entry_size;
10046 					new_offset += entry_size;
10047 					nentries++;
10048 				}
10049 				copy_entry = copy_entry->vme_next;
10050 			}
10051 		}
10052 
10053 		if (aligned) {
10054 			pmap_t  local_pmap;
10055 
10056 			if (pmap) {
10057 				local_pmap = pmap;
10058 			} else {
10059 				local_pmap = dst_map->pmap;
10060 			}
10061 
10062 			if ((kr =  vm_map_copy_overwrite_aligned(
10063 				    dst_map, tmp_entry, copy,
10064 				    base_addr, local_pmap)) != KERN_SUCCESS) {
10065 				if (next_copy != NULL) {
10066 					copy->cpy_hdr.nentries +=
10067 					    remaining_entries;
10068 					copy->cpy_hdr.links.prev->vme_next =
10069 					    next_copy;
10070 					copy->cpy_hdr.links.prev =
10071 					    previous_prev;
10072 					copy->size += copy_size;
10073 				}
10074 				return kr;
10075 			}
10076 			vm_map_unlock(dst_map);
10077 		} else {
10078 			/*
10079 			 * Performance gain:
10080 			 *
10081 			 * if the copy and dst address are misaligned but the same
10082 			 * offset within the page we can copy_not_aligned the
10083 			 * misaligned parts and copy aligned the rest.  If they are
10084 			 * aligned but len is unaligned we simply need to copy
10085 			 * the end bit unaligned.  We'll need to split the misaligned
10086 			 * bits of the region in this case !
10087 			 */
10088 			/* ALWAYS UNLOCKS THE dst_map MAP */
10089 			kr = vm_map_copy_overwrite_unaligned(
10090 				dst_map,
10091 				tmp_entry,
10092 				copy,
10093 				base_addr,
10094 				discard_on_success);
10095 			if (kr != KERN_SUCCESS) {
10096 				if (next_copy != NULL) {
10097 					copy->cpy_hdr.nentries +=
10098 					    remaining_entries;
10099 					copy->cpy_hdr.links.prev->vme_next =
10100 					    next_copy;
10101 					copy->cpy_hdr.links.prev =
10102 					    previous_prev;
10103 					copy->size += copy_size;
10104 				}
10105 				return kr;
10106 			}
10107 		}
10108 		total_size -= copy_size;
10109 		if (total_size == 0) {
10110 			break;
10111 		}
10112 		base_addr += copy_size;
10113 		copy_size = 0;
10114 		copy->offset = new_offset;
10115 		if (next_copy != NULL) {
10116 			copy->cpy_hdr.nentries = remaining_entries;
10117 			copy->cpy_hdr.links.next = next_copy;
10118 			copy->cpy_hdr.links.prev = previous_prev;
10119 			next_copy->vme_prev = vm_map_copy_to_entry(copy);
10120 			copy->size = total_size;
10121 		}
10122 		vm_map_lock(dst_map);
10123 		while (TRUE) {
10124 			if (!vm_map_lookup_entry(dst_map,
10125 			    base_addr, &tmp_entry)) {
10126 				vm_map_unlock(dst_map);
10127 				return KERN_INVALID_ADDRESS;
10128 			}
10129 			if (tmp_entry->in_transition) {
10130 				entry->needs_wakeup = TRUE;
10131 				vm_map_entry_wait(dst_map, THREAD_UNINT);
10132 			} else {
10133 				break;
10134 			}
10135 		}
10136 		vm_map_clip_start(dst_map,
10137 		    tmp_entry,
10138 		    vm_map_trunc_page(base_addr,
10139 		    VM_MAP_PAGE_MASK(dst_map)));
10140 
10141 		entry = tmp_entry;
10142 	} /* while */
10143 
10144 	/*
10145 	 *	Throw away the vm_map_copy object
10146 	 */
10147 	if (discard_on_success) {
10148 		vm_map_copy_discard(copy);
10149 	}
10150 
10151 	return KERN_SUCCESS;
10152 }/* vm_map_copy_overwrite */
10153 
10154 static __attribute__((always_inline, warn_unused_result))
10155 kern_return_t
vm_map_copy_addr_size_sanitize(vm_map_t map,vm_map_offset_ut addr_u,vm_map_size_ut size_u,vm_sanitize_caller_t vm_sanitize_caller,vm_map_offset_t * addr,vm_map_offset_t * end,vm_map_size_t * size)10156 vm_map_copy_addr_size_sanitize(
10157 	vm_map_t                map,
10158 	vm_map_offset_ut        addr_u,
10159 	vm_map_size_ut          size_u,
10160 	vm_sanitize_caller_t    vm_sanitize_caller,
10161 	vm_map_offset_t        *addr,
10162 	vm_map_offset_t        *end,
10163 	vm_map_size_t          *size)
10164 {
10165 	vm_sanitize_flags_t flags = VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH |
10166 	    VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES |
10167 	    VM_SANITIZE_FLAGS_CHECK_ADDR_RANGE;
10168 
10169 	return vm_sanitize_addr_size(addr_u, size_u,
10170 	           vm_sanitize_caller, map,
10171 	           flags,
10172 	           addr, end, size);
10173 }
10174 
10175 kern_return_t
vm_map_copy_overwrite(vm_map_t dst_map,vm_map_offset_ut dst_addr_u,vm_map_copy_t copy,vm_map_size_ut copy_size_u,boolean_t interruptible)10176 vm_map_copy_overwrite(
10177 	vm_map_t                dst_map,
10178 	vm_map_offset_ut        dst_addr_u,
10179 	vm_map_copy_t           copy,
10180 	vm_map_size_ut          copy_size_u,
10181 	boolean_t               interruptible)
10182 {
10183 	vm_map_offset_t dst_addr, dst_end;
10184 	vm_map_size_t   copy_size;
10185 	vm_map_size_t   head_size, tail_size;
10186 	vm_map_copy_t   head_copy, tail_copy;
10187 	vm_map_offset_t head_addr, tail_addr;
10188 	vm_map_entry_t  entry;
10189 	kern_return_t   kr;
10190 	vm_map_offset_t effective_page_mask, effective_page_size;
10191 	uint16_t        copy_page_shift;
10192 
10193 	head_size = 0;
10194 	tail_size = 0;
10195 	head_copy = NULL;
10196 	tail_copy = NULL;
10197 	head_addr = 0;
10198 	tail_addr = 0;
10199 
10200 	/*
10201 	 *	Check for null copy object.
10202 	 */
10203 	if (copy == VM_MAP_COPY_NULL) {
10204 		return KERN_SUCCESS;
10205 	}
10206 
10207 	/*
10208 	 * Sanitize any input parameters that are addr/size/prot/inherit
10209 	 */
10210 	kr = vm_map_copy_addr_size_sanitize(
10211 		dst_map,
10212 		dst_addr_u,
10213 		copy_size_u,
10214 		VM_SANITIZE_CALLER_VM_MAP_COPY_OVERWRITE,
10215 		&dst_addr,
10216 		&dst_end,
10217 		&copy_size);
10218 	if (__improbable(kr != KERN_SUCCESS)) {
10219 		return vm_sanitize_get_kr(kr);
10220 	}
10221 
10222 	/*
10223 	 * Assert that the vm_map_copy is coming from the right
10224 	 * zone and hasn't been forged
10225 	 */
10226 	vm_map_copy_require(copy);
10227 
10228 	if (interruptible ||
10229 	    copy->type != VM_MAP_COPY_ENTRY_LIST) {
10230 		/*
10231 		 * We can't split the "copy" map if we're interruptible
10232 		 * or if we don't have a "copy" map...
10233 		 */
10234 blunt_copy:
10235 		kr = vm_map_copy_overwrite_nested(dst_map,
10236 		    dst_addr,
10237 		    copy,
10238 		    interruptible,
10239 		    (pmap_t) NULL,
10240 		    TRUE);
10241 		if (kr) {
10242 			ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOVERWRITE_FULL_NESTED_ERROR), kr /* arg */);
10243 		}
10244 		return kr;
10245 	}
10246 
10247 	copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy);
10248 	if (copy_page_shift < PAGE_SHIFT ||
10249 	    VM_MAP_PAGE_SHIFT(dst_map) < PAGE_SHIFT) {
10250 		goto blunt_copy;
10251 	}
10252 
10253 	if (VM_MAP_PAGE_SHIFT(dst_map) < PAGE_SHIFT) {
10254 		effective_page_mask = VM_MAP_PAGE_MASK(dst_map);
10255 	} else {
10256 		effective_page_mask = MAX(VM_MAP_PAGE_MASK(dst_map), PAGE_MASK);
10257 		effective_page_mask = MAX(VM_MAP_COPY_PAGE_MASK(copy),
10258 		    effective_page_mask);
10259 	}
10260 	effective_page_size = effective_page_mask + 1;
10261 
10262 	if (copy_size < VM_MAP_COPY_OVERWRITE_OPTIMIZATION_THRESHOLD_PAGES * effective_page_size) {
10263 		/*
10264 		 * Too small to bother with optimizing...
10265 		 */
10266 		goto blunt_copy;
10267 	}
10268 
10269 	if ((dst_addr & effective_page_mask) !=
10270 	    (copy->offset & effective_page_mask)) {
10271 		/*
10272 		 * Incompatible mis-alignment of source and destination...
10273 		 */
10274 		goto blunt_copy;
10275 	}
10276 
10277 	/*
10278 	 * Proper alignment or identical mis-alignment at the beginning.
10279 	 * Let's try and do a small unaligned copy first (if needed)
10280 	 * and then an aligned copy for the rest.
10281 	 */
10282 	if (!vm_map_page_aligned(dst_addr, effective_page_mask)) {
10283 		head_addr = dst_addr;
10284 		head_size = (effective_page_size -
10285 		    (copy->offset & effective_page_mask));
10286 		head_size = MIN(head_size, copy_size);
10287 	}
10288 	if (!vm_map_page_aligned(copy->offset + copy_size,
10289 	    effective_page_mask)) {
10290 		/*
10291 		 * Mis-alignment at the end.
10292 		 * Do an aligned copy up to the last page and
10293 		 * then an unaligned copy for the remaining bytes.
10294 		 */
10295 		tail_size = ((copy->offset + copy_size) &
10296 		    effective_page_mask);
10297 		tail_size = MIN(tail_size, copy_size);
10298 		tail_addr = dst_addr + copy_size - tail_size;
10299 		assert(tail_addr >= head_addr + head_size);
10300 	}
10301 	assert(head_size + tail_size <= copy_size);
10302 
10303 	if (head_size + tail_size == copy_size) {
10304 		/*
10305 		 * It's all unaligned, no optimization possible...
10306 		 */
10307 		goto blunt_copy;
10308 	}
10309 
10310 	/*
10311 	 * Can't optimize if there are any submaps in the
10312 	 * destination due to the way we free the "copy" map
10313 	 * progressively in vm_map_copy_overwrite_nested()
10314 	 * in that case.
10315 	 */
10316 	vm_map_lock_read(dst_map);
10317 	if (!vm_map_lookup_entry(dst_map, dst_addr, &entry)) {
10318 		vm_map_unlock_read(dst_map);
10319 		goto blunt_copy;
10320 	}
10321 	for (;
10322 	    (entry != vm_map_to_entry(dst_map) &&
10323 	    entry->vme_start < dst_addr + copy_size);
10324 	    entry = entry->vme_next) {
10325 		if (entry->is_sub_map) {
10326 			vm_map_unlock_read(dst_map);
10327 			goto blunt_copy;
10328 		}
10329 	}
10330 	vm_map_unlock_read(dst_map);
10331 
10332 	if (head_size) {
10333 		/*
10334 		 * Unaligned copy of the first "head_size" bytes, to reach
10335 		 * a page boundary.
10336 		 */
10337 
10338 		/*
10339 		 * Extract "head_copy" out of "copy".
10340 		 */
10341 		head_copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
10342 		head_copy->cpy_hdr.entries_pageable =
10343 		    copy->cpy_hdr.entries_pageable;
10344 		head_copy->cpy_hdr.page_shift = copy_page_shift;
10345 
10346 		entry = vm_map_copy_first_entry(copy);
10347 		if (entry->vme_end < copy->offset + head_size) {
10348 			head_size = entry->vme_end - copy->offset;
10349 		}
10350 
10351 		head_copy->offset = copy->offset;
10352 		head_copy->size = head_size;
10353 		copy->offset += head_size;
10354 		copy->size -= head_size;
10355 		copy_size -= head_size;
10356 		assert(copy_size > 0);
10357 
10358 		vm_map_copy_clip_end(copy, entry, copy->offset);
10359 		vm_map_copy_entry_unlink(copy, entry);
10360 		vm_map_copy_entry_link(head_copy,
10361 		    vm_map_copy_to_entry(head_copy),
10362 		    entry);
10363 
10364 		/*
10365 		 * Do the unaligned copy.
10366 		 */
10367 		kr = vm_map_copy_overwrite_nested(dst_map,
10368 		    head_addr,
10369 		    head_copy,
10370 		    interruptible,
10371 		    (pmap_t) NULL,
10372 		    FALSE);
10373 		if (kr != KERN_SUCCESS) {
10374 			ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOVERWRITE_PARTIAL_HEAD_NESTED_ERROR), kr /* arg */);
10375 			goto done;
10376 		}
10377 	}
10378 
10379 	if (tail_size) {
10380 		/*
10381 		 * Extract "tail_copy" out of "copy".
10382 		 */
10383 		tail_copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
10384 		tail_copy->cpy_hdr.entries_pageable =
10385 		    copy->cpy_hdr.entries_pageable;
10386 		tail_copy->cpy_hdr.page_shift = copy_page_shift;
10387 
10388 		tail_copy->offset = copy->offset + copy_size - tail_size;
10389 		tail_copy->size = tail_size;
10390 
10391 		copy->size -= tail_size;
10392 		copy_size -= tail_size;
10393 		assert(copy_size > 0);
10394 
10395 		entry = vm_map_copy_last_entry(copy);
10396 		vm_map_copy_clip_start(copy, entry, tail_copy->offset);
10397 		entry = vm_map_copy_last_entry(copy);
10398 		vm_map_copy_entry_unlink(copy, entry);
10399 		vm_map_copy_entry_link(tail_copy,
10400 		    vm_map_copy_last_entry(tail_copy),
10401 		    entry);
10402 	}
10403 
10404 	/*
10405 	 * If we are here from ipc_kmsg_copyout_ool_descriptor(),
10406 	 * we want to avoid TOCTOU issues w.r.t copy->size but
10407 	 * we don't need to change vm_map_copy_overwrite_nested()
10408 	 * and all other vm_map_copy_overwrite variants.
10409 	 *
10410 	 * So we assign the original copy_size that was passed into
10411 	 * this routine back to copy.
10412 	 *
10413 	 * This use of local 'copy_size' passed into this routine is
10414 	 * to try and protect against TOCTOU attacks where the kernel
10415 	 * has been exploited. We don't expect this to be an issue
10416 	 * during normal system operation.
10417 	 */
10418 	assertf(copy->size == copy_size,
10419 	    "Mismatch of copy sizes. Expected 0x%llx, Got 0x%llx\n", (uint64_t) copy_size, (uint64_t) copy->size);
10420 	copy->size = copy_size;
10421 
10422 	/*
10423 	 * Copy most (or possibly all) of the data.
10424 	 */
10425 	kr = vm_map_copy_overwrite_nested(dst_map,
10426 	    dst_addr + head_size,
10427 	    copy,
10428 	    interruptible,
10429 	    (pmap_t) NULL,
10430 	    FALSE);
10431 	if (kr != KERN_SUCCESS) {
10432 		ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOVERWRITE_PARTIAL_NESTED_ERROR), kr /* arg */);
10433 		goto done;
10434 	}
10435 
10436 	if (tail_size) {
10437 		kr = vm_map_copy_overwrite_nested(dst_map,
10438 		    tail_addr,
10439 		    tail_copy,
10440 		    interruptible,
10441 		    (pmap_t) NULL,
10442 		    FALSE);
10443 		if (kr) {
10444 			ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOVERWRITE_PARTIAL_TAIL_NESTED_ERROR), kr /* arg */);
10445 		}
10446 	}
10447 
10448 done:
10449 	assert(copy->type == VM_MAP_COPY_ENTRY_LIST);
10450 	if (kr == KERN_SUCCESS) {
10451 		/*
10452 		 * Discard all the copy maps.
10453 		 */
10454 		if (head_copy) {
10455 			vm_map_copy_discard(head_copy);
10456 			head_copy = NULL;
10457 		}
10458 		vm_map_copy_discard(copy);
10459 		if (tail_copy) {
10460 			vm_map_copy_discard(tail_copy);
10461 			tail_copy = NULL;
10462 		}
10463 	} else {
10464 		/*
10465 		 * Re-assemble the original copy map.
10466 		 */
10467 		if (head_copy) {
10468 			entry = vm_map_copy_first_entry(head_copy);
10469 			vm_map_copy_entry_unlink(head_copy, entry);
10470 			vm_map_copy_entry_link(copy,
10471 			    vm_map_copy_to_entry(copy),
10472 			    entry);
10473 			copy->offset -= head_size;
10474 			copy->size += head_size;
10475 			vm_map_copy_discard(head_copy);
10476 			head_copy = NULL;
10477 		}
10478 		if (tail_copy) {
10479 			entry = vm_map_copy_last_entry(tail_copy);
10480 			vm_map_copy_entry_unlink(tail_copy, entry);
10481 			vm_map_copy_entry_link(copy,
10482 			    vm_map_copy_last_entry(copy),
10483 			    entry);
10484 			copy->size += tail_size;
10485 			vm_map_copy_discard(tail_copy);
10486 			tail_copy = NULL;
10487 		}
10488 	}
10489 	return kr;
10490 }
10491 
10492 
10493 /*
10494  *	Routine: vm_map_copy_overwrite_unaligned	[internal use only]
10495  *
10496  *	Decription:
10497  *	Physically copy unaligned data
10498  *
10499  *	Implementation:
10500  *	Unaligned parts of pages have to be physically copied.  We use
10501  *	a modified form of vm_fault_copy (which understands none-aligned
10502  *	page offsets and sizes) to do the copy.  We attempt to copy as
10503  *	much memory in one go as possibly, however vm_fault_copy copies
10504  *	within 1 memory object so we have to find the smaller of "amount left"
10505  *	"source object data size" and "target object data size".  With
10506  *	unaligned data we don't need to split regions, therefore the source
10507  *	(copy) object should be one map entry, the target range may be split
10508  *	over multiple map entries however.  In any event we are pessimistic
10509  *	about these assumptions.
10510  *
10511  *	Callers of this function must call vm_map_copy_require on
10512  *	previously created vm_map_copy_t or pass a newly created
10513  *	one to ensure that it hasn't been forged.
10514  *
10515  *	Assumptions:
10516  *	dst_map is locked on entry and is return locked on success,
10517  *	unlocked on error.
10518  */
10519 
10520 static kern_return_t
vm_map_copy_overwrite_unaligned(vm_map_t dst_map,vm_map_entry_t entry,vm_map_copy_t copy,vm_map_offset_t start,boolean_t discard_on_success)10521 vm_map_copy_overwrite_unaligned(
10522 	vm_map_t        dst_map,
10523 	vm_map_entry_t  entry,
10524 	vm_map_copy_t   copy,
10525 	vm_map_offset_t start,
10526 	boolean_t       discard_on_success)
10527 {
10528 	vm_map_entry_t          copy_entry;
10529 	vm_map_entry_t          copy_entry_next;
10530 	vm_map_version_t        version;
10531 	vm_object_t             dst_object;
10532 	vm_object_offset_t      dst_offset;
10533 	vm_object_offset_t      src_offset;
10534 	vm_object_offset_t      entry_offset;
10535 	vm_map_offset_t         entry_end;
10536 	vm_map_size_t           src_size,
10537 	    dst_size,
10538 	    copy_size,
10539 	    amount_left;
10540 	kern_return_t           kr = KERN_SUCCESS;
10541 
10542 
10543 	copy_entry = vm_map_copy_first_entry(copy);
10544 
10545 	vm_map_lock_write_to_read(dst_map);
10546 
10547 	src_offset = copy->offset - trunc_page_mask_64(copy->offset, VM_MAP_COPY_PAGE_MASK(copy));
10548 	amount_left = copy->size;
10549 /*
10550  *	unaligned so we never clipped this entry, we need the offset into
10551  *	the vm_object not just the data.
10552  */
10553 	while (amount_left > 0) {
10554 		if (entry == vm_map_to_entry(dst_map)) {
10555 			vm_map_unlock_read(dst_map);
10556 			return KERN_INVALID_ADDRESS;
10557 		}
10558 
10559 		/* "start" must be within the current map entry */
10560 		assert((start >= entry->vme_start) && (start < entry->vme_end));
10561 
10562 		/*
10563 		 *	Check protection again
10564 		 */
10565 		if (!(entry->protection & VM_PROT_WRITE)) {
10566 			vm_map_unlock_read(dst_map);
10567 			return KERN_PROTECTION_FAILURE;
10568 		}
10569 		if (entry->is_sub_map) {
10570 			/* not implemented... */
10571 			vm_map_unlock_read(dst_map);
10572 			return KERN_INVALID_ARGUMENT;
10573 		}
10574 		if (!vm_map_entry_is_overwritable(dst_map, entry)) {
10575 			vm_map_unlock_read(dst_map);
10576 			return KERN_PROTECTION_FAILURE;
10577 		}
10578 		/*
10579 		 *	If the entry is in transition, we must wait
10580 		 *	for it to exit that state.  Anything could happen
10581 		 *	when we unlock the map, so start over.
10582 		 */
10583 		if (entry->in_transition) {
10584 			/*
10585 			 * Say that we are waiting, and wait for entry.
10586 			 */
10587 			entry->needs_wakeup = TRUE;
10588 			vm_map_entry_wait(dst_map, THREAD_UNINT);
10589 
10590 			goto RetryLookup;
10591 		}
10592 
10593 		dst_offset = start - entry->vme_start;
10594 
10595 		dst_size = entry->vme_end - start;
10596 
10597 		src_size = copy_entry->vme_end -
10598 		    (copy_entry->vme_start + src_offset);
10599 
10600 		if (dst_size < src_size) {
10601 /*
10602  *			we can only copy dst_size bytes before
10603  *			we have to get the next destination entry
10604  */
10605 			copy_size = dst_size;
10606 		} else {
10607 /*
10608  *			we can only copy src_size bytes before
10609  *			we have to get the next source copy entry
10610  */
10611 			copy_size = src_size;
10612 		}
10613 
10614 		if (copy_size > amount_left) {
10615 			copy_size = amount_left;
10616 		}
10617 /*
10618  *		Entry needs copy, create a shadow shadow object for
10619  *		Copy on write region.
10620  */
10621 		assert(!entry->is_sub_map);
10622 		if (entry->needs_copy) {
10623 			if (vm_map_lock_read_to_write(dst_map)) {
10624 				vm_map_lock_read(dst_map);
10625 				goto RetryLookup;
10626 			}
10627 			VME_OBJECT_SHADOW(entry,
10628 			    (vm_map_size_t)(entry->vme_end
10629 			    - entry->vme_start),
10630 			    vm_map_always_shadow(dst_map));
10631 			entry->needs_copy = FALSE;
10632 			vm_map_lock_write_to_read(dst_map);
10633 		}
10634 		dst_object = VME_OBJECT(entry);
10635 /*
10636  *		unlike with the virtual (aligned) copy we're going
10637  *		to fault on it therefore we need a target object.
10638  */
10639 		if (dst_object == VM_OBJECT_NULL) {
10640 			if (vm_map_lock_read_to_write(dst_map)) {
10641 				vm_map_lock_read(dst_map);
10642 				goto RetryLookup;
10643 			}
10644 			dst_object = vm_object_allocate((vm_map_size_t)
10645 			    entry->vme_end - entry->vme_start,
10646 			    dst_map->serial_id);
10647 			VME_OBJECT_SET(entry, dst_object, false, 0);
10648 			VME_OFFSET_SET(entry, 0);
10649 			assert(entry->use_pmap);
10650 			vm_map_lock_write_to_read(dst_map);
10651 		}
10652 /*
10653  *		Take an object reference and unlock map. The "entry" may
10654  *		disappear or change when the map is unlocked.
10655  */
10656 		vm_object_reference(dst_object);
10657 		version.main_timestamp = dst_map->timestamp;
10658 		entry_offset = VME_OFFSET(entry);
10659 		entry_end = entry->vme_end;
10660 		vm_map_unlock_read(dst_map);
10661 /*
10662  *		Copy as much as possible in one pass
10663  */
10664 		kr = vm_fault_copy(
10665 			VME_OBJECT(copy_entry),
10666 			VME_OFFSET(copy_entry) + src_offset,
10667 			&copy_size,
10668 			dst_object,
10669 			entry_offset + dst_offset,
10670 			dst_map,
10671 			&version,
10672 			THREAD_UNINT );
10673 
10674 		start += copy_size;
10675 		src_offset += copy_size;
10676 		amount_left -= copy_size;
10677 /*
10678  *		Release the object reference
10679  */
10680 		vm_object_deallocate(dst_object);
10681 /*
10682  *		If a hard error occurred, return it now
10683  */
10684 		if (kr != KERN_SUCCESS) {
10685 			return kr;
10686 		}
10687 
10688 		if ((copy_entry->vme_start + src_offset) == copy_entry->vme_end
10689 		    || amount_left == 0) {
10690 /*
10691  *			all done with this copy entry, dispose.
10692  */
10693 			copy_entry_next = copy_entry->vme_next;
10694 
10695 			if (discard_on_success) {
10696 				vm_map_copy_entry_unlink(copy, copy_entry);
10697 				assert(!copy_entry->is_sub_map);
10698 				vm_object_deallocate(VME_OBJECT(copy_entry));
10699 				vm_map_copy_entry_dispose(copy_entry);
10700 			}
10701 
10702 			if (copy_entry_next == vm_map_copy_to_entry(copy) &&
10703 			    amount_left) {
10704 /*
10705  *				not finished copying but run out of source
10706  */
10707 				return KERN_INVALID_ADDRESS;
10708 			}
10709 
10710 			copy_entry = copy_entry_next;
10711 
10712 			src_offset = 0;
10713 		}
10714 
10715 		if (amount_left == 0) {
10716 			return KERN_SUCCESS;
10717 		}
10718 
10719 		vm_map_lock_read(dst_map);
10720 		if (version.main_timestamp == dst_map->timestamp) {
10721 			if (start == entry_end) {
10722 /*
10723  *				destination region is split.  Use the version
10724  *				information to avoid a lookup in the normal
10725  *				case.
10726  */
10727 				entry = entry->vme_next;
10728 /*
10729  *				should be contiguous. Fail if we encounter
10730  *				a hole in the destination.
10731  */
10732 				if (start != entry->vme_start) {
10733 					vm_map_unlock_read(dst_map);
10734 					return KERN_INVALID_ADDRESS;
10735 				}
10736 			}
10737 		} else {
10738 /*
10739  *			Map version check failed.
10740  *			we must lookup the entry because somebody
10741  *			might have changed the map behind our backs.
10742  */
10743 RetryLookup:
10744 			if (!vm_map_lookup_entry(dst_map, start, &entry)) {
10745 				vm_map_unlock_read(dst_map);
10746 				return KERN_INVALID_ADDRESS;
10747 			}
10748 		}
10749 	}/* while */
10750 
10751 	return KERN_SUCCESS;
10752 }/* vm_map_copy_overwrite_unaligned */
10753 
10754 /*
10755  *	Routine: vm_map_copy_overwrite_aligned	[internal use only]
10756  *
10757  *	Description:
10758  *	Does all the vm_trickery possible for whole pages.
10759  *
10760  *	Implementation:
10761  *
10762  *	If there are no permanent objects in the destination,
10763  *	and the source and destination map entry zones match,
10764  *	and the destination map entry is not shared,
10765  *	then the map entries can be deleted and replaced
10766  *	with those from the copy.  The following code is the
10767  *	basic idea of what to do, but there are lots of annoying
10768  *	little details about getting protection and inheritance
10769  *	right.  Should add protection, inheritance, and sharing checks
10770  *	to the above pass and make sure that no wiring is involved.
10771  *
10772  *	Callers of this function must call vm_map_copy_require on
10773  *	previously created vm_map_copy_t or pass a newly created
10774  *	one to ensure that it hasn't been forged.
10775  */
10776 
10777 int vm_map_copy_overwrite_aligned_src_not_internal = 0;
10778 int vm_map_copy_overwrite_aligned_src_not_symmetric = 0;
10779 int vm_map_copy_overwrite_aligned_src_large = 0;
10780 
10781 static kern_return_t
vm_map_copy_overwrite_aligned(vm_map_t dst_map,vm_map_entry_t tmp_entry,vm_map_copy_t copy,vm_map_offset_t start,__unused pmap_t pmap)10782 vm_map_copy_overwrite_aligned(
10783 	vm_map_t        dst_map,
10784 	vm_map_entry_t  tmp_entry,
10785 	vm_map_copy_t   copy,
10786 	vm_map_offset_t start,
10787 	__unused pmap_t pmap)
10788 {
10789 	vm_object_t     object;
10790 	vm_map_entry_t  copy_entry;
10791 	vm_map_size_t   copy_size;
10792 	vm_map_size_t   size;
10793 	vm_map_entry_t  entry;
10794 
10795 	while ((copy_entry = vm_map_copy_first_entry(copy))
10796 	    != vm_map_copy_to_entry(copy)) {
10797 		copy_size = (copy_entry->vme_end - copy_entry->vme_start);
10798 
10799 		entry = tmp_entry;
10800 
10801 		if (entry->is_sub_map) {
10802 			/* unnested when clipped earlier */
10803 			assert(!entry->use_pmap);
10804 		}
10805 		if (entry == vm_map_to_entry(dst_map)) {
10806 			vm_map_unlock(dst_map);
10807 			return KERN_INVALID_ADDRESS;
10808 		}
10809 		size = (entry->vme_end - entry->vme_start);
10810 		/*
10811 		 *	Make sure that no holes popped up in the
10812 		 *	address map, and that the protection is
10813 		 *	still valid, in case the map was unlocked
10814 		 *	earlier.
10815 		 */
10816 
10817 		if ((entry->vme_start != start) || ((entry->is_sub_map)
10818 		    && !entry->needs_copy)) {
10819 			vm_map_unlock(dst_map);
10820 			return KERN_INVALID_ADDRESS;
10821 		}
10822 		assert(entry != vm_map_to_entry(dst_map));
10823 
10824 		/*
10825 		 *	Check protection again
10826 		 */
10827 
10828 		if (!(entry->protection & VM_PROT_WRITE)) {
10829 			vm_map_unlock(dst_map);
10830 			return KERN_PROTECTION_FAILURE;
10831 		}
10832 
10833 		if (entry->is_sub_map) {
10834 			/* not properly implemented */
10835 			vm_map_unlock(dst_map);
10836 			return KERN_PROTECTION_FAILURE;
10837 		}
10838 
10839 		if (!vm_map_entry_is_overwritable(dst_map, entry)) {
10840 			vm_map_unlock(dst_map);
10841 			return KERN_PROTECTION_FAILURE;
10842 		}
10843 
10844 		/*
10845 		 *	If the entry is in transition, we must wait
10846 		 *	for it to exit that state.  Anything could happen
10847 		 *	when we unlock the map, so start over.
10848 		 */
10849 		if (entry->in_transition) {
10850 			/*
10851 			 * Say that we are waiting, and wait for entry.
10852 			 */
10853 			entry->needs_wakeup = TRUE;
10854 			vm_map_entry_wait(dst_map, THREAD_UNINT);
10855 
10856 			goto RetryLookup;
10857 		}
10858 
10859 		/*
10860 		 *	Adjust to source size first
10861 		 */
10862 
10863 		if (copy_size < size) {
10864 			if (entry->map_aligned &&
10865 			    !VM_MAP_PAGE_ALIGNED(entry->vme_start + copy_size,
10866 			    VM_MAP_PAGE_MASK(dst_map))) {
10867 				/* no longer map-aligned */
10868 				entry->map_aligned = FALSE;
10869 			}
10870 			vm_map_clip_end(dst_map, entry, entry->vme_start + copy_size);
10871 			size = copy_size;
10872 		}
10873 
10874 		/*
10875 		 *	Adjust to destination size
10876 		 */
10877 
10878 		if (size < copy_size) {
10879 			vm_map_copy_clip_end(copy, copy_entry,
10880 			    copy_entry->vme_start + size);
10881 			copy_size = size;
10882 		}
10883 
10884 		assert((entry->vme_end - entry->vme_start) == size);
10885 		assert((tmp_entry->vme_end - tmp_entry->vme_start) == size);
10886 		assert((copy_entry->vme_end - copy_entry->vme_start) == size);
10887 
10888 		/*
10889 		 *	If the destination contains temporary unshared memory,
10890 		 *	we can perform the copy by throwing it away and
10891 		 *	installing the source data.
10892 		 *
10893 		 *	Exceptions for mappings with special semantics:
10894 		 *	+ "permanent" entries,
10895 		 *	+ JIT regions,
10896 		 *	+ TPRO regions,
10897 		 *      + pmap-specific protection policies,
10898 		 *	+ VM objects with COPY_NONE copy strategy.
10899 		 */
10900 
10901 		object = VME_OBJECT(entry);
10902 		if ((!entry->is_shared &&
10903 		    !entry->vme_permanent &&
10904 		    !entry->used_for_jit &&
10905 #if __arm64e__
10906 		    !entry->used_for_tpro &&
10907 #endif /* __arm64e__ */
10908 		    !(entry->protection & VM_PROT_EXECUTE) &&
10909 		    !pmap_has_prot_policy(dst_map->pmap, entry->translated_allow_execute, entry->protection) &&
10910 		    ((object == VM_OBJECT_NULL) ||
10911 		    (object->internal &&
10912 		    !object->true_share &&
10913 		    object->copy_strategy != MEMORY_OBJECT_COPY_NONE))) ||
10914 		    entry->needs_copy) {
10915 			vm_object_t     old_object = VME_OBJECT(entry);
10916 			vm_object_offset_t      old_offset = VME_OFFSET(entry);
10917 			vm_object_offset_t      offset;
10918 
10919 			assert(!entry->is_sub_map);
10920 			/*
10921 			 * Ensure that the source and destination aren't
10922 			 * identical
10923 			 */
10924 			if (old_object == VME_OBJECT(copy_entry) &&
10925 			    old_offset == VME_OFFSET(copy_entry)) {
10926 				vm_map_copy_entry_unlink(copy, copy_entry);
10927 				vm_map_copy_entry_dispose(copy_entry);
10928 
10929 				if (old_object != VM_OBJECT_NULL) {
10930 					vm_object_deallocate(old_object);
10931 				}
10932 
10933 				start = tmp_entry->vme_end;
10934 				tmp_entry = tmp_entry->vme_next;
10935 				continue;
10936 			}
10937 
10938 #if XNU_TARGET_OS_OSX
10939 #define __TRADEOFF1_OBJ_SIZE (64 * 1024 * 1024) /* 64 MB */
10940 #define __TRADEOFF1_COPY_SIZE (128 * 1024)      /* 128 KB */
10941 			if (VME_OBJECT(copy_entry) != VM_OBJECT_NULL &&
10942 			    VME_OBJECT(copy_entry)->vo_size >= __TRADEOFF1_OBJ_SIZE &&
10943 			    copy_size <= __TRADEOFF1_COPY_SIZE) {
10944 				/*
10945 				 * Virtual vs. Physical copy tradeoff #1.
10946 				 *
10947 				 * Copying only a few pages out of a large
10948 				 * object:  do a physical copy instead of
10949 				 * a virtual copy, to avoid possibly keeping
10950 				 * the entire large object alive because of
10951 				 * those few copy-on-write pages.
10952 				 */
10953 				vm_map_copy_overwrite_aligned_src_large++;
10954 				goto slow_copy;
10955 			}
10956 #endif /* XNU_TARGET_OS_OSX */
10957 
10958 			if ((dst_map->pmap != kernel_pmap) &&
10959 			    (VME_ALIAS(entry) >= VM_MEMORY_MALLOC) &&
10960 			    (VME_ALIAS(entry) <= VM_MEMORY_MALLOC_MEDIUM)) {
10961 				vm_object_t new_object, new_shadow;
10962 
10963 				/*
10964 				 * We're about to map something over a mapping
10965 				 * established by malloc()...
10966 				 */
10967 				new_object = VME_OBJECT(copy_entry);
10968 				if (new_object != VM_OBJECT_NULL) {
10969 					vm_object_lock_shared(new_object);
10970 				}
10971 				while (new_object != VM_OBJECT_NULL &&
10972 #if XNU_TARGET_OS_OSX
10973 				    !new_object->true_share &&
10974 				    new_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC &&
10975 #endif /* XNU_TARGET_OS_OSX */
10976 				    new_object->internal) {
10977 					new_shadow = new_object->shadow;
10978 					if (new_shadow == VM_OBJECT_NULL) {
10979 						break;
10980 					}
10981 					vm_object_lock_shared(new_shadow);
10982 					vm_object_unlock(new_object);
10983 					new_object = new_shadow;
10984 				}
10985 				if (new_object != VM_OBJECT_NULL) {
10986 					if (!new_object->internal) {
10987 						/*
10988 						 * The new mapping is backed
10989 						 * by an external object.  We
10990 						 * don't want malloc'ed memory
10991 						 * to be replaced with such a
10992 						 * non-anonymous mapping, so
10993 						 * let's go off the optimized
10994 						 * path...
10995 						 */
10996 						vm_map_copy_overwrite_aligned_src_not_internal++;
10997 						vm_object_unlock(new_object);
10998 						goto slow_copy;
10999 					}
11000 #if XNU_TARGET_OS_OSX
11001 					if (new_object->true_share ||
11002 					    new_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
11003 						/*
11004 						 * Same if there's a "true_share"
11005 						 * object in the shadow chain, or
11006 						 * an object with a non-default
11007 						 * (SYMMETRIC) copy strategy.
11008 						 */
11009 						vm_map_copy_overwrite_aligned_src_not_symmetric++;
11010 						vm_object_unlock(new_object);
11011 						goto slow_copy;
11012 					}
11013 #endif /* XNU_TARGET_OS_OSX */
11014 					vm_object_unlock(new_object);
11015 				}
11016 				/*
11017 				 * The new mapping is still backed by
11018 				 * anonymous (internal) memory, so it's
11019 				 * OK to substitute it for the original
11020 				 * malloc() mapping.
11021 				 */
11022 			}
11023 
11024 			if (old_object != VM_OBJECT_NULL) {
11025 				assert(!entry->vme_permanent);
11026 				if (entry->is_sub_map) {
11027 					if (entry->use_pmap) {
11028 #ifndef NO_NESTED_PMAP
11029 						pmap_unnest(dst_map->pmap,
11030 						    (addr64_t)entry->vme_start,
11031 						    entry->vme_end - entry->vme_start);
11032 #endif  /* NO_NESTED_PMAP */
11033 						if (dst_map->mapped_in_other_pmaps) {
11034 							/* clean up parent */
11035 							/* map/maps */
11036 							vm_map_submap_pmap_clean(
11037 								dst_map, entry->vme_start,
11038 								entry->vme_end,
11039 								VME_SUBMAP(entry),
11040 								VME_OFFSET(entry));
11041 						}
11042 					} else {
11043 						vm_map_submap_pmap_clean(
11044 							dst_map, entry->vme_start,
11045 							entry->vme_end,
11046 							VME_SUBMAP(entry),
11047 							VME_OFFSET(entry));
11048 					}
11049 					vm_map_deallocate(VME_SUBMAP(entry));
11050 				} else {
11051 					if (dst_map->mapped_in_other_pmaps) {
11052 						vm_object_pmap_protect_options(
11053 							VME_OBJECT(entry),
11054 							VME_OFFSET(entry),
11055 							entry->vme_end
11056 							- entry->vme_start,
11057 							PMAP_NULL,
11058 							PAGE_SIZE,
11059 							entry->vme_start,
11060 							VM_PROT_NONE,
11061 							PMAP_OPTIONS_REMOVE);
11062 					} else {
11063 						pmap_remove_options(
11064 							dst_map->pmap,
11065 							(addr64_t)(entry->vme_start),
11066 							(addr64_t)(entry->vme_end),
11067 							PMAP_OPTIONS_REMOVE);
11068 					}
11069 					vm_object_deallocate(old_object);
11070 				}
11071 			}
11072 
11073 			if (entry->iokit_acct) {
11074 				/* keep using iokit accounting */
11075 				entry->use_pmap = FALSE;
11076 			} else {
11077 				/* use pmap accounting */
11078 				entry->use_pmap = TRUE;
11079 			}
11080 			assert(!entry->vme_permanent);
11081 			VME_OBJECT_SET(entry, VME_OBJECT(copy_entry), false, 0);
11082 			object = VME_OBJECT(entry);
11083 			entry->needs_copy = copy_entry->needs_copy;
11084 			entry->wired_count = 0;
11085 			entry->user_wired_count = 0;
11086 			offset = VME_OFFSET(copy_entry);
11087 			VME_OFFSET_SET(entry, offset);
11088 
11089 			vm_map_copy_entry_unlink(copy, copy_entry);
11090 			vm_map_copy_entry_dispose(copy_entry);
11091 
11092 			/*
11093 			 * we could try to push pages into the pmap at this point, BUT
11094 			 * this optimization only saved on average 2 us per page if ALL
11095 			 * the pages in the source were currently mapped
11096 			 * and ALL the pages in the dest were touched, if there were fewer
11097 			 * than 2/3 of the pages touched, this optimization actually cost more cycles
11098 			 * it also puts a lot of pressure on the pmap layer w/r to mapping structures
11099 			 */
11100 
11101 			/*
11102 			 *	Set up for the next iteration.  The map
11103 			 *	has not been unlocked, so the next
11104 			 *	address should be at the end of this
11105 			 *	entry, and the next map entry should be
11106 			 *	the one following it.
11107 			 */
11108 
11109 			start = tmp_entry->vme_end;
11110 			tmp_entry = tmp_entry->vme_next;
11111 		} else {
11112 			vm_map_version_t        version;
11113 			vm_object_t             dst_object;
11114 			vm_object_offset_t      dst_offset;
11115 			kern_return_t           r;
11116 
11117 slow_copy:
11118 			if (entry->needs_copy) {
11119 				VME_OBJECT_SHADOW(entry,
11120 				    (entry->vme_end -
11121 				    entry->vme_start),
11122 				    vm_map_always_shadow(dst_map));
11123 				entry->needs_copy = FALSE;
11124 			}
11125 
11126 			dst_object = VME_OBJECT(entry);
11127 			dst_offset = VME_OFFSET(entry);
11128 
11129 			/*
11130 			 *	Take an object reference, and record
11131 			 *	the map version information so that the
11132 			 *	map can be safely unlocked.
11133 			 */
11134 
11135 			if (dst_object == VM_OBJECT_NULL) {
11136 				/*
11137 				 * We would usually have just taken the
11138 				 * optimized path above if the destination
11139 				 * object has not been allocated yet.  But we
11140 				 * now disable that optimization if the copy
11141 				 * entry's object is not backed by anonymous
11142 				 * memory to avoid replacing malloc'ed
11143 				 * (i.e. re-usable) anonymous memory with a
11144 				 * not-so-anonymous mapping.
11145 				 * So we have to handle this case here and
11146 				 * allocate a new VM object for this map entry.
11147 				 */
11148 				dst_object = vm_object_allocate(
11149 					entry->vme_end - entry->vme_start,
11150 					dst_map->serial_id
11151 					);
11152 				dst_offset = 0;
11153 				VME_OBJECT_SET(entry, dst_object, false, 0);
11154 				VME_OFFSET_SET(entry, dst_offset);
11155 				assert(entry->use_pmap);
11156 			}
11157 
11158 			vm_object_reference(dst_object);
11159 
11160 			/* account for unlock bumping up timestamp */
11161 			version.main_timestamp = dst_map->timestamp + 1;
11162 
11163 			vm_map_unlock(dst_map);
11164 
11165 			/*
11166 			 *	Copy as much as possible in one pass
11167 			 */
11168 
11169 			copy_size = size;
11170 			r = vm_fault_copy(
11171 				VME_OBJECT(copy_entry),
11172 				VME_OFFSET(copy_entry),
11173 				&copy_size,
11174 				dst_object,
11175 				dst_offset,
11176 				dst_map,
11177 				&version,
11178 				THREAD_UNINT );
11179 
11180 			/*
11181 			 *	Release the object reference
11182 			 */
11183 
11184 			vm_object_deallocate(dst_object);
11185 
11186 			/*
11187 			 *	If a hard error occurred, return it now
11188 			 */
11189 
11190 			if (r != KERN_SUCCESS) {
11191 				return r;
11192 			}
11193 
11194 			if (copy_size != 0) {
11195 				/*
11196 				 *	Dispose of the copied region
11197 				 */
11198 
11199 				vm_map_copy_clip_end(copy, copy_entry,
11200 				    copy_entry->vme_start + copy_size);
11201 				vm_map_copy_entry_unlink(copy, copy_entry);
11202 				vm_object_deallocate(VME_OBJECT(copy_entry));
11203 				vm_map_copy_entry_dispose(copy_entry);
11204 			}
11205 
11206 			/*
11207 			 *	Pick up in the destination map where we left off.
11208 			 *
11209 			 *	Use the version information to avoid a lookup
11210 			 *	in the normal case.
11211 			 */
11212 
11213 			start += copy_size;
11214 			vm_map_lock(dst_map);
11215 			if (version.main_timestamp == dst_map->timestamp &&
11216 			    copy_size != 0) {
11217 				/* We can safely use saved tmp_entry value */
11218 
11219 				if (tmp_entry->map_aligned &&
11220 				    !VM_MAP_PAGE_ALIGNED(
11221 					    start,
11222 					    VM_MAP_PAGE_MASK(dst_map))) {
11223 					/* no longer map-aligned */
11224 					tmp_entry->map_aligned = FALSE;
11225 				}
11226 				vm_map_clip_end(dst_map, tmp_entry, start);
11227 				tmp_entry = tmp_entry->vme_next;
11228 			} else {
11229 				/* Must do lookup of tmp_entry */
11230 
11231 RetryLookup:
11232 				if (!vm_map_lookup_entry(dst_map, start, &tmp_entry)) {
11233 					vm_map_unlock(dst_map);
11234 					return KERN_INVALID_ADDRESS;
11235 				}
11236 				if (tmp_entry->map_aligned &&
11237 				    !VM_MAP_PAGE_ALIGNED(
11238 					    start,
11239 					    VM_MAP_PAGE_MASK(dst_map))) {
11240 					/* no longer map-aligned */
11241 					tmp_entry->map_aligned = FALSE;
11242 				}
11243 				vm_map_clip_start(dst_map, tmp_entry, start);
11244 			}
11245 		}
11246 	}/* while */
11247 
11248 	return KERN_SUCCESS;
11249 }/* vm_map_copy_overwrite_aligned */
11250 
11251 /*
11252  *	Routine: vm_map_copyin_kernel_buffer [internal use only]
11253  *
11254  *	Description:
11255  *		Copy in data to a kernel buffer from space in the
11256  *		source map. The original space may be optionally
11257  *		deallocated.
11258  *
11259  *		If successful, returns a new copy object.
11260  */
11261 static kern_return_t
vm_map_copyin_kernel_buffer(vm_map_t src_map,vm_map_offset_t src_addr,vm_map_size_t len,boolean_t src_destroy,vm_map_copy_t * copy_result)11262 vm_map_copyin_kernel_buffer(
11263 	vm_map_t        src_map,
11264 	vm_map_offset_t src_addr,
11265 	vm_map_size_t   len,
11266 	boolean_t       src_destroy,
11267 	vm_map_copy_t   *copy_result)
11268 {
11269 	kern_return_t kr;
11270 	vm_map_copy_t copy;
11271 	void *kdata;
11272 
11273 	if (len > msg_ool_size_small) {
11274 		return KERN_INVALID_ARGUMENT;
11275 	}
11276 
11277 	kdata = kalloc_data(len, Z_WAITOK);
11278 	if (kdata == NULL) {
11279 		return KERN_RESOURCE_SHORTAGE;
11280 	}
11281 	kr = copyinmap(src_map, src_addr, kdata, (vm_size_t)len);
11282 	if (kr != KERN_SUCCESS) {
11283 		kfree_data(kdata, len);
11284 		return kr;
11285 	}
11286 
11287 	copy = vm_map_copy_allocate(VM_MAP_COPY_KERNEL_BUFFER);
11288 	copy->cpy_kdata = kdata;
11289 	copy->size = len;
11290 	copy->offset = 0;
11291 
11292 	if (src_destroy) {
11293 		vmr_flags_t flags = VM_MAP_REMOVE_INTERRUPTIBLE;
11294 
11295 		if (src_map == kernel_map) {
11296 			flags |= VM_MAP_REMOVE_KUNWIRE;
11297 		}
11298 
11299 		(void)vm_map_remove_guard(src_map,
11300 		    vm_map_trunc_page(src_addr, VM_MAP_PAGE_MASK(src_map)),
11301 		    vm_map_round_page(src_addr + len, VM_MAP_PAGE_MASK(src_map)),
11302 		    flags, KMEM_GUARD_NONE);
11303 	}
11304 
11305 	*copy_result = copy;
11306 	return KERN_SUCCESS;
11307 }
11308 
11309 /*
11310  *	Routine: vm_map_copyout_kernel_buffer	[internal use only]
11311  *
11312  *	Description:
11313  *		Copy out data from a kernel buffer into space in the
11314  *		destination map. The space may be otpionally dynamically
11315  *		allocated.
11316  *
11317  *		If successful, consumes the copy object.
11318  *		Otherwise, the caller is responsible for it.
11319  *
11320  *		Callers of this function must call vm_map_copy_require on
11321  *		previously created vm_map_copy_t or pass a newly created
11322  *		one to ensure that it hasn't been forged.
11323  */
11324 static int vm_map_copyout_kernel_buffer_failures = 0;
11325 static kern_return_t
vm_map_copyout_kernel_buffer(vm_map_t map,vm_map_address_t * addr,vm_map_copy_t copy,vm_map_size_t copy_size,boolean_t overwrite,boolean_t consume_on_success)11326 vm_map_copyout_kernel_buffer(
11327 	vm_map_t                map,
11328 	vm_map_address_t        *addr,  /* IN/OUT */
11329 	vm_map_copy_t           copy,
11330 	vm_map_size_t           copy_size,
11331 	boolean_t               overwrite,
11332 	boolean_t               consume_on_success)
11333 {
11334 	kern_return_t kr = KERN_SUCCESS;
11335 	thread_t thread = current_thread();
11336 
11337 	assert(copy->size == copy_size);
11338 
11339 	/*
11340 	 * check for corrupted vm_map_copy structure
11341 	 */
11342 	if (copy_size > msg_ool_size_small || copy->offset) {
11343 		panic("Invalid vm_map_copy_t sz:%lld, ofst:%lld",
11344 		    (long long)copy->size, (long long)copy->offset);
11345 	}
11346 
11347 	if (!overwrite) {
11348 		/*
11349 		 * Allocate space in the target map for the data
11350 		 */
11351 		vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_ANYWHERE();
11352 
11353 		if (map == kernel_map) {
11354 			vmk_flags.vmkf_range_id = KMEM_RANGE_ID_DATA;
11355 		}
11356 
11357 		*addr = 0;
11358 		kr = vm_map_enter(map,
11359 		    addr,
11360 		    vm_map_round_page(copy_size,
11361 		    VM_MAP_PAGE_MASK(map)),
11362 		    (vm_map_offset_t) 0,
11363 		    vmk_flags,
11364 		    VM_OBJECT_NULL,
11365 		    (vm_object_offset_t) 0,
11366 		    FALSE,
11367 		    VM_PROT_DEFAULT,
11368 		    VM_PROT_ALL,
11369 		    VM_INHERIT_DEFAULT);
11370 		if (kr != KERN_SUCCESS) {
11371 			return kr;
11372 		}
11373 #if KASAN
11374 		if (map->pmap == kernel_pmap) {
11375 			kasan_notify_address(*addr, copy->size);
11376 		}
11377 #endif
11378 	}
11379 
11380 	/*
11381 	 * Copyout the data from the kernel buffer to the target map.
11382 	 */
11383 	if (thread->map == map) {
11384 		/*
11385 		 * If the target map is the current map, just do
11386 		 * the copy.
11387 		 */
11388 		assert((vm_size_t)copy_size == copy_size);
11389 		if (copyout(copy->cpy_kdata, *addr, (vm_size_t)copy_size)) {
11390 			kr = KERN_INVALID_ADDRESS;
11391 		}
11392 	} else {
11393 		vm_map_switch_context_t switch_ctx;
11394 
11395 		/*
11396 		 * If the target map is another map, assume the
11397 		 * target's address space identity for the duration
11398 		 * of the copy.
11399 		 */
11400 		vm_map_reference(map);
11401 		switch_ctx = vm_map_switch_to(map);
11402 
11403 		assert((vm_size_t)copy_size == copy_size);
11404 		if (copyout(copy->cpy_kdata, *addr, (vm_size_t)copy_size)) {
11405 			vm_map_copyout_kernel_buffer_failures++;
11406 			kr = KERN_INVALID_ADDRESS;
11407 		}
11408 
11409 		vm_map_switch_back(switch_ctx);
11410 		vm_map_deallocate(map);
11411 	}
11412 
11413 	if (kr != KERN_SUCCESS) {
11414 		/* the copy failed, clean up */
11415 		if (!overwrite) {
11416 			/*
11417 			 * Deallocate the space we allocated in the target map.
11418 			 */
11419 			(void) vm_map_remove(map,
11420 			    vm_map_trunc_page(*addr,
11421 			    VM_MAP_PAGE_MASK(map)),
11422 			    vm_map_round_page((*addr +
11423 			    vm_map_round_page(copy_size,
11424 			    VM_MAP_PAGE_MASK(map))),
11425 			    VM_MAP_PAGE_MASK(map)));
11426 			*addr = 0;
11427 		}
11428 	} else {
11429 		/* copy was successful, dicard the copy structure */
11430 		if (consume_on_success) {
11431 			kfree_data(copy->cpy_kdata, copy_size);
11432 			zfree_id(ZONE_ID_VM_MAP_COPY, copy);
11433 		}
11434 	}
11435 
11436 	return kr;
11437 }
11438 
11439 /*
11440  *	Routine:	vm_map_copy_insert      [internal use only]
11441  *
11442  *	Description:
11443  *		Link a copy chain ("copy") into a map at the
11444  *		specified location (after "where").
11445  *
11446  *		Callers of this function must call vm_map_copy_require on
11447  *		previously created vm_map_copy_t or pass a newly created
11448  *		one to ensure that it hasn't been forged.
11449  *	Side effects:
11450  *		The copy chain is destroyed.
11451  */
11452 static void
vm_map_copy_insert(vm_map_t map,vm_map_entry_t after_where,vm_map_copy_t copy)11453 vm_map_copy_insert(
11454 	vm_map_t        map,
11455 	vm_map_entry_t  after_where,
11456 	vm_map_copy_t   copy)
11457 {
11458 	vm_map_entry_t  entry;
11459 
11460 	while (vm_map_copy_first_entry(copy) != vm_map_copy_to_entry(copy)) {
11461 		entry = vm_map_copy_first_entry(copy);
11462 		vm_map_copy_entry_unlink(copy, entry);
11463 		vm_map_store_entry_link(map, after_where, entry,
11464 		    VM_MAP_KERNEL_FLAGS_NONE);
11465 		after_where = entry;
11466 	}
11467 	zfree_id(ZONE_ID_VM_MAP_COPY, copy);
11468 }
11469 
11470 /*
11471  * Callers of this function must call vm_map_copy_require on
11472  * previously created vm_map_copy_t or pass a newly created
11473  * one to ensure that it hasn't been forged.
11474  */
11475 void
vm_map_copy_remap(vm_map_t map,vm_map_entry_t where,vm_map_copy_t copy,vm_map_offset_t adjustment,vm_prot_t cur_prot,vm_prot_t max_prot,vm_inherit_t inheritance)11476 vm_map_copy_remap(
11477 	vm_map_t        map,
11478 	vm_map_entry_t  where,
11479 	vm_map_copy_t   copy,
11480 	vm_map_offset_t adjustment,
11481 	vm_prot_t       cur_prot,
11482 	vm_prot_t       max_prot,
11483 	vm_inherit_t    inheritance)
11484 {
11485 	vm_map_entry_t  copy_entry, new_entry;
11486 
11487 	for (copy_entry = vm_map_copy_first_entry(copy);
11488 	    copy_entry != vm_map_copy_to_entry(copy);
11489 	    copy_entry = copy_entry->vme_next) {
11490 		/* get a new VM map entry for the map */
11491 		new_entry = vm_map_entry_create(map);
11492 		/* copy the "copy entry" to the new entry */
11493 		vm_map_entry_copy(map, new_entry, copy_entry);
11494 		/* adjust "start" and "end" */
11495 		new_entry->vme_start += adjustment;
11496 		new_entry->vme_end += adjustment;
11497 		/* clear some attributes */
11498 		new_entry->inheritance = inheritance;
11499 		new_entry->protection = cur_prot;
11500 		new_entry->max_protection = max_prot;
11501 		new_entry->behavior = VM_BEHAVIOR_DEFAULT;
11502 		/* take an extra reference on the entry's "object" */
11503 		if (new_entry->is_sub_map) {
11504 			assert(!new_entry->use_pmap); /* not nested */
11505 			vm_map_reference(VME_SUBMAP(new_entry));
11506 		} else {
11507 			vm_object_reference(VME_OBJECT(new_entry));
11508 		}
11509 		/* insert the new entry in the map */
11510 		vm_map_store_entry_link(map, where, new_entry,
11511 		    VM_MAP_KERNEL_FLAGS_NONE);
11512 		/* continue inserting the "copy entries" after the new entry */
11513 		where = new_entry;
11514 	}
11515 }
11516 
11517 
11518 /*
11519  * Returns true if *size matches (or is in the range of) copy->size.
11520  * Upon returning true, the *size field is updated with the actual size of the
11521  * copy object (may be different for VM_MAP_COPY_ENTRY_LIST types)
11522  */
11523 boolean_t
vm_map_copy_validate_size(vm_map_t dst_map,vm_map_copy_t copy,vm_map_size_t * size)11524 vm_map_copy_validate_size(
11525 	vm_map_t                dst_map,
11526 	vm_map_copy_t           copy,
11527 	vm_map_size_t           *size)
11528 {
11529 	if (copy == VM_MAP_COPY_NULL) {
11530 		return FALSE;
11531 	}
11532 
11533 	/*
11534 	 * Assert that the vm_map_copy is coming from the right
11535 	 * zone and hasn't been forged
11536 	 */
11537 	vm_map_copy_require(copy);
11538 
11539 	vm_map_size_t copy_sz = copy->size;
11540 	vm_map_size_t sz = *size;
11541 	switch (copy->type) {
11542 	case VM_MAP_COPY_KERNEL_BUFFER:
11543 		if (sz == copy_sz) {
11544 			return TRUE;
11545 		}
11546 		break;
11547 	case VM_MAP_COPY_ENTRY_LIST:
11548 		/*
11549 		 * potential page-size rounding prevents us from exactly
11550 		 * validating this flavor of vm_map_copy, but we can at least
11551 		 * assert that it's within a range.
11552 		 */
11553 		if (copy_sz >= sz &&
11554 		    copy_sz <= vm_map_round_page(sz, VM_MAP_PAGE_MASK(dst_map))) {
11555 			*size = copy_sz;
11556 			return TRUE;
11557 		}
11558 		break;
11559 	default:
11560 		break;
11561 	}
11562 	return FALSE;
11563 }
11564 
11565 static kern_return_t
vm_map_copyout_internal(vm_map_t dst_map,vm_map_address_t * dst_addr,vm_map_copy_t copy,vm_map_size_ut copy_size_u,boolean_t consume_on_success,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)11566 vm_map_copyout_internal(
11567 	vm_map_t                dst_map,
11568 	vm_map_address_t       *dst_addr,      /* OUT */
11569 	vm_map_copy_t           copy,
11570 	vm_map_size_ut          copy_size_u,
11571 	boolean_t               consume_on_success,
11572 	vm_prot_t               cur_protection,
11573 	vm_prot_t               max_protection,
11574 	vm_inherit_t            inheritance)
11575 {
11576 	vm_map_size_t           size, copy_size;
11577 	vm_map_size_t           adjustment;
11578 	vm_map_offset_t         start;
11579 	vm_object_offset_t      vm_copy_start;
11580 	vm_map_entry_t          last;
11581 	vm_map_entry_t          entry;
11582 	vm_map_copy_t           original_copy;
11583 	kern_return_t           kr;
11584 	vm_map_kernel_flags_t   vmk_flags = VM_MAP_KERNEL_FLAGS_ANYWHERE();
11585 
11586 	/*
11587 	 *	Check for null copy object.
11588 	 */
11589 
11590 	if (copy == VM_MAP_COPY_NULL) {
11591 		*dst_addr = 0;
11592 		return KERN_SUCCESS;
11593 	}
11594 
11595 	/*
11596 	 * Assert that the vm_map_copy is coming from the right
11597 	 * zone and hasn't been forged
11598 	 */
11599 	vm_map_copy_require(copy);
11600 
11601 	if (!VM_SANITIZE_UNSAFE_IS_EQUAL(copy_size_u, copy->size)) {
11602 		*dst_addr = 0;
11603 		ktriage_record(thread_tid(current_thread()),
11604 		    KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM,
11605 		    KDBG_TRIAGE_RESERVED,
11606 		    KDBG_TRIAGE_VM_COPYOUT_INTERNAL_SIZE_ERROR),
11607 		    KERN_FAILURE /* arg */);
11608 		return KERN_FAILURE;
11609 	}
11610 	copy_size = copy->size;
11611 
11612 	/*
11613 	 *	Check for special kernel buffer allocated
11614 	 *	by new_ipc_kmsg_copyin.
11615 	 */
11616 
11617 	if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) {
11618 		kr = vm_map_copyout_kernel_buffer(dst_map, dst_addr,
11619 		    copy, copy_size, FALSE,
11620 		    consume_on_success);
11621 		if (kr) {
11622 			ktriage_record(thread_tid(current_thread()),
11623 			    KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM,
11624 			    KDBG_TRIAGE_RESERVED,
11625 			    KDBG_TRIAGE_VM_COPYOUT_KERNEL_BUFFER_ERROR), kr /* arg */);
11626 		}
11627 		return kr;
11628 	}
11629 
11630 
11631 	original_copy = copy;
11632 	if (copy->cpy_hdr.page_shift != VM_MAP_PAGE_SHIFT(dst_map)) {
11633 		vm_map_copy_t target_copy;
11634 		vm_map_offset_t overmap_start, overmap_end, trimmed_start;
11635 
11636 		target_copy = VM_MAP_COPY_NULL;
11637 		DEBUG4K_ADJUST("adjusting...\n");
11638 		kr = vm_map_copy_adjust_to_target(
11639 			copy,
11640 			0, /* offset */
11641 			copy->size, /* size */
11642 			dst_map,
11643 			TRUE, /* copy */
11644 			&target_copy,
11645 			&overmap_start,
11646 			&overmap_end,
11647 			&trimmed_start);
11648 		if (kr != KERN_SUCCESS) {
11649 			DEBUG4K_COPY("adjust failed 0x%x\n", kr);
11650 			ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOUT_INTERNAL_ADJUSTING_ERROR), kr /* arg */);
11651 			return kr;
11652 		}
11653 		DEBUG4K_COPY("copy %p (%d 0x%llx 0x%llx) dst_map %p (%d) target_copy %p (%d 0x%llx 0x%llx) overmap_start 0x%llx overmap_end 0x%llx trimmed_start 0x%llx\n", copy, copy->cpy_hdr.page_shift, copy->offset, (uint64_t)copy->size, dst_map, VM_MAP_PAGE_SHIFT(dst_map), target_copy, target_copy->cpy_hdr.page_shift, target_copy->offset, (uint64_t)target_copy->size, (uint64_t)overmap_start, (uint64_t)overmap_end, (uint64_t)trimmed_start);
11654 		if (target_copy != copy) {
11655 			copy = target_copy;
11656 		}
11657 		copy_size = copy->size;
11658 	}
11659 
11660 	/*
11661 	 *	Find space for the data
11662 	 */
11663 
11664 	vm_copy_start = vm_map_trunc_page((vm_map_size_t)copy->offset,
11665 	    VM_MAP_COPY_PAGE_MASK(copy));
11666 	size = vm_map_round_page((vm_map_size_t)copy->offset + copy_size,
11667 	    VM_MAP_COPY_PAGE_MASK(copy))
11668 	    - vm_copy_start;
11669 
11670 	vm_map_kernel_flags_update_range_id(&vmk_flags, dst_map, size);
11671 
11672 	vm_map_lock(dst_map);
11673 	kr = vm_map_locate_space_anywhere(dst_map, size, 0, vmk_flags,
11674 	    &start, &last);
11675 	if (kr != KERN_SUCCESS) {
11676 		vm_map_unlock(dst_map);
11677 		ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOUT_INTERNAL_SPACE_ERROR), kr /* arg */);
11678 		return kr;
11679 	}
11680 
11681 	adjustment = start - vm_copy_start;
11682 	if (!consume_on_success) {
11683 		/*
11684 		 * We're not allowed to consume "copy", so we'll have to
11685 		 * copy its map entries into the destination map below.
11686 		 * No need to re-allocate map entries from the correct
11687 		 * (pageable or not) zone, since we'll get new map entries
11688 		 * during the transfer.
11689 		 * We'll also adjust the map entries's "start" and "end"
11690 		 * during the transfer, to keep "copy"'s entries consistent
11691 		 * with its "offset".
11692 		 */
11693 		goto after_adjustments;
11694 	}
11695 
11696 	/*
11697 	 *	Since we're going to just drop the map
11698 	 *	entries from the copy into the destination
11699 	 *	map, they must come from the same pool.
11700 	 */
11701 
11702 	if (copy->cpy_hdr.entries_pageable != dst_map->hdr.entries_pageable) {
11703 		/*
11704 		 * Mismatches occur when dealing with the default
11705 		 * pager.
11706 		 */
11707 		vm_map_entry_t  next, new;
11708 
11709 		/*
11710 		 * Find the zone that the copies were allocated from
11711 		 */
11712 
11713 		entry = vm_map_copy_first_entry(copy);
11714 
11715 		/*
11716 		 * Reinitialize the copy so that vm_map_copy_entry_link
11717 		 * will work.
11718 		 */
11719 		vm_map_store_copy_reset(copy, entry);
11720 		copy->cpy_hdr.entries_pageable = dst_map->hdr.entries_pageable;
11721 
11722 		/*
11723 		 * Copy each entry.
11724 		 */
11725 		while (entry != vm_map_copy_to_entry(copy)) {
11726 			new = vm_map_copy_entry_create(copy);
11727 			vm_map_entry_copy_full(new, entry);
11728 			new->vme_no_copy_on_read = FALSE;
11729 			assert(!new->iokit_acct);
11730 			if (new->is_sub_map) {
11731 				/* clr address space specifics */
11732 				new->use_pmap = FALSE;
11733 			}
11734 			vm_map_copy_entry_link(copy,
11735 			    vm_map_copy_last_entry(copy),
11736 			    new);
11737 			next = entry->vme_next;
11738 			vm_map_entry_dispose(entry);
11739 			entry = next;
11740 		}
11741 	}
11742 
11743 	/*
11744 	 *	Adjust the addresses in the copy chain, and
11745 	 *	reset the region attributes.
11746 	 */
11747 
11748 	for (entry = vm_map_copy_first_entry(copy);
11749 	    entry != vm_map_copy_to_entry(copy);
11750 	    entry = entry->vme_next) {
11751 		if (VM_MAP_PAGE_SHIFT(dst_map) == PAGE_SHIFT) {
11752 			/*
11753 			 * We're injecting this copy entry into a map that
11754 			 * has the standard page alignment, so clear
11755 			 * "map_aligned" (which might have been inherited
11756 			 * from the original map entry).
11757 			 */
11758 			entry->map_aligned = FALSE;
11759 		}
11760 
11761 		entry->vme_start += adjustment;
11762 		entry->vme_end += adjustment;
11763 
11764 		if (entry->map_aligned) {
11765 			assert(VM_MAP_PAGE_ALIGNED(entry->vme_start,
11766 			    VM_MAP_PAGE_MASK(dst_map)));
11767 			assert(VM_MAP_PAGE_ALIGNED(entry->vme_end,
11768 			    VM_MAP_PAGE_MASK(dst_map)));
11769 		}
11770 
11771 		entry->inheritance = VM_INHERIT_DEFAULT;
11772 		entry->protection = VM_PROT_DEFAULT;
11773 		entry->max_protection = VM_PROT_ALL;
11774 		entry->behavior = VM_BEHAVIOR_DEFAULT;
11775 
11776 		/*
11777 		 * If the entry is now wired,
11778 		 * map the pages into the destination map.
11779 		 */
11780 		if (entry->wired_count != 0) {
11781 			vm_map_offset_t va;
11782 			vm_object_offset_t       offset;
11783 			vm_object_t object;
11784 			vm_prot_t prot;
11785 			int     type_of_fault;
11786 			uint8_t object_lock_type = OBJECT_LOCK_EXCLUSIVE;
11787 
11788 			/* TODO4K would need to use actual page size */
11789 			assert(VM_MAP_PAGE_SHIFT(dst_map) == PAGE_SHIFT);
11790 
11791 			object = VME_OBJECT(entry);
11792 			offset = VME_OFFSET(entry);
11793 			va = entry->vme_start;
11794 
11795 			pmap_pageable(dst_map->pmap,
11796 			    entry->vme_start,
11797 			    entry->vme_end,
11798 			    TRUE);
11799 
11800 			while (va < entry->vme_end) {
11801 				vm_page_t       m;
11802 				struct vm_object_fault_info fault_info = {
11803 					.interruptible = THREAD_UNINT,
11804 				};
11805 
11806 				/*
11807 				 * Look up the page in the object.
11808 				 * Assert that the page will be found in the
11809 				 * top object:
11810 				 * either
11811 				 *	the object was newly created by
11812 				 *	vm_object_copy_slowly, and has
11813 				 *	copies of all of the pages from
11814 				 *	the source object
11815 				 * or
11816 				 *	the object was moved from the old
11817 				 *	map entry; because the old map
11818 				 *	entry was wired, all of the pages
11819 				 *	were in the top-level object.
11820 				 *	(XXX not true if we wire pages for
11821 				 *	 reading)
11822 				 */
11823 				vm_object_lock(object);
11824 
11825 				m = vm_page_lookup(object, offset);
11826 				if (m == VM_PAGE_NULL || !VM_PAGE_WIRED(m) ||
11827 				    m->vmp_absent) {
11828 					panic("vm_map_copyout: wiring %p", m);
11829 				}
11830 
11831 				prot = entry->protection;
11832 
11833 				if (override_nx(dst_map, VME_ALIAS(entry)) &&
11834 				    prot) {
11835 					prot |= VM_PROT_EXECUTE;
11836 				}
11837 
11838 				type_of_fault = DBG_CACHE_HIT_FAULT;
11839 
11840 				fault_info.user_tag = VME_ALIAS(entry);
11841 				fault_info.pmap_options = 0;
11842 				if (entry->iokit_acct ||
11843 				    (!entry->is_sub_map && !entry->use_pmap)) {
11844 					fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
11845 				}
11846 				if (entry->vme_xnu_user_debug &&
11847 				    !VM_PAGE_OBJECT(m)->code_signed) {
11848 					/*
11849 					 * Modified code-signed executable
11850 					 * region: this page does not belong
11851 					 * to a code-signed VM object, so it
11852 					 * must have been copied and should
11853 					 * therefore be typed XNU_USER_DEBUG
11854 					 * rather than XNU_USER_EXEC.
11855 					 */
11856 					fault_info.pmap_options |= PMAP_OPTIONS_XNU_USER_DEBUG;
11857 				}
11858 
11859 				vm_fault_enter(m,
11860 				    dst_map->pmap,
11861 				    va,
11862 				    PAGE_SIZE, 0,
11863 				    prot,
11864 				    prot,
11865 				    VM_PAGE_WIRED(m),
11866 				    VM_KERN_MEMORY_NONE,            /* tag - not wiring */
11867 				    &fault_info,
11868 				    NULL,             /* need_retry */
11869 				    &type_of_fault,
11870 				    &object_lock_type); /*Exclusive mode lock. Will remain unchanged.*/
11871 
11872 				vm_object_unlock(object);
11873 
11874 				offset += PAGE_SIZE_64;
11875 				va += PAGE_SIZE;
11876 			}
11877 		}
11878 	}
11879 
11880 after_adjustments:
11881 
11882 	/*
11883 	 *	Correct the page alignment for the result
11884 	 */
11885 
11886 	*dst_addr = start + (copy->offset - vm_copy_start);
11887 
11888 #if KASAN
11889 	kasan_notify_address(*dst_addr, size);
11890 #endif
11891 
11892 	/*
11893 	 *	Update the hints and the map size
11894 	 */
11895 
11896 	if (consume_on_success) {
11897 		SAVE_HINT_MAP_WRITE(dst_map, vm_map_copy_last_entry(copy));
11898 	} else {
11899 		SAVE_HINT_MAP_WRITE(dst_map, last);
11900 	}
11901 
11902 	dst_map->size += size;
11903 
11904 	/*
11905 	 *	Link in the copy
11906 	 */
11907 
11908 	if (consume_on_success) {
11909 		vm_map_copy_insert(dst_map, last, copy);
11910 		if (copy != original_copy) {
11911 			vm_map_copy_discard(original_copy);
11912 			original_copy = VM_MAP_COPY_NULL;
11913 		}
11914 	} else {
11915 		vm_map_copy_remap(dst_map, last, copy, adjustment,
11916 		    cur_protection, max_protection,
11917 		    inheritance);
11918 		if (copy != original_copy && original_copy != VM_MAP_COPY_NULL) {
11919 			vm_map_copy_discard(copy);
11920 			copy = original_copy;
11921 		}
11922 	}
11923 
11924 
11925 	vm_map_unlock(dst_map);
11926 
11927 	/*
11928 	 * XXX	If wiring_required, call vm_map_pageable
11929 	 */
11930 
11931 	return KERN_SUCCESS;
11932 }
11933 
11934 /*
11935  *	Routine:	vm_map_copyout_size
11936  *
11937  *	Description:
11938  *		Copy out a copy chain ("copy") into newly-allocated
11939  *		space in the destination map. Uses a prevalidated
11940  *		size for the copy object (vm_map_copy_validate_size).
11941  *
11942  *		If successful, consumes the copy object.
11943  *		Otherwise, the caller is responsible for it.
11944  */
11945 kern_return_t
vm_map_copyout_size(vm_map_t dst_map,vm_map_address_t * dst_addr,vm_map_copy_t copy,vm_map_size_ut copy_size)11946 vm_map_copyout_size(
11947 	vm_map_t                dst_map,
11948 	vm_map_address_t       *dst_addr,      /* OUT */
11949 	vm_map_copy_t           copy,
11950 	vm_map_size_ut          copy_size)
11951 {
11952 	return vm_map_copyout_internal(dst_map, dst_addr, copy, copy_size,
11953 	           TRUE,                     /* consume_on_success */
11954 	           VM_PROT_DEFAULT,
11955 	           VM_PROT_ALL,
11956 	           VM_INHERIT_DEFAULT);
11957 }
11958 
11959 /*
11960  *	Routine:	vm_map_copyout
11961  *
11962  *	Description:
11963  *		Copy out a copy chain ("copy") into newly-allocated
11964  *		space in the destination map.
11965  *
11966  *		If successful, consumes the copy object.
11967  *		Otherwise, the caller is responsible for it.
11968  */
11969 kern_return_t
vm_map_copyout(vm_map_t dst_map,vm_map_address_t * dst_addr,vm_map_copy_t copy)11970 vm_map_copyout(
11971 	vm_map_t                dst_map,
11972 	vm_map_address_t       *dst_addr,      /* OUT */
11973 	vm_map_copy_t           copy)
11974 {
11975 	return vm_map_copyout_internal(dst_map, dst_addr, copy, copy ? copy->size : 0,
11976 	           TRUE,                     /* consume_on_success */
11977 	           VM_PROT_DEFAULT,
11978 	           VM_PROT_ALL,
11979 	           VM_INHERIT_DEFAULT);
11980 }
11981 
11982 /*
11983  *	Routine:	vm_map_copyin
11984  *
11985  *	Description:
11986  *		see vm_map_copyin_common.  Exported via Unsupported.exports.
11987  *
11988  */
11989 kern_return_t
vm_map_copyin(vm_map_t src_map,vm_map_address_ut src_addr,vm_map_size_ut len,boolean_t src_destroy,vm_map_copy_t * copy_result)11990 vm_map_copyin(
11991 	vm_map_t                src_map,
11992 	vm_map_address_ut       src_addr,
11993 	vm_map_size_ut          len,
11994 	boolean_t               src_destroy,
11995 	vm_map_copy_t          *copy_result)   /* OUT */
11996 {
11997 	return vm_map_copyin_common(src_map, src_addr, len, src_destroy,
11998 	           FALSE, copy_result, FALSE);
11999 }
12000 
12001 /*
12002  *	Routine:	vm_map_copyin_common
12003  *
12004  *	Description:
12005  *		Copy the specified region (src_addr, len) from the
12006  *		source address space (src_map), possibly removing
12007  *		the region from the source address space (src_destroy).
12008  *
12009  *	Returns:
12010  *		A vm_map_copy_t object (copy_result), suitable for
12011  *		insertion into another address space (using vm_map_copyout),
12012  *		copying over another address space region (using
12013  *		vm_map_copy_overwrite).  If the copy is unused, it
12014  *		should be destroyed (using vm_map_copy_discard).
12015  *
12016  *	In/out conditions:
12017  *		The source map should not be locked on entry.
12018  */
12019 
12020 typedef struct submap_map {
12021 	vm_map_t        parent_map;
12022 	vm_map_offset_t base_start;
12023 	vm_map_offset_t base_end;
12024 	vm_map_size_t   base_len;
12025 	struct submap_map *next;
12026 } submap_map_t;
12027 
12028 kern_return_t
vm_map_copyin_common(vm_map_t src_map,vm_map_address_ut src_addr,vm_map_size_ut len,boolean_t src_destroy,__unused boolean_t src_volatile,vm_map_copy_t * copy_result,boolean_t use_maxprot)12029 vm_map_copyin_common(
12030 	vm_map_t                src_map,
12031 	vm_map_address_ut       src_addr,
12032 	vm_map_size_ut          len,
12033 	boolean_t               src_destroy,
12034 	__unused boolean_t      src_volatile,
12035 	vm_map_copy_t          *copy_result,   /* OUT */
12036 	boolean_t               use_maxprot)
12037 {
12038 	int flags;
12039 
12040 	flags = 0;
12041 	if (src_destroy) {
12042 		flags |= VM_MAP_COPYIN_SRC_DESTROY;
12043 	}
12044 	if (use_maxprot) {
12045 		flags |= VM_MAP_COPYIN_USE_MAXPROT;
12046 	}
12047 	return vm_map_copyin_internal(src_map,
12048 	           src_addr,
12049 	           len,
12050 	           flags,
12051 	           copy_result);
12052 }
12053 
12054 static __attribute__((always_inline, warn_unused_result))
12055 kern_return_t
vm_map_copyin_sanitize(vm_map_t src_map,vm_map_address_ut src_addr_u,vm_map_size_ut len_u,vm_map_offset_t * src_start,vm_map_offset_t * src_end,vm_map_size_t * len,vm_map_offset_t * src_addr_unaligned)12056 vm_map_copyin_sanitize(
12057 	vm_map_t                src_map,
12058 	vm_map_address_ut       src_addr_u,
12059 	vm_map_size_ut          len_u,
12060 	vm_map_offset_t        *src_start,
12061 	vm_map_offset_t        *src_end,
12062 	vm_map_size_t          *len,
12063 	vm_map_offset_t        *src_addr_unaligned)
12064 {
12065 	kern_return_t   kr;
12066 	vm_sanitize_flags_t flags = VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS |
12067 	    VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES |
12068 	    VM_SANITIZE_FLAGS_CHECK_ADDR_RANGE;
12069 
12070 #if KASAN_TBI
12071 	if (vm_kernel_map_is_kernel(src_map)) {
12072 		flags |= VM_SANITIZE_FLAGS_CANONICALIZE;
12073 	}
12074 #endif /* KASAN_TBI */
12075 
12076 	kr = vm_sanitize_addr_size(src_addr_u, len_u,
12077 	    VM_SANITIZE_CALLER_VM_MAP_COPYIN,
12078 	    src_map,
12079 	    flags,
12080 	    src_start, src_end, len);
12081 	if (__improbable(kr != KERN_SUCCESS)) {
12082 		return kr;
12083 	}
12084 
12085 	/*
12086 	 *	Compute (page aligned) start and end of region
12087 	 */
12088 	*src_addr_unaligned  = *src_start; /* remember unaligned value */
12089 	*src_start = vm_map_trunc_page(*src_addr_unaligned,
12090 	    VM_MAP_PAGE_MASK(src_map));
12091 	*src_end   = vm_map_round_page(*src_end, VM_MAP_PAGE_MASK(src_map));
12092 
12093 
12094 	return KERN_SUCCESS;
12095 }
12096 
12097 kern_return_t
vm_map_copyin_internal(vm_map_t src_map,vm_map_address_ut src_addr_u,vm_map_size_ut len_u,int flags,vm_map_copy_t * copy_result)12098 vm_map_copyin_internal(
12099 	vm_map_t                src_map,
12100 	vm_map_address_ut       src_addr_u,
12101 	vm_map_size_ut          len_u,
12102 	int                     flags,
12103 	vm_map_copy_t          *copy_result)   /* OUT */
12104 {
12105 	vm_map_entry_t  tmp_entry;      /* Result of last map lookup --
12106 	                                 * in multi-level lookup, this
12107 	                                 * entry contains the actual
12108 	                                 * vm_object/offset.
12109 	                                 */
12110 	vm_map_entry_t  new_entry = VM_MAP_ENTRY_NULL;  /* Map entry for copy */
12111 
12112 	vm_map_offset_t src_start;      /* Start of current entry --
12113 	                                 * where copy is taking place now
12114 	                                 */
12115 	vm_map_offset_t src_end;        /* End of entire region to be
12116 	                                 * copied */
12117 	vm_map_offset_t src_addr_unaligned;
12118 	vm_map_offset_t src_base;
12119 	vm_map_size_t   len;
12120 	vm_map_t        base_map = src_map;
12121 	boolean_t       map_share = FALSE;
12122 	submap_map_t    *parent_maps = NULL;
12123 
12124 	vm_map_copy_t   copy;           /* Resulting copy */
12125 	vm_map_address_t copy_addr;
12126 	vm_map_size_t   copy_size;
12127 	boolean_t       src_destroy;
12128 	boolean_t       use_maxprot;
12129 	boolean_t       preserve_purgeable;
12130 	boolean_t       entry_was_shared;
12131 	vm_map_entry_t  saved_src_entry;
12132 	kern_return_t   kr;
12133 
12134 	if (flags & ~VM_MAP_COPYIN_ALL_FLAGS) {
12135 		return KERN_INVALID_ARGUMENT;
12136 	}
12137 
12138 	/*
12139 	 *	Check for copies of zero bytes.
12140 	 */
12141 	if (VM_SANITIZE_UNSAFE_IS_ZERO(len_u)) {
12142 		*copy_result = VM_MAP_COPY_NULL;
12143 		return KERN_SUCCESS;
12144 	}
12145 
12146 	/*
12147 	 * Sanitize any input parameters that are addr/size/prot/inherit
12148 	 */
12149 	kr = vm_map_copyin_sanitize(
12150 		src_map,
12151 		src_addr_u,
12152 		len_u,
12153 		&src_start,
12154 		&src_end,
12155 		&len,
12156 		&src_addr_unaligned);
12157 	if (__improbable(kr != KERN_SUCCESS)) {
12158 		return vm_sanitize_get_kr(kr);
12159 	}
12160 
12161 	src_destroy = (flags & VM_MAP_COPYIN_SRC_DESTROY) ? TRUE : FALSE;
12162 	use_maxprot = (flags & VM_MAP_COPYIN_USE_MAXPROT) ? TRUE : FALSE;
12163 	preserve_purgeable =
12164 	    (flags & VM_MAP_COPYIN_PRESERVE_PURGEABLE) ? TRUE : FALSE;
12165 
12166 	/*
12167 	 * If the copy is sufficiently small, use a kernel buffer instead
12168 	 * of making a virtual copy.  The theory being that the cost of
12169 	 * setting up VM (and taking C-O-W faults) dominates the copy costs
12170 	 * for small regions.
12171 	 */
12172 	if ((len <= msg_ool_size_small) &&
12173 	    !use_maxprot &&
12174 	    !preserve_purgeable &&
12175 	    !(flags & VM_MAP_COPYIN_ENTRY_LIST) &&
12176 	    /*
12177 	     * Since the "msg_ool_size_small" threshold was increased and
12178 	     * vm_map_copyin_kernel_buffer() doesn't handle accesses beyond the
12179 	     * address space limits, we revert to doing a virtual copy if the
12180 	     * copied range goes beyond those limits.  Otherwise, mach_vm_read()
12181 	     * of the commpage would now fail when it used to work.
12182 	     */
12183 	    (src_start >= vm_map_min(src_map) &&
12184 	    src_start < vm_map_max(src_map) &&
12185 	    src_end >= vm_map_min(src_map) &&
12186 	    src_end < vm_map_max(src_map))) {
12187 		return vm_map_copyin_kernel_buffer(src_map, src_addr_unaligned, len,
12188 		           src_destroy, copy_result);
12189 	}
12190 
12191 	/*
12192 	 *	Allocate a header element for the list.
12193 	 *
12194 	 *	Use the start and end in the header to
12195 	 *	remember the endpoints prior to rounding.
12196 	 */
12197 
12198 	copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
12199 	copy->cpy_hdr.entries_pageable = TRUE;
12200 	copy->cpy_hdr.page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(src_map);
12201 	copy->offset = src_addr_unaligned;
12202 	copy->size = len;
12203 
12204 	new_entry = vm_map_copy_entry_create(copy);
12205 
12206 #define RETURN(x)                                               \
12207 	MACRO_BEGIN                                             \
12208 	vm_map_unlock(src_map);                                 \
12209 	if(src_map != base_map)                                 \
12210 	        vm_map_deallocate(src_map);                     \
12211 	if (new_entry != VM_MAP_ENTRY_NULL)                     \
12212 	        vm_map_copy_entry_dispose(new_entry);           \
12213 	vm_map_copy_discard(copy);                              \
12214 	{                                                       \
12215 	        submap_map_t	*_ptr;                          \
12216                                                                 \
12217 	        for(_ptr = parent_maps; _ptr != NULL; _ptr = parent_maps) { \
12218 	                parent_maps=parent_maps->next;          \
12219 	                if (_ptr->parent_map != base_map)       \
12220 	                        vm_map_deallocate(_ptr->parent_map);    \
12221 	                kfree_type(submap_map_t, _ptr);         \
12222 	        }                                               \
12223 	}                                                       \
12224 	MACRO_RETURN(x);                                        \
12225 	MACRO_END
12226 
12227 	/*
12228 	 *	Find the beginning of the region.
12229 	 */
12230 
12231 	vm_map_lock(src_map);
12232 
12233 	/*
12234 	 * Lookup the original "src_addr_unaligned" rather than the truncated
12235 	 * "src_start", in case "src_start" falls in a non-map-aligned
12236 	 * map entry *before* the map entry that contains "src_addr_unaligned"...
12237 	 */
12238 	if (!vm_map_lookup_entry(src_map, src_addr_unaligned, &tmp_entry)) {
12239 		RETURN(KERN_INVALID_ADDRESS);
12240 	}
12241 	if (!tmp_entry->is_sub_map) {
12242 		/*
12243 		 * ... but clip to the map-rounded "src_start" rather than
12244 		 * "src_addr_unaligned" to preserve map-alignment.  We'll adjust the
12245 		 * first copy entry at the end, if needed.
12246 		 */
12247 		vm_map_clip_start(src_map, tmp_entry, src_start);
12248 	}
12249 	if (src_start < tmp_entry->vme_start) {
12250 		/*
12251 		 * Move "src_start" up to the start of the
12252 		 * first map entry to copy.
12253 		 */
12254 		src_start = tmp_entry->vme_start;
12255 	}
12256 	/* set for later submap fix-up */
12257 	copy_addr = src_start;
12258 
12259 	/*
12260 	 *	Go through entries until we get to the end.
12261 	 */
12262 
12263 	while (TRUE) {
12264 		vm_map_entry_t  src_entry = tmp_entry;  /* Top-level entry */
12265 		vm_map_size_t   src_size;               /* Size of source
12266 		                                         * map entry (in both
12267 		                                         * maps)
12268 		                                         */
12269 
12270 		vm_object_t             src_object;     /* Object to copy */
12271 		vm_object_offset_t      src_offset;
12272 
12273 		vm_object_t             new_copy_object;/* vm_object_copy_* result */
12274 
12275 		boolean_t       src_needs_copy;         /* Should source map
12276 		                                         * be made read-only
12277 		                                         * for copy-on-write?
12278 		                                         */
12279 
12280 		boolean_t       new_entry_needs_copy;   /* Will new entry be COW? */
12281 
12282 		boolean_t       was_wired;              /* Was source wired? */
12283 		boolean_t       saved_used_for_jit;     /* Saved used_for_jit. */
12284 		vm_map_version_t version;               /* Version before locks
12285 		                                         * dropped to make copy
12286 		                                         */
12287 		kern_return_t   result;                 /* Return value from
12288 		                                         * copy_strategically.
12289 		                                         */
12290 		while (tmp_entry->is_sub_map) {
12291 			vm_map_size_t submap_len;
12292 			submap_map_t *ptr;
12293 
12294 			ptr = kalloc_type(submap_map_t, Z_WAITOK);
12295 			ptr->next = parent_maps;
12296 			parent_maps = ptr;
12297 			ptr->parent_map = src_map;
12298 			ptr->base_start = src_start;
12299 			ptr->base_end = src_end;
12300 			submap_len = tmp_entry->vme_end - src_start;
12301 			if (submap_len > (src_end - src_start)) {
12302 				submap_len = src_end - src_start;
12303 			}
12304 			ptr->base_len = submap_len;
12305 
12306 			src_start -= tmp_entry->vme_start;
12307 			src_start += VME_OFFSET(tmp_entry);
12308 			src_end = src_start + submap_len;
12309 			src_map = VME_SUBMAP(tmp_entry);
12310 			vm_map_lock(src_map);
12311 			/* keep an outstanding reference for all maps in */
12312 			/* the parents tree except the base map */
12313 			vm_map_reference(src_map);
12314 			vm_map_unlock(ptr->parent_map);
12315 			if (!vm_map_lookup_entry(
12316 				    src_map, src_start, &tmp_entry)) {
12317 				RETURN(KERN_INVALID_ADDRESS);
12318 			}
12319 			map_share = TRUE;
12320 			if (!tmp_entry->is_sub_map) {
12321 				vm_map_clip_start(src_map, tmp_entry, src_start);
12322 			}
12323 			src_entry = tmp_entry;
12324 		}
12325 		/* we are now in the lowest level submap... */
12326 
12327 		if ((VME_OBJECT(tmp_entry) != VM_OBJECT_NULL) &&
12328 		    (VME_OBJECT(tmp_entry)->phys_contiguous)) {
12329 			/* This is not, supported for now.In future */
12330 			/* we will need to detect the phys_contig   */
12331 			/* condition and then upgrade copy_slowly   */
12332 			/* to do physical copy from the device mem  */
12333 			/* based object. We can piggy-back off of   */
12334 			/* the was wired boolean to set-up the      */
12335 			/* proper handling */
12336 			RETURN(KERN_PROTECTION_FAILURE);
12337 		}
12338 		/*
12339 		 *	Create a new address map entry to hold the result.
12340 		 *	Fill in the fields from the appropriate source entries.
12341 		 *	We must unlock the source map to do this if we need
12342 		 *	to allocate a map entry.
12343 		 */
12344 		if (new_entry == VM_MAP_ENTRY_NULL) {
12345 			version.main_timestamp = src_map->timestamp;
12346 			vm_map_unlock(src_map);
12347 
12348 			new_entry = vm_map_copy_entry_create(copy);
12349 
12350 			vm_map_lock(src_map);
12351 			if ((version.main_timestamp + 1) != src_map->timestamp) {
12352 				if (!vm_map_lookup_entry(src_map, src_start,
12353 				    &tmp_entry)) {
12354 					RETURN(KERN_INVALID_ADDRESS);
12355 				}
12356 				if (!tmp_entry->is_sub_map) {
12357 					vm_map_clip_start(src_map, tmp_entry, src_start);
12358 				}
12359 				continue; /* restart w/ new tmp_entry */
12360 			}
12361 		}
12362 
12363 		/*
12364 		 *	Verify that the region can be read.
12365 		 */
12366 		if (((src_entry->protection & VM_PROT_READ) == VM_PROT_NONE &&
12367 		    !use_maxprot) ||
12368 		    (src_entry->max_protection & VM_PROT_READ) == 0) {
12369 			RETURN(KERN_PROTECTION_FAILURE);
12370 		}
12371 
12372 		src_object = VME_OBJECT(src_entry);
12373 
12374 
12375 		/*
12376 		 *	Clip against the endpoints of the entire region.
12377 		 */
12378 
12379 		vm_map_clip_end(src_map, src_entry, src_end);
12380 
12381 		src_size = src_entry->vme_end - src_start;
12382 		src_offset = VME_OFFSET(src_entry);
12383 		was_wired = (src_entry->wired_count != 0);
12384 
12385 		vm_map_entry_copy(src_map, new_entry, src_entry);
12386 		if (new_entry->is_sub_map) {
12387 			/* clr address space specifics */
12388 			new_entry->use_pmap = FALSE;
12389 		} else {
12390 			/*
12391 			 * We're dealing with a copy-on-write operation,
12392 			 * so the resulting mapping should not inherit the
12393 			 * original mapping's accounting settings.
12394 			 * "iokit_acct" should have been cleared in
12395 			 * vm_map_entry_copy().
12396 			 * "use_pmap" should be reset to its default (TRUE)
12397 			 * so that the new mapping gets accounted for in
12398 			 * the task's memory footprint.
12399 			 */
12400 			assert(!new_entry->iokit_acct);
12401 			new_entry->use_pmap = TRUE;
12402 		}
12403 
12404 		/*
12405 		 *	Attempt non-blocking copy-on-write optimizations.
12406 		 */
12407 
12408 		/*
12409 		 * If we are destroying the source, and the object
12410 		 * is internal, we could move the object reference
12411 		 * from the source to the copy.  The copy is
12412 		 * copy-on-write only if the source is.
12413 		 * We make another reference to the object, because
12414 		 * destroying the source entry will deallocate it.
12415 		 *
12416 		 * This memory transfer has to be atomic, (to prevent
12417 		 * the VM object from being shared or copied while
12418 		 * it's being moved here), so we could only do this
12419 		 * if we won't have to unlock the VM map until the
12420 		 * original mapping has been fully removed.
12421 		 */
12422 
12423 RestartCopy:
12424 		if ((src_object == VM_OBJECT_NULL ||
12425 		    (!was_wired && !map_share && !tmp_entry->is_shared
12426 		    && !(debug4k_no_cow_copyin && VM_MAP_PAGE_SHIFT(src_map) < PAGE_SHIFT))) &&
12427 		    vm_object_copy_quickly(
12428 			    VME_OBJECT(new_entry),
12429 			    src_offset,
12430 			    src_size,
12431 			    &src_needs_copy,
12432 			    &new_entry_needs_copy)) {
12433 			new_entry->needs_copy = new_entry_needs_copy;
12434 
12435 			/*
12436 			 *	Handle copy-on-write obligations
12437 			 */
12438 
12439 			if (src_needs_copy && !tmp_entry->needs_copy) {
12440 				vm_prot_t prot;
12441 
12442 				prot = src_entry->protection & ~VM_PROT_WRITE;
12443 
12444 				if (override_nx(src_map, VME_ALIAS(src_entry))
12445 				    && prot) {
12446 					prot |= VM_PROT_EXECUTE;
12447 				}
12448 
12449 				vm_object_pmap_protect(
12450 					src_object,
12451 					src_offset,
12452 					src_size,
12453 					(src_entry->is_shared ?
12454 					PMAP_NULL
12455 					: src_map->pmap),
12456 					VM_MAP_PAGE_SIZE(src_map),
12457 					src_entry->vme_start,
12458 					prot);
12459 
12460 				assert(tmp_entry->wired_count == 0);
12461 				tmp_entry->needs_copy = TRUE;
12462 			}
12463 
12464 			/*
12465 			 *	The map has never been unlocked, so it's safe
12466 			 *	to move to the next entry rather than doing
12467 			 *	another lookup.
12468 			 */
12469 
12470 			goto CopySuccessful;
12471 		}
12472 
12473 		entry_was_shared = tmp_entry->is_shared;
12474 
12475 		/*
12476 		 *	Take an object reference, so that we may
12477 		 *	release the map lock(s).
12478 		 */
12479 
12480 		assert(src_object != VM_OBJECT_NULL);
12481 		vm_object_reference(src_object);
12482 
12483 		/*
12484 		 *	Record the timestamp for later verification.
12485 		 *	Unlock the map.
12486 		 */
12487 
12488 		version.main_timestamp = src_map->timestamp;
12489 		vm_map_unlock(src_map); /* Increments timestamp once! */
12490 		saved_src_entry = src_entry;
12491 		tmp_entry = VM_MAP_ENTRY_NULL;
12492 		src_entry = VM_MAP_ENTRY_NULL;
12493 
12494 		/*
12495 		 *	Perform the copy
12496 		 */
12497 
12498 		if (was_wired ||
12499 		    (src_object->copy_strategy == MEMORY_OBJECT_COPY_DELAY_FORK &&
12500 		    !(flags & VM_MAP_COPYIN_FORK)) ||
12501 		    (debug4k_no_cow_copyin &&
12502 		    VM_MAP_PAGE_SHIFT(src_map) < PAGE_SHIFT)) {
12503 CopySlowly:
12504 			vm_object_lock(src_object);
12505 			result = vm_object_copy_slowly(
12506 				src_object,
12507 				src_offset,
12508 				src_size,
12509 				THREAD_UNINT,
12510 				&new_copy_object);
12511 			/* VME_OBJECT_SET will reset used_for_jit|tpro, so preserve it. */
12512 			saved_used_for_jit = new_entry->used_for_jit;
12513 			VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
12514 			new_entry->used_for_jit = saved_used_for_jit;
12515 			VME_OFFSET_SET(new_entry,
12516 			    src_offset - vm_object_trunc_page(src_offset));
12517 			new_entry->needs_copy = FALSE;
12518 		} else if (src_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC &&
12519 		    (entry_was_shared || map_share)) {
12520 			vm_object_t new_object;
12521 
12522 			vm_object_lock_shared(src_object);
12523 			new_object = vm_object_copy_delayed(
12524 				src_object,
12525 				src_offset,
12526 				src_size,
12527 				TRUE);
12528 			if (new_object == VM_OBJECT_NULL) {
12529 				goto CopySlowly;
12530 			}
12531 
12532 			VME_OBJECT_SET(new_entry, new_object, false, 0);
12533 			assert(new_entry->wired_count == 0);
12534 			new_entry->needs_copy = TRUE;
12535 			assert(!new_entry->iokit_acct);
12536 			assert(new_object->purgable == VM_PURGABLE_DENY);
12537 			assertf(new_entry->use_pmap, "src_map %p new_entry %p\n", src_map, new_entry);
12538 			result = KERN_SUCCESS;
12539 		} else {
12540 			vm_object_offset_t new_offset;
12541 			new_offset = VME_OFFSET(new_entry);
12542 			result = vm_object_copy_strategically(src_object,
12543 			    src_offset,
12544 			    src_size,
12545 			    (flags & VM_MAP_COPYIN_FORK),
12546 			    &new_copy_object,
12547 			    &new_offset,
12548 			    &new_entry_needs_copy);
12549 			/* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
12550 			saved_used_for_jit = new_entry->used_for_jit;
12551 			VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
12552 			new_entry->used_for_jit = saved_used_for_jit;
12553 			if (new_offset != VME_OFFSET(new_entry)) {
12554 				VME_OFFSET_SET(new_entry, new_offset);
12555 			}
12556 
12557 			new_entry->needs_copy = new_entry_needs_copy;
12558 		}
12559 
12560 		if (result == KERN_SUCCESS &&
12561 		    ((preserve_purgeable &&
12562 		    src_object->purgable != VM_PURGABLE_DENY) ||
12563 		    new_entry->used_for_jit)) {
12564 			/*
12565 			 * Purgeable objects should be COPY_NONE, true share;
12566 			 * this should be propogated to the copy.
12567 			 *
12568 			 * Also force mappings the pmap specially protects to
12569 			 * be COPY_NONE; trying to COW these mappings would
12570 			 * change the effective protections, which could have
12571 			 * side effects if the pmap layer relies on the
12572 			 * specified protections.
12573 			 */
12574 
12575 			vm_object_t     new_object;
12576 
12577 			new_object = VME_OBJECT(new_entry);
12578 			assert(new_object != src_object);
12579 			vm_object_lock(new_object);
12580 			assert(os_ref_get_count_raw(&new_object->ref_count) == 1);
12581 			assert(new_object->shadow == VM_OBJECT_NULL);
12582 			assert(new_object->vo_copy == VM_OBJECT_NULL);
12583 			assert(new_object->vo_owner == NULL);
12584 
12585 			new_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
12586 
12587 			if (preserve_purgeable &&
12588 			    src_object->purgable != VM_PURGABLE_DENY) {
12589 				VM_OBJECT_SET_TRUE_SHARE(new_object, TRUE);
12590 
12591 				/* start as non-volatile with no owner... */
12592 				VM_OBJECT_SET_PURGABLE(new_object, VM_PURGABLE_NONVOLATILE);
12593 				vm_purgeable_nonvolatile_enqueue(new_object, NULL);
12594 				/* ... and move to src_object's purgeable state */
12595 				if (src_object->purgable != VM_PURGABLE_NONVOLATILE) {
12596 					int state;
12597 					state = src_object->purgable;
12598 					vm_object_purgable_control(
12599 						new_object,
12600 						VM_PURGABLE_SET_STATE_FROM_KERNEL,
12601 						&state);
12602 				}
12603 				/* no pmap accounting for purgeable objects */
12604 				new_entry->use_pmap = FALSE;
12605 			}
12606 
12607 			vm_object_unlock(new_object);
12608 			new_object = VM_OBJECT_NULL;
12609 		}
12610 
12611 		/*
12612 		 *	Throw away the extra reference
12613 		 */
12614 
12615 		vm_object_deallocate(src_object);
12616 
12617 		if (result != KERN_SUCCESS &&
12618 		    result != KERN_MEMORY_RESTART_COPY) {
12619 			vm_map_lock(src_map);
12620 			RETURN(result);
12621 		}
12622 
12623 		/*
12624 		 *	Verify that the map has not substantially
12625 		 *	changed while the copy was being made.
12626 		 */
12627 
12628 		vm_map_lock(src_map);
12629 
12630 		if ((version.main_timestamp + 1) == src_map->timestamp) {
12631 			/* src_map hasn't changed: src_entry is still valid */
12632 			src_entry = saved_src_entry;
12633 			goto VerificationSuccessful;
12634 		}
12635 
12636 		/*
12637 		 *	Simple version comparison failed.
12638 		 *
12639 		 *	Retry the lookup and verify that the
12640 		 *	same object/offset are still present.
12641 		 *
12642 		 *	[Note: a memory manager that colludes with
12643 		 *	the calling task can detect that we have
12644 		 *	cheated.  While the map was unlocked, the
12645 		 *	mapping could have been changed and restored.]
12646 		 */
12647 
12648 		if (!vm_map_lookup_entry(src_map, src_start, &tmp_entry)) {
12649 			if (result != KERN_MEMORY_RESTART_COPY) {
12650 				vm_object_deallocate(VME_OBJECT(new_entry));
12651 				VME_OBJECT_SET(new_entry, VM_OBJECT_NULL, false, 0);
12652 				/* reset accounting state */
12653 				new_entry->iokit_acct = FALSE;
12654 				new_entry->use_pmap = TRUE;
12655 			}
12656 			RETURN(KERN_INVALID_ADDRESS);
12657 		}
12658 
12659 		src_entry = tmp_entry;
12660 		vm_map_clip_start(src_map, src_entry, src_start);
12661 
12662 		if ((((src_entry->protection & VM_PROT_READ) == VM_PROT_NONE) &&
12663 		    !use_maxprot) ||
12664 		    ((src_entry->max_protection & VM_PROT_READ) == 0)) {
12665 			goto VerificationFailed;
12666 		}
12667 
12668 		if (src_entry->vme_end < new_entry->vme_end) {
12669 			/*
12670 			 * This entry might have been shortened
12671 			 * (vm_map_clip_end) or been replaced with
12672 			 * an entry that ends closer to "src_start"
12673 			 * than before.
12674 			 * Adjust "new_entry" accordingly; copying
12675 			 * less memory would be correct but we also
12676 			 * redo the copy (see below) if the new entry
12677 			 * no longer points at the same object/offset.
12678 			 */
12679 			assert(VM_MAP_PAGE_ALIGNED(src_entry->vme_end,
12680 			    VM_MAP_COPY_PAGE_MASK(copy)));
12681 			new_entry->vme_end = src_entry->vme_end;
12682 			src_size = new_entry->vme_end - src_start;
12683 		} else if (src_entry->vme_end > new_entry->vme_end) {
12684 			/*
12685 			 * This entry might have been extended
12686 			 * (vm_map_entry_simplify() or coalesce)
12687 			 * or been replaced with an entry that ends farther
12688 			 * from "src_start" than before.
12689 			 *
12690 			 * We've called vm_object_copy_*() only on
12691 			 * the previous <start:end> range, so we can't
12692 			 * just extend new_entry.  We have to re-do
12693 			 * the copy based on the new entry as if it was
12694 			 * pointing at a different object/offset (see
12695 			 * "Verification failed" below).
12696 			 */
12697 		}
12698 
12699 		if ((VME_OBJECT(src_entry) != src_object) ||
12700 		    (VME_OFFSET(src_entry) != src_offset) ||
12701 		    (src_entry->vme_end > new_entry->vme_end)) {
12702 			/*
12703 			 *	Verification failed.
12704 			 *
12705 			 *	Start over with this top-level entry.
12706 			 */
12707 
12708 VerificationFailed:     ;
12709 
12710 			vm_object_deallocate(VME_OBJECT(new_entry));
12711 			tmp_entry = src_entry;
12712 			continue;
12713 		}
12714 
12715 		/*
12716 		 *	Verification succeeded.
12717 		 */
12718 
12719 VerificationSuccessful:;
12720 
12721 		if (result == KERN_MEMORY_RESTART_COPY) {
12722 			goto RestartCopy;
12723 		}
12724 
12725 		/*
12726 		 *	Copy succeeded.
12727 		 */
12728 
12729 CopySuccessful: ;
12730 
12731 		/*
12732 		 *	Link in the new copy entry.
12733 		 */
12734 
12735 		vm_map_copy_entry_link(copy, vm_map_copy_last_entry(copy),
12736 		    new_entry);
12737 
12738 		/*
12739 		 *	Determine whether the entire region
12740 		 *	has been copied.
12741 		 */
12742 		src_base = src_start;
12743 		src_start = new_entry->vme_end;
12744 		new_entry = VM_MAP_ENTRY_NULL;
12745 		while ((src_start >= src_end) && (src_end != 0)) {
12746 			submap_map_t    *ptr;
12747 
12748 			if (src_map == base_map) {
12749 				/* back to the top */
12750 				break;
12751 			}
12752 
12753 			ptr = parent_maps;
12754 			assert(ptr != NULL);
12755 			parent_maps = parent_maps->next;
12756 
12757 			/* fix up the damage we did in that submap */
12758 			vm_map_simplify_range(src_map,
12759 			    src_base,
12760 			    src_end);
12761 
12762 			vm_map_unlock(src_map);
12763 			vm_map_deallocate(src_map);
12764 			vm_map_lock(ptr->parent_map);
12765 			src_map = ptr->parent_map;
12766 			src_base = ptr->base_start;
12767 			src_start = ptr->base_start + ptr->base_len;
12768 			src_end = ptr->base_end;
12769 			if (!vm_map_lookup_entry(src_map,
12770 			    src_start,
12771 			    &tmp_entry) &&
12772 			    (src_end > src_start)) {
12773 				RETURN(KERN_INVALID_ADDRESS);
12774 			}
12775 			kfree_type(submap_map_t, ptr);
12776 			if (parent_maps == NULL) {
12777 				map_share = FALSE;
12778 			}
12779 			src_entry = tmp_entry->vme_prev;
12780 		}
12781 
12782 		if ((VM_MAP_PAGE_SHIFT(src_map) != PAGE_SHIFT) &&
12783 		    (src_start >= src_addr_unaligned + len) &&
12784 		    (src_addr_unaligned + len != 0)) {
12785 			/*
12786 			 * Stop copying now, even though we haven't reached
12787 			 * "src_end".  We'll adjust the end of the last copy
12788 			 * entry at the end, if needed.
12789 			 *
12790 			 * If src_map's aligment is different from the
12791 			 * system's page-alignment, there could be
12792 			 * extra non-map-aligned map entries between
12793 			 * the original (non-rounded) "src_addr_unaligned + len"
12794 			 * and the rounded "src_end".
12795 			 * We do not want to copy those map entries since
12796 			 * they're not part of the copied range.
12797 			 */
12798 			break;
12799 		}
12800 
12801 		if ((src_start >= src_end) && (src_end != 0)) {
12802 			break;
12803 		}
12804 
12805 		/*
12806 		 *	Verify that there are no gaps in the region
12807 		 */
12808 
12809 		tmp_entry = src_entry->vme_next;
12810 		if ((tmp_entry->vme_start != src_start) ||
12811 		    (tmp_entry == vm_map_to_entry(src_map))) {
12812 			RETURN(KERN_INVALID_ADDRESS);
12813 		}
12814 	}
12815 
12816 	/*
12817 	 * If the source should be destroyed, do it now, since the
12818 	 * copy was successful.
12819 	 */
12820 	if (src_destroy) {
12821 		vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_FLAGS;
12822 
12823 		if (src_map == kernel_map) {
12824 			remove_flags |= VM_MAP_REMOVE_KUNWIRE;
12825 		}
12826 		(void)vm_map_remove_and_unlock(src_map,
12827 		    vm_map_trunc_page(src_addr_unaligned, VM_MAP_PAGE_MASK(src_map)),
12828 		    src_end,
12829 		    remove_flags,
12830 		    KMEM_GUARD_NONE);
12831 	} else {
12832 		/* fix up the damage we did in the base map */
12833 		vm_map_simplify_range(
12834 			src_map,
12835 			vm_map_trunc_page(src_addr_unaligned,
12836 			VM_MAP_PAGE_MASK(src_map)),
12837 			vm_map_round_page(src_end,
12838 			VM_MAP_PAGE_MASK(src_map)));
12839 		vm_map_unlock(src_map);
12840 	}
12841 
12842 	tmp_entry = VM_MAP_ENTRY_NULL;
12843 
12844 	if (VM_MAP_PAGE_SHIFT(src_map) > PAGE_SHIFT &&
12845 	    VM_MAP_PAGE_SHIFT(src_map) != VM_MAP_COPY_PAGE_SHIFT(copy)) {
12846 		vm_map_offset_t original_start, original_offset, original_end;
12847 
12848 		assert(VM_MAP_COPY_PAGE_MASK(copy) == PAGE_MASK);
12849 
12850 		/* adjust alignment of first copy_entry's "vme_start" */
12851 		tmp_entry = vm_map_copy_first_entry(copy);
12852 		if (tmp_entry != vm_map_copy_to_entry(copy)) {
12853 			vm_map_offset_t adjustment;
12854 
12855 			original_start = tmp_entry->vme_start;
12856 			original_offset = VME_OFFSET(tmp_entry);
12857 
12858 			/* map-align the start of the first copy entry... */
12859 			adjustment = (tmp_entry->vme_start -
12860 			    vm_map_trunc_page(
12861 				    tmp_entry->vme_start,
12862 				    VM_MAP_PAGE_MASK(src_map)));
12863 			tmp_entry->vme_start -= adjustment;
12864 			VME_OFFSET_SET(tmp_entry,
12865 			    VME_OFFSET(tmp_entry) - adjustment);
12866 			copy_addr -= adjustment;
12867 			assert(tmp_entry->vme_start < tmp_entry->vme_end);
12868 			/* ... adjust for mis-aligned start of copy range */
12869 			adjustment =
12870 			    (vm_map_trunc_page(copy->offset,
12871 			    PAGE_MASK) -
12872 			    vm_map_trunc_page(copy->offset,
12873 			    VM_MAP_PAGE_MASK(src_map)));
12874 			if (adjustment) {
12875 				assert(page_aligned(adjustment));
12876 				assert(adjustment < VM_MAP_PAGE_SIZE(src_map));
12877 				tmp_entry->vme_start += adjustment;
12878 				VME_OFFSET_SET(tmp_entry,
12879 				    (VME_OFFSET(tmp_entry) +
12880 				    adjustment));
12881 				copy_addr += adjustment;
12882 				assert(tmp_entry->vme_start < tmp_entry->vme_end);
12883 			}
12884 
12885 			/*
12886 			 * Assert that the adjustments haven't exposed
12887 			 * more than was originally copied...
12888 			 */
12889 			assert(tmp_entry->vme_start >= original_start);
12890 			assert(VME_OFFSET(tmp_entry) >= original_offset);
12891 			/*
12892 			 * ... and that it did not adjust outside of a
12893 			 * a single 16K page.
12894 			 */
12895 			assert(vm_map_trunc_page(tmp_entry->vme_start,
12896 			    VM_MAP_PAGE_MASK(src_map)) ==
12897 			    vm_map_trunc_page(original_start,
12898 			    VM_MAP_PAGE_MASK(src_map)));
12899 		}
12900 
12901 		/* adjust alignment of last copy_entry's "vme_end" */
12902 		tmp_entry = vm_map_copy_last_entry(copy);
12903 		if (tmp_entry != vm_map_copy_to_entry(copy)) {
12904 			vm_map_offset_t adjustment;
12905 
12906 			original_end = tmp_entry->vme_end;
12907 
12908 			/* map-align the end of the last copy entry... */
12909 			tmp_entry->vme_end =
12910 			    vm_map_round_page(tmp_entry->vme_end,
12911 			    VM_MAP_PAGE_MASK(src_map));
12912 			/* ... adjust for mis-aligned end of copy range */
12913 			adjustment =
12914 			    (vm_map_round_page((copy->offset +
12915 			    copy->size),
12916 			    VM_MAP_PAGE_MASK(src_map)) -
12917 			    vm_map_round_page((copy->offset +
12918 			    copy->size),
12919 			    PAGE_MASK));
12920 			if (adjustment) {
12921 				assert(page_aligned(adjustment));
12922 				assert(adjustment < VM_MAP_PAGE_SIZE(src_map));
12923 				tmp_entry->vme_end -= adjustment;
12924 				assert(tmp_entry->vme_start < tmp_entry->vme_end);
12925 			}
12926 
12927 			/*
12928 			 * Assert that the adjustments haven't exposed
12929 			 * more than was originally copied...
12930 			 */
12931 			assert(tmp_entry->vme_end <= original_end);
12932 			/*
12933 			 * ... and that it did not adjust outside of a
12934 			 * a single 16K page.
12935 			 */
12936 			assert(vm_map_round_page(tmp_entry->vme_end,
12937 			    VM_MAP_PAGE_MASK(src_map)) ==
12938 			    vm_map_round_page(original_end,
12939 			    VM_MAP_PAGE_MASK(src_map)));
12940 		}
12941 	}
12942 
12943 	/* Fix-up start and end points in copy.  This is necessary */
12944 	/* when the various entries in the copy object were picked */
12945 	/* up from different sub-maps */
12946 
12947 	tmp_entry = vm_map_copy_first_entry(copy);
12948 	copy_size = 0; /* compute actual size */
12949 	while (tmp_entry != vm_map_copy_to_entry(copy)) {
12950 		assert(VM_MAP_PAGE_ALIGNED(
12951 			    copy_addr + (tmp_entry->vme_end -
12952 			    tmp_entry->vme_start),
12953 			    MIN(VM_MAP_COPY_PAGE_MASK(copy), PAGE_MASK)));
12954 		assert(VM_MAP_PAGE_ALIGNED(
12955 			    copy_addr,
12956 			    MIN(VM_MAP_COPY_PAGE_MASK(copy), PAGE_MASK)));
12957 
12958 		/*
12959 		 * The copy_entries will be injected directly into the
12960 		 * destination map and might not be "map aligned" there...
12961 		 */
12962 		tmp_entry->map_aligned = FALSE;
12963 
12964 		tmp_entry->vme_end = copy_addr +
12965 		    (tmp_entry->vme_end - tmp_entry->vme_start);
12966 		tmp_entry->vme_start = copy_addr;
12967 		assert(tmp_entry->vme_start < tmp_entry->vme_end);
12968 		copy_addr += tmp_entry->vme_end - tmp_entry->vme_start;
12969 		copy_size += tmp_entry->vme_end - tmp_entry->vme_start;
12970 		tmp_entry = (struct vm_map_entry *)tmp_entry->vme_next;
12971 	}
12972 
12973 	if (VM_MAP_PAGE_SHIFT(src_map) != PAGE_SHIFT &&
12974 	    copy_size < copy->size) {
12975 		/*
12976 		 * The actual size of the VM map copy is smaller than what
12977 		 * was requested by the caller.  This must be because some
12978 		 * PAGE_SIZE-sized pages are missing at the end of the last
12979 		 * VM_MAP_PAGE_SIZE(src_map)-sized chunk of the range.
12980 		 * The caller might not have been aware of those missing
12981 		 * pages and might not want to be aware of it, which is
12982 		 * fine as long as they don't try to access (and crash on)
12983 		 * those missing pages.
12984 		 * Let's adjust the size of the "copy", to avoid failing
12985 		 * in vm_map_copyout() or vm_map_copy_overwrite().
12986 		 */
12987 		assert(vm_map_round_page(copy_size,
12988 		    VM_MAP_PAGE_MASK(src_map)) ==
12989 		    vm_map_round_page(copy->size,
12990 		    VM_MAP_PAGE_MASK(src_map)));
12991 		copy->size = copy_size;
12992 	}
12993 
12994 	*copy_result = copy;
12995 	return KERN_SUCCESS;
12996 
12997 #undef  RETURN
12998 }
12999 
13000 kern_return_t
vm_map_copy_extract(vm_map_t src_map,vm_map_address_t src_addr,vm_map_size_t len,boolean_t do_copy,vm_map_copy_t * copy_result,vm_prot_t * cur_prot,vm_prot_t * max_prot,vm_inherit_t inheritance,vm_map_kernel_flags_t vmk_flags)13001 vm_map_copy_extract(
13002 	vm_map_t                src_map,
13003 	vm_map_address_t        src_addr,
13004 	vm_map_size_t           len,
13005 	boolean_t               do_copy,
13006 	vm_map_copy_t           *copy_result,   /* OUT */
13007 	vm_prot_t               *cur_prot,      /* IN/OUT */
13008 	vm_prot_t               *max_prot,      /* IN/OUT */
13009 	vm_inherit_t            inheritance,
13010 	vm_map_kernel_flags_t   vmk_flags)
13011 {
13012 	vm_map_copy_t   copy;
13013 	kern_return_t   kr;
13014 	vm_prot_t required_cur_prot, required_max_prot;
13015 
13016 	/*
13017 	 *	Check for copies of zero bytes.
13018 	 */
13019 
13020 	if (len == 0) {
13021 		*copy_result = VM_MAP_COPY_NULL;
13022 		return KERN_SUCCESS;
13023 	}
13024 
13025 	/*
13026 	 *	Check that the end address doesn't overflow
13027 	 */
13028 	if (src_addr + len < src_addr) {
13029 		return KERN_INVALID_ADDRESS;
13030 	}
13031 	if (__improbable(vm_map_range_overflows(src_map, src_addr, len))) {
13032 		return KERN_INVALID_ADDRESS;
13033 	}
13034 
13035 	if (VM_MAP_PAGE_SIZE(src_map) < PAGE_SIZE) {
13036 		DEBUG4K_SHARE("src_map %p src_addr 0x%llx src_end 0x%llx\n", src_map, (uint64_t)src_addr, (uint64_t)(src_addr + len));
13037 	}
13038 
13039 	required_cur_prot = *cur_prot;
13040 	required_max_prot = *max_prot;
13041 
13042 	/*
13043 	 *	Allocate a header element for the list.
13044 	 *
13045 	 *	Use the start and end in the header to
13046 	 *	remember the endpoints prior to rounding.
13047 	 */
13048 
13049 	copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
13050 	copy->cpy_hdr.entries_pageable = vmk_flags.vmkf_copy_pageable;
13051 	copy->offset = 0;
13052 	copy->size = len;
13053 
13054 	kr = vm_map_remap_extract(src_map,
13055 	    src_addr,
13056 	    len,
13057 	    do_copy,             /* copy */
13058 	    copy,
13059 	    cur_prot,            /* IN/OUT */
13060 	    max_prot,            /* IN/OUT */
13061 	    inheritance,
13062 	    vmk_flags);
13063 	if (kr != KERN_SUCCESS) {
13064 		vm_map_copy_discard(copy);
13065 		if ((kr == KERN_INVALID_ADDRESS ||
13066 		    kr == KERN_INVALID_ARGUMENT) &&
13067 		    src_map->terminated) {
13068 			/* tell the caller that this address space is gone */
13069 			kr = KERN_TERMINATED;
13070 		}
13071 		return kr;
13072 	}
13073 	if (required_cur_prot != VM_PROT_NONE) {
13074 		assert((*cur_prot & required_cur_prot) == required_cur_prot);
13075 		assert((*max_prot & required_max_prot) == required_max_prot);
13076 	}
13077 
13078 	*copy_result = copy;
13079 	return KERN_SUCCESS;
13080 }
13081 
13082 static void
vm_map_fork_share(vm_map_t old_map,vm_map_entry_t old_entry,vm_map_t new_map)13083 vm_map_fork_share(
13084 	vm_map_t        old_map,
13085 	vm_map_entry_t  old_entry,
13086 	vm_map_t        new_map)
13087 {
13088 	vm_object_t     object;
13089 	vm_map_entry_t  new_entry;
13090 
13091 	/*
13092 	 *	New sharing code.  New map entry
13093 	 *	references original object.  Internal
13094 	 *	objects use asynchronous copy algorithm for
13095 	 *	future copies.  First make sure we have
13096 	 *	the right object.  If we need a shadow,
13097 	 *	or someone else already has one, then
13098 	 *	make a new shadow and share it.
13099 	 */
13100 
13101 	if (!old_entry->is_sub_map) {
13102 		object = VME_OBJECT(old_entry);
13103 	}
13104 
13105 	if (old_entry->is_sub_map) {
13106 		assert(old_entry->wired_count == 0);
13107 #ifndef NO_NESTED_PMAP
13108 #if !PMAP_FORK_NEST
13109 		if (old_entry->use_pmap) {
13110 			kern_return_t   result;
13111 
13112 			result = pmap_nest(new_map->pmap,
13113 			    (VME_SUBMAP(old_entry))->pmap,
13114 			    (addr64_t)old_entry->vme_start,
13115 			    (uint64_t)(old_entry->vme_end - old_entry->vme_start));
13116 			if (result) {
13117 				panic("vm_map_fork_share: pmap_nest failed!");
13118 			}
13119 		}
13120 #endif /* !PMAP_FORK_NEST */
13121 #endif  /* NO_NESTED_PMAP */
13122 	} else if (object == VM_OBJECT_NULL) {
13123 		object = vm_object_allocate((vm_map_size_t)(old_entry->vme_end -
13124 		    old_entry->vme_start), old_map->serial_id);
13125 		VME_OFFSET_SET(old_entry, 0);
13126 		VME_OBJECT_SET(old_entry, object, false, 0);
13127 		old_entry->use_pmap = TRUE;
13128 //		assert(!old_entry->needs_copy);
13129 	} else if (object->copy_strategy !=
13130 	    MEMORY_OBJECT_COPY_SYMMETRIC) {
13131 		/*
13132 		 *	We are already using an asymmetric
13133 		 *	copy, and therefore we already have
13134 		 *	the right object.
13135 		 */
13136 
13137 		assert(!old_entry->needs_copy);
13138 	} else if (old_entry->needs_copy ||       /* case 1 */
13139 	    object->shadowed ||                 /* case 2 */
13140 	    (!object->true_share &&             /* case 3 */
13141 	    !old_entry->is_shared &&
13142 	    (object->vo_size >
13143 	    (vm_map_size_t)(old_entry->vme_end -
13144 	    old_entry->vme_start)))) {
13145 		bool is_writable;
13146 
13147 		/*
13148 		 *	We need to create a shadow.
13149 		 *	There are three cases here.
13150 		 *	In the first case, we need to
13151 		 *	complete a deferred symmetrical
13152 		 *	copy that we participated in.
13153 		 *	In the second and third cases,
13154 		 *	we need to create the shadow so
13155 		 *	that changes that we make to the
13156 		 *	object do not interfere with
13157 		 *	any symmetrical copies which
13158 		 *	have occured (case 2) or which
13159 		 *	might occur (case 3).
13160 		 *
13161 		 *	The first case is when we had
13162 		 *	deferred shadow object creation
13163 		 *	via the entry->needs_copy mechanism.
13164 		 *	This mechanism only works when
13165 		 *	only one entry points to the source
13166 		 *	object, and we are about to create
13167 		 *	a second entry pointing to the
13168 		 *	same object. The problem is that
13169 		 *	there is no way of mapping from
13170 		 *	an object to the entries pointing
13171 		 *	to it. (Deferred shadow creation
13172 		 *	works with one entry because occurs
13173 		 *	at fault time, and we walk from the
13174 		 *	entry to the object when handling
13175 		 *	the fault.)
13176 		 *
13177 		 *	The second case is when the object
13178 		 *	to be shared has already been copied
13179 		 *	with a symmetric copy, but we point
13180 		 *	directly to the object without
13181 		 *	needs_copy set in our entry. (This
13182 		 *	can happen because different ranges
13183 		 *	of an object can be pointed to by
13184 		 *	different entries. In particular,
13185 		 *	a single entry pointing to an object
13186 		 *	can be split by a call to vm_inherit,
13187 		 *	which, combined with task_create, can
13188 		 *	result in the different entries
13189 		 *	having different needs_copy values.)
13190 		 *	The shadowed flag in the object allows
13191 		 *	us to detect this case. The problem
13192 		 *	with this case is that if this object
13193 		 *	has or will have shadows, then we
13194 		 *	must not perform an asymmetric copy
13195 		 *	of this object, since such a copy
13196 		 *	allows the object to be changed, which
13197 		 *	will break the previous symmetrical
13198 		 *	copies (which rely upon the object
13199 		 *	not changing). In a sense, the shadowed
13200 		 *	flag says "don't change this object".
13201 		 *	We fix this by creating a shadow
13202 		 *	object for this object, and sharing
13203 		 *	that. This works because we are free
13204 		 *	to change the shadow object (and thus
13205 		 *	to use an asymmetric copy strategy);
13206 		 *	this is also semantically correct,
13207 		 *	since this object is temporary, and
13208 		 *	therefore a copy of the object is
13209 		 *	as good as the object itself. (This
13210 		 *	is not true for permanent objects,
13211 		 *	since the pager needs to see changes,
13212 		 *	which won't happen if the changes
13213 		 *	are made to a copy.)
13214 		 *
13215 		 *	The third case is when the object
13216 		 *	to be shared has parts sticking
13217 		 *	outside of the entry we're working
13218 		 *	with, and thus may in the future
13219 		 *	be subject to a symmetrical copy.
13220 		 *	(This is a preemptive version of
13221 		 *	case 2.)
13222 		 */
13223 		VME_OBJECT_SHADOW(old_entry,
13224 		    (vm_map_size_t) (old_entry->vme_end -
13225 		    old_entry->vme_start),
13226 		    vm_map_always_shadow(old_map));
13227 
13228 		/*
13229 		 *	If we're making a shadow for other than
13230 		 *	copy on write reasons, then we have
13231 		 *	to remove write permission.
13232 		 */
13233 
13234 		is_writable = false;
13235 		if (old_entry->protection & VM_PROT_WRITE) {
13236 			is_writable = true;
13237 #if __arm64e__
13238 		} else if (old_entry->used_for_tpro) {
13239 			is_writable = true;
13240 #endif /* __arm64e__ */
13241 		}
13242 		if (!old_entry->needs_copy && is_writable) {
13243 			vm_prot_t prot;
13244 
13245 			if (pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, old_entry->protection)) {
13246 				panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
13247 				    __FUNCTION__, old_map, old_map->pmap,
13248 				    old_entry,
13249 				    (uint64_t)old_entry->vme_start,
13250 				    (uint64_t)old_entry->vme_end,
13251 				    old_entry->protection);
13252 			}
13253 
13254 			prot = old_entry->protection & ~VM_PROT_WRITE;
13255 
13256 			if (pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, prot)) {
13257 				panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
13258 				    __FUNCTION__, old_map, old_map->pmap,
13259 				    old_entry,
13260 				    (uint64_t)old_entry->vme_start,
13261 				    (uint64_t)old_entry->vme_end,
13262 				    prot);
13263 			}
13264 
13265 			if (override_nx(old_map, VME_ALIAS(old_entry)) && prot) {
13266 				prot |= VM_PROT_EXECUTE;
13267 			}
13268 
13269 
13270 			if (old_map->mapped_in_other_pmaps) {
13271 				vm_object_pmap_protect(
13272 					VME_OBJECT(old_entry),
13273 					VME_OFFSET(old_entry),
13274 					(old_entry->vme_end -
13275 					old_entry->vme_start),
13276 					PMAP_NULL,
13277 					PAGE_SIZE,
13278 					old_entry->vme_start,
13279 					prot);
13280 			} else {
13281 				pmap_protect(old_map->pmap,
13282 				    old_entry->vme_start,
13283 				    old_entry->vme_end,
13284 				    prot);
13285 			}
13286 		}
13287 
13288 		old_entry->needs_copy = FALSE;
13289 		object = VME_OBJECT(old_entry);
13290 	}
13291 
13292 
13293 	/*
13294 	 *	If object was using a symmetric copy strategy,
13295 	 *	change its copy strategy to the default
13296 	 *	asymmetric copy strategy, which is copy_delay
13297 	 *	in the non-norma case and copy_call in the
13298 	 *	norma case. Bump the reference count for the
13299 	 *	new entry.
13300 	 */
13301 
13302 	if (old_entry->is_sub_map) {
13303 		vm_map_reference(VME_SUBMAP(old_entry));
13304 	} else {
13305 		vm_object_lock(object);
13306 		vm_object_reference_locked(object);
13307 		if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
13308 			object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
13309 		}
13310 		vm_object_unlock(object);
13311 	}
13312 
13313 	/*
13314 	 *	Clone the entry, using object ref from above.
13315 	 *	Mark both entries as shared.
13316 	 */
13317 
13318 	new_entry = vm_map_entry_create(new_map); /* Never the kernel map or descendants */
13319 	vm_map_entry_copy(old_map, new_entry, old_entry);
13320 	old_entry->is_shared = TRUE;
13321 	new_entry->is_shared = TRUE;
13322 
13323 	/*
13324 	 * We're dealing with a shared mapping, so the resulting mapping
13325 	 * should inherit some of the original mapping's accounting settings.
13326 	 * "iokit_acct" should have been cleared in vm_map_entry_copy().
13327 	 * "use_pmap" should stay the same as before (if it hasn't been reset
13328 	 * to TRUE when we cleared "iokit_acct").
13329 	 */
13330 	assert(!new_entry->iokit_acct);
13331 
13332 	/*
13333 	 *	If old entry's inheritence is VM_INHERIT_NONE,
13334 	 *	the new entry is for corpse fork, remove the
13335 	 *	write permission from the new entry.
13336 	 */
13337 	if (old_entry->inheritance == VM_INHERIT_NONE) {
13338 		new_entry->protection &= ~VM_PROT_WRITE;
13339 		new_entry->max_protection &= ~VM_PROT_WRITE;
13340 	}
13341 
13342 	/*
13343 	 *	Insert the entry into the new map -- we
13344 	 *	know we're inserting at the end of the new
13345 	 *	map.
13346 	 */
13347 
13348 	vm_map_store_entry_link(new_map, vm_map_last_entry(new_map), new_entry,
13349 	    VM_MAP_KERNEL_FLAGS_NONE);
13350 
13351 	/*
13352 	 *	Update the physical map
13353 	 */
13354 
13355 	if (old_entry->is_sub_map) {
13356 		/* Bill Angell pmap support goes here */
13357 	} else {
13358 		pmap_copy(new_map->pmap, old_map->pmap, new_entry->vme_start,
13359 		    old_entry->vme_end - old_entry->vme_start,
13360 		    old_entry->vme_start);
13361 	}
13362 }
13363 
13364 static boolean_t
vm_map_fork_copy(vm_map_t old_map,vm_map_entry_t * old_entry_p,vm_map_t new_map,int vm_map_copyin_flags)13365 vm_map_fork_copy(
13366 	vm_map_t        old_map,
13367 	vm_map_entry_t  *old_entry_p,
13368 	vm_map_t        new_map,
13369 	int             vm_map_copyin_flags)
13370 {
13371 	vm_map_entry_t old_entry = *old_entry_p;
13372 	vm_map_size_t entry_size = old_entry->vme_end - old_entry->vme_start;
13373 	vm_map_offset_t start = old_entry->vme_start;
13374 	vm_map_copy_t copy;
13375 	vm_map_entry_t last = vm_map_last_entry(new_map);
13376 
13377 	vm_map_unlock(old_map);
13378 	/*
13379 	 *	Use maxprot version of copyin because we
13380 	 *	care about whether this memory can ever
13381 	 *	be accessed, not just whether it's accessible
13382 	 *	right now.
13383 	 */
13384 	vm_map_copyin_flags |= VM_MAP_COPYIN_USE_MAXPROT;
13385 	if (vm_map_copyin_internal(old_map, start, entry_size,
13386 	    vm_map_copyin_flags, &copy)
13387 	    != KERN_SUCCESS) {
13388 		/*
13389 		 *	The map might have changed while it
13390 		 *	was unlocked, check it again.  Skip
13391 		 *	any blank space or permanently
13392 		 *	unreadable region.
13393 		 */
13394 		vm_map_lock(old_map);
13395 		if (!vm_map_lookup_entry(old_map, start, &last) ||
13396 		    (last->max_protection & VM_PROT_READ) == VM_PROT_NONE) {
13397 			last = last->vme_next;
13398 		}
13399 		*old_entry_p = last;
13400 
13401 		/*
13402 		 * XXX	For some error returns, want to
13403 		 * XXX	skip to the next element.  Note
13404 		 *	that INVALID_ADDRESS and
13405 		 *	PROTECTION_FAILURE are handled above.
13406 		 */
13407 
13408 		return FALSE;
13409 	}
13410 
13411 	/*
13412 	 * Assert that the vm_map_copy is coming from the right
13413 	 * zone and hasn't been forged
13414 	 */
13415 	vm_map_copy_require(copy);
13416 
13417 	/*
13418 	 *	Insert the copy into the new map
13419 	 */
13420 	vm_map_copy_insert(new_map, last, copy);
13421 
13422 	/*
13423 	 *	Pick up the traversal at the end of
13424 	 *	the copied region.
13425 	 */
13426 
13427 	vm_map_lock(old_map);
13428 	start += entry_size;
13429 	if (!vm_map_lookup_entry(old_map, start, &last)) {
13430 		last = last->vme_next;
13431 	} else {
13432 		if (last->vme_start == start) {
13433 			/*
13434 			 * No need to clip here and we don't
13435 			 * want to cause any unnecessary
13436 			 * unnesting...
13437 			 */
13438 		} else {
13439 			vm_map_clip_start(old_map, last, start);
13440 		}
13441 	}
13442 	*old_entry_p = last;
13443 
13444 	return TRUE;
13445 }
13446 
13447 #if PMAP_FORK_NEST
13448 #define PMAP_FORK_NEST_DEBUG 0
13449 static inline void
vm_map_fork_unnest(pmap_t new_pmap,vm_map_offset_t pre_nested_start,vm_map_offset_t pre_nested_end,vm_map_offset_t start,vm_map_offset_t end)13450 vm_map_fork_unnest(
13451 	pmap_t new_pmap,
13452 	vm_map_offset_t pre_nested_start,
13453 	vm_map_offset_t pre_nested_end,
13454 	vm_map_offset_t start,
13455 	vm_map_offset_t end)
13456 {
13457 	kern_return_t kr;
13458 	vm_map_offset_t nesting_mask, start_unnest, end_unnest;
13459 
13460 	assertf(pre_nested_start <= pre_nested_end,
13461 	    "pre_nested start 0x%llx end 0x%llx",
13462 	    (uint64_t)pre_nested_start, (uint64_t)pre_nested_end);
13463 	assertf(start <= end,
13464 	    "start 0x%llx end 0x%llx",
13465 	    (uint64_t) start, (uint64_t)end);
13466 
13467 	if (pre_nested_start == pre_nested_end) {
13468 		/* nothing was pre-nested: done */
13469 		return;
13470 	}
13471 	if (end <= pre_nested_start) {
13472 		/* fully before pre-nested range: done */
13473 		return;
13474 	}
13475 	if (start >= pre_nested_end) {
13476 		/* fully after pre-nested range: done */
13477 		return;
13478 	}
13479 	/* ignore parts of range outside of pre_nested range */
13480 	if (start < pre_nested_start) {
13481 		start = pre_nested_start;
13482 	}
13483 	if (end > pre_nested_end) {
13484 		end = pre_nested_end;
13485 	}
13486 	nesting_mask = pmap_shared_region_size_min(new_pmap) - 1;
13487 	start_unnest = start & ~nesting_mask;
13488 	end_unnest = (end + nesting_mask) & ~nesting_mask;
13489 	kr = pmap_unnest(new_pmap,
13490 	    (addr64_t)start_unnest,
13491 	    (uint64_t)(end_unnest - start_unnest));
13492 #if PMAP_FORK_NEST_DEBUG
13493 	printf("PMAP_FORK_NEST %s:%d new_pmap %p 0x%llx:0x%llx -> pmap_unnest 0x%llx:0x%llx kr 0x%x\n", __FUNCTION__, __LINE__, new_pmap, (uint64_t)start, (uint64_t)end, (uint64_t)start_unnest, (uint64_t)end_unnest, kr);
13494 #endif /* PMAP_FORK_NEST_DEBUG */
13495 	assertf(kr == KERN_SUCCESS,
13496 	    "0x%llx 0x%llx pmap_unnest(%p, 0x%llx, 0x%llx) -> 0x%x",
13497 	    (uint64_t)start, (uint64_t)end, new_pmap,
13498 	    (uint64_t)start_unnest, (uint64_t)(end_unnest - start_unnest),
13499 	    kr);
13500 }
13501 #endif /* PMAP_FORK_NEST */
13502 
13503 void
vm_map_inherit_limits(vm_map_t new_map,const struct _vm_map * old_map)13504 vm_map_inherit_limits(vm_map_t new_map, const struct _vm_map *old_map)
13505 {
13506 	new_map->size_limit = old_map->size_limit;
13507 	new_map->data_limit = old_map->data_limit;
13508 	new_map->user_wire_limit = old_map->user_wire_limit;
13509 	new_map->reserved_regions = old_map->reserved_regions;
13510 }
13511 
13512 /*
13513  *	vm_map_fork:
13514  *
13515  *	Create and return a new map based on the old
13516  *	map, according to the inheritance values on the
13517  *	regions in that map and the options.
13518  *
13519  *	The source map must not be locked.
13520  */
13521 vm_map_t
vm_map_fork(ledger_t ledger,vm_map_t old_map,int options)13522 vm_map_fork(
13523 	ledger_t        ledger,
13524 	vm_map_t        old_map,
13525 	int             options)
13526 {
13527 	pmap_t          new_pmap;
13528 	vm_map_t        new_map;
13529 	vm_map_entry_t  old_entry;
13530 	vm_map_size_t   new_size = 0, entry_size;
13531 	vm_map_entry_t  new_entry;
13532 	boolean_t       src_needs_copy;
13533 	boolean_t       new_entry_needs_copy;
13534 	boolean_t       pmap_is64bit;
13535 	int             vm_map_copyin_flags;
13536 	vm_inherit_t    old_entry_inheritance;
13537 	int             map_create_options;
13538 	kern_return_t   footprint_collect_kr;
13539 
13540 	if (options & ~(VM_MAP_FORK_SHARE_IF_INHERIT_NONE |
13541 	    VM_MAP_FORK_PRESERVE_PURGEABLE |
13542 	    VM_MAP_FORK_CORPSE_FOOTPRINT |
13543 	    VM_MAP_FORK_SHARE_IF_OWNED)) {
13544 		/* unsupported option */
13545 		return VM_MAP_NULL;
13546 	}
13547 
13548 	pmap_is64bit =
13549 #if defined(__i386__) || defined(__x86_64__)
13550 	    old_map->pmap->pm_task_map != TASK_MAP_32BIT;
13551 #elif defined(__arm64__)
13552 	    old_map->pmap->is_64bit;
13553 #else
13554 #error Unknown architecture.
13555 #endif
13556 
13557 	unsigned int pmap_flags = 0;
13558 	pmap_flags |= pmap_is64bit ? PMAP_CREATE_64BIT : 0;
13559 #if defined(HAS_APPLE_PAC)
13560 	pmap_flags |= old_map->pmap->disable_jop ? PMAP_CREATE_DISABLE_JOP : 0;
13561 #endif
13562 #if CONFIG_ROSETTA
13563 	pmap_flags |= old_map->pmap->is_rosetta ? PMAP_CREATE_ROSETTA : 0;
13564 #endif
13565 #if PMAP_CREATE_FORCE_4K_PAGES
13566 	if (VM_MAP_PAGE_SIZE(old_map) == FOURK_PAGE_SIZE &&
13567 	    PAGE_SIZE != FOURK_PAGE_SIZE) {
13568 		pmap_flags |= PMAP_CREATE_FORCE_4K_PAGES;
13569 	}
13570 #endif /* PMAP_CREATE_FORCE_4K_PAGES */
13571 	new_pmap = pmap_create_options(ledger, (vm_map_size_t) 0, pmap_flags);
13572 	if (new_pmap == NULL) {
13573 		return VM_MAP_NULL;
13574 	}
13575 
13576 	vm_map_reference(old_map);
13577 	vm_map_lock(old_map);
13578 
13579 	/* Note that we're creating a map out of fork() */
13580 	map_create_options = VM_MAP_CREATE_VIA_FORK;
13581 	if (old_map->hdr.entries_pageable) {
13582 		map_create_options |= VM_MAP_CREATE_PAGEABLE;
13583 	}
13584 	if (options & VM_MAP_FORK_CORPSE_FOOTPRINT) {
13585 		map_create_options |= VM_MAP_CREATE_CORPSE_FOOTPRINT;
13586 		footprint_collect_kr = KERN_SUCCESS;
13587 	}
13588 	new_map = vm_map_create_options(new_pmap,
13589 	    old_map->min_offset,
13590 	    old_map->max_offset,
13591 	    map_create_options);
13592 
13593 	/* Inherit our parent's ID. */
13594 	vm_map_assign_serial(new_map, old_map->serial_id);
13595 
13596 	/* inherit cs_enforcement */
13597 	vm_map_cs_enforcement_set(new_map, old_map->cs_enforcement);
13598 
13599 	vm_map_lock(new_map);
13600 	vm_commit_pagezero_status(new_map);
13601 	/* inherit the parent map's page size */
13602 	vm_map_set_page_shift(new_map, VM_MAP_PAGE_SHIFT(old_map));
13603 
13604 	/* inherit the parent rlimits */
13605 	vm_map_inherit_limits(new_map, old_map);
13606 
13607 #if CONFIG_MAP_RANGES
13608 	/* inherit the parent map's VM ranges */
13609 	vm_map_range_fork(new_map, old_map);
13610 #endif
13611 
13612 #if CODE_SIGNING_MONITOR
13613 	/* Prepare the monitor for the fork */
13614 	csm_fork_prepare(old_map->pmap, new_pmap);
13615 #endif
13616 
13617 #if PMAP_FORK_NEST
13618 	/*
13619 	 * Pre-nest the shared region's pmap.
13620 	 */
13621 	vm_map_offset_t pre_nested_start = 0, pre_nested_end = 0;
13622 	pmap_fork_nest(old_map->pmap, new_pmap,
13623 	    &pre_nested_start, &pre_nested_end);
13624 #if PMAP_FORK_NEST_DEBUG
13625 	printf("PMAP_FORK_NEST %s:%d old %p new %p pre_nested start 0x%llx end 0x%llx\n", __FUNCTION__, __LINE__, old_map->pmap, new_pmap, (uint64_t)pre_nested_start, (uint64_t)pre_nested_end);
13626 #endif /* PMAP_FORK_NEST_DEBUG */
13627 #endif /* PMAP_FORK_NEST */
13628 
13629 	for (old_entry = vm_map_first_entry(old_map); old_entry != vm_map_to_entry(old_map);) {
13630 		/*
13631 		 * Abort any corpse collection if the system is shutting down.
13632 		 */
13633 		if ((options & VM_MAP_FORK_CORPSE_FOOTPRINT) &&
13634 		    get_system_inshutdown()) {
13635 #if PMAP_FORK_NEST
13636 			new_entry = vm_map_last_entry(new_map);
13637 			if (new_entry == vm_map_to_entry(new_map)) {
13638 				/* unnest all that was pre-nested */
13639 				vm_map_fork_unnest(new_pmap,
13640 				    pre_nested_start, pre_nested_end,
13641 				    vm_map_min(new_map), vm_map_max(new_map));
13642 			} else if (new_entry->vme_end < vm_map_max(new_map)) {
13643 				/* unnest hole at the end, if pre-nested */
13644 				vm_map_fork_unnest(new_pmap,
13645 				    pre_nested_start, pre_nested_end,
13646 				    new_entry->vme_end, vm_map_max(new_map));
13647 			}
13648 #endif /* PMAP_FORK_NEST */
13649 			vm_map_corpse_footprint_collect_done(new_map);
13650 			vm_map_unlock(new_map);
13651 			vm_map_unlock(old_map);
13652 			vm_map_deallocate(new_map);
13653 			vm_map_deallocate(old_map);
13654 			printf("Aborting corpse map due to system shutdown\n");
13655 			return VM_MAP_NULL;
13656 		}
13657 
13658 		entry_size = old_entry->vme_end - old_entry->vme_start;
13659 
13660 #if PMAP_FORK_NEST
13661 		/*
13662 		 * Undo any unnecessary pre-nesting.
13663 		 */
13664 		vm_map_offset_t prev_end;
13665 		if (old_entry == vm_map_first_entry(old_map)) {
13666 			prev_end = vm_map_min(old_map);
13667 		} else {
13668 			prev_end = old_entry->vme_prev->vme_end;
13669 		}
13670 		if (prev_end < old_entry->vme_start) {
13671 			/* unnest hole before this entry, if pre-nested */
13672 			vm_map_fork_unnest(new_pmap,
13673 			    pre_nested_start, pre_nested_end,
13674 			    prev_end, old_entry->vme_start);
13675 		}
13676 		if (old_entry->is_sub_map && old_entry->use_pmap) {
13677 			/* keep this entry nested in the child */
13678 #if PMAP_FORK_NEST_DEBUG
13679 			printf("PMAP_FORK_NEST %s:%d new_pmap %p keeping 0x%llx:0x%llx nested\n", __FUNCTION__, __LINE__, new_pmap, (uint64_t)old_entry->vme_start, (uint64_t)old_entry->vme_end);
13680 #endif /* PMAP_FORK_NEST_DEBUG */
13681 		} else {
13682 			/* undo nesting for this entry, if pre-nested */
13683 			vm_map_fork_unnest(new_pmap,
13684 			    pre_nested_start, pre_nested_end,
13685 			    old_entry->vme_start, old_entry->vme_end);
13686 		}
13687 #endif /* PMAP_FORK_NEST */
13688 
13689 		old_entry_inheritance = old_entry->inheritance;
13690 
13691 		/*
13692 		 * If caller used the VM_MAP_FORK_SHARE_IF_INHERIT_NONE option
13693 		 * share VM_INHERIT_NONE entries that are not backed by a
13694 		 * device pager.
13695 		 */
13696 		if (old_entry_inheritance == VM_INHERIT_NONE &&
13697 		    (options & VM_MAP_FORK_SHARE_IF_INHERIT_NONE) &&
13698 		    (old_entry->protection & VM_PROT_READ) &&
13699 		    !(!old_entry->is_sub_map &&
13700 		    VME_OBJECT(old_entry) != NULL &&
13701 		    VME_OBJECT(old_entry)->pager != NULL &&
13702 		    is_device_pager_ops(
13703 			    VME_OBJECT(old_entry)->pager->mo_pager_ops))) {
13704 			old_entry_inheritance = VM_INHERIT_SHARE;
13705 		}
13706 		if (old_entry_inheritance == VM_INHERIT_COPY &&
13707 		    (options & VM_MAP_FORK_SHARE_IF_OWNED) &&
13708 		    !old_entry->is_sub_map &&
13709 		    VME_OBJECT(old_entry) != VM_OBJECT_NULL) {
13710 			vm_object_t object;
13711 			task_t owner;
13712 			object = VME_OBJECT(old_entry);
13713 			owner = VM_OBJECT_OWNER(object);
13714 			if (owner != TASK_NULL &&
13715 			    owner->map == old_map) {
13716 				/*
13717 				 * This mapping points at a VM object owned
13718 				 * by the task being forked.
13719 				 * Some tools reporting memory accounting
13720 				 * info rely on the object ID, so share this
13721 				 * mapping instead of copying, to make the
13722 				 * corpse look exactly like the original
13723 				 * task in that respect.
13724 				 */
13725 				assert(object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC);
13726 				old_entry_inheritance = VM_INHERIT_SHARE;
13727 			}
13728 		}
13729 
13730 		if (old_entry_inheritance != VM_INHERIT_NONE &&
13731 		    (options & VM_MAP_FORK_CORPSE_FOOTPRINT) &&
13732 		    footprint_collect_kr == KERN_SUCCESS) {
13733 			/*
13734 			 * The corpse won't have old_map->pmap to query
13735 			 * footprint information, so collect that data now
13736 			 * and store it in new_map->vmmap_corpse_footprint
13737 			 * for later autopsy.
13738 			 */
13739 			footprint_collect_kr =
13740 			    vm_map_corpse_footprint_collect(old_map,
13741 			    old_entry,
13742 			    new_map);
13743 		}
13744 
13745 		switch (old_entry_inheritance) {
13746 		case VM_INHERIT_NONE:
13747 			break;
13748 
13749 		case VM_INHERIT_SHARE:
13750 			vm_map_fork_share(old_map, old_entry, new_map);
13751 			new_size += entry_size;
13752 			break;
13753 
13754 		case VM_INHERIT_COPY:
13755 
13756 			/*
13757 			 *	Inline the copy_quickly case;
13758 			 *	upon failure, fall back on call
13759 			 *	to vm_map_fork_copy.
13760 			 */
13761 
13762 			if (old_entry->is_sub_map) {
13763 				break;
13764 			}
13765 			if ((old_entry->wired_count != 0) ||
13766 			    ((VME_OBJECT(old_entry) != NULL) &&
13767 			    (VME_OBJECT(old_entry)->true_share))) {
13768 				goto slow_vm_map_fork_copy;
13769 			}
13770 
13771 			new_entry = vm_map_entry_create(new_map); /* never the kernel map or descendants */
13772 			vm_map_entry_copy(old_map, new_entry, old_entry);
13773 			if (old_entry->vme_permanent) {
13774 				/* inherit "permanent" on fork() */
13775 				new_entry->vme_permanent = TRUE;
13776 			}
13777 
13778 			if (new_entry->used_for_jit == TRUE && new_map->jit_entry_exists == FALSE) {
13779 				new_map->jit_entry_exists = TRUE;
13780 			}
13781 
13782 			if (new_entry->is_sub_map) {
13783 				/* clear address space specifics */
13784 				new_entry->use_pmap = FALSE;
13785 			} else {
13786 				/*
13787 				 * We're dealing with a copy-on-write operation,
13788 				 * so the resulting mapping should not inherit
13789 				 * the original mapping's accounting settings.
13790 				 * "iokit_acct" should have been cleared in
13791 				 * vm_map_entry_copy().
13792 				 * "use_pmap" should be reset to its default
13793 				 * (TRUE) so that the new mapping gets
13794 				 * accounted for in the task's memory footprint.
13795 				 */
13796 				assert(!new_entry->iokit_acct);
13797 				new_entry->use_pmap = TRUE;
13798 			}
13799 
13800 			if (!vm_object_copy_quickly(
13801 				    VME_OBJECT(new_entry),
13802 				    VME_OFFSET(old_entry),
13803 				    (old_entry->vme_end -
13804 				    old_entry->vme_start),
13805 				    &src_needs_copy,
13806 				    &new_entry_needs_copy)) {
13807 				vm_map_entry_dispose(new_entry);
13808 				goto slow_vm_map_fork_copy;
13809 			}
13810 
13811 			/*
13812 			 *	Handle copy-on-write obligations
13813 			 */
13814 
13815 			if (src_needs_copy && !old_entry->needs_copy) {
13816 				vm_prot_t prot;
13817 
13818 				if (pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, old_entry->protection)) {
13819 					panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
13820 					    __FUNCTION__,
13821 					    old_map, old_map->pmap, old_entry,
13822 					    (uint64_t)old_entry->vme_start,
13823 					    (uint64_t)old_entry->vme_end,
13824 					    old_entry->protection);
13825 				}
13826 
13827 				prot = old_entry->protection & ~VM_PROT_WRITE;
13828 
13829 				if (override_nx(old_map, VME_ALIAS(old_entry))
13830 				    && prot) {
13831 					prot |= VM_PROT_EXECUTE;
13832 				}
13833 
13834 				if (pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, prot)) {
13835 					panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
13836 					    __FUNCTION__,
13837 					    old_map, old_map->pmap, old_entry,
13838 					    (uint64_t)old_entry->vme_start,
13839 					    (uint64_t)old_entry->vme_end,
13840 					    prot);
13841 				}
13842 
13843 				vm_object_pmap_protect(
13844 					VME_OBJECT(old_entry),
13845 					VME_OFFSET(old_entry),
13846 					(old_entry->vme_end -
13847 					old_entry->vme_start),
13848 					((old_entry->is_shared
13849 					|| old_map->mapped_in_other_pmaps)
13850 					? PMAP_NULL :
13851 					old_map->pmap),
13852 					VM_MAP_PAGE_SIZE(old_map),
13853 					old_entry->vme_start,
13854 					prot);
13855 
13856 				assert(old_entry->wired_count == 0);
13857 				old_entry->needs_copy = TRUE;
13858 			}
13859 			new_entry->needs_copy = new_entry_needs_copy;
13860 
13861 			/*
13862 			 *	Insert the entry at the end
13863 			 *	of the map.
13864 			 */
13865 
13866 			vm_map_store_entry_link(new_map,
13867 			    vm_map_last_entry(new_map),
13868 			    new_entry,
13869 			    VM_MAP_KERNEL_FLAGS_NONE);
13870 			new_size += entry_size;
13871 			break;
13872 
13873 slow_vm_map_fork_copy:
13874 			vm_map_copyin_flags = VM_MAP_COPYIN_FORK;
13875 			if (options & VM_MAP_FORK_PRESERVE_PURGEABLE) {
13876 				vm_map_copyin_flags |=
13877 				    VM_MAP_COPYIN_PRESERVE_PURGEABLE;
13878 			}
13879 			if (vm_map_fork_copy(old_map,
13880 			    &old_entry,
13881 			    new_map,
13882 			    vm_map_copyin_flags)) {
13883 				new_size += entry_size;
13884 			}
13885 			continue;
13886 		}
13887 		old_entry = old_entry->vme_next;
13888 	}
13889 
13890 #if PMAP_FORK_NEST
13891 	new_entry = vm_map_last_entry(new_map);
13892 	if (new_entry == vm_map_to_entry(new_map)) {
13893 		/* unnest all that was pre-nested */
13894 		vm_map_fork_unnest(new_pmap,
13895 		    pre_nested_start, pre_nested_end,
13896 		    vm_map_min(new_map), vm_map_max(new_map));
13897 	} else if (new_entry->vme_end < vm_map_max(new_map)) {
13898 		/* unnest hole at the end, if pre-nested */
13899 		vm_map_fork_unnest(new_pmap,
13900 		    pre_nested_start, pre_nested_end,
13901 		    new_entry->vme_end, vm_map_max(new_map));
13902 	}
13903 #endif /* PMAP_FORK_NEST */
13904 
13905 #if defined(__arm64__)
13906 	pmap_insert_commpage(new_map->pmap);
13907 #endif /* __arm64__ */
13908 
13909 	new_map->size = new_size;
13910 
13911 	if (options & VM_MAP_FORK_CORPSE_FOOTPRINT) {
13912 		vm_map_corpse_footprint_collect_done(new_map);
13913 	}
13914 
13915 	/* Propagate JIT entitlement for the pmap layer. */
13916 	if (pmap_get_jit_entitled(old_map->pmap)) {
13917 		/* Tell the pmap that it supports JIT. */
13918 		pmap_set_jit_entitled(new_map->pmap);
13919 	}
13920 
13921 	/* Propagate TPRO settings for the pmap layer */
13922 	if (pmap_get_tpro(old_map->pmap)) {
13923 		/* Tell the pmap that it supports TPRO */
13924 		pmap_set_tpro(new_map->pmap);
13925 	}
13926 
13927 
13928 	vm_map_unlock(new_map);
13929 	vm_map_unlock(old_map);
13930 	vm_map_deallocate(old_map);
13931 
13932 	return new_map;
13933 }
13934 
13935 /*
13936  * vm_map_exec:
13937  *
13938  *      Setup the "new_map" with the proper execution environment according
13939  *	to the type of executable (platform, 64bit, chroot environment).
13940  *	Map the comm page and shared region, etc...
13941  */
13942 kern_return_t
vm_map_exec(vm_map_t new_map,task_t task,boolean_t is64bit,void * fsroot,cpu_type_t cpu,cpu_subtype_t cpu_subtype,boolean_t reslide,boolean_t is_driverkit,uint32_t rsr_version)13943 vm_map_exec(
13944 	vm_map_t        new_map,
13945 	task_t          task,
13946 	boolean_t       is64bit,
13947 	void            *fsroot,
13948 	cpu_type_t      cpu,
13949 	cpu_subtype_t   cpu_subtype,
13950 	boolean_t       reslide,
13951 	boolean_t       is_driverkit,
13952 	uint32_t        rsr_version)
13953 {
13954 	SHARED_REGION_TRACE_DEBUG(
13955 		("shared_region: task %p: vm_map_exec(%p,%p,%p,0x%x,0x%x): ->\n",
13956 		(void *)VM_KERNEL_ADDRPERM(current_task()),
13957 		(void *)VM_KERNEL_ADDRPERM(new_map),
13958 		(void *)VM_KERNEL_ADDRPERM(task),
13959 		(void *)VM_KERNEL_ADDRPERM(fsroot),
13960 		cpu,
13961 		cpu_subtype));
13962 	(void) vm_commpage_enter(new_map, task, is64bit);
13963 
13964 	(void) vm_shared_region_enter(new_map, task, is64bit, fsroot, cpu, cpu_subtype, reslide, is_driverkit, rsr_version);
13965 
13966 	SHARED_REGION_TRACE_DEBUG(
13967 		("shared_region: task %p: vm_map_exec(%p,%p,%p,0x%x,0x%x): <-\n",
13968 		(void *)VM_KERNEL_ADDRPERM(current_task()),
13969 		(void *)VM_KERNEL_ADDRPERM(new_map),
13970 		(void *)VM_KERNEL_ADDRPERM(task),
13971 		(void *)VM_KERNEL_ADDRPERM(fsroot),
13972 		cpu,
13973 		cpu_subtype));
13974 
13975 	/*
13976 	 * Some devices have region(s) of memory that shouldn't get allocated by
13977 	 * user processes. The following code creates dummy vm_map_entry_t's for each
13978 	 * of the regions that needs to be reserved to prevent any allocations in
13979 	 * those regions.
13980 	 */
13981 	kern_return_t kr = KERN_FAILURE;
13982 	vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_FIXED_PERMANENT();
13983 	vmk_flags.vmkf_beyond_max = true;
13984 
13985 	const struct vm_reserved_region *regions = NULL;
13986 	size_t num_regions = ml_get_vm_reserved_regions(is64bit, &regions);
13987 	assert((num_regions == 0) || (num_regions > 0 && regions != NULL));
13988 
13989 	for (size_t i = 0; i < num_regions; ++i) {
13990 		vm_map_offset_t address = regions[i].vmrr_addr;
13991 
13992 		kr = vm_map_enter(
13993 			new_map,
13994 			&address,
13995 			regions[i].vmrr_size,
13996 			(vm_map_offset_t)0,
13997 			vmk_flags,
13998 			VM_OBJECT_NULL,
13999 			(vm_object_offset_t)0,
14000 			FALSE,
14001 			VM_PROT_NONE,
14002 			VM_PROT_NONE,
14003 			VM_INHERIT_COPY);
14004 
14005 		if (kr != KERN_SUCCESS) {
14006 			os_log_error(OS_LOG_DEFAULT, "Failed to reserve %s region in user map %p %d", regions[i].vmrr_name, new_map, kr);
14007 			return KERN_FAILURE;
14008 		}
14009 	}
14010 
14011 	new_map->reserved_regions = (num_regions ? TRUE : FALSE);
14012 
14013 	return KERN_SUCCESS;
14014 }
14015 
14016 uint64_t vm_map_lookup_and_lock_object_copy_slowly_count = 0;
14017 uint64_t vm_map_lookup_and_lock_object_copy_slowly_size = 0;
14018 uint64_t vm_map_lookup_and_lock_object_copy_slowly_max = 0;
14019 uint64_t vm_map_lookup_and_lock_object_copy_slowly_restart = 0;
14020 uint64_t vm_map_lookup_and_lock_object_copy_slowly_error = 0;
14021 uint64_t vm_map_lookup_and_lock_object_copy_strategically_count = 0;
14022 uint64_t vm_map_lookup_and_lock_object_copy_strategically_size = 0;
14023 uint64_t vm_map_lookup_and_lock_object_copy_strategically_max = 0;
14024 uint64_t vm_map_lookup_and_lock_object_copy_strategically_restart = 0;
14025 uint64_t vm_map_lookup_and_lock_object_copy_strategically_error = 0;
14026 uint64_t vm_map_lookup_and_lock_object_copy_shadow_count = 0;
14027 uint64_t vm_map_lookup_and_lock_object_copy_shadow_size = 0;
14028 uint64_t vm_map_lookup_and_lock_object_copy_shadow_max = 0;
14029 /*
14030  *	vm_map_lookup_and_lock_object:
14031  *
14032  *	Finds the VM object, offset, and
14033  *	protection for a given virtual address in the
14034  *	specified map, assuming a page fault of the
14035  *	type specified.
14036  *
14037  *	Returns the (object, offset, protection) for
14038  *	this address, whether it is wired down, and whether
14039  *	this map has the only reference to the data in question.
14040  *	In order to later verify this lookup, a "version"
14041  *	is returned.
14042  *	If contended != NULL, *contended will be set to
14043  *	true iff the thread had to spin or block to acquire
14044  *	an exclusive lock.
14045  *
14046  *	The map MUST be locked by the caller and WILL be
14047  *	locked on exit.  In order to guarantee the
14048  *	existence of the returned object, it is returned
14049  *	locked.
14050  *
14051  *	If a lookup is requested with "write protection"
14052  *	specified, the map may be changed to perform virtual
14053  *	copying operations, although the data referenced will
14054  *	remain the same.
14055  *
14056  *  If fault_info is provided, then the information is
14057  *  initialized according to the properties of the map entry
14058  *  NB: only properties of the entry are initialized,
14059  *  namely:
14060  *    - user_tag
14061  *    - pmap_options
14062  *    - iokit_acct
14063  *    - behavior
14064  *    - lo_offset
14065  *    - hi_offset
14066  *    - no_cache
14067  *    - cs_bypass
14068  *    - csm_associated
14069  *    - resilient_media
14070  *    - vme_xnu_user_debug
14071  *    - vme_no_copy_on_read
14072  *    - used_for_tpro
14073  */
14074 kern_return_t
vm_map_lookup_and_lock_object(vm_map_t * var_map,vm_map_offset_t vaddr,vm_prot_t fault_type,int object_lock_type,vm_map_version_t * out_version,vm_object_t * object,vm_object_offset_t * offset,vm_prot_t * out_prot,boolean_t * wired,vm_object_fault_info_t fault_info,vm_map_t * real_map,bool * contended)14075 vm_map_lookup_and_lock_object(
14076 	vm_map_t                *var_map,       /* IN/OUT */
14077 	vm_map_offset_t         vaddr,
14078 	vm_prot_t               fault_type,
14079 	int                     object_lock_type,
14080 	vm_map_version_t        *out_version,   /* OUT */
14081 	vm_object_t             *object,        /* OUT */
14082 	vm_object_offset_t      *offset,        /* OUT */
14083 	vm_prot_t               *out_prot,      /* OUT */
14084 	boolean_t               *wired,         /* OUT */
14085 	vm_object_fault_info_t  fault_info,     /* OUT */
14086 	vm_map_t                *real_map,      /* OUT */
14087 	bool                    *contended)     /* OUT */
14088 {
14089 	vm_map_entry_t                  entry;
14090 	vm_map_t                        map = *var_map;
14091 	vm_map_t                        old_map = *var_map;
14092 	vm_map_t                        cow_sub_map_parent = VM_MAP_NULL;
14093 	vm_map_offset_t                 cow_parent_vaddr = 0;
14094 	vm_map_offset_t                 old_start = 0;
14095 	vm_map_offset_t                 old_end = 0;
14096 	vm_prot_t                       prot;
14097 	boolean_t                       mask_protections;
14098 	boolean_t                       force_copy;
14099 	boolean_t                       no_force_copy_if_executable;
14100 	boolean_t                       submap_needed_copy;
14101 	vm_prot_t                       original_fault_type;
14102 	vm_map_size_t                   fault_page_mask;
14103 
14104 	/*
14105 	 * VM_PROT_MASK means that the caller wants us to use "fault_type"
14106 	 * as a mask against the mapping's actual protections, not as an
14107 	 * absolute value.
14108 	 */
14109 	mask_protections = (fault_type & VM_PROT_IS_MASK) ? TRUE : FALSE;
14110 	force_copy = (fault_type & VM_PROT_COPY) ? TRUE : FALSE;
14111 	no_force_copy_if_executable = (fault_type & VM_PROT_COPY_FAIL_IF_EXECUTABLE) ? TRUE : FALSE;
14112 	fault_type &= VM_PROT_ALL;
14113 	original_fault_type = fault_type;
14114 	if (contended) {
14115 		*contended = false;
14116 	}
14117 
14118 	*real_map = map;
14119 
14120 	fault_page_mask = MIN(VM_MAP_PAGE_MASK(map), PAGE_MASK);
14121 	vaddr = VM_MAP_TRUNC_PAGE(vaddr, fault_page_mask);
14122 
14123 RetryLookup:
14124 	fault_type = original_fault_type;
14125 
14126 	/*
14127 	 *	If the map has an interesting hint, try it before calling
14128 	 *	full blown lookup routine.
14129 	 */
14130 	entry = map->hint;
14131 
14132 	if ((entry == vm_map_to_entry(map)) ||
14133 	    (vaddr < entry->vme_start) || (vaddr >= entry->vme_end)) {
14134 		vm_map_entry_t  tmp_entry;
14135 
14136 		/*
14137 		 *	Entry was either not a valid hint, or the vaddr
14138 		 *	was not contained in the entry, so do a full lookup.
14139 		 */
14140 		if (!vm_map_lookup_entry(map, vaddr, &tmp_entry)) {
14141 			if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14142 				vm_map_unlock(cow_sub_map_parent);
14143 			}
14144 			if ((*real_map != map)
14145 			    && (*real_map != cow_sub_map_parent)) {
14146 				vm_map_unlock(*real_map);
14147 			}
14148 			return KERN_INVALID_ADDRESS;
14149 		}
14150 
14151 		entry = tmp_entry;
14152 	}
14153 	if (map == old_map) {
14154 		old_start = entry->vme_start;
14155 		old_end = entry->vme_end;
14156 	}
14157 
14158 	/*
14159 	 *	Handle submaps.  Drop lock on upper map, submap is
14160 	 *	returned locked.
14161 	 */
14162 
14163 	submap_needed_copy = FALSE;
14164 submap_recurse:
14165 	if (entry->is_sub_map) {
14166 		vm_map_offset_t         local_vaddr;
14167 		vm_map_offset_t         end_delta;
14168 		vm_map_offset_t         start_delta;
14169 		vm_map_offset_t         top_entry_saved_start;
14170 		vm_object_offset_t      top_entry_saved_offset;
14171 		vm_map_entry_t          submap_entry, saved_submap_entry;
14172 		vm_object_offset_t      submap_entry_offset;
14173 		vm_object_size_t        submap_entry_size;
14174 		vm_prot_t               subentry_protection;
14175 		vm_prot_t               subentry_max_protection;
14176 		boolean_t               subentry_no_copy_on_read;
14177 		boolean_t               subentry_permanent;
14178 		boolean_t               subentry_csm_associated;
14179 #if __arm64e__
14180 		boolean_t               subentry_used_for_tpro;
14181 #endif /* __arm64e__ */
14182 		boolean_t               mapped_needs_copy = FALSE;
14183 		vm_map_version_t        version;
14184 
14185 		assertf(VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)) >= VM_MAP_PAGE_SHIFT(map),
14186 		    "map %p (%d) entry %p submap %p (%d)\n",
14187 		    map, VM_MAP_PAGE_SHIFT(map), entry,
14188 		    VME_SUBMAP(entry), VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)));
14189 
14190 		local_vaddr = vaddr;
14191 		top_entry_saved_start = entry->vme_start;
14192 		top_entry_saved_offset = VME_OFFSET(entry);
14193 
14194 		if ((entry->use_pmap &&
14195 		    !((fault_type & VM_PROT_WRITE) ||
14196 		    force_copy))) {
14197 			/* if real_map equals map we unlock below */
14198 			if ((*real_map != map) &&
14199 			    (*real_map != cow_sub_map_parent)) {
14200 				vm_map_unlock(*real_map);
14201 			}
14202 			*real_map = VME_SUBMAP(entry);
14203 		}
14204 
14205 		if (entry->needs_copy &&
14206 		    ((fault_type & VM_PROT_WRITE) ||
14207 		    force_copy)) {
14208 			if (!mapped_needs_copy) {
14209 				if (vm_map_lock_read_to_write(map)) {
14210 					vm_map_lock_read(map);
14211 					*real_map = map;
14212 					goto RetryLookup;
14213 				}
14214 				vm_map_lock_read(VME_SUBMAP(entry));
14215 				*var_map = VME_SUBMAP(entry);
14216 				cow_sub_map_parent = map;
14217 				/* reset base to map before cow object */
14218 				/* this is the map which will accept   */
14219 				/* the new cow object */
14220 				old_start = entry->vme_start;
14221 				old_end = entry->vme_end;
14222 				cow_parent_vaddr = vaddr;
14223 				mapped_needs_copy = TRUE;
14224 			} else {
14225 				vm_map_lock_read(VME_SUBMAP(entry));
14226 				*var_map = VME_SUBMAP(entry);
14227 				if ((cow_sub_map_parent != map) &&
14228 				    (*real_map != map)) {
14229 					vm_map_unlock(map);
14230 				}
14231 			}
14232 		} else {
14233 			if (entry->needs_copy) {
14234 				submap_needed_copy = TRUE;
14235 			}
14236 			vm_map_lock_read(VME_SUBMAP(entry));
14237 			*var_map = VME_SUBMAP(entry);
14238 			/* leave map locked if it is a target */
14239 			/* cow sub_map above otherwise, just  */
14240 			/* follow the maps down to the object */
14241 			/* here we unlock knowing we are not  */
14242 			/* revisiting the map.  */
14243 			if ((*real_map != map) && (map != cow_sub_map_parent)) {
14244 				vm_map_unlock_read(map);
14245 			}
14246 		}
14247 
14248 		entry = NULL;
14249 		map = *var_map;
14250 
14251 		/* calculate the offset in the submap for vaddr */
14252 		local_vaddr = (local_vaddr - top_entry_saved_start) + top_entry_saved_offset;
14253 		assertf(VM_MAP_PAGE_ALIGNED(local_vaddr, fault_page_mask),
14254 		    "local_vaddr 0x%llx entry->vme_start 0x%llx fault_page_mask 0x%llx\n",
14255 		    (uint64_t)local_vaddr, (uint64_t)top_entry_saved_start, (uint64_t)fault_page_mask);
14256 
14257 RetrySubMap:
14258 		if (!vm_map_lookup_entry(map, local_vaddr, &submap_entry)) {
14259 			if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14260 				vm_map_unlock(cow_sub_map_parent);
14261 			}
14262 			if ((*real_map != map)
14263 			    && (*real_map != cow_sub_map_parent)) {
14264 				vm_map_unlock(*real_map);
14265 			}
14266 			*real_map = map;
14267 			return KERN_INVALID_ADDRESS;
14268 		}
14269 
14270 		/* find the attenuated shadow of the underlying object */
14271 		/* on our target map */
14272 
14273 		/* in english the submap object may extend beyond the     */
14274 		/* region mapped by the entry or, may only fill a portion */
14275 		/* of it.  For our purposes, we only care if the object   */
14276 		/* doesn't fill.  In this case the area which will        */
14277 		/* ultimately be clipped in the top map will only need    */
14278 		/* to be as big as the portion of the underlying entry    */
14279 		/* which is mapped */
14280 		start_delta = submap_entry->vme_start > top_entry_saved_offset ?
14281 		    submap_entry->vme_start - top_entry_saved_offset : 0;
14282 
14283 		end_delta =
14284 		    (top_entry_saved_offset + start_delta + (old_end - old_start)) <=
14285 		    submap_entry->vme_end ?
14286 		    0 : (top_entry_saved_offset +
14287 		    (old_end - old_start))
14288 		    - submap_entry->vme_end;
14289 
14290 		old_start += start_delta;
14291 		old_end -= end_delta;
14292 
14293 		if (submap_entry->is_sub_map) {
14294 			entry = submap_entry;
14295 			vaddr = local_vaddr;
14296 			goto submap_recurse;
14297 		}
14298 
14299 		if (((fault_type & VM_PROT_WRITE) ||
14300 		    force_copy)
14301 		    && cow_sub_map_parent) {
14302 			vm_object_t     sub_object, copy_object;
14303 			vm_object_offset_t copy_offset;
14304 			vm_map_offset_t local_start;
14305 			vm_map_offset_t local_end;
14306 			boolean_t       object_copied = FALSE;
14307 			vm_object_offset_t object_copied_offset = 0;
14308 			boolean_t       object_copied_needs_copy = FALSE;
14309 			kern_return_t   kr = KERN_SUCCESS;
14310 
14311 			if (vm_map_lock_read_to_write(map)) {
14312 				vm_map_lock_read(map);
14313 				old_start -= start_delta;
14314 				old_end += end_delta;
14315 				goto RetrySubMap;
14316 			}
14317 
14318 
14319 			sub_object = VME_OBJECT(submap_entry);
14320 			if (sub_object == VM_OBJECT_NULL) {
14321 				sub_object =
14322 				    vm_object_allocate(
14323 					(vm_map_size_t)
14324 					(submap_entry->vme_end -
14325 					submap_entry->vme_start), map->serial_id);
14326 				VME_OBJECT_SET(submap_entry, sub_object, false, 0);
14327 				VME_OFFSET_SET(submap_entry, 0);
14328 				assert(!submap_entry->is_sub_map);
14329 				assert(submap_entry->use_pmap);
14330 			}
14331 			local_start =  local_vaddr -
14332 			    (cow_parent_vaddr - old_start);
14333 			local_end = local_vaddr +
14334 			    (old_end - cow_parent_vaddr);
14335 			vm_map_clip_start(map, submap_entry, local_start);
14336 			vm_map_clip_end(map, submap_entry, local_end);
14337 			if (submap_entry->is_sub_map) {
14338 				/* unnesting was done when clipping */
14339 				assert(!submap_entry->use_pmap);
14340 			}
14341 
14342 			/* This is the COW case, lets connect */
14343 			/* an entry in our space to the underlying */
14344 			/* object in the submap, bypassing the  */
14345 			/* submap. */
14346 			submap_entry_offset = VME_OFFSET(submap_entry);
14347 			submap_entry_size = submap_entry->vme_end - submap_entry->vme_start;
14348 
14349 			if ((submap_entry->wired_count != 0 ||
14350 			    sub_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) &&
14351 			    (submap_entry->protection & VM_PROT_EXECUTE) &&
14352 			    no_force_copy_if_executable) {
14353 //				printf("FBDP map %p entry %p start 0x%llx end 0x%llx wired %d strat %d\n", map, submap_entry, (uint64_t)local_start, (uint64_t)local_end, submap_entry->wired_count, sub_object->copy_strategy);
14354 				if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14355 					vm_map_unlock(cow_sub_map_parent);
14356 				}
14357 				if ((*real_map != map)
14358 				    && (*real_map != cow_sub_map_parent)) {
14359 					vm_map_unlock(*real_map);
14360 				}
14361 				*real_map = map;
14362 				ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_NO_COW_ON_EXECUTABLE), 0 /* arg */);
14363 				vm_map_lock_write_to_read(map);
14364 				kr = KERN_PROTECTION_FAILURE;
14365 				DTRACE_VM4(submap_no_copy_executable,
14366 				    vm_map_t, map,
14367 				    vm_object_offset_t, submap_entry_offset,
14368 				    vm_object_size_t, submap_entry_size,
14369 				    int, kr);
14370 				return kr;
14371 			}
14372 
14373 			if (submap_entry->wired_count != 0) {
14374 				vm_object_reference(sub_object);
14375 
14376 				assertf(VM_MAP_PAGE_ALIGNED(VME_OFFSET(submap_entry), VM_MAP_PAGE_MASK(map)),
14377 				    "submap_entry %p offset 0x%llx\n",
14378 				    submap_entry, VME_OFFSET(submap_entry));
14379 
14380 				DTRACE_VM6(submap_copy_slowly,
14381 				    vm_map_t, cow_sub_map_parent,
14382 				    vm_map_offset_t, vaddr,
14383 				    vm_map_t, map,
14384 				    vm_object_size_t, submap_entry_size,
14385 				    int, submap_entry->wired_count,
14386 				    int, sub_object->copy_strategy);
14387 
14388 				saved_submap_entry = submap_entry;
14389 				version.main_timestamp = map->timestamp;
14390 				vm_map_unlock(map); /* Increments timestamp by 1 */
14391 				submap_entry = VM_MAP_ENTRY_NULL;
14392 
14393 				vm_object_lock(sub_object);
14394 				kr = vm_object_copy_slowly(sub_object,
14395 				    submap_entry_offset,
14396 				    submap_entry_size,
14397 				    FALSE, /* interruptible */
14398 				    &copy_object);
14399 				object_copied = TRUE;
14400 				object_copied_offset = 0;
14401 				/* 4k: account for extra offset in physical page */
14402 				object_copied_offset += submap_entry_offset - vm_object_trunc_page(submap_entry_offset);
14403 				object_copied_needs_copy = FALSE;
14404 				vm_object_deallocate(sub_object);
14405 
14406 				vm_map_lock(map);
14407 
14408 				if (kr != KERN_SUCCESS &&
14409 				    kr != KERN_MEMORY_RESTART_COPY) {
14410 					if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14411 						vm_map_unlock(cow_sub_map_parent);
14412 					}
14413 					if ((*real_map != map)
14414 					    && (*real_map != cow_sub_map_parent)) {
14415 						vm_map_unlock(*real_map);
14416 					}
14417 					*real_map = map;
14418 					vm_object_deallocate(copy_object);
14419 					copy_object = VM_OBJECT_NULL;
14420 					ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_COPY_SLOWLY_FAILED), 0 /* arg */);
14421 					vm_map_lock_write_to_read(map);
14422 					DTRACE_VM4(submap_copy_error_slowly,
14423 					    vm_object_t, sub_object,
14424 					    vm_object_offset_t, submap_entry_offset,
14425 					    vm_object_size_t, submap_entry_size,
14426 					    int, kr);
14427 					vm_map_lookup_and_lock_object_copy_slowly_error++;
14428 					return kr;
14429 				}
14430 
14431 				if ((kr == KERN_SUCCESS) &&
14432 				    (version.main_timestamp + 1) == map->timestamp) {
14433 					submap_entry = saved_submap_entry;
14434 				} else {
14435 					saved_submap_entry = NULL;
14436 					old_start -= start_delta;
14437 					old_end += end_delta;
14438 					vm_object_deallocate(copy_object);
14439 					copy_object = VM_OBJECT_NULL;
14440 					vm_map_lock_write_to_read(map);
14441 					vm_map_lookup_and_lock_object_copy_slowly_restart++;
14442 					goto RetrySubMap;
14443 				}
14444 				vm_map_lookup_and_lock_object_copy_slowly_count++;
14445 				vm_map_lookup_and_lock_object_copy_slowly_size += submap_entry_size;
14446 				if (submap_entry_size > vm_map_lookup_and_lock_object_copy_slowly_max) {
14447 					vm_map_lookup_and_lock_object_copy_slowly_max = submap_entry_size;
14448 				}
14449 			} else if (sub_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
14450 				submap_entry_offset = VME_OFFSET(submap_entry);
14451 				copy_object = VM_OBJECT_NULL;
14452 				object_copied_offset = submap_entry_offset;
14453 				object_copied_needs_copy = FALSE;
14454 				DTRACE_VM6(submap_copy_strategically,
14455 				    vm_map_t, cow_sub_map_parent,
14456 				    vm_map_offset_t, vaddr,
14457 				    vm_map_t, map,
14458 				    vm_object_size_t, submap_entry_size,
14459 				    int, submap_entry->wired_count,
14460 				    int, sub_object->copy_strategy);
14461 				kr = vm_object_copy_strategically(
14462 					sub_object,
14463 					submap_entry_offset,
14464 					submap_entry->vme_end - submap_entry->vme_start,
14465 					false, /* forking */
14466 					&copy_object,
14467 					&object_copied_offset,
14468 					&object_copied_needs_copy);
14469 				if (kr == KERN_MEMORY_RESTART_COPY) {
14470 					old_start -= start_delta;
14471 					old_end += end_delta;
14472 					vm_object_deallocate(copy_object);
14473 					copy_object = VM_OBJECT_NULL;
14474 					vm_map_lock_write_to_read(map);
14475 					vm_map_lookup_and_lock_object_copy_strategically_restart++;
14476 					goto RetrySubMap;
14477 				}
14478 				if (kr != KERN_SUCCESS) {
14479 					if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14480 						vm_map_unlock(cow_sub_map_parent);
14481 					}
14482 					if ((*real_map != map)
14483 					    && (*real_map != cow_sub_map_parent)) {
14484 						vm_map_unlock(*real_map);
14485 					}
14486 					*real_map = map;
14487 					vm_object_deallocate(copy_object);
14488 					copy_object = VM_OBJECT_NULL;
14489 					ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_COPY_STRAT_FAILED), 0 /* arg */);
14490 					vm_map_lock_write_to_read(map);
14491 					DTRACE_VM4(submap_copy_error_strategically,
14492 					    vm_object_t, sub_object,
14493 					    vm_object_offset_t, submap_entry_offset,
14494 					    vm_object_size_t, submap_entry_size,
14495 					    int, kr);
14496 					vm_map_lookup_and_lock_object_copy_strategically_error++;
14497 					return kr;
14498 				}
14499 				assert(copy_object != VM_OBJECT_NULL);
14500 				assert(copy_object != sub_object);
14501 				object_copied = TRUE;
14502 				vm_map_lookup_and_lock_object_copy_strategically_count++;
14503 				vm_map_lookup_and_lock_object_copy_strategically_size += submap_entry_size;
14504 				if (submap_entry_size > vm_map_lookup_and_lock_object_copy_strategically_max) {
14505 					vm_map_lookup_and_lock_object_copy_strategically_max = submap_entry_size;
14506 				}
14507 			} else {
14508 				/* set up shadow object */
14509 				object_copied = FALSE;
14510 				copy_object = sub_object;
14511 				vm_object_lock(sub_object);
14512 				vm_object_reference_locked(sub_object);
14513 				VM_OBJECT_SET_SHADOWED(sub_object, TRUE);
14514 				vm_object_unlock(sub_object);
14515 
14516 				assert(submap_entry->wired_count == 0);
14517 				submap_entry->needs_copy = TRUE;
14518 
14519 				prot = submap_entry->protection;
14520 				if (pmap_has_prot_policy(map->pmap, submap_entry->translated_allow_execute, prot)) {
14521 					panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
14522 					    __FUNCTION__,
14523 					    map, map->pmap, submap_entry,
14524 					    (uint64_t)submap_entry->vme_start,
14525 					    (uint64_t)submap_entry->vme_end,
14526 					    prot);
14527 				}
14528 				prot = prot & ~VM_PROT_WRITE;
14529 				if (pmap_has_prot_policy(map->pmap, submap_entry->translated_allow_execute, prot)) {
14530 					panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
14531 					    __FUNCTION__,
14532 					    map, map->pmap, submap_entry,
14533 					    (uint64_t)submap_entry->vme_start,
14534 					    (uint64_t)submap_entry->vme_end,
14535 					    prot);
14536 				}
14537 
14538 				if (override_nx(old_map,
14539 				    VME_ALIAS(submap_entry))
14540 				    && prot) {
14541 					prot |= VM_PROT_EXECUTE;
14542 				}
14543 
14544 				vm_object_pmap_protect(
14545 					sub_object,
14546 					VME_OFFSET(submap_entry),
14547 					submap_entry->vme_end -
14548 					submap_entry->vme_start,
14549 					(submap_entry->is_shared
14550 					|| map->mapped_in_other_pmaps) ?
14551 					PMAP_NULL : map->pmap,
14552 					VM_MAP_PAGE_SIZE(map),
14553 					submap_entry->vme_start,
14554 					prot);
14555 				vm_map_lookup_and_lock_object_copy_shadow_count++;
14556 				vm_map_lookup_and_lock_object_copy_shadow_size += submap_entry_size;
14557 				if (submap_entry_size > vm_map_lookup_and_lock_object_copy_shadow_max) {
14558 					vm_map_lookup_and_lock_object_copy_shadow_max = submap_entry_size;
14559 				}
14560 			}
14561 
14562 			/*
14563 			 * Adjust the fault offset to the submap entry.
14564 			 */
14565 			copy_offset = (local_vaddr -
14566 			    submap_entry->vme_start +
14567 			    VME_OFFSET(submap_entry));
14568 
14569 			/* This works diffently than the   */
14570 			/* normal submap case. We go back  */
14571 			/* to the parent of the cow map and*/
14572 			/* clip out the target portion of  */
14573 			/* the sub_map, substituting the   */
14574 			/* new copy object,                */
14575 
14576 			subentry_protection = submap_entry->protection;
14577 			subentry_max_protection = submap_entry->max_protection;
14578 			subentry_no_copy_on_read = submap_entry->vme_no_copy_on_read;
14579 			subentry_permanent = submap_entry->vme_permanent;
14580 			subentry_csm_associated = submap_entry->csm_associated;
14581 #if __arm64e__
14582 			subentry_used_for_tpro = submap_entry->used_for_tpro;
14583 #endif // __arm64e__
14584 			vm_map_unlock(map);
14585 			submap_entry = NULL; /* not valid after map unlock */
14586 
14587 			local_start = old_start;
14588 			local_end = old_end;
14589 			map = cow_sub_map_parent;
14590 			*var_map = cow_sub_map_parent;
14591 			vaddr = cow_parent_vaddr;
14592 			cow_sub_map_parent = NULL;
14593 
14594 			if (!vm_map_lookup_entry(map,
14595 			    vaddr, &entry)) {
14596 				if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14597 					vm_map_unlock(cow_sub_map_parent);
14598 				}
14599 				if ((*real_map != map)
14600 				    && (*real_map != cow_sub_map_parent)) {
14601 					vm_map_unlock(*real_map);
14602 				}
14603 				*real_map = map;
14604 				vm_object_deallocate(
14605 					copy_object);
14606 				copy_object = VM_OBJECT_NULL;
14607 				vm_map_lock_write_to_read(map);
14608 				DTRACE_VM4(submap_lookup_post_unlock,
14609 				    uint64_t, (uint64_t)entry->vme_start,
14610 				    uint64_t, (uint64_t)entry->vme_end,
14611 				    vm_map_offset_t, vaddr,
14612 				    int, object_copied);
14613 				return KERN_INVALID_ADDRESS;
14614 			}
14615 
14616 			/* clip out the portion of space */
14617 			/* mapped by the sub map which   */
14618 			/* corresponds to the underlying */
14619 			/* object */
14620 
14621 			/*
14622 			 * Clip (and unnest) the smallest nested chunk
14623 			 * possible around the faulting address...
14624 			 */
14625 			local_start = vaddr & ~(pmap_shared_region_size_min(map->pmap) - 1);
14626 			local_end = local_start + pmap_shared_region_size_min(map->pmap);
14627 			/*
14628 			 * ... but don't go beyond the "old_start" to "old_end"
14629 			 * range, to avoid spanning over another VM region
14630 			 * with a possibly different VM object and/or offset.
14631 			 */
14632 			if (local_start < old_start) {
14633 				local_start = old_start;
14634 			}
14635 			if (local_end > old_end) {
14636 				local_end = old_end;
14637 			}
14638 			/*
14639 			 * Adjust copy_offset to the start of the range.
14640 			 */
14641 			copy_offset -= (vaddr - local_start);
14642 
14643 			vm_map_clip_start(map, entry, local_start);
14644 			vm_map_clip_end(map, entry, local_end);
14645 			if (entry->is_sub_map) {
14646 				/* unnesting was done when clipping */
14647 				assert(!entry->use_pmap);
14648 			}
14649 
14650 			/* substitute copy object for */
14651 			/* shared map entry           */
14652 			vm_map_deallocate(VME_SUBMAP(entry));
14653 			assert(!entry->iokit_acct);
14654 			entry->use_pmap = TRUE;
14655 			VME_OBJECT_SET(entry, copy_object, false, 0);
14656 
14657 			/* propagate the submap entry's protections */
14658 			if (entry->protection != VM_PROT_READ) {
14659 				/*
14660 				 * Someone has already altered the top entry's
14661 				 * protections via vm_protect(VM_PROT_COPY).
14662 				 * Respect these new values and ignore the
14663 				 * submap entry's protections.
14664 				 */
14665 			} else {
14666 				/*
14667 				 * Regular copy-on-write: propagate the submap
14668 				 * entry's protections to the top map entry.
14669 				 */
14670 				entry->protection |= subentry_protection;
14671 			}
14672 			entry->max_protection |= subentry_max_protection;
14673 			/* propagate some attributes from subentry */
14674 			entry->vme_no_copy_on_read = subentry_no_copy_on_read;
14675 			entry->vme_permanent = subentry_permanent;
14676 			entry->csm_associated = subentry_csm_associated;
14677 #if __arm64e__
14678 			/* propagate TPRO iff the destination map has TPRO enabled */
14679 			if (subentry_used_for_tpro) {
14680 				if (vm_map_tpro(map)) {
14681 					entry->used_for_tpro = subentry_used_for_tpro;
14682 				} else {
14683 					/* "permanent" came from being TPRO */
14684 					entry->vme_permanent = FALSE;
14685 				}
14686 			}
14687 #endif /* __arm64e */
14688 			if ((entry->protection & VM_PROT_WRITE) &&
14689 			    (entry->protection & VM_PROT_EXECUTE) &&
14690 #if XNU_TARGET_OS_OSX
14691 			    map->pmap != kernel_pmap &&
14692 			    (vm_map_cs_enforcement(map)
14693 #if __arm64__
14694 			    || !VM_MAP_IS_EXOTIC(map)
14695 #endif /* __arm64__ */
14696 			    ) &&
14697 #endif /* XNU_TARGET_OS_OSX */
14698 #if CODE_SIGNING_MONITOR
14699 			    (csm_address_space_exempt(map->pmap) != KERN_SUCCESS) &&
14700 #endif
14701 			    !(entry->used_for_jit) &&
14702 			    VM_MAP_POLICY_WX_STRIP_X(map)) {
14703 				DTRACE_VM3(cs_wx,
14704 				    uint64_t, (uint64_t)entry->vme_start,
14705 				    uint64_t, (uint64_t)entry->vme_end,
14706 				    vm_prot_t, entry->protection);
14707 				printf("CODE SIGNING: %d[%s] %s:%d(0x%llx,0x%llx,0x%x) can't have both write and exec at the same time\n",
14708 				    proc_selfpid(),
14709 				    (get_bsdtask_info(current_task())
14710 				    ? proc_name_address(get_bsdtask_info(current_task()))
14711 				    : "?"),
14712 				    __FUNCTION__, __LINE__,
14713 #if DEVELOPMENT || DEBUG
14714 				    (uint64_t)entry->vme_start,
14715 				    (uint64_t)entry->vme_end,
14716 #else /* DEVELOPMENT || DEBUG */
14717 				    (uint64_t)0,
14718 				    (uint64_t)0,
14719 #endif /* DEVELOPMENT || DEBUG */
14720 				    entry->protection);
14721 				entry->protection &= ~VM_PROT_EXECUTE;
14722 			}
14723 
14724 			if (object_copied) {
14725 				VME_OFFSET_SET(entry, local_start - old_start + object_copied_offset);
14726 				entry->needs_copy = object_copied_needs_copy;
14727 				entry->is_shared = FALSE;
14728 			} else {
14729 				assert(VME_OBJECT(entry) != VM_OBJECT_NULL);
14730 				assert(VME_OBJECT(entry)->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC);
14731 				assert(entry->wired_count == 0);
14732 				VME_OFFSET_SET(entry, copy_offset);
14733 				entry->needs_copy = TRUE;
14734 				if (map != old_map) {
14735 					entry->is_shared = TRUE;
14736 				}
14737 			}
14738 			if (entry->inheritance == VM_INHERIT_SHARE) {
14739 				entry->inheritance = VM_INHERIT_COPY;
14740 			}
14741 
14742 			vm_map_lock_write_to_read(map);
14743 		} else {
14744 			if ((cow_sub_map_parent)
14745 			    && (cow_sub_map_parent != *real_map)
14746 			    && (cow_sub_map_parent != map)) {
14747 				vm_map_unlock(cow_sub_map_parent);
14748 			}
14749 			entry = submap_entry;
14750 			vaddr = local_vaddr;
14751 		}
14752 	}
14753 
14754 	/*
14755 	 *	Check whether this task is allowed to have
14756 	 *	this page.
14757 	 */
14758 
14759 	prot = entry->protection;
14760 
14761 	if (override_nx(old_map, VME_ALIAS(entry)) && prot) {
14762 		/*
14763 		 * HACK -- if not a stack, then allow execution
14764 		 */
14765 		prot |= VM_PROT_EXECUTE;
14766 	}
14767 
14768 #if __arm64e__
14769 	/*
14770 	 * If the entry we're dealing with is TPRO and we have a write
14771 	 * fault, inject VM_PROT_WRITE into protections. This allows us
14772 	 * to maintain RO permissions when not marked as TPRO.
14773 	 */
14774 	if (entry->used_for_tpro && (fault_type & VM_PROT_WRITE)) {
14775 		prot |= VM_PROT_WRITE;
14776 	}
14777 #endif /* __arm64e__ */
14778 	if (mask_protections) {
14779 		fault_type &= prot;
14780 		if (fault_type == VM_PROT_NONE) {
14781 			goto protection_failure;
14782 		}
14783 	}
14784 	if (((fault_type & prot) != fault_type)
14785 #if __arm64__
14786 	    /* prefetch abort in execute-only page */
14787 	    && !(prot == VM_PROT_EXECUTE && fault_type == (VM_PROT_READ | VM_PROT_EXECUTE))
14788 #elif defined(__x86_64__)
14789 	    /* Consider the UEXEC bit when handling an EXECUTE fault */
14790 	    && !((fault_type & VM_PROT_EXECUTE) && !(prot & VM_PROT_EXECUTE) && (prot & VM_PROT_UEXEC))
14791 #endif
14792 	    ) {
14793 protection_failure:
14794 		if (*real_map != map) {
14795 			vm_map_unlock(*real_map);
14796 		}
14797 		*real_map = map;
14798 
14799 		if ((fault_type & VM_PROT_EXECUTE) && prot) {
14800 			log_stack_execution_failure((addr64_t)vaddr, prot);
14801 		}
14802 
14803 		DTRACE_VM2(prot_fault, int, 1, (uint64_t *), NULL);
14804 		DTRACE_VM3(prot_fault_detailed, vm_prot_t, fault_type, vm_prot_t, prot, void *, vaddr);
14805 		/*
14806 		 * Noisy (esp. internally) and can be inferred from CrashReports. So OFF for now.
14807 		 *
14808 		 * ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_PROTECTION_FAILURE), 0);
14809 		 */
14810 		return KERN_PROTECTION_FAILURE;
14811 	}
14812 
14813 	/*
14814 	 *	If this page is not pageable, we have to get
14815 	 *	it for all possible accesses.
14816 	 */
14817 
14818 	*wired = (entry->wired_count != 0);
14819 	if (*wired) {
14820 		fault_type = prot;
14821 	}
14822 
14823 	/*
14824 	 *	If the entry was copy-on-write, we either ...
14825 	 */
14826 
14827 	if (entry->needs_copy) {
14828 		/*
14829 		 *	If we want to write the page, we may as well
14830 		 *	handle that now since we've got the map locked.
14831 		 *
14832 		 *	If we don't need to write the page, we just
14833 		 *	demote the permissions allowed.
14834 		 */
14835 
14836 		if ((fault_type & VM_PROT_WRITE) || *wired || force_copy) {
14837 			/*
14838 			 *	Make a new object, and place it in the
14839 			 *	object chain.  Note that no new references
14840 			 *	have appeared -- one just moved from the
14841 			 *	map to the new object.
14842 			 */
14843 
14844 			if (vm_map_lock_read_to_write(map)) {
14845 				vm_map_lock_read(map);
14846 				goto RetryLookup;
14847 			}
14848 
14849 			if (VME_OBJECT(entry)->shadowed == FALSE) {
14850 				vm_object_lock(VME_OBJECT(entry));
14851 				VM_OBJECT_SET_SHADOWED(VME_OBJECT(entry), TRUE);
14852 				vm_object_unlock(VME_OBJECT(entry));
14853 			}
14854 			VME_OBJECT_SHADOW(entry,
14855 			    (vm_map_size_t) (entry->vme_end -
14856 			    entry->vme_start),
14857 			    vm_map_always_shadow(map));
14858 			entry->needs_copy = FALSE;
14859 
14860 			vm_map_lock_write_to_read(map);
14861 		}
14862 		if ((fault_type & VM_PROT_WRITE) == 0 && *wired == 0) {
14863 			/*
14864 			 *	We're attempting to read a copy-on-write
14865 			 *	page -- don't allow writes.
14866 			 */
14867 
14868 			prot &= (~VM_PROT_WRITE);
14869 		}
14870 	}
14871 
14872 	if (submap_needed_copy && (prot & VM_PROT_WRITE)) {
14873 		/*
14874 		 * We went through a "needs_copy" submap without triggering
14875 		 * a copy, so granting write access to the page would bypass
14876 		 * that submap's "needs_copy".
14877 		 */
14878 		assert(!(fault_type & VM_PROT_WRITE));
14879 		assert(!*wired);
14880 		assert(!force_copy);
14881 		// printf("FBDP %d[%s] submap_needed_copy for %p 0x%llx\n", proc_selfpid(), proc_name_address(current_task()->bsd_info), map, vaddr);
14882 		prot &= ~VM_PROT_WRITE;
14883 	}
14884 
14885 	/*
14886 	 *	Create an object if necessary.
14887 	 */
14888 	if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
14889 		if (vm_map_lock_read_to_write(map)) {
14890 			vm_map_lock_read(map);
14891 			goto RetryLookup;
14892 		}
14893 
14894 		VME_OBJECT_SET(entry,
14895 		    vm_object_allocate(
14896 			    (vm_map_size_t)(entry->vme_end -
14897 			    entry->vme_start),
14898 			    map->serial_id
14899 			    ), false, 0);
14900 		VME_OFFSET_SET(entry, 0);
14901 		assert(entry->use_pmap);
14902 		vm_map_lock_write_to_read(map);
14903 	}
14904 
14905 	/*
14906 	 *	Return the object/offset from this entry.  If the entry
14907 	 *	was copy-on-write or empty, it has been fixed up.  Also
14908 	 *	return the protection.
14909 	 */
14910 
14911 	*offset = (vaddr - entry->vme_start) + VME_OFFSET(entry);
14912 	*object = VME_OBJECT(entry);
14913 	*out_prot = prot;
14914 	KDBG_FILTERED(MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_MAP_LOOKUP_OBJECT), VM_KERNEL_UNSLIDE_OR_PERM(*object), (unsigned long) VME_ALIAS(entry), 0, 0);
14915 
14916 	if (fault_info) {
14917 		/*
14918 		 * Initialize fault information according to the entry being faulted
14919 		 * from.
14920 		 */
14921 		fault_info->user_tag = VME_ALIAS(entry);
14922 		fault_info->pmap_options = 0;
14923 		if (entry->iokit_acct ||
14924 		    (!entry->is_sub_map && !entry->use_pmap)) {
14925 			fault_info->pmap_options |= PMAP_OPTIONS_ALT_ACCT;
14926 		}
14927 		if (fault_info->behavior == VM_BEHAVIOR_DEFAULT) {
14928 			fault_info->behavior = entry->behavior;
14929 		}
14930 		fault_info->lo_offset = VME_OFFSET(entry);
14931 		fault_info->hi_offset =
14932 		    (entry->vme_end - entry->vme_start) + VME_OFFSET(entry);
14933 		fault_info->no_cache  = entry->no_cache;
14934 		fault_info->io_sync = FALSE;
14935 		fault_info->cs_bypass = (entry->used_for_jit ||
14936 #if CODE_SIGNING_MONITOR
14937 		    (csm_address_space_exempt(map->pmap) == KERN_SUCCESS) ||
14938 #endif
14939 		    entry->vme_resilient_codesign);
14940 		fault_info->mark_zf_absent = FALSE;
14941 		fault_info->batch_pmap_op = FALSE;
14942 		/*
14943 		 * The pmap layer will validate this page
14944 		 * before allowing it to be executed from.
14945 		 */
14946 #if CODE_SIGNING_MONITOR
14947 		fault_info->csm_associated = entry->csm_associated;
14948 #else
14949 		fault_info->csm_associated = FALSE;
14950 #endif
14951 
14952 		fault_info->resilient_media = entry->vme_resilient_media;
14953 		fault_info->fi_xnu_user_debug = entry->vme_xnu_user_debug;
14954 		fault_info->no_copy_on_read = entry->vme_no_copy_on_read;
14955 #if __arm64e__
14956 		fault_info->fi_used_for_tpro = entry->used_for_tpro;
14957 #else /* __arm64e__ */
14958 		fault_info->fi_used_for_tpro = FALSE;
14959 #endif
14960 		if (entry->translated_allow_execute) {
14961 			fault_info->pmap_options |= PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE;
14962 		}
14963 	}
14964 
14965 	/*
14966 	 *	Lock the object to prevent it from disappearing
14967 	 */
14968 	if (object_lock_type == OBJECT_LOCK_EXCLUSIVE) {
14969 		if (contended == NULL) {
14970 			vm_object_lock(*object);
14971 		} else {
14972 			*contended = vm_object_lock_check_contended(*object);
14973 		}
14974 	} else {
14975 		vm_object_lock_shared(*object);
14976 	}
14977 
14978 	/*
14979 	 *	Save the version number
14980 	 */
14981 
14982 	out_version->main_timestamp = map->timestamp;
14983 
14984 	return KERN_SUCCESS;
14985 }
14986 
14987 
14988 /*
14989  *	vm_map_verify:
14990  *
14991  *	Verifies that the map in question has not changed
14992  *	since the given version. The map has to be locked
14993  *	("shared" mode is fine) before calling this function
14994  *	and it will be returned locked too.
14995  */
14996 boolean_t
vm_map_verify(vm_map_t map,vm_map_version_t * version)14997 vm_map_verify(
14998 	vm_map_t                map,
14999 	vm_map_version_t        *version)       /* REF */
15000 {
15001 	boolean_t       result;
15002 
15003 	vm_map_lock_assert_held(map);
15004 	result = (map->timestamp == version->main_timestamp);
15005 
15006 	return result;
15007 }
15008 
15009 
15010 /*
15011  *	TEMPORARYTEMPORARYTEMPORARYTEMPORARYTEMPORARYTEMPORARY
15012  *	Goes away after regular vm_region_recurse function migrates to
15013  *	64 bits
15014  *	vm_region_recurse: A form of vm_region which follows the
15015  *	submaps in a target map
15016  *
15017  */
15018 
15019 kern_return_t
vm_map_region_recurse_64(vm_map_t map,vm_map_offset_ut * address_u,vm_map_size_ut * size_u,natural_t * nesting_depth,vm_region_submap_info_64_t submap_info,mach_msg_type_number_t * count)15020 vm_map_region_recurse_64(
15021 	vm_map_t                map,
15022 	vm_map_offset_ut       *address_u,      /* IN/OUT */
15023 	vm_map_size_ut         *size_u,         /* OUT */
15024 	natural_t              *nesting_depth,  /* IN/OUT */
15025 	vm_region_submap_info_64_t submap_info, /* IN/OUT */
15026 	mach_msg_type_number_t *count)          /* IN/OUT */
15027 {
15028 	mach_msg_type_number_t  original_count;
15029 	vm_region_extended_info_data_t  extended;
15030 	vm_map_entry_t                  tmp_entry;
15031 	vm_map_offset_t                 user_address;
15032 	unsigned int                    user_max_depth;
15033 
15034 	/*
15035 	 * "curr_entry" is the VM map entry preceding or including the
15036 	 * address we're looking for.
15037 	 * "curr_map" is the map or sub-map containing "curr_entry".
15038 	 * "curr_address" is the equivalent of the top map's "user_address"
15039 	 * in the current map.
15040 	 * "curr_offset" is the cumulated offset of "curr_map" in the
15041 	 * target task's address space.
15042 	 * "curr_depth" is the depth of "curr_map" in the chain of
15043 	 * sub-maps.
15044 	 *
15045 	 * "curr_max_below" and "curr_max_above" limit the range (around
15046 	 * "curr_address") we should take into account in the current (sub)map.
15047 	 * They limit the range to what's visible through the map entries
15048 	 * we've traversed from the top map to the current map.
15049 	 *
15050 	 */
15051 	vm_map_entry_t                  curr_entry;
15052 	vm_map_t                        curr_entry_submap;
15053 	vm_map_address_t                curr_entry_start;
15054 	vm_object_offset_t              curr_entry_offset;
15055 	vm_map_address_t                curr_address;
15056 	vm_map_offset_t                 curr_offset;
15057 	vm_map_t                        curr_map;
15058 	unsigned int                    curr_depth;
15059 	vm_map_offset_t                 curr_max_below, curr_max_above;
15060 	vm_map_offset_t                 curr_skip;
15061 
15062 	/*
15063 	 * "next_" is the same as "curr_" but for the VM region immediately
15064 	 * after the address we're looking for.  We need to keep track of this
15065 	 * too because we want to return info about that region if the
15066 	 * address we're looking for is not mapped.
15067 	 */
15068 	vm_map_entry_t                  next_entry;
15069 	vm_map_offset_t                 next_offset;
15070 	vm_map_offset_t                 next_address;
15071 	vm_map_t                        next_map;
15072 	unsigned int                    next_depth;
15073 	vm_map_offset_t                 next_max_below, next_max_above;
15074 	vm_map_offset_t                 next_skip;
15075 
15076 	boolean_t                       look_for_pages;
15077 	vm_region_submap_short_info_64_t short_info;
15078 	boolean_t                       do_region_footprint;
15079 	int                             effective_page_size, effective_page_shift;
15080 	boolean_t                       submap_needed_copy;
15081 
15082 	if (map == VM_MAP_NULL) {
15083 		/* no address space to work on */
15084 		return KERN_INVALID_ARGUMENT;
15085 	}
15086 
15087 	user_address = vm_sanitize_addr(map, *address_u);
15088 
15089 
15090 	effective_page_shift = vm_self_region_page_shift(map);
15091 	effective_page_size = (1 << effective_page_shift);
15092 
15093 	if (*count < VM_REGION_SUBMAP_SHORT_INFO_COUNT_64) {
15094 		/*
15095 		 * "info" structure is not big enough and
15096 		 * would overflow
15097 		 */
15098 		return KERN_INVALID_ARGUMENT;
15099 	}
15100 
15101 	do_region_footprint = task_self_region_footprint();
15102 	original_count = *count;
15103 
15104 	if (original_count < VM_REGION_SUBMAP_INFO_V0_COUNT_64) {
15105 		*count = VM_REGION_SUBMAP_SHORT_INFO_COUNT_64;
15106 		look_for_pages = FALSE;
15107 		short_info = (vm_region_submap_short_info_64_t) submap_info;
15108 		submap_info = NULL;
15109 	} else {
15110 		look_for_pages = TRUE;
15111 		*count = VM_REGION_SUBMAP_INFO_V0_COUNT_64;
15112 		short_info = NULL;
15113 
15114 		if (original_count >= VM_REGION_SUBMAP_INFO_V1_COUNT_64) {
15115 			*count = VM_REGION_SUBMAP_INFO_V1_COUNT_64;
15116 		}
15117 		if (original_count >= VM_REGION_SUBMAP_INFO_V2_COUNT_64) {
15118 			*count = VM_REGION_SUBMAP_INFO_V2_COUNT_64;
15119 		}
15120 	}
15121 
15122 	user_max_depth = *nesting_depth;
15123 	submap_needed_copy = FALSE;
15124 
15125 	if (not_in_kdp) {
15126 		vm_map_lock_read(map);
15127 	}
15128 
15129 recurse_again:
15130 	curr_entry = NULL;
15131 	curr_map = map;
15132 	curr_address = user_address;
15133 	curr_offset = 0;
15134 	curr_skip = 0;
15135 	curr_depth = 0;
15136 	curr_max_above = ((vm_map_offset_t) -1) - curr_address;
15137 	curr_max_below = curr_address;
15138 
15139 	next_entry = NULL;
15140 	next_map = NULL;
15141 	next_address = 0;
15142 	next_offset = 0;
15143 	next_skip = 0;
15144 	next_depth = 0;
15145 	next_max_above = (vm_map_offset_t) -1;
15146 	next_max_below = (vm_map_offset_t) -1;
15147 
15148 	for (;;) {
15149 		if (vm_map_lookup_entry(curr_map,
15150 		    curr_address,
15151 		    &tmp_entry)) {
15152 			/* tmp_entry contains the address we're looking for */
15153 			curr_entry = tmp_entry;
15154 		} else {
15155 			vm_map_offset_t skip;
15156 			/*
15157 			 * The address is not mapped.  "tmp_entry" is the
15158 			 * map entry preceding the address.  We want the next
15159 			 * one, if it exists.
15160 			 */
15161 			curr_entry = tmp_entry->vme_next;
15162 
15163 			if (curr_entry == vm_map_to_entry(curr_map) ||
15164 			    (curr_entry->vme_start >=
15165 			    curr_address + curr_max_above)) {
15166 				/* no next entry at this level: stop looking */
15167 				if (not_in_kdp) {
15168 					vm_map_unlock_read(curr_map);
15169 				}
15170 				curr_entry = NULL;
15171 				curr_map = NULL;
15172 				curr_skip = 0;
15173 				curr_offset = 0;
15174 				curr_depth = 0;
15175 				curr_max_above = 0;
15176 				curr_max_below = 0;
15177 				break;
15178 			}
15179 
15180 			/* adjust current address and offset */
15181 			skip = curr_entry->vme_start - curr_address;
15182 			curr_address = curr_entry->vme_start;
15183 			curr_skip += skip;
15184 			curr_offset += skip;
15185 			curr_max_above -= skip;
15186 			curr_max_below = 0;
15187 		}
15188 
15189 		/*
15190 		 * Is the next entry at this level closer to the address (or
15191 		 * deeper in the submap chain) than the one we had
15192 		 * so far ?
15193 		 */
15194 		tmp_entry = curr_entry->vme_next;
15195 		if (tmp_entry == vm_map_to_entry(curr_map)) {
15196 			/* no next entry at this level */
15197 		} else if (tmp_entry->vme_start >=
15198 		    curr_address + curr_max_above) {
15199 			/*
15200 			 * tmp_entry is beyond the scope of what we mapped of
15201 			 * this submap in the upper level: ignore it.
15202 			 */
15203 		} else if ((next_entry == NULL) ||
15204 		    (tmp_entry->vme_start + curr_offset <=
15205 		    next_entry->vme_start + next_offset)) {
15206 			/*
15207 			 * We didn't have a "next_entry" or this one is
15208 			 * closer to the address we're looking for:
15209 			 * use this "tmp_entry" as the new "next_entry".
15210 			 */
15211 			if (next_entry != NULL) {
15212 				/* unlock the last "next_map" */
15213 				if (next_map != curr_map && not_in_kdp) {
15214 					vm_map_unlock_read(next_map);
15215 				}
15216 			}
15217 			next_entry = tmp_entry;
15218 			next_map = curr_map;
15219 			next_depth = curr_depth;
15220 			next_address = next_entry->vme_start;
15221 			next_skip = curr_skip;
15222 			next_skip += (next_address - curr_address);
15223 			next_offset = curr_offset;
15224 			next_offset += (next_address - curr_address);
15225 			next_max_above = MIN(next_max_above, curr_max_above);
15226 			next_max_above = MIN(next_max_above,
15227 			    next_entry->vme_end - next_address);
15228 			next_max_below = MIN(next_max_below, curr_max_below);
15229 			next_max_below = MIN(next_max_below,
15230 			    next_address - next_entry->vme_start);
15231 		}
15232 
15233 		/*
15234 		 * "curr_max_{above,below}" allow us to keep track of the
15235 		 * portion of the submap that is actually mapped at this level:
15236 		 * the rest of that submap is irrelevant to us, since it's not
15237 		 * mapped here.
15238 		 * The relevant portion of the map starts at
15239 		 * "VME_OFFSET(curr_entry)" up to the size of "curr_entry".
15240 		 */
15241 		curr_max_above = MIN(curr_max_above,
15242 		    curr_entry->vme_end - curr_address);
15243 		curr_max_below = MIN(curr_max_below,
15244 		    curr_address - curr_entry->vme_start);
15245 
15246 		if (!curr_entry->is_sub_map ||
15247 		    curr_depth >= user_max_depth) {
15248 			/*
15249 			 * We hit a leaf map or we reached the maximum depth
15250 			 * we could, so stop looking.  Keep the current map
15251 			 * locked.
15252 			 */
15253 			break;
15254 		}
15255 
15256 		/*
15257 		 * Get down to the next submap level.
15258 		 */
15259 
15260 		if (curr_entry->needs_copy) {
15261 			/* everything below this is effectively copy-on-write */
15262 			submap_needed_copy = TRUE;
15263 		}
15264 
15265 		/*
15266 		 * Lock the next level and unlock the current level,
15267 		 * unless we need to keep it locked to access the "next_entry"
15268 		 * later.
15269 		 */
15270 		curr_entry_submap = VME_SUBMAP(curr_entry);
15271 		curr_entry_start = curr_entry->vme_start;
15272 		curr_entry_offset = VME_OFFSET(curr_entry);
15273 		curr_entry = VM_MAP_ENTRY_NULL; /* no longer valid after unlocking the map */
15274 		if (not_in_kdp) {
15275 			vm_map_lock_read(curr_entry_submap);
15276 		}
15277 		if (curr_map == next_map) {
15278 			/* keep "next_map" locked in case we need it */
15279 		} else {
15280 			/* release this map */
15281 			if (not_in_kdp) {
15282 				vm_map_unlock_read(curr_map);
15283 			}
15284 		}
15285 
15286 		/*
15287 		 * Adjust the offset.  "curr_entry" mapped the submap
15288 		 * at relative address "curr_entry_start" in the
15289 		 * curr_map but skips the first "curr_entry_offset"
15290 		 * bytes of the submap.
15291 		 * "curr_offset" always represents the offset of a virtual
15292 		 * address in the curr_map relative to the absolute address
15293 		 * space (i.e. the top-level VM map).
15294 		 */
15295 		curr_offset += curr_entry_offset - curr_entry_start;
15296 		curr_address = user_address + curr_offset;
15297 		/* switch to the submap */
15298 		curr_map = curr_entry_submap;
15299 		curr_depth++;
15300 	}
15301 
15302 // LP64todo: all the current tools are 32bit, obviously never worked for 64b
15303 // so probably should be a real 32b ID vs. ptr.
15304 // Current users just check for equality
15305 
15306 	if (curr_entry == NULL) {
15307 		/* no VM region contains the address... */
15308 
15309 		if (do_region_footprint && /* we want footprint numbers */
15310 		    next_entry == NULL && /* & there are no more regions */
15311 		    /* & we haven't already provided our fake region: */
15312 		    user_address <= vm_map_last_entry(map)->vme_end) {
15313 			ledger_amount_t ledger_resident, ledger_compressed;
15314 
15315 			/*
15316 			 * Add a fake memory region to account for
15317 			 * purgeable and/or ledger-tagged memory that
15318 			 * counts towards this task's memory footprint,
15319 			 * i.e. the resident/compressed pages of non-volatile
15320 			 * objects owned by that task.
15321 			 */
15322 			task_ledgers_footprint(map->pmap->ledger,
15323 			    &ledger_resident,
15324 			    &ledger_compressed);
15325 			if (ledger_resident + ledger_compressed == 0) {
15326 				/* no purgeable memory usage to report */
15327 				return KERN_INVALID_ADDRESS;
15328 			}
15329 			/* fake region to show nonvolatile footprint */
15330 			if (look_for_pages) {
15331 				submap_info->protection = VM_PROT_DEFAULT;
15332 				submap_info->max_protection = VM_PROT_DEFAULT;
15333 				submap_info->inheritance = VM_INHERIT_DEFAULT;
15334 				submap_info->offset = 0;
15335 				submap_info->user_tag = -1;
15336 				submap_info->pages_resident = (unsigned int) (ledger_resident / effective_page_size);
15337 				submap_info->pages_shared_now_private = 0;
15338 				submap_info->pages_swapped_out = (unsigned int) (ledger_compressed / effective_page_size);
15339 				submap_info->pages_dirtied = submap_info->pages_resident;
15340 				submap_info->ref_count = 1;
15341 				submap_info->shadow_depth = 0;
15342 				submap_info->external_pager = 0;
15343 				submap_info->share_mode = SM_PRIVATE;
15344 				if (submap_needed_copy) {
15345 					submap_info->share_mode = SM_COW;
15346 				}
15347 				submap_info->is_submap = 0;
15348 				submap_info->behavior = VM_BEHAVIOR_DEFAULT;
15349 				submap_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
15350 				submap_info->user_wired_count = 0;
15351 				submap_info->pages_reusable = 0;
15352 			} else {
15353 				short_info->user_tag = -1;
15354 				short_info->offset = 0;
15355 				short_info->protection = VM_PROT_DEFAULT;
15356 				short_info->inheritance = VM_INHERIT_DEFAULT;
15357 				short_info->max_protection = VM_PROT_DEFAULT;
15358 				short_info->behavior = VM_BEHAVIOR_DEFAULT;
15359 				short_info->user_wired_count = 0;
15360 				short_info->is_submap = 0;
15361 				short_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
15362 				short_info->external_pager = 0;
15363 				short_info->shadow_depth = 0;
15364 				short_info->share_mode = SM_PRIVATE;
15365 				if (submap_needed_copy) {
15366 					short_info->share_mode = SM_COW;
15367 				}
15368 				short_info->ref_count = 1;
15369 			}
15370 			*nesting_depth = 0;
15371 			*address_u = vm_sanitize_wrap_addr(vm_map_last_entry(map)->vme_end);
15372 			*size_u    = vm_sanitize_wrap_size(ledger_resident + ledger_compressed);
15373 			return KERN_SUCCESS;
15374 		}
15375 
15376 		if (next_entry == NULL) {
15377 			/* ... and no VM region follows it either */
15378 			return KERN_INVALID_ADDRESS;
15379 		}
15380 		/* ... gather info about the next VM region */
15381 		curr_entry = next_entry;
15382 		curr_map = next_map;    /* still locked ... */
15383 		curr_address = next_address;
15384 		curr_skip = next_skip;
15385 		curr_offset = next_offset;
15386 		curr_depth = next_depth;
15387 		curr_max_above = next_max_above;
15388 		curr_max_below = next_max_below;
15389 	} else {
15390 		/* we won't need "next_entry" after all */
15391 		if (next_entry != NULL) {
15392 			/* release "next_map" */
15393 			if (next_map != curr_map && not_in_kdp) {
15394 				vm_map_unlock_read(next_map);
15395 			}
15396 		}
15397 	}
15398 	next_entry = NULL;
15399 	next_map = NULL;
15400 	next_offset = 0;
15401 	next_skip = 0;
15402 	next_depth = 0;
15403 	next_max_below = -1;
15404 	next_max_above = -1;
15405 
15406 	if (curr_entry->is_sub_map &&
15407 	    curr_depth < user_max_depth) {
15408 		/*
15409 		 * We're not as deep as we could be:  we must have
15410 		 * gone back up after not finding anything mapped
15411 		 * below the original top-level map entry's.
15412 		 * Let's move "curr_address" forward and recurse again.
15413 		 */
15414 		user_address = curr_address;
15415 		goto recurse_again;
15416 	}
15417 
15418 	*nesting_depth = curr_depth;
15419 	*address_u = vm_sanitize_wrap_addr(
15420 		user_address + curr_skip - curr_max_below);
15421 	*size_u    = vm_sanitize_wrap_size(curr_max_above + curr_max_below);
15422 
15423 	if (look_for_pages) {
15424 		submap_info->user_tag = VME_ALIAS(curr_entry);
15425 		submap_info->offset = VME_OFFSET(curr_entry);
15426 		submap_info->protection = curr_entry->protection;
15427 		submap_info->inheritance = curr_entry->inheritance;
15428 		submap_info->max_protection = curr_entry->max_protection;
15429 		submap_info->behavior = curr_entry->behavior;
15430 		submap_info->user_wired_count = curr_entry->user_wired_count;
15431 		submap_info->is_submap = curr_entry->is_sub_map;
15432 		if (curr_entry->is_sub_map) {
15433 			submap_info->object_id = VM_OBJECT_ID(VME_SUBMAP(curr_entry));
15434 		} else {
15435 			submap_info->object_id = VM_OBJECT_ID(VME_OBJECT(curr_entry));
15436 		}
15437 	} else {
15438 		short_info->user_tag = VME_ALIAS(curr_entry);
15439 		short_info->offset = VME_OFFSET(curr_entry);
15440 		short_info->protection = curr_entry->protection;
15441 		short_info->inheritance = curr_entry->inheritance;
15442 		short_info->max_protection = curr_entry->max_protection;
15443 		short_info->behavior = curr_entry->behavior;
15444 		short_info->user_wired_count = curr_entry->user_wired_count;
15445 		short_info->is_submap = curr_entry->is_sub_map;
15446 		if (curr_entry->is_sub_map) {
15447 			short_info->object_id = VM_OBJECT_ID(VME_SUBMAP(curr_entry));
15448 		} else {
15449 			short_info->object_id = VM_OBJECT_ID(VME_OBJECT(curr_entry));
15450 		}
15451 	}
15452 
15453 	extended.pages_resident = 0;
15454 	extended.pages_swapped_out = 0;
15455 	extended.pages_shared_now_private = 0;
15456 	extended.pages_dirtied = 0;
15457 	extended.pages_reusable = 0;
15458 	extended.external_pager = 0;
15459 	extended.shadow_depth = 0;
15460 	extended.share_mode = SM_EMPTY;
15461 	extended.ref_count = 0;
15462 
15463 	if (not_in_kdp) {
15464 		if (!curr_entry->is_sub_map) {
15465 			vm_map_offset_t range_start, range_end;
15466 			range_start = MAX((curr_address - curr_max_below),
15467 			    curr_entry->vme_start);
15468 			range_end = MIN((curr_address + curr_max_above),
15469 			    curr_entry->vme_end);
15470 			vm_map_region_walk(curr_map,
15471 			    range_start,
15472 			    curr_entry,
15473 			    (VME_OFFSET(curr_entry) +
15474 			    (range_start -
15475 			    curr_entry->vme_start)),
15476 			    range_end - range_start,
15477 			    &extended,
15478 			    look_for_pages, VM_REGION_EXTENDED_INFO_COUNT);
15479 			if (submap_needed_copy) {
15480 				extended.share_mode = SM_COW;
15481 			}
15482 		} else {
15483 			if (curr_entry->use_pmap) {
15484 				extended.share_mode = SM_TRUESHARED;
15485 			} else {
15486 				extended.share_mode = SM_PRIVATE;
15487 			}
15488 			extended.ref_count = os_ref_get_count_raw(&VME_SUBMAP(curr_entry)->map_refcnt);
15489 		}
15490 	}
15491 
15492 	if (look_for_pages) {
15493 		submap_info->pages_resident = extended.pages_resident;
15494 		submap_info->pages_swapped_out = extended.pages_swapped_out;
15495 		submap_info->pages_shared_now_private =
15496 		    extended.pages_shared_now_private;
15497 		submap_info->pages_dirtied = extended.pages_dirtied;
15498 		submap_info->external_pager = extended.external_pager;
15499 		submap_info->shadow_depth = extended.shadow_depth;
15500 		submap_info->share_mode = extended.share_mode;
15501 		submap_info->ref_count = extended.ref_count;
15502 
15503 		if (original_count >= VM_REGION_SUBMAP_INFO_V1_COUNT_64) {
15504 			submap_info->pages_reusable = extended.pages_reusable;
15505 		}
15506 		if (original_count >= VM_REGION_SUBMAP_INFO_V2_COUNT_64) {
15507 			if (curr_entry->is_sub_map) {
15508 				submap_info->object_id_full = (vm_object_id_t)VM_KERNEL_ADDRHASH(VME_SUBMAP(curr_entry));
15509 			} else if (VME_OBJECT(curr_entry)) {
15510 				submap_info->object_id_full = (vm_object_id_t)VM_KERNEL_ADDRHASH(VME_OBJECT(curr_entry));
15511 			} else {
15512 				submap_info->object_id_full = 0ull;
15513 			}
15514 		}
15515 	} else {
15516 		short_info->external_pager = extended.external_pager;
15517 		short_info->shadow_depth = extended.shadow_depth;
15518 		short_info->share_mode = extended.share_mode;
15519 		short_info->ref_count = extended.ref_count;
15520 	}
15521 
15522 	if (not_in_kdp) {
15523 		vm_map_unlock_read(curr_map);
15524 	}
15525 
15526 	return KERN_SUCCESS;
15527 }
15528 
15529 /*
15530  *	vm_region:
15531  *
15532  *	User call to obtain information about a region in
15533  *	a task's address map. Currently, only one flavor is
15534  *	supported.
15535  *
15536  *	XXX The reserved and behavior fields cannot be filled
15537  *	    in until the vm merge from the IK is completed, and
15538  *	    vm_reserve is implemented.
15539  */
15540 
15541 kern_return_t
vm_map_region(vm_map_t map,vm_map_offset_ut * address_u,vm_map_size_ut * size_u,vm_region_flavor_t flavor,vm_region_info_t info,mach_msg_type_number_t * count,mach_port_t * object_name)15542 vm_map_region(
15543 	vm_map_t                map,
15544 	vm_map_offset_ut       *address_u,      /* IN/OUT */
15545 	vm_map_size_ut         *size_u,         /* OUT */
15546 	vm_region_flavor_t      flavor,         /* IN */
15547 	vm_region_info_t        info,           /* OUT */
15548 	mach_msg_type_number_t *count,          /* IN/OUT */
15549 	mach_port_t            *object_name)    /* OUT */
15550 {
15551 	vm_map_entry_t          tmp_entry;
15552 	vm_map_entry_t          entry;
15553 	vm_map_offset_t         start;
15554 
15555 	if (map == VM_MAP_NULL) {
15556 		return KERN_INVALID_ARGUMENT;
15557 	}
15558 
15559 	start = vm_sanitize_addr(map, *address_u);
15560 
15561 
15562 	switch (flavor) {
15563 	case VM_REGION_BASIC_INFO:
15564 		/* legacy for old 32-bit objects info */
15565 	{
15566 		vm_region_basic_info_t  basic;
15567 
15568 		if (*count < VM_REGION_BASIC_INFO_COUNT) {
15569 			return KERN_INVALID_ARGUMENT;
15570 		}
15571 
15572 		basic = (vm_region_basic_info_t) info;
15573 		*count = VM_REGION_BASIC_INFO_COUNT;
15574 
15575 		vm_map_lock_read(map);
15576 
15577 		if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15578 			if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15579 				vm_map_unlock_read(map);
15580 				return KERN_INVALID_ADDRESS;
15581 			}
15582 		} else {
15583 			entry = tmp_entry;
15584 		}
15585 
15586 		start = entry->vme_start;
15587 
15588 		basic->offset = (uint32_t)VME_OFFSET(entry);
15589 		basic->protection = entry->protection;
15590 		basic->inheritance = entry->inheritance;
15591 		basic->max_protection = entry->max_protection;
15592 		basic->behavior = entry->behavior;
15593 		basic->user_wired_count = entry->user_wired_count;
15594 		basic->reserved = entry->is_sub_map;
15595 
15596 		*address_u = vm_sanitize_wrap_addr(start);
15597 		*size_u    = vm_sanitize_wrap_size(entry->vme_end - start);
15598 
15599 		if (object_name) {
15600 			*object_name = IP_NULL;
15601 		}
15602 		if (entry->is_sub_map) {
15603 			basic->shared = FALSE;
15604 		} else {
15605 			basic->shared = entry->is_shared;
15606 		}
15607 
15608 		vm_map_unlock_read(map);
15609 		return KERN_SUCCESS;
15610 	}
15611 
15612 	case VM_REGION_BASIC_INFO_64:
15613 	{
15614 		vm_region_basic_info_64_t       basic;
15615 
15616 		if (*count < VM_REGION_BASIC_INFO_COUNT_64) {
15617 			return KERN_INVALID_ARGUMENT;
15618 		}
15619 
15620 		basic = (vm_region_basic_info_64_t) info;
15621 		*count = VM_REGION_BASIC_INFO_COUNT_64;
15622 
15623 		vm_map_lock_read(map);
15624 
15625 		if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15626 			if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15627 				vm_map_unlock_read(map);
15628 				return KERN_INVALID_ADDRESS;
15629 			}
15630 		} else {
15631 			entry = tmp_entry;
15632 		}
15633 
15634 		start = entry->vme_start;
15635 
15636 		basic->offset = VME_OFFSET(entry);
15637 		basic->protection = entry->protection;
15638 		basic->inheritance = entry->inheritance;
15639 		basic->max_protection = entry->max_protection;
15640 		basic->behavior = entry->behavior;
15641 		basic->user_wired_count = entry->user_wired_count;
15642 		basic->reserved = entry->is_sub_map;
15643 
15644 		*address_u = vm_sanitize_wrap_addr(start);
15645 		*size_u    = vm_sanitize_wrap_size(entry->vme_end - start);
15646 
15647 		if (object_name) {
15648 			*object_name = IP_NULL;
15649 		}
15650 		if (entry->is_sub_map) {
15651 			basic->shared = FALSE;
15652 		} else {
15653 			basic->shared = entry->is_shared;
15654 		}
15655 
15656 		vm_map_unlock_read(map);
15657 		return KERN_SUCCESS;
15658 	}
15659 	case VM_REGION_EXTENDED_INFO:
15660 		if (*count < VM_REGION_EXTENDED_INFO_COUNT) {
15661 			return KERN_INVALID_ARGUMENT;
15662 		}
15663 		OS_FALLTHROUGH;
15664 	case VM_REGION_EXTENDED_INFO__legacy:
15665 	{
15666 		vm_region_extended_info_t       extended;
15667 		mach_msg_type_number_t original_count;
15668 		int effective_page_size, effective_page_shift;
15669 
15670 		if (*count < VM_REGION_EXTENDED_INFO_COUNT__legacy) {
15671 			return KERN_INVALID_ARGUMENT;
15672 		}
15673 
15674 		extended = (vm_region_extended_info_t) info;
15675 
15676 		effective_page_shift = vm_self_region_page_shift(map);
15677 		effective_page_size = (1 << effective_page_shift);
15678 
15679 		vm_map_lock_read(map);
15680 
15681 		if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15682 			if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15683 				vm_map_unlock_read(map);
15684 				return KERN_INVALID_ADDRESS;
15685 			}
15686 		} else {
15687 			entry = tmp_entry;
15688 		}
15689 		start = entry->vme_start;
15690 
15691 		extended->protection = entry->protection;
15692 		extended->user_tag = VME_ALIAS(entry);
15693 		extended->pages_resident = 0;
15694 		extended->pages_swapped_out = 0;
15695 		extended->pages_shared_now_private = 0;
15696 		extended->pages_dirtied = 0;
15697 		extended->external_pager = 0;
15698 		extended->shadow_depth = 0;
15699 
15700 		original_count = *count;
15701 		if (flavor == VM_REGION_EXTENDED_INFO__legacy) {
15702 			*count = VM_REGION_EXTENDED_INFO_COUNT__legacy;
15703 		} else {
15704 			extended->pages_reusable = 0;
15705 			*count = VM_REGION_EXTENDED_INFO_COUNT;
15706 		}
15707 
15708 		vm_map_region_walk(map, start, entry, VME_OFFSET(entry), entry->vme_end - start, extended, TRUE, *count);
15709 
15710 		if (object_name) {
15711 			*object_name = IP_NULL;
15712 		}
15713 
15714 		*address_u = vm_sanitize_wrap_addr(start);
15715 		*size_u    = vm_sanitize_wrap_size(entry->vme_end - start);
15716 
15717 		vm_map_unlock_read(map);
15718 		return KERN_SUCCESS;
15719 	}
15720 	case VM_REGION_TOP_INFO:
15721 	{
15722 		vm_region_top_info_t    top;
15723 
15724 		if (*count < VM_REGION_TOP_INFO_COUNT) {
15725 			return KERN_INVALID_ARGUMENT;
15726 		}
15727 
15728 		top = (vm_region_top_info_t) info;
15729 		*count = VM_REGION_TOP_INFO_COUNT;
15730 
15731 		vm_map_lock_read(map);
15732 
15733 		if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15734 			if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15735 				vm_map_unlock_read(map);
15736 				return KERN_INVALID_ADDRESS;
15737 			}
15738 		} else {
15739 			entry = tmp_entry;
15740 		}
15741 		start = entry->vme_start;
15742 
15743 		top->private_pages_resident = 0;
15744 		top->shared_pages_resident = 0;
15745 
15746 		vm_map_region_top_walk(entry, top);
15747 
15748 		if (object_name) {
15749 			*object_name = IP_NULL;
15750 		}
15751 
15752 		*address_u = vm_sanitize_wrap_addr(start);
15753 		*size_u    = vm_sanitize_wrap_size(entry->vme_end - start);
15754 
15755 		vm_map_unlock_read(map);
15756 		return KERN_SUCCESS;
15757 	}
15758 	default:
15759 		return KERN_INVALID_ARGUMENT;
15760 	}
15761 }
15762 
15763 #define OBJ_RESIDENT_COUNT(obj, entry_size)                             \
15764 	MIN((entry_size),                                               \
15765 	    ((obj)->all_reusable ?                                      \
15766 	     (obj)->wired_page_count :                                  \
15767 	     (obj)->resident_page_count - (obj)->reusable_page_count))
15768 
15769 void
vm_map_region_top_walk(vm_map_entry_t entry,vm_region_top_info_t top)15770 vm_map_region_top_walk(
15771 	vm_map_entry_t             entry,
15772 	vm_region_top_info_t       top)
15773 {
15774 	if (entry->is_sub_map || VME_OBJECT(entry) == 0) {
15775 		top->share_mode = SM_EMPTY;
15776 		top->ref_count = 0;
15777 		top->obj_id = 0;
15778 		return;
15779 	}
15780 
15781 	{
15782 		struct  vm_object *obj, *tmp_obj;
15783 		int             ref_count;
15784 		uint32_t        entry_size;
15785 
15786 		entry_size = (uint32_t) ((entry->vme_end - entry->vme_start) / PAGE_SIZE_64);
15787 
15788 		obj = VME_OBJECT(entry);
15789 
15790 		vm_object_lock(obj);
15791 
15792 		if ((ref_count = os_ref_get_count_raw(&obj->ref_count)) > 1 &&
15793 		    obj->paging_in_progress) {
15794 			ref_count--;
15795 		}
15796 
15797 		assert(obj->reusable_page_count <= obj->resident_page_count);
15798 		if (obj->shadow) {
15799 			if (ref_count == 1) {
15800 				top->private_pages_resident =
15801 				    OBJ_RESIDENT_COUNT(obj, entry_size);
15802 			} else {
15803 				top->shared_pages_resident =
15804 				    OBJ_RESIDENT_COUNT(obj, entry_size);
15805 			}
15806 			top->ref_count  = ref_count;
15807 			top->share_mode = SM_COW;
15808 
15809 			while ((tmp_obj = obj->shadow)) {
15810 				vm_object_lock(tmp_obj);
15811 				vm_object_unlock(obj);
15812 				obj = tmp_obj;
15813 
15814 				if ((ref_count = os_ref_get_count_raw(&obj->ref_count)) > 1 &&
15815 				    obj->paging_in_progress) {
15816 					ref_count--;
15817 				}
15818 
15819 				assert(obj->reusable_page_count <= obj->resident_page_count);
15820 				top->shared_pages_resident +=
15821 				    OBJ_RESIDENT_COUNT(obj, entry_size);
15822 				top->ref_count += ref_count - 1;
15823 			}
15824 		} else {
15825 			if (entry->superpage_size) {
15826 				top->share_mode = SM_LARGE_PAGE;
15827 				top->shared_pages_resident = 0;
15828 				top->private_pages_resident = entry_size;
15829 			} else if (entry->needs_copy) {
15830 				top->share_mode = SM_COW;
15831 				top->shared_pages_resident =
15832 				    OBJ_RESIDENT_COUNT(obj, entry_size);
15833 			} else {
15834 				if (ref_count == 1 ||
15835 				    (ref_count == 2 && obj->named)) {
15836 					top->share_mode = SM_PRIVATE;
15837 					top->private_pages_resident =
15838 					    OBJ_RESIDENT_COUNT(obj,
15839 					    entry_size);
15840 				} else {
15841 					top->share_mode = SM_SHARED;
15842 					top->shared_pages_resident =
15843 					    OBJ_RESIDENT_COUNT(obj,
15844 					    entry_size);
15845 				}
15846 			}
15847 			top->ref_count = ref_count;
15848 		}
15849 
15850 		vm_object_unlock(obj);
15851 
15852 		/* XXX K64: obj_id will be truncated */
15853 		top->obj_id = (unsigned int) (uintptr_t)VM_KERNEL_ADDRHASH(obj);
15854 	}
15855 }
15856 
15857 void
vm_map_region_walk(vm_map_t map,vm_map_offset_t va,vm_map_entry_t entry,vm_object_offset_t offset,vm_object_size_t range,vm_region_extended_info_t extended,boolean_t look_for_pages,mach_msg_type_number_t count)15858 vm_map_region_walk(
15859 	vm_map_t                        map,
15860 	vm_map_offset_t                 va,
15861 	vm_map_entry_t                  entry,
15862 	vm_object_offset_t              offset,
15863 	vm_object_size_t                range,
15864 	vm_region_extended_info_t       extended,
15865 	boolean_t                       look_for_pages,
15866 	mach_msg_type_number_t count)
15867 {
15868 	struct vm_object *obj, *tmp_obj;
15869 	vm_map_offset_t       last_offset;
15870 	int               i;
15871 	int               ref_count;
15872 	struct vm_object        *shadow_object;
15873 	unsigned short          shadow_depth;
15874 	boolean_t         do_region_footprint;
15875 	int                     effective_page_size, effective_page_shift;
15876 	vm_map_offset_t         effective_page_mask;
15877 
15878 	do_region_footprint = task_self_region_footprint();
15879 
15880 	if ((entry->is_sub_map) ||
15881 	    (VME_OBJECT(entry) == 0) ||
15882 	    (VME_OBJECT(entry)->phys_contiguous &&
15883 	    !entry->superpage_size)) {
15884 		extended->share_mode = SM_EMPTY;
15885 		extended->ref_count = 0;
15886 		return;
15887 	}
15888 
15889 	if (entry->superpage_size) {
15890 		extended->shadow_depth = 0;
15891 		extended->share_mode = SM_LARGE_PAGE;
15892 		extended->ref_count = 1;
15893 		extended->external_pager = 0;
15894 
15895 		/* TODO4K: Superpage in 4k mode? */
15896 		extended->pages_resident = (unsigned int)(range >> PAGE_SHIFT);
15897 		extended->shadow_depth = 0;
15898 		return;
15899 	}
15900 
15901 	effective_page_shift = vm_self_region_page_shift(map);
15902 	effective_page_size = (1 << effective_page_shift);
15903 	effective_page_mask = effective_page_size - 1;
15904 
15905 	offset = vm_map_trunc_page(offset, effective_page_mask);
15906 
15907 	obj = VME_OBJECT(entry);
15908 
15909 	vm_object_lock(obj);
15910 
15911 	if ((ref_count = os_ref_get_count_raw(&obj->ref_count)) > 1 &&
15912 	    obj->paging_in_progress) {
15913 		ref_count--;
15914 	}
15915 
15916 	if (look_for_pages) {
15917 		for (last_offset = offset + range;
15918 		    offset < last_offset;
15919 		    offset += effective_page_size, va += effective_page_size) {
15920 			if (do_region_footprint) {
15921 				int disp;
15922 
15923 				disp = 0;
15924 				if (map->has_corpse_footprint) {
15925 					/*
15926 					 * Query the page info data we saved
15927 					 * while forking the corpse.
15928 					 */
15929 					vm_map_corpse_footprint_query_page_info(
15930 						map,
15931 						va,
15932 						&disp);
15933 				} else {
15934 					/*
15935 					 * Query the pmap.
15936 					 */
15937 					vm_map_footprint_query_page_info(
15938 						map,
15939 						entry,
15940 						va,
15941 						&disp);
15942 				}
15943 				if (disp & VM_PAGE_QUERY_PAGE_PRESENT) {
15944 					extended->pages_resident++;
15945 				}
15946 				if (disp & VM_PAGE_QUERY_PAGE_REUSABLE) {
15947 					extended->pages_reusable++;
15948 				}
15949 				if (disp & VM_PAGE_QUERY_PAGE_DIRTY) {
15950 					extended->pages_dirtied++;
15951 				}
15952 				if (disp & PMAP_QUERY_PAGE_COMPRESSED) {
15953 					extended->pages_swapped_out++;
15954 				}
15955 				continue;
15956 			}
15957 
15958 			vm_map_region_look_for_page(map, va, obj,
15959 			    vm_object_trunc_page(offset), ref_count,
15960 			    0, extended, count);
15961 		}
15962 
15963 		if (do_region_footprint) {
15964 			goto collect_object_info;
15965 		}
15966 	} else {
15967 collect_object_info:
15968 		shadow_object = obj->shadow;
15969 		shadow_depth = 0;
15970 
15971 		if (!(obj->internal)) {
15972 			extended->external_pager = 1;
15973 		}
15974 
15975 		if (shadow_object != VM_OBJECT_NULL) {
15976 			vm_object_lock(shadow_object);
15977 			for (;
15978 			    shadow_object != VM_OBJECT_NULL;
15979 			    shadow_depth++) {
15980 				vm_object_t     next_shadow;
15981 
15982 				if (!(shadow_object->internal)) {
15983 					extended->external_pager = 1;
15984 				}
15985 
15986 				next_shadow = shadow_object->shadow;
15987 				if (next_shadow) {
15988 					vm_object_lock(next_shadow);
15989 				}
15990 				vm_object_unlock(shadow_object);
15991 				shadow_object = next_shadow;
15992 			}
15993 		}
15994 		extended->shadow_depth = shadow_depth;
15995 	}
15996 
15997 	if (extended->shadow_depth || entry->needs_copy) {
15998 		extended->share_mode = SM_COW;
15999 	} else {
16000 		if (ref_count == 1) {
16001 			extended->share_mode = SM_PRIVATE;
16002 		} else {
16003 			if (obj->true_share) {
16004 				extended->share_mode = SM_TRUESHARED;
16005 			} else {
16006 				extended->share_mode = SM_SHARED;
16007 			}
16008 		}
16009 	}
16010 	extended->ref_count = ref_count - extended->shadow_depth;
16011 
16012 	for (i = 0; i < extended->shadow_depth; i++) {
16013 		if ((tmp_obj = obj->shadow) == 0) {
16014 			break;
16015 		}
16016 		vm_object_lock(tmp_obj);
16017 		vm_object_unlock(obj);
16018 
16019 		if ((ref_count = os_ref_get_count_raw(&tmp_obj->ref_count)) > 1 &&
16020 		    tmp_obj->paging_in_progress) {
16021 			ref_count--;
16022 		}
16023 
16024 		extended->ref_count += ref_count;
16025 		obj = tmp_obj;
16026 	}
16027 	vm_object_unlock(obj);
16028 
16029 	if (extended->external_pager && extended->ref_count == 2 && extended->share_mode == SM_SHARED) {
16030 		extended->share_mode = SM_PRIVATE;
16031 	} else if (extended->share_mode == SM_SHARED && !(task_self_region_info_flags() & VM_REGION_INFO_FLAGS_NO_ALIASED)) {
16032 		vm_map_entry_t       cur;
16033 		vm_map_entry_t       last;
16034 		int      my_refs;
16035 
16036 		obj = VME_OBJECT(entry);
16037 		last = vm_map_to_entry(map);
16038 		my_refs = 0;
16039 
16040 		if ((ref_count = os_ref_get_count_raw(&obj->ref_count)) > 1 &&
16041 		    obj->paging_in_progress) {
16042 			ref_count--;
16043 		}
16044 		for (cur = vm_map_first_entry(map); cur != last; cur = cur->vme_next) {
16045 			if (vm_map_region_has_obj_ref(cur, obj)) {
16046 				my_refs++;
16047 			}
16048 		}
16049 
16050 		if (my_refs == ref_count) {
16051 			extended->share_mode = SM_PRIVATE_ALIASED;
16052 		} else if (my_refs > 1) {
16053 			extended->share_mode = SM_SHARED_ALIASED;
16054 		}
16055 	}
16056 }
16057 
16058 
16059 /* object is locked on entry and locked on return */
16060 
16061 
16062 static void
vm_map_region_look_for_page(__unused vm_map_t map,__unused vm_map_offset_t va,vm_object_t object,vm_object_offset_t offset,int max_refcnt,unsigned short depth,vm_region_extended_info_t extended,mach_msg_type_number_t count)16063 vm_map_region_look_for_page(
16064 	__unused vm_map_t               map,
16065 	__unused vm_map_offset_t        va,
16066 	vm_object_t                     object,
16067 	vm_object_offset_t              offset,
16068 	int                             max_refcnt,
16069 	unsigned short                  depth,
16070 	vm_region_extended_info_t       extended,
16071 	mach_msg_type_number_t count)
16072 {
16073 	vm_page_t       p;
16074 	vm_object_t     shadow;
16075 	int             ref_count;
16076 	vm_object_t     caller_object;
16077 
16078 	shadow = object->shadow;
16079 	caller_object = object;
16080 
16081 
16082 	while (TRUE) {
16083 		if (!(object->internal)) {
16084 			extended->external_pager = 1;
16085 		}
16086 
16087 		if ((p = vm_page_lookup(object, offset)) != VM_PAGE_NULL) {
16088 			if (shadow && (max_refcnt == 1)) {
16089 				extended->pages_shared_now_private++;
16090 			}
16091 
16092 			if (!vm_page_is_fictitious(p) &&
16093 			    (p->vmp_dirty || pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(p)))) {
16094 				extended->pages_dirtied++;
16095 			} else if (count >= VM_REGION_EXTENDED_INFO_COUNT) {
16096 				if (p->vmp_reusable || object->all_reusable) {
16097 					extended->pages_reusable++;
16098 				}
16099 			}
16100 
16101 			extended->pages_resident++;
16102 
16103 			if (object != caller_object) {
16104 				vm_object_unlock(object);
16105 			}
16106 
16107 			return;
16108 		}
16109 		if (object->internal &&
16110 		    object->alive &&
16111 		    !object->terminating &&
16112 		    object->pager_ready) {
16113 			if (vm_object_compressor_pager_state_get(object, offset)
16114 			    == VM_EXTERNAL_STATE_EXISTS) {
16115 				/* the pager has that page */
16116 				extended->pages_swapped_out++;
16117 				if (object != caller_object) {
16118 					vm_object_unlock(object);
16119 				}
16120 				return;
16121 			}
16122 		}
16123 
16124 		if (shadow) {
16125 			vm_object_lock(shadow);
16126 			if ((ref_count = os_ref_get_count_raw(&shadow->ref_count)) > 1 &&
16127 			    shadow->paging_in_progress) {
16128 				ref_count--;
16129 			}
16130 
16131 			if (++depth > extended->shadow_depth) {
16132 				extended->shadow_depth = depth;
16133 			}
16134 
16135 			if (ref_count > max_refcnt) {
16136 				max_refcnt = ref_count;
16137 			}
16138 
16139 			if (object != caller_object) {
16140 				vm_object_unlock(object);
16141 			}
16142 
16143 			offset = offset + object->vo_shadow_offset;
16144 			object = shadow;
16145 			shadow = object->shadow;
16146 			continue;
16147 		}
16148 		if (object != caller_object) {
16149 			vm_object_unlock(object);
16150 		}
16151 		break;
16152 	}
16153 }
16154 
16155 static inline boolean_t
vm_map_region_has_obj_ref(vm_map_entry_t entry,vm_object_t object)16156 vm_map_region_has_obj_ref(
16157 	vm_map_entry_t    entry,
16158 	vm_object_t       object)
16159 {
16160 	vm_object_t cur_obj;
16161 	vm_object_t shadow_obj;
16162 
16163 	if (entry->is_sub_map) {
16164 		return FALSE;
16165 	}
16166 
16167 	cur_obj = VME_OBJECT(entry);
16168 	if (cur_obj == VM_OBJECT_NULL) {
16169 		return FALSE;
16170 	} else if (cur_obj == object) {
16171 		return TRUE;
16172 	}
16173 
16174 	/*
16175 	 * Avoid locks for first shadow check, otherwise diagnostic tools will
16176 	 * spend most of their time obtaining locks in this function when analyzing
16177 	 * processes with many VM entries which may commonly have no shadow chain.
16178 	 *
16179 	 * This is acceptable because:
16180 	 *  - Shadow's fields are not accessed outside of its lock
16181 	 *  - Objects are unlikely to be modified due to:
16182 	 *	  - Many diagnostic tools suspend the task
16183 	 *	  - VM map is locked
16184 	 *	- The rare incorrect return from this function turns a guess into a
16185 	 *	  slightly worse guess
16186 	 *	- Entire shadow chain is not locked as a whole, so can still change
16187 	 *	  while traversing, resulting in incorrect guess even with locking
16188 	 */
16189 	shadow_obj = cur_obj->shadow;
16190 	if (shadow_obj == VM_OBJECT_NULL) {
16191 		return FALSE;
16192 	} else if (shadow_obj == object) {
16193 		return TRUE;
16194 	}
16195 
16196 	vm_object_lock(cur_obj);
16197 
16198 	while ((shadow_obj = cur_obj->shadow)) {
16199 		/* check if object was found before grabbing a lock */
16200 		if (shadow_obj == object) {
16201 			vm_object_unlock(cur_obj);
16202 			return TRUE;
16203 		}
16204 
16205 		vm_object_lock(shadow_obj);
16206 		vm_object_unlock(cur_obj);
16207 		cur_obj = shadow_obj;
16208 	}
16209 
16210 	/* exhausted the shadow chain */
16211 	vm_object_unlock(cur_obj);
16212 	return FALSE;
16213 }
16214 
16215 
16216 /*
16217  *	Routine:	vm_map_simplify
16218  *
16219  *	Description:
16220  *		Attempt to simplify the map representation in
16221  *		the vicinity of the given starting address.
16222  *	Note:
16223  *		This routine is intended primarily to keep the
16224  *		kernel maps more compact -- they generally don't
16225  *		benefit from the "expand a map entry" technology
16226  *		at allocation time because the adjacent entry
16227  *		is often wired down.
16228  */
16229 void
vm_map_simplify_entry(vm_map_t map,vm_map_entry_t this_entry)16230 vm_map_simplify_entry(
16231 	vm_map_t        map,
16232 	vm_map_entry_t  this_entry)
16233 {
16234 	vm_map_entry_t  prev_entry;
16235 
16236 	prev_entry = this_entry->vme_prev;
16237 
16238 	if ((this_entry != vm_map_to_entry(map)) &&
16239 	    (prev_entry != vm_map_to_entry(map)) &&
16240 
16241 	    (prev_entry->vme_end == this_entry->vme_start) &&
16242 
16243 	    (prev_entry->is_sub_map == this_entry->is_sub_map) &&
16244 	    (prev_entry->vme_object_value == this_entry->vme_object_value) &&
16245 	    (prev_entry->vme_kernel_object == this_entry->vme_kernel_object) &&
16246 	    ((VME_OFFSET(prev_entry) + (prev_entry->vme_end -
16247 	    prev_entry->vme_start))
16248 	    == VME_OFFSET(this_entry)) &&
16249 
16250 	    (prev_entry->behavior == this_entry->behavior) &&
16251 	    (prev_entry->needs_copy == this_entry->needs_copy) &&
16252 	    (prev_entry->protection == this_entry->protection) &&
16253 	    (prev_entry->max_protection == this_entry->max_protection) &&
16254 	    (prev_entry->inheritance == this_entry->inheritance) &&
16255 	    (prev_entry->use_pmap == this_entry->use_pmap) &&
16256 	    (VME_ALIAS(prev_entry) == VME_ALIAS(this_entry)) &&
16257 	    (prev_entry->no_cache == this_entry->no_cache) &&
16258 	    (prev_entry->vme_permanent == this_entry->vme_permanent) &&
16259 	    (prev_entry->map_aligned == this_entry->map_aligned) &&
16260 	    (prev_entry->zero_wired_pages == this_entry->zero_wired_pages) &&
16261 	    (prev_entry->used_for_jit == this_entry->used_for_jit) &&
16262 #if __arm64e__
16263 	    (prev_entry->used_for_tpro == this_entry->used_for_tpro) &&
16264 #endif
16265 	    (prev_entry->csm_associated == this_entry->csm_associated) &&
16266 	    (prev_entry->vme_xnu_user_debug == this_entry->vme_xnu_user_debug) &&
16267 	    (prev_entry->iokit_acct == this_entry->iokit_acct) &&
16268 	    (prev_entry->vme_resilient_codesign ==
16269 	    this_entry->vme_resilient_codesign) &&
16270 	    (prev_entry->vme_resilient_media ==
16271 	    this_entry->vme_resilient_media) &&
16272 	    (prev_entry->vme_no_copy_on_read == this_entry->vme_no_copy_on_read) &&
16273 	    (prev_entry->translated_allow_execute == this_entry->translated_allow_execute) &&
16274 
16275 	    (prev_entry->wired_count == this_entry->wired_count) &&
16276 	    (prev_entry->user_wired_count == this_entry->user_wired_count) &&
16277 
16278 	    ((prev_entry->vme_atomic == FALSE) && (this_entry->vme_atomic == FALSE)) &&
16279 	    (prev_entry->in_transition == FALSE) &&
16280 	    (this_entry->in_transition == FALSE) &&
16281 	    (prev_entry->needs_wakeup == FALSE) &&
16282 	    (this_entry->needs_wakeup == FALSE) &&
16283 	    (prev_entry->is_shared == this_entry->is_shared) &&
16284 	    (prev_entry->superpage_size == FALSE) &&
16285 	    (this_entry->superpage_size == FALSE)
16286 	    ) {
16287 		if (prev_entry->vme_permanent) {
16288 			assert(this_entry->vme_permanent);
16289 			prev_entry->vme_permanent = false;
16290 		}
16291 		vm_map_store_entry_unlink(map, prev_entry, true);
16292 		assert(prev_entry->vme_start < this_entry->vme_end);
16293 		if (prev_entry->map_aligned) {
16294 			assert(VM_MAP_PAGE_ALIGNED(prev_entry->vme_start,
16295 			    VM_MAP_PAGE_MASK(map)));
16296 		}
16297 		this_entry->vme_start = prev_entry->vme_start;
16298 		VME_OFFSET_SET(this_entry, VME_OFFSET(prev_entry));
16299 
16300 		if (map->holelistenabled) {
16301 			vm_map_store_update_first_free(map, this_entry, TRUE);
16302 		}
16303 
16304 		if (prev_entry->is_sub_map) {
16305 			vm_map_deallocate(VME_SUBMAP(prev_entry));
16306 		} else {
16307 			vm_object_deallocate(VME_OBJECT(prev_entry));
16308 		}
16309 		vm_map_entry_dispose(prev_entry);
16310 		SAVE_HINT_MAP_WRITE(map, this_entry);
16311 	}
16312 }
16313 
16314 void
vm_map_simplify(vm_map_t map,vm_map_offset_t start)16315 vm_map_simplify(
16316 	vm_map_t        map,
16317 	vm_map_offset_t start)
16318 {
16319 	vm_map_entry_t  this_entry;
16320 
16321 	vm_map_lock(map);
16322 	if (vm_map_lookup_entry(map, start, &this_entry)) {
16323 		vm_map_simplify_entry(map, this_entry);
16324 		vm_map_simplify_entry(map, this_entry->vme_next);
16325 	}
16326 	vm_map_unlock(map);
16327 }
16328 
16329 static void
vm_map_simplify_range(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16330 vm_map_simplify_range(
16331 	vm_map_t        map,
16332 	vm_map_offset_t start,
16333 	vm_map_offset_t end)
16334 {
16335 	vm_map_entry_t  entry;
16336 
16337 	/*
16338 	 * The map should be locked (for "write") by the caller.
16339 	 */
16340 
16341 	if (start >= end) {
16342 		/* invalid address range */
16343 		return;
16344 	}
16345 
16346 	start = vm_map_trunc_page(start,
16347 	    VM_MAP_PAGE_MASK(map));
16348 	end = vm_map_round_page(end,
16349 	    VM_MAP_PAGE_MASK(map));
16350 
16351 	if (!vm_map_lookup_entry(map, start, &entry)) {
16352 		/* "start" is not mapped and "entry" ends before "start" */
16353 		if (entry == vm_map_to_entry(map)) {
16354 			/* start with first entry in the map */
16355 			entry = vm_map_first_entry(map);
16356 		} else {
16357 			/* start with next entry */
16358 			entry = entry->vme_next;
16359 		}
16360 	}
16361 
16362 	while (entry != vm_map_to_entry(map) &&
16363 	    entry->vme_start <= end) {
16364 		/* try and coalesce "entry" with its previous entry */
16365 		vm_map_simplify_entry(map, entry);
16366 		entry = entry->vme_next;
16367 	}
16368 }
16369 
16370 static __attribute__((always_inline, warn_unused_result))
16371 kern_return_t
vm_map_machine_attribute_sanitize(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,mach_vm_offset_t * start,mach_vm_offset_t * end,vm_map_size_t * size)16372 vm_map_machine_attribute_sanitize(
16373 	vm_map_t                map,
16374 	vm_map_offset_ut        start_u,
16375 	vm_map_offset_ut        end_u,
16376 	mach_vm_offset_t       *start,
16377 	mach_vm_offset_t       *end,
16378 	vm_map_size_t          *size)
16379 {
16380 	vm_sanitize_flags_t flags = VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS;
16381 
16382 
16383 	return vm_sanitize_addr_end(start_u, end_u,
16384 	           VM_SANITIZE_CALLER_VM_MAP_MACHINE_ATTRIBUTE, map,
16385 	           flags, start, end, size);
16386 }
16387 
16388 
16389 /*
16390  *	Routine:	vm_map_machine_attribute
16391  *	Purpose:
16392  *		Provide machine-specific attributes to mappings,
16393  *		such as cachability etc. for machines that provide
16394  *		them.  NUMA architectures and machines with big/strange
16395  *		caches will use this.
16396  *	Note:
16397  *		Responsibilities for locking and checking are handled here,
16398  *		everything else in the pmap module. If any non-volatile
16399  *		information must be kept, the pmap module should handle
16400  *		it itself. [This assumes that attributes do not
16401  *		need to be inherited, which seems ok to me]
16402  */
16403 kern_return_t
vm_map_machine_attribute(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_machine_attribute_t attribute,vm_machine_attribute_val_t * value)16404 vm_map_machine_attribute(
16405 	vm_map_t                map,
16406 	vm_map_offset_ut        start_u,
16407 	vm_map_offset_ut        end_u,
16408 	vm_machine_attribute_t  attribute,
16409 	vm_machine_attribute_val_t *value) /* IN/OUT */
16410 {
16411 	mach_vm_offset_t start, end;
16412 	vm_map_size_t    sync_size;
16413 	kern_return_t    ret;
16414 	vm_map_entry_t   entry;
16415 
16416 	ret = vm_map_machine_attribute_sanitize(map,
16417 	    start_u,
16418 	    end_u,
16419 	    &start,
16420 	    &end,
16421 	    &sync_size);
16422 	if (__improbable(ret != KERN_SUCCESS)) {
16423 		return vm_sanitize_get_kr(ret);
16424 	}
16425 
16426 	if (start < vm_map_min(map) || end > vm_map_max(map)) {
16427 		return KERN_INVALID_ADDRESS;
16428 	}
16429 
16430 	vm_map_lock(map);
16431 
16432 	if (attribute != MATTR_CACHE) {
16433 		/* If we don't have to find physical addresses, we */
16434 		/* don't have to do an explicit traversal here.    */
16435 		ret = pmap_attribute(map->pmap, start, end - start,
16436 		    attribute, value);
16437 		vm_map_unlock(map);
16438 		return ret;
16439 	}
16440 
16441 	ret = KERN_SUCCESS;                                                                             /* Assume it all worked */
16442 
16443 	while (sync_size) {
16444 		if (vm_map_lookup_entry(map, start, &entry)) {
16445 			vm_map_size_t   sub_size;
16446 			if ((entry->vme_end - start) > sync_size) {
16447 				sub_size = sync_size;
16448 				sync_size = 0;
16449 			} else {
16450 				sub_size = entry->vme_end - start;
16451 				sync_size -= sub_size;
16452 			}
16453 			if (entry->is_sub_map) {
16454 				vm_map_offset_t sub_start;
16455 				vm_map_offset_t sub_end;
16456 
16457 				sub_start = (start - entry->vme_start)
16458 				    + VME_OFFSET(entry);
16459 				sub_end = sub_start + sub_size;
16460 				vm_map_machine_attribute(
16461 					VME_SUBMAP(entry),
16462 					sub_start,
16463 					sub_end,
16464 					attribute, value);
16465 			} else if (VME_OBJECT(entry)) {
16466 				vm_page_t               m;
16467 				vm_object_t             object;
16468 				vm_object_t             base_object;
16469 				vm_object_t             last_object;
16470 				vm_object_offset_t      offset;
16471 				vm_object_offset_t      base_offset;
16472 				vm_map_size_t           range;
16473 				range = sub_size;
16474 				offset = (start - entry->vme_start)
16475 				    + VME_OFFSET(entry);
16476 				offset = vm_object_trunc_page(offset);
16477 				base_offset = offset;
16478 				object = VME_OBJECT(entry);
16479 				base_object = object;
16480 				last_object = NULL;
16481 
16482 				vm_object_lock(object);
16483 
16484 				while (range) {
16485 					m = vm_page_lookup(
16486 						object, offset);
16487 
16488 					if (m && !vm_page_is_fictitious(m)) {
16489 						ret =
16490 						    pmap_attribute_cache_sync(
16491 							VM_PAGE_GET_PHYS_PAGE(m),
16492 							PAGE_SIZE,
16493 							attribute, value);
16494 					} else if (object->shadow) {
16495 						offset = offset + object->vo_shadow_offset;
16496 						last_object = object;
16497 						object = object->shadow;
16498 						vm_object_lock(last_object->shadow);
16499 						vm_object_unlock(last_object);
16500 						continue;
16501 					}
16502 					if (range < PAGE_SIZE) {
16503 						range = 0;
16504 					} else {
16505 						range -= PAGE_SIZE;
16506 					}
16507 
16508 					if (base_object != object) {
16509 						vm_object_unlock(object);
16510 						vm_object_lock(base_object);
16511 						object = base_object;
16512 					}
16513 					/* Bump to the next page */
16514 					base_offset += PAGE_SIZE;
16515 					offset = base_offset;
16516 				}
16517 				vm_object_unlock(object);
16518 			}
16519 			start += sub_size;
16520 		} else {
16521 			vm_map_unlock(map);
16522 			return KERN_FAILURE;
16523 		}
16524 	}
16525 
16526 	vm_map_unlock(map);
16527 
16528 	return ret;
16529 }
16530 
16531 /*
16532  *	vm_map_behavior_set:
16533  *
16534  *	Sets the paging reference behavior of the specified address
16535  *	range in the target map.  Paging reference behavior affects
16536  *	how pagein operations resulting from faults on the map will be
16537  *	clustered.
16538  */
16539 kern_return_t
vm_map_behavior_set(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_behavior_t new_behavior)16540 vm_map_behavior_set(
16541 	vm_map_t        map,
16542 	vm_map_offset_t start,
16543 	vm_map_offset_t end,
16544 	vm_behavior_t   new_behavior)
16545 {
16546 	vm_map_entry_t  entry;
16547 	vm_map_entry_t  temp_entry;
16548 
16549 	if (start > end ||
16550 	    start < vm_map_min(map) ||
16551 	    end > vm_map_max(map)) {
16552 		return KERN_NO_SPACE;
16553 	}
16554 	if (__improbable(vm_map_range_overflows(map, start, end - start))) {
16555 		return KERN_INVALID_ADDRESS;
16556 	}
16557 
16558 	switch (new_behavior) {
16559 	/*
16560 	 * This first block of behaviors all set a persistent state on the specified
16561 	 * memory range.  All we have to do here is to record the desired behavior
16562 	 * in the vm_map_entry_t's.
16563 	 */
16564 
16565 	case VM_BEHAVIOR_DEFAULT:
16566 	case VM_BEHAVIOR_RANDOM:
16567 	case VM_BEHAVIOR_SEQUENTIAL:
16568 	case VM_BEHAVIOR_RSEQNTL:
16569 	case VM_BEHAVIOR_ZERO_WIRED_PAGES:
16570 		vm_map_lock(map);
16571 
16572 		/*
16573 		 *	The entire address range must be valid for the map.
16574 		 *      Note that vm_map_range_check() does a
16575 		 *	vm_map_lookup_entry() internally and returns the
16576 		 *	entry containing the start of the address range if
16577 		 *	the entire range is valid.
16578 		 */
16579 		if (vm_map_range_check(map, start, end, &temp_entry)) {
16580 			entry = temp_entry;
16581 			vm_map_clip_start(map, entry, start);
16582 		} else {
16583 			vm_map_unlock(map);
16584 			return KERN_INVALID_ADDRESS;
16585 		}
16586 
16587 		if (new_behavior == VM_BEHAVIOR_ZERO_WIRED_PAGES) {
16588 			/* zeroing requires write access */
16589 			temp_entry = entry;
16590 			for (;
16591 			    entry != vm_map_to_entry(map) && (entry->vme_start < end);
16592 			    entry = entry->vme_next) {
16593 				if (!(entry->protection & VM_PROT_WRITE) ||
16594 #if __arm64e__
16595 				    entry->used_for_tpro ||
16596 #endif /* __arm64e__ */
16597 				    entry->used_for_jit) {
16598 					vm_map_unlock(map);
16599 					return KERN_PROTECTION_FAILURE;
16600 				}
16601 			}
16602 			entry = temp_entry;
16603 		}
16604 
16605 		while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
16606 			vm_map_clip_end(map, entry, end);
16607 			if (entry->is_sub_map) {
16608 				assert(!entry->use_pmap);
16609 			}
16610 
16611 			if (new_behavior == VM_BEHAVIOR_ZERO_WIRED_PAGES) {
16612 				assert(entry->protection & VM_PROT_WRITE);
16613 #if __arm64e__
16614 				assert(!entry->used_for_tpro);
16615 #endif /* __arm64e__ */
16616 				assert(!entry->used_for_jit);
16617 				entry->zero_wired_pages = TRUE;
16618 			} else {
16619 				entry->behavior = new_behavior;
16620 			}
16621 			entry = entry->vme_next;
16622 		}
16623 
16624 		vm_map_unlock(map);
16625 		break;
16626 
16627 	/*
16628 	 * The rest of these are different from the above in that they cause
16629 	 * an immediate action to take place as opposed to setting a behavior that
16630 	 * affects future actions.
16631 	 */
16632 
16633 	case VM_BEHAVIOR_WILLNEED:
16634 		return vm_map_willneed(map, start, end);
16635 
16636 	case VM_BEHAVIOR_DONTNEED:
16637 		return vm_map_msync(map, start, end - start, VM_SYNC_DEACTIVATE | VM_SYNC_CONTIGUOUS);
16638 
16639 	case VM_BEHAVIOR_FREE:
16640 		return vm_map_msync(map, start, end - start, VM_SYNC_KILLPAGES | VM_SYNC_CONTIGUOUS);
16641 
16642 	case VM_BEHAVIOR_REUSABLE:
16643 		return vm_map_reusable_pages(map, start, end);
16644 
16645 	case VM_BEHAVIOR_REUSE:
16646 		return vm_map_reuse_pages(map, start, end);
16647 
16648 	case VM_BEHAVIOR_CAN_REUSE:
16649 		return vm_map_can_reuse(map, start, end);
16650 
16651 #if MACH_ASSERT
16652 	case VM_BEHAVIOR_PAGEOUT:
16653 		return vm_map_pageout(map, start, end);
16654 #endif /* MACH_ASSERT */
16655 
16656 	case VM_BEHAVIOR_ZERO:
16657 		return vm_map_zero(map, start, end);
16658 
16659 	default:
16660 		return KERN_INVALID_ARGUMENT;
16661 	}
16662 
16663 	return KERN_SUCCESS;
16664 }
16665 
16666 
16667 /*
16668  * Internals for madvise(MADV_WILLNEED) system call.
16669  *
16670  * The implementation is to do:-
16671  * a) read-ahead if the mapping corresponds to a mapped regular file
16672  * b) or, fault in the pages (zero-fill, decompress etc) if it's an anonymous mapping
16673  */
16674 static kern_return_t
vm_map_willneed(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16675 vm_map_willneed(
16676 	vm_map_t        map,
16677 	vm_map_offset_t start,
16678 	vm_map_offset_t end
16679 	)
16680 {
16681 	vm_map_entry_t entry;
16682 	kern_return_t kr;
16683 	vm_object_size_t len;
16684 	vm_size_t region_size;
16685 
16686 	KDBG(VMDBG_CODE(DBG_VM_MAP_WILLNEED) | DBG_FUNC_START,
16687 	    start, end);
16688 	struct vm_object_fault_info fault_info = {
16689 		.interruptible = THREAD_UNINT,
16690 		.behavior = VM_BEHAVIOR_SEQUENTIAL,
16691 		/* Do not activate pages after faulting */
16692 		.stealth = true,
16693 		/* Don't wait for busy pages */
16694 		.fi_no_sleep = true,
16695 	};
16696 
16697 	/*
16698 	 * The MADV_WILLNEED operation doesn't require any changes to the
16699 	 * vm_map_entry_t's, so the read lock is sufficient.
16700 	 */
16701 
16702 	vm_map_lock_read(map);
16703 
16704 	/*
16705 	 * The madvise semantics require that the address range be fully
16706 	 * allocated with no holes.  Otherwise, we're required to return
16707 	 * an error.
16708 	 */
16709 
16710 	if (!vm_map_range_check(map, start, end, &entry)) {
16711 		vm_map_unlock_read(map);
16712 		kr = KERN_INVALID_ADDRESS;
16713 		goto done;
16714 	}
16715 
16716 	/*
16717 	 * Examine each vm_map_entry_t in the range.
16718 	 */
16719 	while (start < end) {
16720 		/*
16721 		 * Set the length so we don't go beyond the end of the
16722 		 * map_entry or beyond the end of the range we were given.
16723 		 * This range could span also multiple map entries all of which
16724 		 * map different files, so make sure we only do the right amount
16725 		 * of I/O for each object.  Note that it's possible for there
16726 		 * to be multiple map entries all referring to the same object
16727 		 * but with different page permissions, but it's not worth
16728 		 * trying to optimize that case.
16729 		 */
16730 		len = MIN(entry->vme_end - start, end - start);
16731 
16732 		vm_map_offset_t addr = start;
16733 
16734 		vm_size_t effective_page_mask = MIN(vm_map_page_mask(map), PAGE_MASK);
16735 		vm_map_offset_t effective_page_size = effective_page_mask + 1;
16736 
16737 		/*
16738 		 * Write-fault if the entry supports it to preclude subsequent soft-faults
16739 		 */
16740 		vm_prot_t fault_prot = entry->protection & VM_PROT_WRITE ?
16741 		    VM_PROT_WRITE : VM_PROT_READ;
16742 
16743 		vm_map_unlock_read(map);
16744 
16745 		region_size = len;
16746 		while (region_size) {
16747 			/*
16748 			 * Provide a hint for how much clustering we would like. Note that
16749 			 * each individual fault will limit the size of each request to
16750 			 * MAX_UPL_TRANSFER_BYTES.
16751 			 */
16752 			fault_info.cluster_size = region_size;
16753 			kr = vm_pre_fault_with_info(
16754 				map,
16755 				vm_map_trunc_page(addr, effective_page_mask),
16756 				fault_prot,
16757 				&fault_info);
16758 			if (kr == KERN_ALREADY_WAITING) {
16759 				/*
16760 				 * The page is busy being faulted/paged by another thread.
16761 				 */
16762 				KDBG(VMDBG_CODE(DBG_VM_MAP_WILLNEED) | DBG_FUNC_NONE,
16763 				    task_pid(current_task()), addr, kr);
16764 				kr = KERN_SUCCESS;
16765 			} else if (kr != KERN_SUCCESS) {
16766 				goto done;
16767 			}
16768 			region_size -= effective_page_size;
16769 			addr += effective_page_size;
16770 		}
16771 
16772 		start += len;
16773 		if (start >= end) {
16774 			kr = KERN_SUCCESS;
16775 			goto done;
16776 		}
16777 
16778 		if (thread_should_abort(current_thread())) {
16779 			kr = KERN_ABORTED;
16780 			goto done;
16781 		}
16782 
16783 		/* look up next entry */
16784 		vm_map_lock_read(map);
16785 		if (!vm_map_lookup_entry(map, start, &entry)) {
16786 			/*
16787 			 * There's a new hole in the address range.
16788 			 */
16789 			vm_map_unlock_read(map);
16790 			kr = KERN_INVALID_ADDRESS;
16791 			goto done;
16792 		}
16793 	}
16794 
16795 	vm_map_unlock_read(map);
16796 done:
16797 	KDBG(VMDBG_CODE(DBG_VM_MAP_WILLNEED) | DBG_FUNC_END,
16798 	    start, kr);
16799 	return kr;
16800 }
16801 
16802 static boolean_t
vm_map_entry_is_reusable(vm_map_entry_t entry)16803 vm_map_entry_is_reusable(
16804 	vm_map_entry_t entry)
16805 {
16806 	/* Only user map entries */
16807 
16808 	vm_object_t object;
16809 
16810 	if (entry->is_sub_map) {
16811 		return FALSE;
16812 	}
16813 
16814 	switch (VME_ALIAS(entry)) {
16815 	case VM_MEMORY_MALLOC:
16816 	case VM_MEMORY_MALLOC_SMALL:
16817 	case VM_MEMORY_MALLOC_LARGE:
16818 	case VM_MEMORY_REALLOC:
16819 	case VM_MEMORY_MALLOC_TINY:
16820 	case VM_MEMORY_MALLOC_LARGE_REUSABLE:
16821 	case VM_MEMORY_MALLOC_LARGE_REUSED:
16822 		/*
16823 		 * This is a malloc() memory region: check if it's still
16824 		 * in its original state and can be re-used for more
16825 		 * malloc() allocations.
16826 		 */
16827 		break;
16828 	default:
16829 		/*
16830 		 * Not a malloc() memory region: let the caller decide if
16831 		 * it's re-usable.
16832 		 */
16833 		return TRUE;
16834 	}
16835 
16836 	if (/*entry->is_shared ||*/
16837 		entry->is_sub_map ||
16838 		entry->in_transition ||
16839 		entry->protection != VM_PROT_DEFAULT ||
16840 		entry->max_protection != VM_PROT_ALL ||
16841 		entry->inheritance != VM_INHERIT_DEFAULT ||
16842 		entry->no_cache ||
16843 		entry->vme_permanent ||
16844 		entry->superpage_size != FALSE ||
16845 		entry->zero_wired_pages ||
16846 		entry->wired_count != 0 ||
16847 		entry->user_wired_count != 0) {
16848 		return FALSE;
16849 	}
16850 
16851 	object = VME_OBJECT(entry);
16852 	if (object == VM_OBJECT_NULL) {
16853 		return TRUE;
16854 	}
16855 	if (
16856 #if 0
16857 		/*
16858 		 * Let's proceed even if the VM object is potentially
16859 		 * shared.
16860 		 * We check for this later when processing the actual
16861 		 * VM pages, so the contents will be safe if shared.
16862 		 *
16863 		 * But we can still mark this memory region as "reusable" to
16864 		 * acknowledge that the caller did let us know that the memory
16865 		 * could be re-used and should not be penalized for holding
16866 		 * on to it.  This allows its "resident size" to not include
16867 		 * the reusable range.
16868 		 */
16869 		object->ref_count == 1 &&
16870 #endif
16871 		object->vo_copy == VM_OBJECT_NULL &&
16872 		object->shadow == VM_OBJECT_NULL &&
16873 		object->internal &&
16874 		object->purgable == VM_PURGABLE_DENY &&
16875 		HAS_DEFAULT_CACHEABILITY(object->wimg_bits & VM_WIMG_MASK) &&
16876 		!object->code_signed) {
16877 		return TRUE;
16878 	}
16879 	return FALSE;
16880 }
16881 
16882 static kern_return_t
vm_map_reuse_pages(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16883 vm_map_reuse_pages(
16884 	vm_map_t        map,
16885 	vm_map_offset_t start,
16886 	vm_map_offset_t end)
16887 {
16888 	vm_map_entry_t                  entry;
16889 	vm_object_t                     object;
16890 	vm_object_offset_t              start_offset, end_offset;
16891 
16892 	/*
16893 	 * The MADV_REUSE operation doesn't require any changes to the
16894 	 * vm_map_entry_t's, so the read lock is sufficient.
16895 	 */
16896 
16897 	if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
16898 		/*
16899 		 * XXX TODO4K
16900 		 * need to figure out what reusable means for a
16901 		 * portion of a native page.
16902 		 */
16903 		return KERN_SUCCESS;
16904 	}
16905 
16906 	vm_map_lock_read(map);
16907 	assert(map->pmap != kernel_pmap);       /* protect alias access */
16908 
16909 	/*
16910 	 * The madvise semantics require that the address range be fully
16911 	 * allocated with no holes.  Otherwise, we're required to return
16912 	 * an error.
16913 	 */
16914 
16915 	if (!vm_map_range_check(map, start, end, &entry)) {
16916 		vm_map_unlock_read(map);
16917 		vm_page_stats_reusable.reuse_pages_failure++;
16918 		return KERN_INVALID_ADDRESS;
16919 	}
16920 
16921 	/*
16922 	 * Examine each vm_map_entry_t in the range.
16923 	 */
16924 	for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
16925 	    entry = entry->vme_next) {
16926 		/*
16927 		 * Sanity check on the VM map entry.
16928 		 */
16929 		if (!vm_map_entry_is_reusable(entry)) {
16930 			vm_map_unlock_read(map);
16931 			vm_page_stats_reusable.reuse_pages_failure++;
16932 			return KERN_INVALID_ADDRESS;
16933 		}
16934 
16935 		/*
16936 		 * The first time through, the start address could be anywhere
16937 		 * within the vm_map_entry we found.  So adjust the offset to
16938 		 * correspond.
16939 		 */
16940 		if (entry->vme_start < start) {
16941 			start_offset = start - entry->vme_start;
16942 		} else {
16943 			start_offset = 0;
16944 		}
16945 		end_offset = MIN(end, entry->vme_end) - entry->vme_start;
16946 		start_offset += VME_OFFSET(entry);
16947 		end_offset += VME_OFFSET(entry);
16948 
16949 		object = VME_OBJECT(entry);
16950 		if (object != VM_OBJECT_NULL) {
16951 			vm_object_lock(object);
16952 			vm_object_reuse_pages(object, start_offset, end_offset,
16953 			    TRUE);
16954 			vm_object_unlock(object);
16955 		}
16956 
16957 		if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE_REUSABLE) {
16958 			/*
16959 			 * XXX
16960 			 * We do not hold the VM map exclusively here.
16961 			 * The "alias" field is not that critical, so it's
16962 			 * safe to update it here, as long as it is the only
16963 			 * one that can be modified while holding the VM map
16964 			 * "shared".
16965 			 */
16966 			VME_ALIAS_SET(entry, VM_MEMORY_MALLOC_LARGE_REUSED);
16967 		}
16968 	}
16969 
16970 	vm_map_unlock_read(map);
16971 	vm_page_stats_reusable.reuse_pages_success++;
16972 	return KERN_SUCCESS;
16973 }
16974 
16975 
16976 static kern_return_t
vm_map_reusable_pages(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16977 vm_map_reusable_pages(
16978 	vm_map_t        map,
16979 	vm_map_offset_t start,
16980 	vm_map_offset_t end)
16981 {
16982 	vm_map_entry_t                  entry;
16983 	vm_object_t                     object;
16984 	vm_object_offset_t              start_offset, end_offset;
16985 	vm_map_offset_t                 pmap_offset;
16986 
16987 	if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
16988 		/*
16989 		 * XXX TODO4K
16990 		 * need to figure out what reusable means for a portion
16991 		 * of a native page.
16992 		 */
16993 		return KERN_SUCCESS;
16994 	}
16995 
16996 	/*
16997 	 * The MADV_REUSABLE operation doesn't require any changes to the
16998 	 * vm_map_entry_t's, so the read lock is sufficient.
16999 	 */
17000 
17001 	vm_map_lock_read(map);
17002 	assert(map->pmap != kernel_pmap);       /* protect alias access */
17003 
17004 	/*
17005 	 * The madvise semantics require that the address range be fully
17006 	 * allocated with no holes.  Otherwise, we're required to return
17007 	 * an error.
17008 	 */
17009 
17010 	if (!vm_map_range_check(map, start, end, &entry)) {
17011 		vm_map_unlock_read(map);
17012 		vm_page_stats_reusable.reusable_pages_failure++;
17013 		return KERN_INVALID_ADDRESS;
17014 	}
17015 
17016 	/*
17017 	 * Examine each vm_map_entry_t in the range.
17018 	 */
17019 	for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
17020 	    entry = entry->vme_next) {
17021 		int kill_pages = 0;
17022 		boolean_t kill_no_write = FALSE;
17023 
17024 		/*
17025 		 * Sanity check on the VM map entry.
17026 		 */
17027 		if (!vm_map_entry_is_reusable(entry)) {
17028 			vm_map_unlock_read(map);
17029 			vm_page_stats_reusable.reusable_pages_failure++;
17030 			return KERN_INVALID_ADDRESS;
17031 		}
17032 
17033 		if (!(entry->protection & VM_PROT_WRITE) && !entry->used_for_jit
17034 #if __arm64e__
17035 		    && !entry->used_for_tpro
17036 #endif
17037 		    ) {
17038 			/* not writable: can't discard contents */
17039 			vm_map_unlock_read(map);
17040 			vm_page_stats_reusable.reusable_nonwritable++;
17041 			vm_page_stats_reusable.reusable_pages_failure++;
17042 			return KERN_PROTECTION_FAILURE;
17043 		}
17044 
17045 		/*
17046 		 * The first time through, the start address could be anywhere
17047 		 * within the vm_map_entry we found.  So adjust the offset to
17048 		 * correspond.
17049 		 */
17050 		if (entry->vme_start < start) {
17051 			start_offset = start - entry->vme_start;
17052 			pmap_offset = start;
17053 		} else {
17054 			start_offset = 0;
17055 			pmap_offset = entry->vme_start;
17056 		}
17057 		end_offset = MIN(end, entry->vme_end) - entry->vme_start;
17058 		start_offset += VME_OFFSET(entry);
17059 		end_offset += VME_OFFSET(entry);
17060 
17061 		object = VME_OBJECT(entry);
17062 		if (object == VM_OBJECT_NULL) {
17063 			continue;
17064 		}
17065 
17066 		if ((entry->protection & VM_PROT_EXECUTE) ||
17067 		    entry->vme_xnu_user_debug) {
17068 			/*
17069 			 * Executable or user debug pages might be write-protected by
17070 			 * hardware, so do not attempt to write to these pages.
17071 			 */
17072 			kill_no_write = TRUE;
17073 		}
17074 
17075 		vm_object_lock(object);
17076 		if (((os_ref_get_count_raw(&object->ref_count) == 1) ||
17077 		    (object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC &&
17078 		    object->vo_copy == VM_OBJECT_NULL)) &&
17079 		    object->shadow == VM_OBJECT_NULL &&
17080 		    /*
17081 		     * "iokit_acct" entries are billed for their virtual size
17082 		     * (rather than for their resident pages only), so they
17083 		     * wouldn't benefit from making pages reusable, and it
17084 		     * would be hard to keep track of pages that are both
17085 		     * "iokit_acct" and "reusable" in the pmap stats and
17086 		     * ledgers.
17087 		     */
17088 		    !(entry->iokit_acct ||
17089 		    (!entry->is_sub_map && !entry->use_pmap))) {
17090 			if (os_ref_get_count_raw(&object->ref_count) != 1) {
17091 				vm_page_stats_reusable.reusable_shared++;
17092 			}
17093 			kill_pages = 1;
17094 		} else {
17095 			kill_pages = -1;
17096 		}
17097 		if (kill_pages != -1) {
17098 			vm_object_deactivate_pages(object,
17099 			    start_offset,
17100 			    end_offset - start_offset,
17101 			    kill_pages,
17102 			    TRUE /*reusable_pages*/,
17103 			    kill_no_write,
17104 			    map->pmap,
17105 			    pmap_offset);
17106 		} else {
17107 			vm_page_stats_reusable.reusable_pages_shared++;
17108 			DTRACE_VM4(vm_map_reusable_pages_shared,
17109 			    unsigned int, VME_ALIAS(entry),
17110 			    vm_map_t, map,
17111 			    vm_map_entry_t, entry,
17112 			    vm_object_t, object);
17113 		}
17114 		vm_object_unlock(object);
17115 
17116 		if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE ||
17117 		    VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE_REUSED) {
17118 			/*
17119 			 * XXX
17120 			 * We do not hold the VM map exclusively here.
17121 			 * The "alias" field is not that critical, so it's
17122 			 * safe to update it here, as long as it is the only
17123 			 * one that can be modified while holding the VM map
17124 			 * "shared".
17125 			 */
17126 			VME_ALIAS_SET(entry, VM_MEMORY_MALLOC_LARGE_REUSABLE);
17127 		}
17128 	}
17129 
17130 	vm_map_unlock_read(map);
17131 	vm_page_stats_reusable.reusable_pages_success++;
17132 	return KERN_SUCCESS;
17133 }
17134 
17135 
17136 static kern_return_t
vm_map_can_reuse(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)17137 vm_map_can_reuse(
17138 	vm_map_t        map,
17139 	vm_map_offset_t start,
17140 	vm_map_offset_t end)
17141 {
17142 	vm_map_entry_t                  entry;
17143 
17144 	/*
17145 	 * The MADV_REUSABLE operation doesn't require any changes to the
17146 	 * vm_map_entry_t's, so the read lock is sufficient.
17147 	 */
17148 
17149 	vm_map_lock_read(map);
17150 	assert(map->pmap != kernel_pmap);       /* protect alias access */
17151 
17152 	/*
17153 	 * The madvise semantics require that the address range be fully
17154 	 * allocated with no holes.  Otherwise, we're required to return
17155 	 * an error.
17156 	 */
17157 
17158 	if (!vm_map_range_check(map, start, end, &entry)) {
17159 		vm_map_unlock_read(map);
17160 		vm_page_stats_reusable.can_reuse_failure++;
17161 		return KERN_INVALID_ADDRESS;
17162 	}
17163 
17164 	/*
17165 	 * Examine each vm_map_entry_t in the range.
17166 	 */
17167 	for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
17168 	    entry = entry->vme_next) {
17169 		/*
17170 		 * Sanity check on the VM map entry.
17171 		 */
17172 		if (!vm_map_entry_is_reusable(entry)) {
17173 			vm_map_unlock_read(map);
17174 			vm_page_stats_reusable.can_reuse_failure++;
17175 			return KERN_INVALID_ADDRESS;
17176 		}
17177 	}
17178 
17179 	vm_map_unlock_read(map);
17180 	vm_page_stats_reusable.can_reuse_success++;
17181 	return KERN_SUCCESS;
17182 }
17183 
17184 
17185 #if MACH_ASSERT
17186 static kern_return_t
vm_map_pageout(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)17187 vm_map_pageout(
17188 	vm_map_t        map,
17189 	vm_map_offset_t start,
17190 	vm_map_offset_t end)
17191 {
17192 	vm_map_entry_t                  entry;
17193 
17194 	/*
17195 	 * The MADV_PAGEOUT operation doesn't require any changes to the
17196 	 * vm_map_entry_t's, so the read lock is sufficient.
17197 	 */
17198 
17199 	vm_map_lock_read(map);
17200 
17201 	/*
17202 	 * The madvise semantics require that the address range be fully
17203 	 * allocated with no holes.  Otherwise, we're required to return
17204 	 * an error.
17205 	 */
17206 
17207 	if (!vm_map_range_check(map, start, end, &entry)) {
17208 		vm_map_unlock_read(map);
17209 		return KERN_INVALID_ADDRESS;
17210 	}
17211 
17212 	/*
17213 	 * Examine each vm_map_entry_t in the range.
17214 	 */
17215 	for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
17216 	    entry = entry->vme_next) {
17217 		vm_object_t     object;
17218 
17219 		/*
17220 		 * Sanity check on the VM map entry.
17221 		 */
17222 		if (entry->is_sub_map) {
17223 			vm_map_t submap;
17224 			vm_map_offset_t submap_start;
17225 			vm_map_offset_t submap_end;
17226 			vm_map_entry_t submap_entry;
17227 
17228 			submap = VME_SUBMAP(entry);
17229 			submap_start = VME_OFFSET(entry);
17230 			submap_end = submap_start + (entry->vme_end -
17231 			    entry->vme_start);
17232 
17233 			vm_map_lock_read(submap);
17234 
17235 			if (!vm_map_range_check(submap,
17236 			    submap_start,
17237 			    submap_end,
17238 			    &submap_entry)) {
17239 				vm_map_unlock_read(submap);
17240 				vm_map_unlock_read(map);
17241 				return KERN_INVALID_ADDRESS;
17242 			}
17243 
17244 			if (submap_entry->is_sub_map) {
17245 				vm_map_unlock_read(submap);
17246 				continue;
17247 			}
17248 
17249 			object = VME_OBJECT(submap_entry);
17250 			if (object == VM_OBJECT_NULL || !object->internal) {
17251 				vm_map_unlock_read(submap);
17252 				continue;
17253 			}
17254 
17255 			vm_object_pageout(object);
17256 
17257 			vm_map_unlock_read(submap);
17258 			submap = VM_MAP_NULL;
17259 			submap_entry = VM_MAP_ENTRY_NULL;
17260 			continue;
17261 		}
17262 
17263 		object = VME_OBJECT(entry);
17264 		if (object == VM_OBJECT_NULL || !object->internal) {
17265 			continue;
17266 		}
17267 
17268 		vm_object_pageout(object);
17269 	}
17270 
17271 	vm_map_unlock_read(map);
17272 	return KERN_SUCCESS;
17273 }
17274 #endif /* MACH_ASSERT */
17275 
17276 /*
17277  * This function determines if the zero operation can be run on the
17278  * respective entry. Additional checks on the object are in
17279  * vm_object_zero_preflight.
17280  */
17281 static kern_return_t
vm_map_zero_entry_preflight(vm_map_entry_t entry)17282 vm_map_zero_entry_preflight(vm_map_entry_t entry)
17283 {
17284 	/*
17285 	 * Zeroing is restricted to writable non-executable entries and non-JIT
17286 	 * regions.
17287 	 */
17288 	if (!(entry->protection & VM_PROT_WRITE) ||
17289 	    (entry->protection & VM_PROT_EXECUTE) ||
17290 	    entry->used_for_jit ||
17291 	    entry->vme_xnu_user_debug) {
17292 		return KERN_PROTECTION_FAILURE;
17293 	}
17294 
17295 	/*
17296 	 * Zeroing for copy on write isn't yet supported. Zeroing is also not
17297 	 * allowed for submaps.
17298 	 */
17299 	if (entry->needs_copy || entry->is_sub_map) {
17300 		return KERN_NO_ACCESS;
17301 	}
17302 
17303 	return KERN_SUCCESS;
17304 }
17305 
17306 /*
17307  * This function translates entry's start and end to offsets in the object
17308  */
17309 static void
vm_map_get_bounds_in_object(vm_map_entry_t entry,vm_map_offset_t start,vm_map_offset_t end,vm_map_offset_t * start_offset,vm_map_offset_t * end_offset)17310 vm_map_get_bounds_in_object(
17311 	vm_map_entry_t      entry,
17312 	vm_map_offset_t     start,
17313 	vm_map_offset_t     end,
17314 	vm_map_offset_t    *start_offset,
17315 	vm_map_offset_t    *end_offset)
17316 {
17317 	if (entry->vme_start < start) {
17318 		*start_offset = start - entry->vme_start;
17319 	} else {
17320 		*start_offset = 0;
17321 	}
17322 	*end_offset = MIN(end, entry->vme_end) - entry->vme_start;
17323 	*start_offset += VME_OFFSET(entry);
17324 	*end_offset += VME_OFFSET(entry);
17325 }
17326 
17327 /*
17328  * This function iterates through the entries in the requested range
17329  * and zeroes any resident pages in the corresponding objects. Compressed
17330  * pages are dropped instead of being faulted in and zeroed.
17331  */
17332 static kern_return_t
vm_map_zero(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)17333 vm_map_zero(
17334 	vm_map_t        map,
17335 	vm_map_offset_t start,
17336 	vm_map_offset_t end)
17337 {
17338 	vm_map_entry_t                  entry;
17339 	vm_map_offset_t                 cur = start;
17340 	kern_return_t                   ret;
17341 
17342 	/*
17343 	 * This operation isn't supported where the map page size is less than
17344 	 * the hardware page size. Caller will need to handle error and
17345 	 * explicitly zero memory if needed.
17346 	 */
17347 	if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
17348 		return KERN_NO_ACCESS;
17349 	}
17350 
17351 	/*
17352 	 * The MADV_ZERO operation doesn't require any changes to the
17353 	 * vm_map_entry_t's, so the read lock is sufficient.
17354 	 */
17355 	vm_map_lock_read(map);
17356 	assert(map->pmap != kernel_pmap);       /* protect alias access */
17357 
17358 	/*
17359 	 * The madvise semantics require that the address range be fully
17360 	 * allocated with no holes. Otherwise, we're required to return
17361 	 * an error. This check needs to be redone if the map has changed.
17362 	 */
17363 	if (!vm_map_range_check(map, cur, end, &entry)) {
17364 		vm_map_unlock_read(map);
17365 		return KERN_INVALID_ADDRESS;
17366 	}
17367 
17368 	/*
17369 	 * Examine each vm_map_entry_t in the range.
17370 	 */
17371 	while (entry != vm_map_to_entry(map) && entry->vme_start < end) {
17372 		vm_map_offset_t cur_offset;
17373 		vm_map_offset_t end_offset;
17374 		unsigned int last_timestamp = map->timestamp;
17375 		vm_object_t object = VME_OBJECT(entry);
17376 
17377 		ret = vm_map_zero_entry_preflight(entry);
17378 		if (ret != KERN_SUCCESS) {
17379 			vm_map_unlock_read(map);
17380 			return ret;
17381 		}
17382 
17383 		if (object == VM_OBJECT_NULL) {
17384 			entry = entry->vme_next;
17385 			continue;
17386 		}
17387 
17388 		vm_map_get_bounds_in_object(entry, cur, end, &cur_offset, &end_offset);
17389 		vm_object_lock(object);
17390 		/*
17391 		 * Take a reference on the object as vm_object_zero will drop the object
17392 		 * lock when it encounters a busy page.
17393 		 */
17394 		vm_object_reference_locked(object);
17395 		vm_map_unlock_read(map);
17396 
17397 		ret = vm_object_zero(object, cur_offset, end_offset);
17398 		vm_object_unlock(object);
17399 		vm_object_deallocate(object);
17400 		if (ret != KERN_SUCCESS) {
17401 			return ret;
17402 		}
17403 		/*
17404 		 * Update cur as vm_object_zero has succeeded.
17405 		 */
17406 		cur += (end_offset - cur_offset);
17407 		if (cur == end) {
17408 			return KERN_SUCCESS;
17409 		}
17410 
17411 		/*
17412 		 * If the map timestamp has changed, restart by relooking up cur in the
17413 		 * map
17414 		 */
17415 		vm_map_lock_read(map);
17416 		if (last_timestamp != map->timestamp) {
17417 			/*
17418 			 * Relookup cur in the map
17419 			 */
17420 			if (!vm_map_range_check(map, cur, end, &entry)) {
17421 				vm_map_unlock_read(map);
17422 				return KERN_INVALID_ADDRESS;
17423 			}
17424 			continue;
17425 		}
17426 		/*
17427 		 * If the map hasn't changed proceed with the next entry
17428 		 */
17429 		entry = entry->vme_next;
17430 	}
17431 
17432 	vm_map_unlock_read(map);
17433 	return KERN_SUCCESS;
17434 }
17435 
17436 
17437 /*
17438  *	Routine:	vm_map_entry_insert
17439  *
17440  *	Description:	This routine inserts a new vm_entry in a locked map.
17441  */
17442 static vm_map_entry_t
vm_map_entry_insert(vm_map_t map,vm_map_entry_t insp_entry,vm_map_offset_t start,vm_map_offset_t end,vm_object_t object,vm_object_offset_t offset,vm_map_kernel_flags_t vmk_flags,boolean_t needs_copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance,boolean_t clear_map_aligned)17443 vm_map_entry_insert(
17444 	vm_map_t                map,
17445 	vm_map_entry_t          insp_entry,
17446 	vm_map_offset_t         start,
17447 	vm_map_offset_t         end,
17448 	vm_object_t             object,
17449 	vm_object_offset_t      offset,
17450 	vm_map_kernel_flags_t   vmk_flags,
17451 	boolean_t               needs_copy,
17452 	vm_prot_t               cur_protection,
17453 	vm_prot_t               max_protection,
17454 	vm_inherit_t            inheritance,
17455 	boolean_t               clear_map_aligned)
17456 {
17457 	vm_map_entry_t  new_entry;
17458 	boolean_t map_aligned = FALSE;
17459 
17460 	assert(insp_entry != (vm_map_entry_t)0);
17461 	vm_map_lock_assert_exclusive(map);
17462 
17463 	__assert_only vm_object_offset_t      end_offset = 0;
17464 	assertf(!os_add_overflow(end - start, offset, &end_offset), "size 0x%llx, offset 0x%llx caused overflow", (uint64_t)(end - start), offset);
17465 
17466 	if (VM_MAP_PAGE_SHIFT(map) != PAGE_SHIFT) {
17467 		map_aligned = TRUE;
17468 	}
17469 	if (clear_map_aligned &&
17470 	    (!VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)) ||
17471 	    !VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)))) {
17472 		map_aligned = FALSE;
17473 	}
17474 	if (map_aligned) {
17475 		assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
17476 		assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
17477 	} else {
17478 		assert(page_aligned(start));
17479 		assert(page_aligned(end));
17480 	}
17481 	assert(start < end);
17482 
17483 	new_entry = vm_map_entry_create(map);
17484 
17485 	new_entry->vme_start = start;
17486 	new_entry->vme_end = end;
17487 
17488 	if (vmk_flags.vmkf_submap) {
17489 		new_entry->vme_atomic = vmk_flags.vmkf_submap_atomic;
17490 		VME_SUBMAP_SET(new_entry, (vm_map_t)object);
17491 	} else {
17492 		VME_OBJECT_SET(new_entry, object, false, 0);
17493 	}
17494 	VME_OFFSET_SET(new_entry, offset);
17495 	VME_ALIAS_SET(new_entry, vmk_flags.vm_tag);
17496 
17497 	new_entry->map_aligned = map_aligned;
17498 	new_entry->needs_copy = needs_copy;
17499 	new_entry->inheritance = inheritance;
17500 	new_entry->protection = cur_protection;
17501 	new_entry->max_protection = max_protection;
17502 	/*
17503 	 * submap: "use_pmap" means "nested".
17504 	 * default: false.
17505 	 *
17506 	 * object: "use_pmap" means "use pmap accounting" for footprint.
17507 	 * default: true.
17508 	 */
17509 	new_entry->use_pmap = !vmk_flags.vmkf_submap;
17510 	new_entry->no_cache = vmk_flags.vmf_no_cache;
17511 	new_entry->vme_permanent = vmk_flags.vmf_permanent;
17512 	new_entry->translated_allow_execute = vmk_flags.vmkf_translated_allow_execute;
17513 	new_entry->vme_no_copy_on_read = vmk_flags.vmkf_no_copy_on_read;
17514 	new_entry->superpage_size = (vmk_flags.vmf_superpage_size != 0);
17515 
17516 	if (vmk_flags.vmkf_map_jit) {
17517 		if (!(map->jit_entry_exists) ||
17518 		    VM_MAP_POLICY_ALLOW_MULTIPLE_JIT(map)) {
17519 			new_entry->used_for_jit = TRUE;
17520 			map->jit_entry_exists = TRUE;
17521 		}
17522 	}
17523 
17524 	/*
17525 	 *	Insert the new entry into the list.
17526 	 */
17527 
17528 	vm_map_store_entry_link(map, insp_entry, new_entry, vmk_flags);
17529 	map->size += end - start;
17530 
17531 	/*
17532 	 *	Update the free space hint and the lookup hint.
17533 	 */
17534 
17535 	SAVE_HINT_MAP_WRITE(map, new_entry);
17536 	return new_entry;
17537 }
17538 
17539 /*
17540  *	Routine:	vm_map_remap_extract
17541  *
17542  *	Description:	This routine returns a vm_entry list from a map.
17543  */
17544 static kern_return_t
vm_map_remap_extract(vm_map_t map,vm_map_offset_t addr,vm_map_size_t size,boolean_t copy,vm_map_copy_t map_copy,vm_prot_t * cur_protection,vm_prot_t * max_protection,vm_inherit_t inheritance,vm_map_kernel_flags_t vmk_flags)17545 vm_map_remap_extract(
17546 	vm_map_t                map,
17547 	vm_map_offset_t         addr,
17548 	vm_map_size_t           size,
17549 	boolean_t               copy,
17550 	vm_map_copy_t           map_copy,
17551 	vm_prot_t               *cur_protection,   /* IN/OUT */
17552 	vm_prot_t               *max_protection,   /* IN/OUT */
17553 	/* What, no behavior? */
17554 	vm_inherit_t            inheritance,
17555 	vm_map_kernel_flags_t   vmk_flags)
17556 {
17557 	struct vm_map_header   *map_header = &map_copy->cpy_hdr;
17558 	kern_return_t           result;
17559 	vm_map_size_t           mapped_size;
17560 	vm_map_size_t           tmp_size;
17561 	vm_map_entry_t          src_entry;     /* result of last map lookup */
17562 	vm_map_entry_t          new_entry;
17563 	vm_object_offset_t      offset;
17564 	vm_map_offset_t         map_address;
17565 	vm_map_offset_t         src_start;     /* start of entry to map */
17566 	vm_map_offset_t         src_end;       /* end of region to be mapped */
17567 	vm_object_t             object;
17568 	vm_map_version_t        version;
17569 	boolean_t               src_needs_copy;
17570 	boolean_t               new_entry_needs_copy;
17571 	vm_map_entry_t          saved_src_entry;
17572 	boolean_t               src_entry_was_wired;
17573 	vm_prot_t               max_prot_for_prot_copy;
17574 	vm_map_offset_t         effective_page_mask;
17575 	bool                    pageable, same_map;
17576 	boolean_t               vm_remap_legacy;
17577 	vm_prot_t               required_cur_prot, required_max_prot;
17578 	vm_object_t             new_copy_object;     /* vm_object_copy_* result */
17579 	boolean_t               saved_used_for_jit;  /* Saved used_for_jit. */
17580 
17581 	pageable = vmk_flags.vmkf_copy_pageable;
17582 	same_map = vmk_flags.vmkf_copy_same_map;
17583 
17584 	effective_page_mask = MIN(PAGE_MASK, VM_MAP_PAGE_MASK(map));
17585 
17586 	assert(map != VM_MAP_NULL);
17587 	assert(size != 0);
17588 	assert(size == vm_map_round_page(size, effective_page_mask));
17589 	assert(inheritance == VM_INHERIT_NONE ||
17590 	    inheritance == VM_INHERIT_COPY ||
17591 	    inheritance == VM_INHERIT_SHARE);
17592 	assert(!(*cur_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)));
17593 	assert(!(*max_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)));
17594 	assert((*cur_protection & *max_protection) == *cur_protection);
17595 
17596 	/*
17597 	 *	Compute start and end of region.
17598 	 */
17599 	src_start = vm_map_trunc_page(addr, effective_page_mask);
17600 	src_end = vm_map_round_page(src_start + size, effective_page_mask);
17601 
17602 	/*
17603 	 *	Initialize map_header.
17604 	 */
17605 	map_header->nentries = 0;
17606 	map_header->entries_pageable = pageable;
17607 //	map_header->page_shift = MIN(VM_MAP_PAGE_SHIFT(map), PAGE_SHIFT);
17608 	map_header->page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(map);
17609 	map_header->rb_head_store.rbh_root = (void *)(int)SKIP_RB_TREE;
17610 	vm_map_store_init(map_header);
17611 
17612 	if (copy && vmk_flags.vmkf_remap_prot_copy) {
17613 		/*
17614 		 * Special case for vm_map_protect(VM_PROT_COPY):
17615 		 * we want to set the new mappings' max protection to the
17616 		 * specified *max_protection...
17617 		 */
17618 		max_prot_for_prot_copy = *max_protection & (VM_PROT_ALL | VM_PROT_ALLEXEC);
17619 		/* ... but we want to use the vm_remap() legacy mode */
17620 		vmk_flags.vmkf_remap_legacy_mode = true;
17621 		*max_protection = VM_PROT_NONE;
17622 		*cur_protection = VM_PROT_NONE;
17623 	} else {
17624 		max_prot_for_prot_copy = VM_PROT_NONE;
17625 	}
17626 
17627 	if (vmk_flags.vmkf_remap_legacy_mode) {
17628 		/*
17629 		 * vm_remap() legacy mode:
17630 		 * Extract all memory regions in the specified range and
17631 		 * collect the strictest set of protections allowed on the
17632 		 * entire range, so the caller knows what they can do with
17633 		 * the remapped range.
17634 		 * We start with VM_PROT_ALL and we'll remove the protections
17635 		 * missing from each memory region.
17636 		 */
17637 		vm_remap_legacy = TRUE;
17638 		*cur_protection = VM_PROT_ALL;
17639 		*max_protection = VM_PROT_ALL;
17640 		required_cur_prot = VM_PROT_NONE;
17641 		required_max_prot = VM_PROT_NONE;
17642 	} else {
17643 		/*
17644 		 * vm_remap_new() mode:
17645 		 * Extract all memory regions in the specified range and
17646 		 * ensure that they have at least the protections specified
17647 		 * by the caller via *cur_protection and *max_protection.
17648 		 * The resulting mapping should have these protections.
17649 		 */
17650 		vm_remap_legacy = FALSE;
17651 		if (copy) {
17652 			required_cur_prot = VM_PROT_NONE;
17653 			required_max_prot = VM_PROT_READ;
17654 		} else {
17655 			required_cur_prot = *cur_protection;
17656 			required_max_prot = *max_protection;
17657 		}
17658 	}
17659 
17660 	map_address = 0;
17661 	mapped_size = 0;
17662 	result = KERN_SUCCESS;
17663 
17664 	/*
17665 	 *	The specified source virtual space might correspond to
17666 	 *	multiple map entries, need to loop on them.
17667 	 */
17668 	vm_map_lock(map);
17669 
17670 	if (map->pmap == kernel_pmap) {
17671 		map_copy->is_kernel_range = true;
17672 		map_copy->orig_range = kmem_addr_get_range(addr, size);
17673 #if CONFIG_MAP_RANGES
17674 	} else if (map->uses_user_ranges) {
17675 		map_copy->is_user_range = true;
17676 		map_copy->orig_range = vm_map_user_range_resolve(map, addr, size, NULL);
17677 #endif /* CONFIG_MAP_RANGES */
17678 	}
17679 
17680 	if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
17681 		/*
17682 		 * This address space uses sub-pages so the range might
17683 		 * not be re-mappable in an address space with larger
17684 		 * pages. Re-assemble any broken-up VM map entries to
17685 		 * improve our chances of making it work.
17686 		 */
17687 		vm_map_simplify_range(map, src_start, src_end);
17688 	}
17689 	while (mapped_size != size) {
17690 		vm_map_size_t   entry_size;
17691 
17692 		/*
17693 		 *	Find the beginning of the region.
17694 		 */
17695 		if (!vm_map_lookup_entry(map, src_start, &src_entry)) {
17696 			result = KERN_INVALID_ADDRESS;
17697 			break;
17698 		}
17699 
17700 		if (src_start < src_entry->vme_start ||
17701 		    (mapped_size && src_start != src_entry->vme_start)) {
17702 			result = KERN_INVALID_ADDRESS;
17703 			break;
17704 		}
17705 
17706 		tmp_size = size - mapped_size;
17707 		if (src_end > src_entry->vme_end) {
17708 			tmp_size -= (src_end - src_entry->vme_end);
17709 		}
17710 
17711 		entry_size = (vm_map_size_t)(src_entry->vme_end -
17712 		    src_entry->vme_start);
17713 
17714 		if (src_entry->is_sub_map &&
17715 		    vmk_flags.vmkf_copy_single_object) {
17716 			vm_map_t submap;
17717 			vm_map_offset_t submap_start;
17718 			vm_map_size_t submap_size;
17719 			boolean_t submap_needs_copy;
17720 
17721 			/*
17722 			 * No check for "required protection" on "src_entry"
17723 			 * because the protections that matter are the ones
17724 			 * on the submap's VM map entry, which will be checked
17725 			 * during the call to vm_map_remap_extract() below.
17726 			 */
17727 			object = VM_OBJECT_NULL;
17728 
17729 			submap_size = src_entry->vme_end - src_start;
17730 			if (submap_size > size) {
17731 				submap_size = size;
17732 			}
17733 			submap_start = VME_OFFSET(src_entry) + src_start - src_entry->vme_start;
17734 			submap = VME_SUBMAP(src_entry);
17735 			if (copy) {
17736 				/*
17737 				 * The caller wants a copy-on-write re-mapping,
17738 				 * so let's extract from the submap accordingly.
17739 				 */
17740 				submap_needs_copy = TRUE;
17741 			} else if (src_entry->needs_copy) {
17742 				/*
17743 				 * The caller wants a shared re-mapping but the
17744 				 * submap is mapped with "needs_copy", so its
17745 				 * contents can't be shared as is. Extract the
17746 				 * contents of the submap as "copy-on-write".
17747 				 * The re-mapping won't be shared with the
17748 				 * original mapping but this is equivalent to
17749 				 * what happened with the original "remap from
17750 				 * submap" code.
17751 				 * The shared region is mapped "needs_copy", for
17752 				 * example.
17753 				 */
17754 				submap_needs_copy = TRUE;
17755 			} else {
17756 				/*
17757 				 * The caller wants a shared re-mapping and
17758 				 * this mapping can be shared (no "needs_copy"),
17759 				 * so let's extract from the submap accordingly.
17760 				 * Kernel submaps are mapped without
17761 				 * "needs_copy", for example.
17762 				 */
17763 				submap_needs_copy = FALSE;
17764 			}
17765 			vm_map_reference(submap);
17766 			vm_map_unlock(map);
17767 			src_entry = NULL;
17768 			if (vm_remap_legacy) {
17769 				*cur_protection = VM_PROT_NONE;
17770 				*max_protection = VM_PROT_NONE;
17771 			}
17772 
17773 			DTRACE_VM7(remap_submap_recurse,
17774 			    vm_map_t, map,
17775 			    vm_map_offset_t, addr,
17776 			    vm_map_size_t, size,
17777 			    boolean_t, copy,
17778 			    vm_map_offset_t, submap_start,
17779 			    vm_map_size_t, submap_size,
17780 			    boolean_t, submap_needs_copy);
17781 
17782 			result = vm_map_remap_extract(submap,
17783 			    submap_start,
17784 			    submap_size,
17785 			    submap_needs_copy,
17786 			    map_copy,
17787 			    cur_protection,
17788 			    max_protection,
17789 			    inheritance,
17790 			    vmk_flags);
17791 			vm_map_deallocate(submap);
17792 
17793 			if (result == KERN_SUCCESS &&
17794 			    submap_needs_copy &&
17795 			    !copy) {
17796 				/*
17797 				 * We were asked for a "shared"
17798 				 * re-mapping but had to ask for a
17799 				 * "copy-on-write" remapping of the
17800 				 * submap's mapping to honor the
17801 				 * submap's "needs_copy".
17802 				 * We now need to resolve that
17803 				 * pending "copy-on-write" to
17804 				 * get something we can share.
17805 				 */
17806 				vm_map_entry_t copy_entry;
17807 				vm_object_offset_t copy_offset;
17808 				vm_map_size_t copy_size;
17809 				vm_object_t copy_object;
17810 				copy_entry = vm_map_copy_first_entry(map_copy);
17811 				copy_size = copy_entry->vme_end - copy_entry->vme_start;
17812 				copy_object = VME_OBJECT(copy_entry);
17813 				copy_offset = VME_OFFSET(copy_entry);
17814 				if (copy_object == VM_OBJECT_NULL) {
17815 					assert(copy_offset == 0);
17816 					assert(!copy_entry->needs_copy);
17817 					if (copy_entry->max_protection == VM_PROT_NONE) {
17818 						assert(copy_entry->protection == VM_PROT_NONE);
17819 						/* nothing to share */
17820 					} else {
17821 						assert(copy_offset == 0);
17822 						copy_object = vm_object_allocate(copy_size, submap->serial_id);
17823 						VME_OFFSET_SET(copy_entry, 0);
17824 						VME_OBJECT_SET(copy_entry, copy_object, false, 0);
17825 						assert(copy_entry->use_pmap);
17826 					}
17827 				} else if (copy_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
17828 					/* already shareable */
17829 					assert(!copy_entry->needs_copy);
17830 				} else if (copy_entry->needs_copy ||
17831 				    copy_object->shadowed ||
17832 				    (copy_object->internal &&
17833 				    !copy_object->true_share &&
17834 				    !copy_entry->is_shared &&
17835 				    copy_object->vo_size > copy_size)) {
17836 					VME_OBJECT_SHADOW(copy_entry, copy_size, TRUE);
17837 					assert(copy_entry->use_pmap);
17838 					if (copy_entry->needs_copy) {
17839 						/* already write-protected */
17840 					} else {
17841 						vm_prot_t prot;
17842 						prot = copy_entry->protection & ~VM_PROT_WRITE;
17843 						vm_object_pmap_protect(copy_object,
17844 						    copy_offset,
17845 						    copy_size,
17846 						    PMAP_NULL,
17847 						    PAGE_SIZE,
17848 						    0,
17849 						    prot);
17850 					}
17851 					copy_entry->needs_copy = FALSE;
17852 				}
17853 				copy_object = VME_OBJECT(copy_entry);
17854 				copy_offset = VME_OFFSET(copy_entry);
17855 				if (copy_object &&
17856 				    copy_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
17857 					copy_object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
17858 					copy_object->true_share = TRUE;
17859 				}
17860 			}
17861 
17862 			return result;
17863 		}
17864 
17865 		if (src_entry->is_sub_map) {
17866 			/* protections for submap mapping are irrelevant here */
17867 		} else if (((src_entry->protection & required_cur_prot) !=
17868 		    required_cur_prot) ||
17869 		    ((src_entry->max_protection & required_max_prot) !=
17870 		    required_max_prot)) {
17871 			if (vmk_flags.vmkf_copy_single_object &&
17872 			    mapped_size != 0) {
17873 				/*
17874 				 * Single object extraction.
17875 				 * We can't extract more with the required
17876 				 * protection but we've extracted some, so
17877 				 * stop there and declare success.
17878 				 * The caller should check the size of
17879 				 * the copy entry we've extracted.
17880 				 */
17881 				result = KERN_SUCCESS;
17882 			} else {
17883 				/*
17884 				 * VM range extraction.
17885 				 * Required proctection is not available
17886 				 * for this part of the range: fail.
17887 				 */
17888 				result = KERN_PROTECTION_FAILURE;
17889 			}
17890 			break;
17891 		}
17892 
17893 		if (src_entry->is_sub_map) {
17894 			vm_map_t submap;
17895 			vm_map_offset_t submap_start;
17896 			vm_map_size_t submap_size;
17897 			vm_map_copy_t submap_copy;
17898 			vm_prot_t submap_curprot, submap_maxprot;
17899 			boolean_t submap_needs_copy;
17900 
17901 			/*
17902 			 * No check for "required protection" on "src_entry"
17903 			 * because the protections that matter are the ones
17904 			 * on the submap's VM map entry, which will be checked
17905 			 * during the call to vm_map_copy_extract() below.
17906 			 */
17907 			object = VM_OBJECT_NULL;
17908 			submap_copy = VM_MAP_COPY_NULL;
17909 
17910 			/* find equivalent range in the submap */
17911 			submap = VME_SUBMAP(src_entry);
17912 			submap_start = VME_OFFSET(src_entry) + src_start - src_entry->vme_start;
17913 			submap_size = tmp_size;
17914 			if (copy) {
17915 				/*
17916 				 * The caller wants a copy-on-write re-mapping,
17917 				 * so let's extract from the submap accordingly.
17918 				 */
17919 				submap_needs_copy = TRUE;
17920 			} else if (src_entry->needs_copy) {
17921 				/*
17922 				 * The caller wants a shared re-mapping but the
17923 				 * submap is mapped with "needs_copy", so its
17924 				 * contents can't be shared as is. Extract the
17925 				 * contents of the submap as "copy-on-write".
17926 				 * The re-mapping won't be shared with the
17927 				 * original mapping but this is equivalent to
17928 				 * what happened with the original "remap from
17929 				 * submap" code.
17930 				 * The shared region is mapped "needs_copy", for
17931 				 * example.
17932 				 */
17933 				submap_needs_copy = TRUE;
17934 			} else {
17935 				/*
17936 				 * The caller wants a shared re-mapping and
17937 				 * this mapping can be shared (no "needs_copy"),
17938 				 * so let's extract from the submap accordingly.
17939 				 * Kernel submaps are mapped without
17940 				 * "needs_copy", for example.
17941 				 */
17942 				submap_needs_copy = FALSE;
17943 			}
17944 			/* extra ref to keep submap alive */
17945 			vm_map_reference(submap);
17946 
17947 			DTRACE_VM7(remap_submap_recurse,
17948 			    vm_map_t, map,
17949 			    vm_map_offset_t, addr,
17950 			    vm_map_size_t, size,
17951 			    boolean_t, copy,
17952 			    vm_map_offset_t, submap_start,
17953 			    vm_map_size_t, submap_size,
17954 			    boolean_t, submap_needs_copy);
17955 
17956 			/*
17957 			 * The map can be safely unlocked since we
17958 			 * already hold a reference on the submap.
17959 			 *
17960 			 * No timestamp since we don't care if the map
17961 			 * gets modified while we're down in the submap.
17962 			 * We'll resume the extraction at src_start + tmp_size
17963 			 * anyway.
17964 			 */
17965 			vm_map_unlock(map);
17966 			src_entry = NULL; /* not valid once map is unlocked */
17967 
17968 			if (vm_remap_legacy) {
17969 				submap_curprot = VM_PROT_NONE;
17970 				submap_maxprot = VM_PROT_NONE;
17971 				if (max_prot_for_prot_copy) {
17972 					submap_maxprot = max_prot_for_prot_copy;
17973 				}
17974 			} else {
17975 				assert(!max_prot_for_prot_copy);
17976 				submap_curprot = *cur_protection;
17977 				submap_maxprot = *max_protection;
17978 			}
17979 			result = vm_map_copy_extract(submap,
17980 			    submap_start,
17981 			    submap_size,
17982 			    submap_needs_copy,
17983 			    &submap_copy,
17984 			    &submap_curprot,
17985 			    &submap_maxprot,
17986 			    inheritance,
17987 			    vmk_flags);
17988 
17989 			/* release extra ref on submap */
17990 			vm_map_deallocate(submap);
17991 			submap = VM_MAP_NULL;
17992 
17993 			if (result != KERN_SUCCESS) {
17994 				vm_map_lock(map);
17995 				break;
17996 			}
17997 
17998 			/* transfer submap_copy entries to map_header */
17999 			while (vm_map_copy_first_entry(submap_copy) !=
18000 			    vm_map_copy_to_entry(submap_copy)) {
18001 				vm_map_entry_t copy_entry;
18002 				vm_map_size_t copy_entry_size;
18003 
18004 				copy_entry = vm_map_copy_first_entry(submap_copy);
18005 
18006 				/*
18007 				 * Prevent kernel_object from being exposed to
18008 				 * user space.
18009 				 */
18010 				if (__improbable(copy_entry->vme_kernel_object)) {
18011 					printf("%d[%s]: rejecting attempt to extract from kernel_object\n",
18012 					    proc_selfpid(),
18013 					    (get_bsdtask_info(current_task())
18014 					    ? proc_name_address(get_bsdtask_info(current_task()))
18015 					    : "?"));
18016 					DTRACE_VM(extract_kernel_only);
18017 					result = KERN_INVALID_RIGHT;
18018 					vm_map_copy_discard(submap_copy);
18019 					submap_copy = VM_MAP_COPY_NULL;
18020 					vm_map_lock(map);
18021 					break;
18022 				}
18023 
18024 				vm_map_copy_entry_unlink(submap_copy, copy_entry);
18025 				copy_entry_size = copy_entry->vme_end - copy_entry->vme_start;
18026 				copy_entry->vme_start = map_address;
18027 				copy_entry->vme_end = map_address + copy_entry_size;
18028 				map_address += copy_entry_size;
18029 				mapped_size += copy_entry_size;
18030 				src_start += copy_entry_size;
18031 				assert(src_start <= src_end);
18032 				_vm_map_store_entry_link(map_header,
18033 				    map_header->links.prev,
18034 				    copy_entry);
18035 			}
18036 			/* done with submap_copy */
18037 			vm_map_copy_discard(submap_copy);
18038 
18039 			if (vm_remap_legacy) {
18040 				*cur_protection &= submap_curprot;
18041 				*max_protection &= submap_maxprot;
18042 			}
18043 
18044 			/* re-acquire the map lock and continue to next entry */
18045 			vm_map_lock(map);
18046 			continue;
18047 		} else {
18048 			object = VME_OBJECT(src_entry);
18049 
18050 			/*
18051 			 * Prevent kernel_object from being exposed to
18052 			 * user space.
18053 			 */
18054 			if (__improbable(is_kernel_object(object))) {
18055 				printf("%d[%s]: rejecting attempt to extract from kernel_object\n",
18056 				    proc_selfpid(),
18057 				    (get_bsdtask_info(current_task())
18058 				    ? proc_name_address(get_bsdtask_info(current_task()))
18059 				    : "?"));
18060 				DTRACE_VM(extract_kernel_only);
18061 				result = KERN_INVALID_RIGHT;
18062 				break;
18063 			}
18064 
18065 			if (src_entry->iokit_acct) {
18066 				/*
18067 				 * This entry uses "IOKit accounting".
18068 				 */
18069 			} else if (object != VM_OBJECT_NULL &&
18070 			    object->internal &&
18071 			    (object->purgable != VM_PURGABLE_DENY ||
18072 			    object->vo_ledger_tag != VM_LEDGER_TAG_NONE)) {
18073 				/*
18074 				 * Purgeable objects have their own accounting:
18075 				 * no pmap accounting for them.
18076 				 */
18077 				assertf(!src_entry->use_pmap,
18078 				    "map=%p src_entry=%p [0x%llx:0x%llx] 0x%x/0x%x %d",
18079 				    map,
18080 				    src_entry,
18081 				    (uint64_t)src_entry->vme_start,
18082 				    (uint64_t)src_entry->vme_end,
18083 				    src_entry->protection,
18084 				    src_entry->max_protection,
18085 				    VME_ALIAS(src_entry));
18086 			} else {
18087 				/*
18088 				 * Not IOKit or purgeable:
18089 				 * must be accounted by pmap stats.
18090 				 */
18091 				assertf(src_entry->use_pmap,
18092 				    "map=%p src_entry=%p [0x%llx:0x%llx] 0x%x/0x%x %d",
18093 				    map,
18094 				    src_entry,
18095 				    (uint64_t)src_entry->vme_start,
18096 				    (uint64_t)src_entry->vme_end,
18097 				    src_entry->protection,
18098 				    src_entry->max_protection,
18099 				    VME_ALIAS(src_entry));
18100 			}
18101 
18102 			if (object == VM_OBJECT_NULL) {
18103 				assert(!src_entry->needs_copy);
18104 				if (src_entry->max_protection == VM_PROT_NONE) {
18105 					assert(src_entry->protection == VM_PROT_NONE);
18106 					/*
18107 					 * No VM object and no permissions:
18108 					 * this must be a reserved range with
18109 					 * nothing to share or copy.
18110 					 * There could also be all sorts of
18111 					 * pmap shenanigans within that reserved
18112 					 * range, so let's just copy the map
18113 					 * entry as is to remap a similar
18114 					 * reserved range.
18115 					 */
18116 					offset = 0; /* no object => no offset */
18117 					goto copy_src_entry;
18118 				}
18119 				object = vm_object_allocate(entry_size, map->serial_id);
18120 				VME_OFFSET_SET(src_entry, 0);
18121 				VME_OBJECT_SET(src_entry, object, false, 0);
18122 				assert(src_entry->use_pmap);
18123 				assert(!map->mapped_in_other_pmaps);
18124 			} else if (src_entry->wired_count ||
18125 			    object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
18126 				/*
18127 				 * A wired memory region should not have
18128 				 * any pending copy-on-write and needs to
18129 				 * keep pointing at the VM object that
18130 				 * contains the wired pages.
18131 				 * If we're sharing this memory (copy=false),
18132 				 * we'll share this VM object.
18133 				 * If we're copying this memory (copy=true),
18134 				 * we'll call vm_object_copy_slowly() below
18135 				 * and use the new VM object for the remapping.
18136 				 *
18137 				 * Or, we are already using an asymmetric
18138 				 * copy, and therefore we already have
18139 				 * the right object.
18140 				 */
18141 				assert(!src_entry->needs_copy);
18142 			} else if (src_entry->needs_copy || object->shadowed ||
18143 			    (object->internal && !object->true_share &&
18144 			    !src_entry->is_shared &&
18145 			    object->vo_size > entry_size)) {
18146 				bool is_writable;
18147 
18148 				VME_OBJECT_SHADOW(src_entry, entry_size,
18149 				    vm_map_always_shadow(map));
18150 				assert(src_entry->use_pmap);
18151 
18152 				is_writable = false;
18153 				if (src_entry->protection & VM_PROT_WRITE) {
18154 					is_writable = true;
18155 #if __arm64e__
18156 				} else if (src_entry->used_for_tpro) {
18157 					is_writable = true;
18158 #endif /* __arm64e__ */
18159 				}
18160 				if (!src_entry->needs_copy && is_writable) {
18161 					vm_prot_t prot;
18162 
18163 					if (pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, src_entry->protection)) {
18164 						panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
18165 						    __FUNCTION__,
18166 						    map, map->pmap,
18167 						    src_entry,
18168 						    (uint64_t)src_entry->vme_start,
18169 						    (uint64_t)src_entry->vme_end,
18170 						    src_entry->protection);
18171 					}
18172 
18173 					prot = src_entry->protection & ~VM_PROT_WRITE;
18174 
18175 					if (override_nx(map,
18176 					    VME_ALIAS(src_entry))
18177 					    && prot) {
18178 						prot |= VM_PROT_EXECUTE;
18179 					}
18180 
18181 					if (pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, prot)) {
18182 						panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
18183 						    __FUNCTION__,
18184 						    map, map->pmap,
18185 						    src_entry,
18186 						    (uint64_t)src_entry->vme_start,
18187 						    (uint64_t)src_entry->vme_end,
18188 						    prot);
18189 					}
18190 
18191 					if (map->mapped_in_other_pmaps) {
18192 						vm_object_pmap_protect(
18193 							VME_OBJECT(src_entry),
18194 							VME_OFFSET(src_entry),
18195 							entry_size,
18196 							PMAP_NULL,
18197 							PAGE_SIZE,
18198 							src_entry->vme_start,
18199 							prot);
18200 #if MACH_ASSERT
18201 					} else if (__improbable(map->pmap == PMAP_NULL)) {
18202 						/*
18203 						 * Some VM tests (in vm_tests.c)
18204 						 * sometimes want to use a VM
18205 						 * map without a pmap.
18206 						 * Otherwise, this should never
18207 						 * happen.
18208 						 */
18209 						if (!thread_get_test_option(test_option_vm_map_allow_null_pmap)) {
18210 							panic("null pmap");
18211 						}
18212 #endif /* MACH_ASSERT */
18213 					} else {
18214 						pmap_protect(vm_map_pmap(map),
18215 						    src_entry->vme_start,
18216 						    src_entry->vme_end,
18217 						    prot);
18218 					}
18219 				}
18220 
18221 				object = VME_OBJECT(src_entry);
18222 				src_entry->needs_copy = FALSE;
18223 			}
18224 
18225 
18226 			vm_object_lock(object);
18227 			vm_object_reference_locked(object); /* object ref. for new entry */
18228 			assert(!src_entry->needs_copy);
18229 			if (object->copy_strategy ==
18230 			    MEMORY_OBJECT_COPY_SYMMETRIC) {
18231 				/*
18232 				 * If we want to share this object (copy==0),
18233 				 * it needs to be COPY_DELAY.
18234 				 * If we want to copy this object (copy==1),
18235 				 * we can't just set "needs_copy" on our side
18236 				 * and expect the other side to do the same
18237 				 * (symmetrically), so we can't let the object
18238 				 * stay COPY_SYMMETRIC.
18239 				 * So we always switch from COPY_SYMMETRIC to
18240 				 * COPY_DELAY.
18241 				 */
18242 				object->copy_strategy =
18243 				    MEMORY_OBJECT_COPY_DELAY;
18244 				VM_OBJECT_SET_TRUE_SHARE(object, TRUE);
18245 			}
18246 			vm_object_unlock(object);
18247 		}
18248 
18249 		offset = (VME_OFFSET(src_entry) +
18250 		    (src_start - src_entry->vme_start));
18251 
18252 copy_src_entry:
18253 
18254 
18255 		new_entry = _vm_map_entry_create(map_header);
18256 		vm_map_entry_copy(map, new_entry, src_entry);
18257 		if (new_entry->is_sub_map) {
18258 			/* clr address space specifics */
18259 			new_entry->use_pmap = FALSE;
18260 		} else if (copy) {
18261 			/*
18262 			 * We're dealing with a copy-on-write operation,
18263 			 * so the resulting mapping should not inherit the
18264 			 * original mapping's accounting settings.
18265 			 * "use_pmap" should be reset to its default (TRUE)
18266 			 * so that the new mapping gets accounted for in
18267 			 * the task's memory footprint.
18268 			 */
18269 			new_entry->use_pmap = TRUE;
18270 		}
18271 		/* "iokit_acct" was cleared in vm_map_entry_copy() */
18272 		assert(!new_entry->iokit_acct);
18273 
18274 		new_entry->map_aligned = FALSE;
18275 
18276 		new_entry->vme_start = map_address;
18277 		new_entry->vme_end = map_address + tmp_size;
18278 		assert(new_entry->vme_start < new_entry->vme_end);
18279 		if (copy && vmk_flags.vmkf_remap_prot_copy) {
18280 			/* security: keep "permanent" and "csm_associated" */
18281 			new_entry->vme_permanent = src_entry->vme_permanent;
18282 			new_entry->csm_associated = src_entry->csm_associated;
18283 			/*
18284 			 * Remapping for vm_map_protect(VM_PROT_COPY)
18285 			 * to convert a read-only mapping into a
18286 			 * copy-on-write version of itself but
18287 			 * with write access:
18288 			 * keep the original inheritance but let's not
18289 			 * add VM_PROT_WRITE to the max protection yet
18290 			 * since we want to do more security checks against
18291 			 * the target map.
18292 			 */
18293 			new_entry->inheritance = src_entry->inheritance;
18294 			new_entry->protection &= max_prot_for_prot_copy;
18295 
18296 #ifdef __arm64e__
18297 			/*
18298 			 * Remapping for vm_map_protect(VM_PROT_COPY) to remap a TPRO
18299 			 * region to be explicitly writable without TPRO is only permitted
18300 			 * if TPRO enforcement has been overridden.
18301 			 *
18302 			 * In this case we ensure any entries reset the TPRO state
18303 			 * and we permit the region to be downgraded from permanent.
18304 			 */
18305 			if (new_entry->used_for_tpro) {
18306 				if (vmk_flags.vmkf_tpro_enforcement_override) {
18307 					new_entry->used_for_tpro = FALSE;
18308 					new_entry->vme_permanent = FALSE;
18309 				} else {
18310 					result = KERN_PROTECTION_FAILURE;
18311 					vm_object_deallocate(object);
18312 					vm_map_entry_dispose(new_entry);
18313 					new_entry = VM_MAP_ENTRY_NULL;
18314 					break;
18315 				}
18316 			}
18317 #endif
18318 		} else {
18319 			new_entry->inheritance = inheritance;
18320 			if (!vm_remap_legacy) {
18321 				new_entry->protection = *cur_protection;
18322 				new_entry->max_protection = *max_protection;
18323 			}
18324 		}
18325 
18326 		VME_OFFSET_SET(new_entry, offset);
18327 
18328 		/*
18329 		 * The new region has to be copied now if required.
18330 		 */
18331 RestartCopy:
18332 		if (!copy) {
18333 			if (src_entry->used_for_jit == TRUE) {
18334 				if (same_map) {
18335 				} else if (!VM_MAP_POLICY_ALLOW_JIT_SHARING(map)) {
18336 					/*
18337 					 * Cannot allow an entry describing a JIT
18338 					 * region to be shared across address spaces.
18339 					 */
18340 					result = KERN_INVALID_ARGUMENT;
18341 					vm_object_deallocate(object);
18342 					vm_map_entry_dispose(new_entry);
18343 					new_entry = VM_MAP_ENTRY_NULL;
18344 					break;
18345 				}
18346 			}
18347 
18348 			if (!src_entry->is_sub_map &&
18349 			    VME_OBJECT(src_entry) == VM_OBJECT_NULL) {
18350 				/* no accessible memory; nothing to share */
18351 				assert(src_entry->protection == VM_PROT_NONE);
18352 				assert(src_entry->max_protection == VM_PROT_NONE);
18353 				src_entry->is_shared = FALSE;
18354 			} else {
18355 				src_entry->is_shared = TRUE;
18356 			}
18357 			if (!new_entry->is_sub_map &&
18358 			    VME_OBJECT(new_entry) == VM_OBJECT_NULL) {
18359 				/* no accessible memory; nothing to share */
18360 				assert(new_entry->protection == VM_PROT_NONE);
18361 				assert(new_entry->max_protection == VM_PROT_NONE);
18362 				new_entry->is_shared = FALSE;
18363 			} else {
18364 				new_entry->is_shared = TRUE;
18365 			}
18366 			if (!(new_entry->is_sub_map)) {
18367 				new_entry->needs_copy = FALSE;
18368 			}
18369 		} else if (src_entry->is_sub_map) {
18370 			/* make this a COW sub_map if not already */
18371 			assert(new_entry->wired_count == 0);
18372 			new_entry->needs_copy = TRUE;
18373 			object = VM_OBJECT_NULL;
18374 		} else if (src_entry->wired_count == 0 &&
18375 		    !(debug4k_no_cow_copyin && VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) &&
18376 		    vm_object_copy_quickly(VME_OBJECT(new_entry),
18377 		    VME_OFFSET(new_entry),
18378 		    (new_entry->vme_end -
18379 		    new_entry->vme_start),
18380 		    &src_needs_copy,
18381 		    &new_entry_needs_copy)) {
18382 			new_entry->needs_copy = new_entry_needs_copy;
18383 			new_entry->is_shared = FALSE;
18384 			assertf(new_entry->use_pmap, "map %p new_entry %p\n", map, new_entry);
18385 
18386 			/*
18387 			 * Handle copy_on_write semantics.
18388 			 */
18389 			if (src_needs_copy && !src_entry->needs_copy) {
18390 				vm_prot_t prot;
18391 
18392 				if (pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, src_entry->protection)) {
18393 					panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
18394 					    __FUNCTION__,
18395 					    map, map->pmap, src_entry,
18396 					    (uint64_t)src_entry->vme_start,
18397 					    (uint64_t)src_entry->vme_end,
18398 					    src_entry->protection);
18399 				}
18400 
18401 				prot = src_entry->protection & ~VM_PROT_WRITE;
18402 
18403 				if (override_nx(map,
18404 				    VME_ALIAS(src_entry))
18405 				    && prot) {
18406 					prot |= VM_PROT_EXECUTE;
18407 				}
18408 
18409 				if (pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, prot)) {
18410 					panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
18411 					    __FUNCTION__,
18412 					    map, map->pmap, src_entry,
18413 					    (uint64_t)src_entry->vme_start,
18414 					    (uint64_t)src_entry->vme_end,
18415 					    prot);
18416 				}
18417 
18418 				vm_object_pmap_protect(object,
18419 				    offset,
18420 				    entry_size,
18421 				    ((src_entry->is_shared
18422 				    || map->mapped_in_other_pmaps) ?
18423 				    PMAP_NULL : map->pmap),
18424 				    VM_MAP_PAGE_SIZE(map),
18425 				    src_entry->vme_start,
18426 				    prot);
18427 
18428 				assert(src_entry->wired_count == 0);
18429 				src_entry->needs_copy = TRUE;
18430 			}
18431 			/*
18432 			 * Throw away the old object reference of the new entry.
18433 			 */
18434 			vm_object_deallocate(object);
18435 		} else {
18436 			new_entry->is_shared = FALSE;
18437 			assertf(new_entry->use_pmap, "map %p new_entry %p\n", map, new_entry);
18438 
18439 			src_entry_was_wired = (src_entry->wired_count > 0);
18440 			saved_src_entry = src_entry;
18441 			src_entry = VM_MAP_ENTRY_NULL;
18442 
18443 			/*
18444 			 * The map can be safely unlocked since we
18445 			 * already hold a reference on the object.
18446 			 *
18447 			 * Record the timestamp of the map for later
18448 			 * verification, and unlock the map.
18449 			 */
18450 			version.main_timestamp = map->timestamp;
18451 			vm_map_unlock(map);     /* Increments timestamp once! */
18452 
18453 			/*
18454 			 * Perform the copy.
18455 			 */
18456 			if (src_entry_was_wired > 0 ||
18457 			    (debug4k_no_cow_copyin &&
18458 			    VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT)) {
18459 				vm_object_lock(object);
18460 				result = vm_object_copy_slowly(
18461 					object,
18462 					offset,
18463 					(new_entry->vme_end -
18464 					new_entry->vme_start),
18465 					THREAD_UNINT,
18466 					&new_copy_object);
18467 				/* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
18468 				saved_used_for_jit = new_entry->used_for_jit;
18469 				VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
18470 				new_entry->used_for_jit = saved_used_for_jit;
18471 				VME_OFFSET_SET(new_entry, offset - vm_object_trunc_page(offset));
18472 				new_entry->needs_copy = FALSE;
18473 			} else {
18474 				vm_object_offset_t new_offset;
18475 
18476 				new_offset = VME_OFFSET(new_entry);
18477 				result = vm_object_copy_strategically(
18478 					object,
18479 					offset,
18480 					(new_entry->vme_end -
18481 					new_entry->vme_start),
18482 					false, /* forking */
18483 					&new_copy_object,
18484 					&new_offset,
18485 					&new_entry_needs_copy);
18486 				/* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
18487 				saved_used_for_jit = new_entry->used_for_jit;
18488 				VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
18489 				new_entry->used_for_jit = saved_used_for_jit;
18490 				if (new_offset != VME_OFFSET(new_entry)) {
18491 					VME_OFFSET_SET(new_entry, new_offset);
18492 				}
18493 
18494 				new_entry->needs_copy = new_entry_needs_copy;
18495 			}
18496 
18497 			/*
18498 			 * Throw away the old object reference of the new entry.
18499 			 */
18500 			vm_object_deallocate(object);
18501 
18502 			if (result != KERN_SUCCESS &&
18503 			    result != KERN_MEMORY_RESTART_COPY) {
18504 				vm_map_entry_dispose(new_entry);
18505 				vm_map_lock(map);
18506 				break;
18507 			}
18508 
18509 			/*
18510 			 * Verify that the map has not substantially
18511 			 * changed while the copy was being made.
18512 			 */
18513 
18514 			vm_map_lock(map);
18515 			if (version.main_timestamp + 1 != map->timestamp) {
18516 				/*
18517 				 * Simple version comparison failed.
18518 				 *
18519 				 * Retry the lookup and verify that the
18520 				 * same object/offset are still present.
18521 				 */
18522 				saved_src_entry = VM_MAP_ENTRY_NULL;
18523 				vm_object_deallocate(VME_OBJECT(new_entry));
18524 				vm_map_entry_dispose(new_entry);
18525 				if (result == KERN_MEMORY_RESTART_COPY) {
18526 					result = KERN_SUCCESS;
18527 				}
18528 				continue;
18529 			}
18530 			/* map hasn't changed: src_entry is still valid */
18531 			src_entry = saved_src_entry;
18532 			saved_src_entry = VM_MAP_ENTRY_NULL;
18533 
18534 			if (result == KERN_MEMORY_RESTART_COPY) {
18535 				vm_object_reference(object);
18536 				goto RestartCopy;
18537 			}
18538 		}
18539 
18540 		_vm_map_store_entry_link(map_header,
18541 		    map_header->links.prev, new_entry);
18542 
18543 		/* protections for submap mapping are irrelevant here */
18544 		if (vm_remap_legacy && !src_entry->is_sub_map) {
18545 			*cur_protection &= src_entry->protection;
18546 			*max_protection &= src_entry->max_protection;
18547 		}
18548 
18549 		map_address += tmp_size;
18550 		mapped_size += tmp_size;
18551 		src_start += tmp_size;
18552 
18553 		if (vmk_flags.vmkf_copy_single_object) {
18554 			if (mapped_size != size) {
18555 				DEBUG4K_SHARE("map %p addr 0x%llx size 0x%llx clipped copy at mapped_size 0x%llx\n",
18556 				    map, (uint64_t)addr, (uint64_t)size, (uint64_t)mapped_size);
18557 				if (src_entry->vme_next != vm_map_to_entry(map) &&
18558 				    src_entry->vme_next->vme_object_value ==
18559 				    src_entry->vme_object_value) {
18560 					/* XXX TODO4K */
18561 					DEBUG4K_ERROR("could have extended copy to next entry...\n");
18562 				}
18563 			}
18564 			break;
18565 		}
18566 	} /* end while */
18567 
18568 	vm_map_unlock(map);
18569 	if (result != KERN_SUCCESS) {
18570 		/*
18571 		 * Free all allocated elements.
18572 		 */
18573 		for (src_entry = map_header->links.next;
18574 		    src_entry != CAST_TO_VM_MAP_ENTRY(&map_header->links);
18575 		    src_entry = new_entry) {
18576 			new_entry = src_entry->vme_next;
18577 			_vm_map_store_entry_unlink(map_header, src_entry, false);
18578 			if (src_entry->is_sub_map) {
18579 				vm_map_deallocate(VME_SUBMAP(src_entry));
18580 			} else {
18581 				vm_object_deallocate(VME_OBJECT(src_entry));
18582 			}
18583 			vm_map_entry_dispose(src_entry);
18584 		}
18585 	}
18586 	return result;
18587 }
18588 
18589 bool
vm_map_is_exotic(vm_map_t map)18590 vm_map_is_exotic(
18591 	vm_map_t map)
18592 {
18593 	return VM_MAP_IS_EXOTIC(map);
18594 }
18595 
18596 bool
vm_map_is_alien(vm_map_t map)18597 vm_map_is_alien(
18598 	vm_map_t map)
18599 {
18600 	return VM_MAP_IS_ALIEN(map);
18601 }
18602 
18603 #if XNU_TARGET_OS_OSX
18604 void
vm_map_mark_alien(vm_map_t map)18605 vm_map_mark_alien(
18606 	vm_map_t map)
18607 {
18608 	vm_map_lock(map);
18609 	map->is_alien = true;
18610 	vm_map_unlock(map);
18611 }
18612 
18613 void
vm_map_single_jit(vm_map_t map)18614 vm_map_single_jit(
18615 	vm_map_t map)
18616 {
18617 	vm_map_lock(map);
18618 	map->single_jit = true;
18619 	vm_map_unlock(map);
18620 }
18621 #endif /* XNU_TARGET_OS_OSX */
18622 
18623 
18624 /*
18625  * Callers of this function must call vm_map_copy_require on
18626  * previously created vm_map_copy_t or pass a newly created
18627  * one to ensure that it hasn't been forged.
18628  */
18629 static kern_return_t
vm_map_copy_to_physcopy(vm_map_copy_t copy_map,vm_map_t target_map)18630 vm_map_copy_to_physcopy(
18631 	vm_map_copy_t   copy_map,
18632 	vm_map_t        target_map)
18633 {
18634 	vm_map_size_t           size;
18635 	vm_map_entry_t          entry;
18636 	vm_map_entry_t          new_entry;
18637 	vm_object_t             new_object;
18638 	unsigned int            pmap_flags;
18639 	pmap_t                  new_pmap;
18640 	vm_map_t                new_map;
18641 	vm_map_address_t        src_start, src_end, src_cur;
18642 	vm_map_address_t        dst_start, dst_end, dst_cur;
18643 	kern_return_t           kr;
18644 	void                    *kbuf;
18645 
18646 	/*
18647 	 * Perform the equivalent of vm_allocate() and memcpy().
18648 	 * Replace the mappings in "copy_map" with the newly allocated mapping.
18649 	 */
18650 	DEBUG4K_COPY("copy_map %p (%d %d 0x%llx 0x%llx) BEFORE\n", copy_map, copy_map->cpy_hdr.page_shift, copy_map->cpy_hdr.nentries, copy_map->offset, (uint64_t)copy_map->size);
18651 
18652 	assert(copy_map->cpy_hdr.page_shift != VM_MAP_PAGE_MASK(target_map));
18653 
18654 	/* create a new pmap to map "copy_map" */
18655 	pmap_flags = 0;
18656 	assert(copy_map->cpy_hdr.page_shift == FOURK_PAGE_SHIFT);
18657 #if PMAP_CREATE_FORCE_4K_PAGES
18658 	pmap_flags |= PMAP_CREATE_FORCE_4K_PAGES;
18659 #endif /* PMAP_CREATE_FORCE_4K_PAGES */
18660 	pmap_flags |= PMAP_CREATE_64BIT;
18661 	new_pmap = pmap_create_options(NULL, (vm_map_size_t)0, pmap_flags);
18662 	if (new_pmap == NULL) {
18663 		return KERN_RESOURCE_SHORTAGE;
18664 	}
18665 
18666 	/* allocate new VM object */
18667 	size = VM_MAP_ROUND_PAGE(copy_map->size, PAGE_MASK);
18668 	new_object = vm_object_allocate(size, VM_MAP_SERIAL_NONE);
18669 	assert(new_object);
18670 
18671 	/* allocate new VM map entry */
18672 	new_entry = vm_map_copy_entry_create(copy_map);
18673 	assert(new_entry);
18674 
18675 	/* finish initializing new VM map entry */
18676 	new_entry->protection = VM_PROT_DEFAULT;
18677 	new_entry->max_protection = VM_PROT_DEFAULT;
18678 	new_entry->use_pmap = TRUE;
18679 
18680 	/* make new VM map entry point to new VM object */
18681 	new_entry->vme_start = 0;
18682 	new_entry->vme_end = size;
18683 	VME_OBJECT_SET(new_entry, new_object, false, 0);
18684 	VME_OFFSET_SET(new_entry, 0);
18685 
18686 	/* create a new pageable VM map to map "copy_map" */
18687 	new_map = vm_map_create_options(new_pmap, 0, MACH_VM_MAX_ADDRESS,
18688 	    VM_MAP_CREATE_PAGEABLE);
18689 	assert(new_map);
18690 	vm_map_set_page_shift(new_map, copy_map->cpy_hdr.page_shift);
18691 
18692 	/* map "copy_map" in the new VM map */
18693 	src_start = 0;
18694 	kr = vm_map_copyout_internal(
18695 		new_map,
18696 		&src_start,
18697 		copy_map,
18698 		copy_map->size,
18699 		FALSE, /* consume_on_success */
18700 		VM_PROT_DEFAULT,
18701 		VM_PROT_DEFAULT,
18702 		VM_INHERIT_DEFAULT);
18703 	assert(kr == KERN_SUCCESS);
18704 	src_end = src_start + copy_map->size;
18705 
18706 	/* map "new_object" in the new VM map */
18707 	vm_object_reference(new_object);
18708 	dst_start = 0;
18709 	kr = vm_map_enter(new_map,
18710 	    &dst_start,
18711 	    size,
18712 	    0,               /* mask */
18713 	    VM_MAP_KERNEL_FLAGS_ANYWHERE(.vm_tag = VM_KERN_MEMORY_OSFMK),
18714 	    new_object,
18715 	    0,               /* offset */
18716 	    FALSE,               /* needs copy */
18717 	    VM_PROT_DEFAULT,
18718 	    VM_PROT_DEFAULT,
18719 	    VM_INHERIT_DEFAULT);
18720 	assert(kr == KERN_SUCCESS);
18721 	dst_end = dst_start + size;
18722 
18723 	/* get a kernel buffer */
18724 	kbuf = kalloc_data(PAGE_SIZE, Z_WAITOK | Z_NOFAIL);
18725 
18726 	/* physically copy "copy_map" mappings to new VM object */
18727 	for (src_cur = src_start, dst_cur = dst_start;
18728 	    src_cur < src_end;
18729 	    src_cur += PAGE_SIZE, dst_cur += PAGE_SIZE) {
18730 		vm_size_t bytes;
18731 
18732 		bytes = PAGE_SIZE;
18733 		if (src_cur + PAGE_SIZE > src_end) {
18734 			/* partial copy for last page */
18735 			bytes = src_end - src_cur;
18736 			assert(bytes > 0 && bytes < PAGE_SIZE);
18737 			/* rest of dst page should be zero-filled */
18738 		}
18739 		/* get bytes from src mapping */
18740 		kr = copyinmap(new_map, src_cur, kbuf, bytes);
18741 		if (kr != KERN_SUCCESS) {
18742 			DEBUG4K_COPY("copyinmap(%p, 0x%llx, %p, 0x%llx) kr 0x%x\n", new_map, (uint64_t)src_cur, kbuf, (uint64_t)bytes, kr);
18743 		}
18744 		/* put bytes in dst mapping */
18745 		assert(dst_cur < dst_end);
18746 		assert(dst_cur + bytes <= dst_end);
18747 		kr = copyoutmap(new_map, kbuf, dst_cur, bytes);
18748 		if (kr != KERN_SUCCESS) {
18749 			DEBUG4K_COPY("copyoutmap(%p, %p, 0x%llx, 0x%llx) kr 0x%x\n", new_map, kbuf, (uint64_t)dst_cur, (uint64_t)bytes, kr);
18750 		}
18751 	}
18752 
18753 	/* free kernel buffer */
18754 	kfree_data(kbuf, PAGE_SIZE);
18755 
18756 	/* destroy new map */
18757 	vm_map_destroy(new_map);
18758 	new_map = VM_MAP_NULL;
18759 
18760 	/* dispose of the old map entries in "copy_map" */
18761 	while (vm_map_copy_first_entry(copy_map) !=
18762 	    vm_map_copy_to_entry(copy_map)) {
18763 		entry = vm_map_copy_first_entry(copy_map);
18764 		vm_map_copy_entry_unlink(copy_map, entry);
18765 		if (entry->is_sub_map) {
18766 			vm_map_deallocate(VME_SUBMAP(entry));
18767 		} else {
18768 			vm_object_deallocate(VME_OBJECT(entry));
18769 		}
18770 		vm_map_copy_entry_dispose(entry);
18771 	}
18772 
18773 	/* change "copy_map"'s page_size to match "target_map" */
18774 	copy_map->cpy_hdr.page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(target_map);
18775 	copy_map->offset = 0;
18776 	copy_map->size = size;
18777 
18778 	/* insert new map entry in "copy_map" */
18779 	assert(vm_map_copy_last_entry(copy_map) == vm_map_copy_to_entry(copy_map));
18780 	vm_map_copy_entry_link(copy_map, vm_map_copy_last_entry(copy_map), new_entry);
18781 
18782 	DEBUG4K_COPY("copy_map %p (%d %d 0x%llx 0x%llx) AFTER\n", copy_map, copy_map->cpy_hdr.page_shift, copy_map->cpy_hdr.nentries, copy_map->offset, (uint64_t)copy_map->size);
18783 	return KERN_SUCCESS;
18784 }
18785 
18786 void
18787 vm_map_copy_adjust_get_target_copy_map(
18788 	vm_map_copy_t   copy_map,
18789 	vm_map_copy_t   *target_copy_map_p);
18790 void
vm_map_copy_adjust_get_target_copy_map(vm_map_copy_t copy_map,vm_map_copy_t * target_copy_map_p)18791 vm_map_copy_adjust_get_target_copy_map(
18792 	vm_map_copy_t   copy_map,
18793 	vm_map_copy_t   *target_copy_map_p)
18794 {
18795 	vm_map_copy_t   target_copy_map;
18796 	vm_map_entry_t  entry, target_entry;
18797 
18798 	if (*target_copy_map_p != VM_MAP_COPY_NULL) {
18799 		/* the caller already has a "target_copy_map": use it */
18800 		return;
18801 	}
18802 
18803 	/* the caller wants us to create a new copy of "copy_map" */
18804 	assert(copy_map->type == VM_MAP_COPY_ENTRY_LIST);
18805 	target_copy_map = vm_map_copy_allocate(copy_map->type);
18806 	target_copy_map->offset = copy_map->offset;
18807 	target_copy_map->size = copy_map->size;
18808 	target_copy_map->cpy_hdr.page_shift = copy_map->cpy_hdr.page_shift;
18809 	for (entry = vm_map_copy_first_entry(copy_map);
18810 	    entry != vm_map_copy_to_entry(copy_map);
18811 	    entry = entry->vme_next) {
18812 		target_entry = vm_map_copy_entry_create(target_copy_map);
18813 		vm_map_entry_copy_full(target_entry, entry);
18814 		if (target_entry->is_sub_map) {
18815 			vm_map_reference(VME_SUBMAP(target_entry));
18816 		} else {
18817 			vm_object_reference(VME_OBJECT(target_entry));
18818 		}
18819 		vm_map_copy_entry_link(
18820 			target_copy_map,
18821 			vm_map_copy_last_entry(target_copy_map),
18822 			target_entry);
18823 	}
18824 	entry = VM_MAP_ENTRY_NULL;
18825 	*target_copy_map_p = target_copy_map;
18826 }
18827 
18828 /*
18829  * Callers of this function must call vm_map_copy_require on
18830  * previously created vm_map_copy_t or pass a newly created
18831  * one to ensure that it hasn't been forged.
18832  */
18833 static void
vm_map_copy_trim(vm_map_copy_t copy_map,uint16_t new_page_shift,vm_map_offset_t trim_start,vm_map_offset_t trim_end)18834 vm_map_copy_trim(
18835 	vm_map_copy_t   copy_map,
18836 	uint16_t        new_page_shift,
18837 	vm_map_offset_t trim_start,
18838 	vm_map_offset_t trim_end)
18839 {
18840 	uint16_t        copy_page_shift;
18841 	vm_map_entry_t  entry, next_entry;
18842 
18843 	assert(copy_map->type == VM_MAP_COPY_ENTRY_LIST);
18844 	assert(copy_map->cpy_hdr.nentries > 0);
18845 
18846 	trim_start += vm_map_copy_first_entry(copy_map)->vme_start;
18847 	trim_end += vm_map_copy_first_entry(copy_map)->vme_start;
18848 
18849 	/* use the new page_shift to do the clipping */
18850 	copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy_map);
18851 	copy_map->cpy_hdr.page_shift = new_page_shift;
18852 
18853 	for (entry = vm_map_copy_first_entry(copy_map);
18854 	    entry != vm_map_copy_to_entry(copy_map);
18855 	    entry = next_entry) {
18856 		next_entry = entry->vme_next;
18857 		if (entry->vme_end <= trim_start) {
18858 			/* entry fully before trim range: skip */
18859 			continue;
18860 		}
18861 		if (entry->vme_start >= trim_end) {
18862 			/* entry fully after trim range: done */
18863 			break;
18864 		}
18865 		/* clip entry if needed */
18866 		vm_map_copy_clip_start(copy_map, entry, trim_start);
18867 		vm_map_copy_clip_end(copy_map, entry, trim_end);
18868 		/* dispose of entry */
18869 		copy_map->size -= entry->vme_end - entry->vme_start;
18870 		vm_map_copy_entry_unlink(copy_map, entry);
18871 		if (entry->is_sub_map) {
18872 			vm_map_deallocate(VME_SUBMAP(entry));
18873 		} else {
18874 			vm_object_deallocate(VME_OBJECT(entry));
18875 		}
18876 		vm_map_copy_entry_dispose(entry);
18877 		entry = VM_MAP_ENTRY_NULL;
18878 	}
18879 
18880 	/* restore copy_map's original page_shift */
18881 	copy_map->cpy_hdr.page_shift = copy_page_shift;
18882 }
18883 
18884 /*
18885  * Make any necessary adjustments to "copy_map" to allow it to be
18886  * mapped into "target_map".
18887  * If no changes were necessary, "target_copy_map" points to the
18888  * untouched "copy_map".
18889  * If changes are necessary, changes will be made to "target_copy_map".
18890  * If "target_copy_map" was NULL, we create a new "vm_map_copy_t" and
18891  * copy the original "copy_map" to it before applying the changes.
18892  * The caller should discard "target_copy_map" if it's not the same as
18893  * the original "copy_map".
18894  */
18895 /* TODO4K: also adjust to sub-range in the copy_map -> add start&end? */
18896 kern_return_t
vm_map_copy_adjust_to_target(vm_map_copy_t src_copy_map,vm_map_offset_ut offset_u,vm_map_size_ut size_u,vm_map_t target_map,boolean_t copy,vm_map_copy_t * target_copy_map_p,vm_map_offset_t * overmap_start_p,vm_map_offset_t * overmap_end_p,vm_map_offset_t * trimmed_start_p)18897 vm_map_copy_adjust_to_target(
18898 	vm_map_copy_t           src_copy_map,
18899 	vm_map_offset_ut        offset_u,
18900 	vm_map_size_ut          size_u,
18901 	vm_map_t                target_map,
18902 	boolean_t               copy,
18903 	vm_map_copy_t           *target_copy_map_p,
18904 	vm_map_offset_t         *overmap_start_p,
18905 	vm_map_offset_t         *overmap_end_p,
18906 	vm_map_offset_t         *trimmed_start_p)
18907 {
18908 	vm_map_copy_t           copy_map, target_copy_map;
18909 	vm_map_size_t           target_size;
18910 	vm_map_size_t           src_copy_map_size;
18911 	vm_map_size_t           overmap_start, overmap_end;
18912 	int                     misalignments;
18913 	vm_map_entry_t          entry, target_entry;
18914 	vm_map_offset_t         addr_adjustment;
18915 	vm_map_offset_t         new_start, new_end;
18916 	int                     copy_page_mask, target_page_mask;
18917 	uint16_t                copy_page_shift, target_page_shift;
18918 	vm_map_offset_t         trimmed_end;
18919 	vm_map_size_t           map_size;
18920 	kern_return_t           kr;
18921 
18922 	/*
18923 	 * Sanitize any input parameters that are addr/size/prot/inherit
18924 	 */
18925 	kr = vm_map_copy_addr_size_sanitize(
18926 		target_map,
18927 		offset_u,
18928 		size_u,
18929 		VM_SANITIZE_CALLER_MACH_MEMORY_ENTRY_MAP_SIZE,
18930 		&new_start,
18931 		&new_end,
18932 		&map_size);
18933 	if (__improbable(kr != KERN_SUCCESS)) {
18934 		return vm_sanitize_get_kr(kr);
18935 	}
18936 
18937 	/*
18938 	 * Assert that the vm_map_copy is coming from the right
18939 	 * zone and hasn't been forged
18940 	 */
18941 	vm_map_copy_require(src_copy_map);
18942 	assert(src_copy_map->type == VM_MAP_COPY_ENTRY_LIST);
18943 
18944 	/*
18945 	 * Start working with "src_copy_map" but we'll switch
18946 	 * to "target_copy_map" as soon as we start making adjustments.
18947 	 */
18948 	copy_map = src_copy_map;
18949 	src_copy_map_size = src_copy_map->size;
18950 
18951 	copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy_map);
18952 	copy_page_mask = VM_MAP_COPY_PAGE_MASK(copy_map);
18953 	target_page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(target_map);
18954 	target_page_mask = VM_MAP_PAGE_MASK(target_map);
18955 
18956 	DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p...\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(offset_u), (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(size_u), *target_copy_map_p);
18957 
18958 	target_copy_map = *target_copy_map_p;
18959 	if (target_copy_map != VM_MAP_COPY_NULL) {
18960 		vm_map_copy_require(target_copy_map);
18961 	}
18962 
18963 	if (new_end > copy_map->size) {
18964 		DEBUG4K_ERROR("copy_map %p (%d->%d) copy_map->size 0x%llx offset 0x%llx size 0x%llx KERN_INVALID_ARGUMENT\n", copy_map, copy_page_shift, target_page_shift, (uint64_t)copy_map->size, (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(offset_u), (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(size_u));
18965 		return KERN_INVALID_ARGUMENT;
18966 	}
18967 
18968 	/* trim the end */
18969 	trimmed_end = 0;
18970 	new_end = VM_MAP_ROUND_PAGE(new_end, target_page_mask);
18971 	if (new_end < copy_map->size) {
18972 		trimmed_end = src_copy_map_size - new_end;
18973 		DEBUG4K_ADJUST("copy_map %p (%d->%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p... trim end from 0x%llx to 0x%llx\n", copy_map, copy_page_shift, target_page_shift, copy, (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(offset_u), (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(size_u), target_copy_map, (uint64_t)new_end, (uint64_t)copy_map->size);
18974 		/* get "target_copy_map" if needed and adjust it */
18975 		vm_map_copy_adjust_get_target_copy_map(copy_map,
18976 		    &target_copy_map);
18977 		copy_map = target_copy_map;
18978 		vm_map_copy_trim(target_copy_map, target_page_shift,
18979 		    new_end, copy_map->size);
18980 	}
18981 
18982 	/* trim the start */
18983 	new_start = VM_MAP_TRUNC_PAGE(new_start, target_page_mask);
18984 	if (new_start != 0) {
18985 		DEBUG4K_ADJUST("copy_map %p (%d->%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p... trim start from 0x%llx to 0x%llx\n", copy_map, copy_page_shift, target_page_shift, copy, (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(offset_u), (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(size_u), target_copy_map, (uint64_t)0, (uint64_t)new_start);
18986 		/* get "target_copy_map" if needed and adjust it */
18987 		vm_map_copy_adjust_get_target_copy_map(copy_map,
18988 		    &target_copy_map);
18989 		copy_map = target_copy_map;
18990 		vm_map_copy_trim(target_copy_map, target_page_shift,
18991 		    0, new_start);
18992 	}
18993 	*trimmed_start_p = new_start;
18994 
18995 	/* target_size starts with what's left after trimming */
18996 	target_size = copy_map->size;
18997 	assertf(target_size == src_copy_map_size - *trimmed_start_p - trimmed_end,
18998 	    "target_size 0x%llx src_copy_map_size 0x%llx trimmed_start 0x%llx trimmed_end 0x%llx\n",
18999 	    (uint64_t)target_size, (uint64_t)src_copy_map_size,
19000 	    (uint64_t)*trimmed_start_p, (uint64_t)trimmed_end);
19001 
19002 	/* check for misalignments but don't adjust yet */
19003 	misalignments = 0;
19004 	overmap_start = 0;
19005 	overmap_end = 0;
19006 	if (copy_page_shift < target_page_shift) {
19007 		/*
19008 		 * Remapping from 4K to 16K: check the VM object alignments
19009 		 * throughout the range.
19010 		 * If the start and end of the range are mis-aligned, we can
19011 		 * over-map to re-align, and adjust the "overmap" start/end
19012 		 * and "target_size" of the range accordingly.
19013 		 * If there is any mis-alignment within the range:
19014 		 *     if "copy":
19015 		 *         we can do immediate-copy instead of copy-on-write,
19016 		 *     else:
19017 		 *         no way to remap and share; fail.
19018 		 */
19019 		for (entry = vm_map_copy_first_entry(copy_map);
19020 		    entry != vm_map_copy_to_entry(copy_map);
19021 		    entry = entry->vme_next) {
19022 			vm_object_offset_t object_offset_start, object_offset_end;
19023 
19024 			object_offset_start = VME_OFFSET(entry);
19025 			object_offset_end = object_offset_start;
19026 			object_offset_end += entry->vme_end - entry->vme_start;
19027 			if (object_offset_start & target_page_mask) {
19028 				if (entry == vm_map_copy_first_entry(copy_map) && !copy) {
19029 					overmap_start++;
19030 				} else {
19031 					misalignments++;
19032 				}
19033 			}
19034 			if (object_offset_end & target_page_mask) {
19035 				if (entry->vme_next == vm_map_copy_to_entry(copy_map) && !copy) {
19036 					overmap_end++;
19037 				} else {
19038 					misalignments++;
19039 				}
19040 			}
19041 		}
19042 	}
19043 	entry = VM_MAP_ENTRY_NULL;
19044 
19045 	/* decide how to deal with misalignments */
19046 	assert(overmap_start <= 1);
19047 	assert(overmap_end <= 1);
19048 	if (!overmap_start && !overmap_end && !misalignments) {
19049 		/* copy_map is properly aligned for target_map ... */
19050 		if (*trimmed_start_p) {
19051 			/* ... but we trimmed it, so still need to adjust */
19052 		} else {
19053 			/* ... and we didn't trim anything: we're done */
19054 			if (target_copy_map == VM_MAP_COPY_NULL) {
19055 				target_copy_map = copy_map;
19056 			}
19057 			*target_copy_map_p = target_copy_map;
19058 			*overmap_start_p = 0;
19059 			*overmap_end_p = 0;
19060 			DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx) -> trimmed 0x%llx overmap start 0x%llx end 0x%llx KERN_SUCCESS\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p);
19061 			return KERN_SUCCESS;
19062 		}
19063 	} else if (misalignments && !copy) {
19064 		/* can't "share" if misaligned */
19065 		DEBUG4K_ADJUST("unsupported sharing\n");
19066 #if MACH_ASSERT
19067 		if (debug4k_panic_on_misaligned_sharing) {
19068 			panic("DEBUG4k %s:%d unsupported sharing", __FUNCTION__, __LINE__);
19069 		}
19070 #endif /* MACH_ASSERT */
19071 		DEBUG4K_ADJUST("copy_map %p (%d) target_map %p (%d) copy %d target_copy_map %p -> KERN_NOT_SUPPORTED\n", copy_map, copy_page_shift, target_map, target_page_shift, copy, *target_copy_map_p);
19072 		return KERN_NOT_SUPPORTED;
19073 	} else {
19074 		/* can't virtual-copy if misaligned (but can physical-copy) */
19075 		DEBUG4K_ADJUST("mis-aligned copying\n");
19076 	}
19077 
19078 	/* get a "target_copy_map" if needed and switch to it */
19079 	vm_map_copy_adjust_get_target_copy_map(copy_map, &target_copy_map);
19080 	copy_map = target_copy_map;
19081 
19082 	if (misalignments && copy) {
19083 		vm_map_size_t target_copy_map_size;
19084 
19085 		/*
19086 		 * Can't do copy-on-write with misaligned mappings.
19087 		 * Replace the mappings with a physical copy of the original
19088 		 * mappings' contents.
19089 		 */
19090 		target_copy_map_size = target_copy_map->size;
19091 		kr = vm_map_copy_to_physcopy(target_copy_map, target_map);
19092 		if (kr != KERN_SUCCESS) {
19093 			return kr;
19094 		}
19095 		*target_copy_map_p = target_copy_map;
19096 		*overmap_start_p = 0;
19097 		*overmap_end_p = target_copy_map->size - target_copy_map_size;
19098 		DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx)-> trimmed 0x%llx overmap start 0x%llx end 0x%llx PHYSCOPY\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p);
19099 		return KERN_SUCCESS;
19100 	}
19101 
19102 	/* apply the adjustments */
19103 	misalignments = 0;
19104 	overmap_start = 0;
19105 	overmap_end = 0;
19106 	/* remove copy_map->offset, so that everything starts at offset 0 */
19107 	addr_adjustment = copy_map->offset;
19108 	/* also remove whatever we trimmed from the start */
19109 	addr_adjustment += *trimmed_start_p;
19110 	for (target_entry = vm_map_copy_first_entry(target_copy_map);
19111 	    target_entry != vm_map_copy_to_entry(target_copy_map);
19112 	    target_entry = target_entry->vme_next) {
19113 		vm_object_offset_t object_offset_start, object_offset_end;
19114 
19115 		DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx BEFORE\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
19116 		object_offset_start = VME_OFFSET(target_entry);
19117 		if (object_offset_start & target_page_mask) {
19118 			DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx misaligned at start\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
19119 			if (target_entry == vm_map_copy_first_entry(target_copy_map)) {
19120 				/*
19121 				 * start of 1st entry is mis-aligned:
19122 				 * re-adjust by over-mapping.
19123 				 */
19124 				overmap_start = object_offset_start - trunc_page_mask_64(object_offset_start, target_page_mask);
19125 				DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> overmap_start 0x%llx\n", target_entry, VME_OFFSET(target_entry), copy, (uint64_t)overmap_start);
19126 				VME_OFFSET_SET(target_entry, VME_OFFSET(target_entry) - overmap_start);
19127 			} else {
19128 				misalignments++;
19129 				DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> misalignments %d\n", target_entry, VME_OFFSET(target_entry), copy, misalignments);
19130 				assert(copy);
19131 			}
19132 		}
19133 
19134 		if (target_entry == vm_map_copy_first_entry(target_copy_map)) {
19135 			target_size += overmap_start;
19136 		} else {
19137 			target_entry->vme_start += overmap_start;
19138 		}
19139 		target_entry->vme_end += overmap_start;
19140 
19141 		object_offset_end = VME_OFFSET(target_entry) + target_entry->vme_end - target_entry->vme_start;
19142 		if (object_offset_end & target_page_mask) {
19143 			DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx misaligned at end\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
19144 			if (target_entry->vme_next == vm_map_copy_to_entry(target_copy_map)) {
19145 				/*
19146 				 * end of last entry is mis-aligned: re-adjust by over-mapping.
19147 				 */
19148 				overmap_end = round_page_mask_64(object_offset_end, target_page_mask) - object_offset_end;
19149 				DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> overmap_end 0x%llx\n", target_entry, VME_OFFSET(target_entry), copy, (uint64_t)overmap_end);
19150 				target_entry->vme_end += overmap_end;
19151 				target_size += overmap_end;
19152 			} else {
19153 				misalignments++;
19154 				DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> misalignments %d\n", target_entry, VME_OFFSET(target_entry), copy, misalignments);
19155 				assert(copy);
19156 			}
19157 		}
19158 		target_entry->vme_start -= addr_adjustment;
19159 		target_entry->vme_end -= addr_adjustment;
19160 		DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx AFTER\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
19161 	}
19162 
19163 	target_copy_map->size = target_size;
19164 	target_copy_map->offset += overmap_start;
19165 	target_copy_map->offset -= addr_adjustment;
19166 	target_copy_map->cpy_hdr.page_shift = target_page_shift;
19167 
19168 //	assert(VM_MAP_PAGE_ALIGNED(target_copy_map->size, target_page_mask));
19169 //	assert(VM_MAP_PAGE_ALIGNED(target_copy_map->offset, FOURK_PAGE_MASK));
19170 	assert(overmap_start < VM_MAP_PAGE_SIZE(target_map));
19171 	assert(overmap_end < VM_MAP_PAGE_SIZE(target_map));
19172 
19173 	*target_copy_map_p = target_copy_map;
19174 	*overmap_start_p = overmap_start;
19175 	*overmap_end_p = overmap_end;
19176 
19177 	DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx) -> trimmed 0x%llx overmap start 0x%llx end 0x%llx KERN_SUCCESS\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p);
19178 	return KERN_SUCCESS;
19179 }
19180 
19181 kern_return_t
vm_map_range_physical_size(vm_map_t map,vm_map_address_t start,mach_vm_size_t size,mach_vm_size_t * phys_size)19182 vm_map_range_physical_size(
19183 	vm_map_t         map,
19184 	vm_map_address_t start,
19185 	mach_vm_size_t   size,
19186 	mach_vm_size_t * phys_size)
19187 {
19188 	kern_return_t   kr;
19189 	vm_map_copy_t   copy_map, target_copy_map;
19190 	vm_map_offset_t adjusted_start, adjusted_end;
19191 	vm_map_size_t   adjusted_size;
19192 	vm_prot_t       cur_prot, max_prot;
19193 	vm_map_offset_t overmap_start, overmap_end, trimmed_start, end;
19194 	vm_map_kernel_flags_t vmk_flags;
19195 
19196 	if (size == 0) {
19197 		DEBUG4K_SHARE("map %p start 0x%llx size 0x%llx -> phys_size 0!\n", map, (uint64_t)start, (uint64_t)size);
19198 		*phys_size = 0;
19199 		return KERN_SUCCESS;
19200 	}
19201 
19202 	adjusted_start = vm_map_trunc_page(start, VM_MAP_PAGE_MASK(map));
19203 	adjusted_end = vm_map_round_page(start + size, VM_MAP_PAGE_MASK(map));
19204 	if (__improbable(os_add_overflow(start, size, &end) ||
19205 	    adjusted_end <= adjusted_start)) {
19206 		/* wraparound */
19207 		printf("%s:%d(start=0x%llx, size=0x%llx) pgmask 0x%x: wraparound\n", __FUNCTION__, __LINE__, (uint64_t)start, (uint64_t)size, VM_MAP_PAGE_MASK(map));
19208 		*phys_size = 0;
19209 		return KERN_INVALID_ARGUMENT;
19210 	}
19211 	if (__improbable(vm_map_range_overflows(map, start, size))) {
19212 		*phys_size = 0;
19213 		return KERN_INVALID_ADDRESS;
19214 	}
19215 	assert(adjusted_end > adjusted_start);
19216 	adjusted_size = adjusted_end - adjusted_start;
19217 	*phys_size = adjusted_size;
19218 	if (VM_MAP_PAGE_SIZE(map) == PAGE_SIZE) {
19219 		return KERN_SUCCESS;
19220 	}
19221 	if (start == 0) {
19222 		adjusted_start = vm_map_trunc_page(start, PAGE_MASK);
19223 		adjusted_end = vm_map_round_page(start + size, PAGE_MASK);
19224 		if (__improbable(adjusted_end <= adjusted_start)) {
19225 			/* wraparound */
19226 			printf("%s:%d(start=0x%llx, size=0x%llx) pgmask 0x%x: wraparound\n", __FUNCTION__, __LINE__, (uint64_t)start, (uint64_t)size, PAGE_MASK);
19227 			*phys_size = 0;
19228 			return KERN_INVALID_ARGUMENT;
19229 		}
19230 		assert(adjusted_end > adjusted_start);
19231 		adjusted_size = adjusted_end - adjusted_start;
19232 		*phys_size = adjusted_size;
19233 		return KERN_SUCCESS;
19234 	}
19235 
19236 	vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
19237 	vmk_flags.vmkf_copy_pageable = TRUE;
19238 	vmk_flags.vmkf_copy_same_map = TRUE;
19239 	assert(adjusted_size != 0);
19240 	cur_prot = VM_PROT_NONE; /* legacy mode */
19241 	max_prot = VM_PROT_NONE; /* legacy mode */
19242 	vmk_flags.vmkf_remap_legacy_mode = true;
19243 	kr = vm_map_copy_extract(map, adjusted_start, adjusted_size,
19244 	    FALSE /* copy */,
19245 	    &copy_map,
19246 	    &cur_prot, &max_prot, VM_INHERIT_DEFAULT,
19247 	    vmk_flags);
19248 	if (kr != KERN_SUCCESS) {
19249 		DEBUG4K_ERROR("map %p start 0x%llx 0x%llx size 0x%llx 0x%llx kr 0x%x\n", map, (uint64_t)start, (uint64_t)adjusted_start, size, (uint64_t)adjusted_size, kr);
19250 		//assert(0);
19251 		*phys_size = 0;
19252 		return kr;
19253 	}
19254 	assert(copy_map != VM_MAP_COPY_NULL);
19255 	target_copy_map = copy_map;
19256 	DEBUG4K_ADJUST("adjusting...\n");
19257 	kr = vm_map_copy_adjust_to_target(
19258 		copy_map,
19259 		start - adjusted_start, /* offset */
19260 		size, /* size */
19261 		kernel_map,
19262 		FALSE,                          /* copy */
19263 		&target_copy_map,
19264 		&overmap_start,
19265 		&overmap_end,
19266 		&trimmed_start);
19267 	if (kr == KERN_SUCCESS) {
19268 		if (target_copy_map->size != *phys_size) {
19269 			DEBUG4K_ADJUST("map %p (%d) start 0x%llx size 0x%llx adjusted_start 0x%llx adjusted_end 0x%llx overmap_start 0x%llx overmap_end 0x%llx trimmed_start 0x%llx phys_size 0x%llx -> 0x%llx\n", map, VM_MAP_PAGE_SHIFT(map), (uint64_t)start, (uint64_t)size, (uint64_t)adjusted_start, (uint64_t)adjusted_end, (uint64_t)overmap_start, (uint64_t)overmap_end, (uint64_t)trimmed_start, (uint64_t)*phys_size, (uint64_t)target_copy_map->size);
19270 		}
19271 		*phys_size = target_copy_map->size;
19272 	} else {
19273 		DEBUG4K_ERROR("map %p start 0x%llx 0x%llx size 0x%llx 0x%llx kr 0x%x\n", map, (uint64_t)start, (uint64_t)adjusted_start, size, (uint64_t)adjusted_size, kr);
19274 		//assert(0);
19275 		*phys_size = 0;
19276 	}
19277 	vm_map_copy_discard(copy_map);
19278 	copy_map = VM_MAP_COPY_NULL;
19279 
19280 	return kr;
19281 }
19282 
19283 static __attribute__((always_inline, warn_unused_result))
19284 kern_return_t
vm_map_remap_sanitize(vm_map_t src_map,vm_map_t target_map,vm_map_address_ut address_u,vm_map_size_ut size_u,vm_map_offset_ut mask_u,vm_map_offset_ut memory_address_u,vm_prot_ut cur_protection_u,vm_prot_ut max_protection_u,vm_inherit_ut inheritance_u,vm_map_kernel_flags_t vmk_flags,vm_map_address_t * target_addr,vm_map_address_t * mask,vm_map_offset_t * memory_address,vm_map_offset_t * memory_end,vm_map_size_t * memory_size,vm_prot_t * cur_protection,vm_prot_t * max_protection,vm_inherit_t * inheritance)19285 vm_map_remap_sanitize(
19286 	vm_map_t                src_map,
19287 	vm_map_t                target_map,
19288 	vm_map_address_ut       address_u,
19289 	vm_map_size_ut          size_u,
19290 	vm_map_offset_ut        mask_u,
19291 	vm_map_offset_ut        memory_address_u,
19292 	vm_prot_ut              cur_protection_u,
19293 	vm_prot_ut              max_protection_u,
19294 	vm_inherit_ut           inheritance_u,
19295 	vm_map_kernel_flags_t   vmk_flags,
19296 	vm_map_address_t       *target_addr,
19297 	vm_map_address_t       *mask,
19298 	vm_map_offset_t        *memory_address,
19299 	vm_map_offset_t        *memory_end,
19300 	vm_map_size_t          *memory_size,
19301 	vm_prot_t              *cur_protection,
19302 	vm_prot_t              *max_protection,
19303 	vm_inherit_t           *inheritance)
19304 {
19305 	kern_return_t           result;
19306 	vm_sanitize_flags_t     vm_sanitize_flags;
19307 
19308 	result = vm_sanitize_inherit(inheritance_u, VM_SANITIZE_CALLER_VM_MAP_REMAP,
19309 	    inheritance);
19310 	if (__improbable(result != KERN_SUCCESS)) {
19311 		return result;
19312 	}
19313 
19314 	result = vm_sanitize_cur_and_max_prots(cur_protection_u, max_protection_u,
19315 	    VM_SANITIZE_CALLER_VM_MAP_REMAP, target_map,
19316 	    cur_protection, max_protection);
19317 	if (__improbable(result != KERN_SUCCESS)) {
19318 		return result;
19319 	}
19320 
19321 	result = vm_sanitize_mask(mask_u, VM_SANITIZE_CALLER_VM_MAP_REMAP, mask);
19322 	if (__improbable(result != KERN_SUCCESS)) {
19323 		return result;
19324 	}
19325 
19326 	/*
19327 	 * If the user is requesting that we return the address of the
19328 	 * first byte of the data (rather than the base of the page),
19329 	 * then we use different rounding semantics: specifically,
19330 	 * we assume that (memory_address, size) describes a region
19331 	 * all of whose pages we must cover, rather than a base to be truncated
19332 	 * down and a size to be added to that base.  So we figure out
19333 	 * the highest page that the requested region includes and make
19334 	 * sure that the size will cover it.
19335 	 *
19336 	 * The key example we're worried about it is of the form:
19337 	 *
19338 	 *              memory_address = 0x1ff0, size = 0x20
19339 	 *
19340 	 * With the old semantics, we round down the memory_address to 0x1000
19341 	 * and round up the size to 0x1000, resulting in our covering *only*
19342 	 * page 0x1000.  With the new semantics, we'd realize that the region covers
19343 	 * 0x1ff0-0x2010, and compute a size of 0x2000.  Thus, we cover both page
19344 	 * 0x1000 and page 0x2000 in the region we remap.
19345 	 *
19346 	 * VM_SANITIZE_FLAGS_REALIGN_START asks for the old (broken) semantics.
19347 	 */
19348 	vm_sanitize_flags = VM_SANITIZE_FLAGS_SIZE_ZERO_FAILS;
19349 	if (!vmk_flags.vmf_return_data_addr) {
19350 		vm_sanitize_flags |= VM_SANITIZE_FLAGS_REALIGN_START;
19351 	}
19352 
19353 	result = vm_sanitize_addr_size(memory_address_u, size_u,
19354 	    VM_SANITIZE_CALLER_VM_MAP_REMAP, src_map,
19355 	    vm_sanitize_flags, memory_address, memory_end,
19356 	    memory_size);
19357 	if (__improbable(result != KERN_SUCCESS)) {
19358 		return result;
19359 	}
19360 
19361 	*target_addr = vm_sanitize_addr(target_map, address_u);
19362 	return KERN_SUCCESS;
19363 }
19364 
19365 /*
19366  *	Routine:	vm_remap
19367  *
19368  *			Map portion of a task's address space.
19369  *			Mapped region must not overlap more than
19370  *			one vm memory object. Protections and
19371  *			inheritance attributes remain the same
19372  *			as in the original task and are	out parameters.
19373  *			Source and Target task can be identical
19374  *			Other attributes are identical as for vm_map()
19375  */
19376 kern_return_t
vm_map_remap(vm_map_t target_map,vm_map_address_ut * address_u,vm_map_size_ut size_u,vm_map_offset_ut mask_u,vm_map_kernel_flags_t vmk_flags,vm_map_t src_map,vm_map_offset_ut memory_address_u,boolean_t copy,vm_prot_ut * cur_protection_u,vm_prot_ut * max_protection_u,vm_inherit_ut inheritance_u)19377 vm_map_remap(
19378 	vm_map_t                target_map,
19379 	vm_map_address_ut      *address_u,
19380 	vm_map_size_ut          size_u,
19381 	vm_map_offset_ut        mask_u,
19382 	vm_map_kernel_flags_t   vmk_flags,
19383 	vm_map_t                src_map,
19384 	vm_map_offset_ut        memory_address_u,
19385 	boolean_t               copy,
19386 	vm_prot_ut             *cur_protection_u, /* IN/OUT */
19387 	vm_prot_ut             *max_protection_u, /* IN/OUT */
19388 	vm_inherit_ut           inheritance_u)
19389 {
19390 	vm_map_address_t        target_addr, mask;
19391 	vm_map_size_t           target_size;
19392 	vm_map_offset_t         memory_address, memory_end;
19393 	vm_map_size_t           memory_size;
19394 	vm_prot_t               cur_protection, max_protection;
19395 	vm_inherit_t            inheritance;
19396 	kern_return_t           result;
19397 	vm_map_entry_t          insp_entry = VM_MAP_ENTRY_NULL;
19398 	vm_map_copy_t           copy_map;
19399 	vm_map_offset_t         offset_in_mapping;
19400 	vm_map_size_t           src_page_mask, target_page_mask;
19401 	vm_map_size_t           initial_size;
19402 	VM_MAP_ZAP_DECLARE(zap_list);
19403 
19404 	if (target_map == VM_MAP_NULL || src_map == VM_MAP_NULL) {
19405 		return KERN_INVALID_ARGUMENT;
19406 	}
19407 	src_page_mask    = VM_MAP_PAGE_MASK(src_map);
19408 	target_page_mask = VM_MAP_PAGE_MASK(target_map);
19409 
19410 	if (src_page_mask != target_page_mask) {
19411 		if (copy) {
19412 			DEBUG4K_COPY("src_map %p pgsz 0x%x addr 0x%llx size 0x%llx copy %d -> target_map %p pgsz 0x%x\n", src_map, VM_MAP_PAGE_SIZE(src_map), VM_SANITIZE_UNSAFE_UNWRAP(memory_address_u), VM_SANITIZE_UNSAFE_UNWRAP(size_u), copy, target_map, VM_MAP_PAGE_SIZE(target_map));
19413 		} else {
19414 			DEBUG4K_SHARE("src_map %p pgsz 0x%x addr 0x%llx size 0x%llx copy %d -> target_map %p pgsz 0x%x\n", src_map, VM_MAP_PAGE_SIZE(src_map), VM_SANITIZE_UNSAFE_UNWRAP(memory_address_u), VM_SANITIZE_UNSAFE_UNWRAP(size_u), copy, target_map, VM_MAP_PAGE_SIZE(target_map));
19415 		}
19416 	}
19417 
19418 	/*
19419 	 * Sanitize any input parameters that are addr/size/prot/inherit
19420 	 */
19421 	result = vm_map_remap_sanitize(src_map,
19422 	    target_map,
19423 	    *address_u,
19424 	    size_u,
19425 	    mask_u,
19426 	    memory_address_u,
19427 	    *cur_protection_u,
19428 	    *max_protection_u,
19429 	    inheritance_u,
19430 	    vmk_flags,
19431 	    &target_addr,
19432 	    &mask,
19433 	    &memory_address,
19434 	    &memory_end,
19435 	    &memory_size,
19436 	    &cur_protection,
19437 	    &max_protection,
19438 	    &inheritance);
19439 	if (__improbable(result != KERN_SUCCESS)) {
19440 		return vm_sanitize_get_kr(result);
19441 	}
19442 
19443 	if (vmk_flags.vmf_return_data_addr) {
19444 		/*
19445 		 * This is safe to unwrap now that the quantities
19446 		 * have been validated and rounded up normally.
19447 		 */
19448 		offset_in_mapping = vm_sanitize_offset_in_page(src_map,
19449 		    memory_address_u);
19450 		initial_size = VM_SANITIZE_UNSAFE_UNWRAP(size_u);
19451 	} else {
19452 		/*
19453 		 * IMPORTANT:
19454 		 * This legacy code path is broken: for the range mentioned
19455 		 * above [ memory_address = 0x1ff0,size = 0x20 ], which spans
19456 		 * two 4k pages, it yields [ memory_address = 0x1000,
19457 		 * size = 0x1000 ], which covers only the first 4k page.
19458 		 * BUT some code unfortunately depends on this bug, so we
19459 		 * can't fix it without breaking something.
19460 		 * New code should get automatically opted in the new
19461 		 * behavior with the new VM_FLAGS_RETURN_DATA_ADDR flags.
19462 		 */
19463 		offset_in_mapping = 0;
19464 		initial_size = memory_size;
19465 	}
19466 
19467 	if (vmk_flags.vmf_resilient_media) {
19468 		/* must be copy-on-write to be "media resilient" */
19469 		if (!copy) {
19470 			return KERN_INVALID_ARGUMENT;
19471 		}
19472 	}
19473 
19474 	vmk_flags.vmkf_copy_pageable = target_map->hdr.entries_pageable;
19475 	vmk_flags.vmkf_copy_same_map = (src_map == target_map);
19476 
19477 	assert(memory_size != 0);
19478 	result = vm_map_copy_extract(src_map,
19479 	    memory_address,
19480 	    memory_size,
19481 	    copy, &copy_map,
19482 	    &cur_protection, /* IN/OUT */
19483 	    &max_protection, /* IN/OUT */
19484 	    inheritance,
19485 	    vmk_flags);
19486 	if (result != KERN_SUCCESS) {
19487 		return result;
19488 	}
19489 	assert(copy_map != VM_MAP_COPY_NULL);
19490 
19491 	/*
19492 	 * Handle the policy for vm map ranges
19493 	 *
19494 	 * If the maps differ, the target_map policy applies like for vm_map()
19495 	 * For same mapping remaps, we preserve the range.
19496 	 */
19497 	if (vmk_flags.vmkf_copy_same_map) {
19498 		vmk_flags.vmkf_range_id = copy_map->orig_range;
19499 	} else {
19500 		vm_map_kernel_flags_update_range_id(&vmk_flags, target_map, memory_size);
19501 	}
19502 
19503 	target_size = memory_size;
19504 	if (src_page_mask != target_page_mask) {
19505 		vm_map_copy_t   target_copy_map;
19506 		vm_map_offset_t overmap_start = 0;
19507 		vm_map_offset_t overmap_end   = 0;
19508 		vm_map_offset_t trimmed_start = 0;
19509 
19510 		target_copy_map = copy_map; /* can modify "copy_map" itself */
19511 		DEBUG4K_ADJUST("adjusting...\n");
19512 		result = vm_map_copy_adjust_to_target(
19513 			copy_map,
19514 			offset_in_mapping, /* offset */
19515 			initial_size,
19516 			target_map,
19517 			copy,
19518 			&target_copy_map,
19519 			&overmap_start,
19520 			&overmap_end,
19521 			&trimmed_start);
19522 		if (result != KERN_SUCCESS) {
19523 			DEBUG4K_COPY("failed to adjust 0x%x\n", result);
19524 			vm_map_copy_discard(copy_map);
19525 			return result;
19526 		}
19527 		if (trimmed_start == 0) {
19528 			/* nothing trimmed: no adjustment needed */
19529 		} else if (trimmed_start >= offset_in_mapping) {
19530 			/* trimmed more than offset_in_mapping: nothing left */
19531 			assert(overmap_start == 0);
19532 			assert(overmap_end == 0);
19533 			offset_in_mapping = 0;
19534 		} else {
19535 			/* trimmed some of offset_in_mapping: adjust */
19536 			assert(overmap_start == 0);
19537 			assert(overmap_end == 0);
19538 			offset_in_mapping -= trimmed_start;
19539 		}
19540 		offset_in_mapping += overmap_start;
19541 		target_size = target_copy_map->size;
19542 	}
19543 
19544 	/*
19545 	 * Allocate/check a range of free virtual address
19546 	 * space for the target
19547 	 */
19548 	target_size = vm_map_round_page(target_size, target_page_mask);
19549 
19550 	if (target_size == 0) {
19551 		vm_map_copy_discard(copy_map);
19552 		return KERN_INVALID_ARGUMENT;
19553 	}
19554 
19555 	if (__improbable(!vm_map_is_map_size_valid(
19556 		    target_map, target_size, vmk_flags.vmkf_no_soft_limit))) {
19557 		vm_map_copy_discard(copy_map);
19558 		return KERN_NO_SPACE;
19559 	}
19560 
19561 	vm_map_lock(target_map);
19562 
19563 	if (!vmk_flags.vmf_fixed) {
19564 		result = vm_map_locate_space_anywhere(target_map, target_size,
19565 		    mask, vmk_flags, &target_addr, &insp_entry);
19566 	} else {
19567 		/*
19568 		 * vm_map_locate_space_fixed will reject overflowing
19569 		 * target_addr + target_size values
19570 		 */
19571 		result = vm_map_locate_space_fixed(target_map, target_addr,
19572 		    target_size, mask, vmk_flags, &insp_entry, &zap_list);
19573 
19574 		if (result == KERN_MEMORY_PRESENT) {
19575 			assert(!vmk_flags.vmkf_already);
19576 			insp_entry = VM_MAP_ENTRY_NULL;
19577 			result = KERN_NO_SPACE;
19578 		}
19579 	}
19580 
19581 	if (result == KERN_SUCCESS) {
19582 		while (vm_map_copy_first_entry(copy_map) !=
19583 		    vm_map_copy_to_entry(copy_map)) {
19584 			vm_map_entry_t entry = vm_map_copy_first_entry(copy_map);
19585 
19586 			vm_map_copy_entry_unlink(copy_map, entry);
19587 
19588 			if (vmk_flags.vmkf_remap_prot_copy) {
19589 				/*
19590 				 * This vm_map_remap() is for a
19591 				 * vm_protect(VM_PROT_COPY), so the caller
19592 				 * expects to be allowed to add write access
19593 				 * to this new mapping.  This is done by
19594 				 * adding VM_PROT_WRITE to each entry's
19595 				 * max_protection... unless some security
19596 				 * settings disallow it.
19597 				 */
19598 				bool allow_write = false;
19599 				if (entry->vme_permanent) {
19600 					/* immutable mapping... */
19601 					if ((entry->max_protection & VM_PROT_EXECUTE) &&
19602 					    developer_mode_state()) {
19603 						/*
19604 						 * ... but executable and
19605 						 * possibly being debugged,
19606 						 * so let's allow it to become
19607 						 * writable, for breakpoints
19608 						 * and dtrace probes, for
19609 						 * example.
19610 						 */
19611 						allow_write = true;
19612 					} else {
19613 						printf("%d[%s] vm_remap(0x%llx,0x%llx) VM_PROT_COPY denied on permanent mapping prot 0x%x/0x%x developer %d\n",
19614 						    proc_selfpid(),
19615 						    (get_bsdtask_info(current_task())
19616 						    ? proc_name_address(get_bsdtask_info(current_task()))
19617 						    : "?"),
19618 						    (uint64_t)memory_address,
19619 						    (uint64_t)memory_size,
19620 						    entry->protection,
19621 						    entry->max_protection,
19622 						    developer_mode_state());
19623 						DTRACE_VM6(vm_map_delete_permanent_deny_protcopy,
19624 						    vm_map_entry_t, entry,
19625 						    vm_map_offset_t, entry->vme_start,
19626 						    vm_map_offset_t, entry->vme_end,
19627 						    vm_prot_t, entry->protection,
19628 						    vm_prot_t, entry->max_protection,
19629 						    int, VME_ALIAS(entry));
19630 					}
19631 				} else {
19632 					allow_write = true;
19633 				}
19634 
19635 				/*
19636 				 * VM_PROT_COPY: allow this mapping to become
19637 				 * writable, unless it was "permanent".
19638 				 */
19639 				if (allow_write) {
19640 					entry->max_protection |= VM_PROT_WRITE;
19641 				}
19642 			}
19643 			if (vmk_flags.vmf_resilient_codesign) {
19644 				/* no codesigning -> read-only access */
19645 				entry->max_protection = VM_PROT_READ;
19646 				entry->protection = VM_PROT_READ;
19647 				entry->vme_resilient_codesign = TRUE;
19648 			}
19649 			entry->vme_start += target_addr;
19650 			entry->vme_end += target_addr;
19651 			assert(!entry->map_aligned);
19652 			if (vmk_flags.vmf_resilient_media &&
19653 			    !entry->is_sub_map &&
19654 			    (VME_OBJECT(entry) == VM_OBJECT_NULL ||
19655 			    VME_OBJECT(entry)->internal)) {
19656 				entry->vme_resilient_media = TRUE;
19657 			}
19658 			assert(VM_MAP_PAGE_ALIGNED(entry->vme_start, MIN(target_page_mask, PAGE_MASK)));
19659 			assert(VM_MAP_PAGE_ALIGNED(entry->vme_end, MIN(target_page_mask, PAGE_MASK)));
19660 			assert(VM_MAP_PAGE_ALIGNED(VME_OFFSET(entry), MIN(target_page_mask, PAGE_MASK)));
19661 			vm_map_store_entry_link(target_map, insp_entry, entry,
19662 			    vmk_flags);
19663 			insp_entry = entry;
19664 		}
19665 	}
19666 
19667 	if (vmk_flags.vmf_resilient_codesign) {
19668 		cur_protection = VM_PROT_READ;
19669 		max_protection = VM_PROT_READ;
19670 	}
19671 
19672 	if (result == KERN_SUCCESS) {
19673 		target_map->size += target_size;
19674 		SAVE_HINT_MAP_WRITE(target_map, insp_entry);
19675 	}
19676 	vm_map_unlock(target_map);
19677 
19678 	vm_map_zap_dispose(&zap_list);
19679 
19680 	if (result == KERN_SUCCESS && target_map->wiring_required) {
19681 		result = vm_map_wire_nested(target_map, target_addr,
19682 		    target_addr + target_size, cur_protection, VM_KERN_MEMORY_MLOCK,
19683 		    TRUE, PMAP_NULL, 0, NULL);
19684 	}
19685 
19686 	if (result == KERN_SUCCESS) {
19687 #if KASAN
19688 		if (target_map->pmap == kernel_pmap) {
19689 			kasan_notify_address(target_addr, target_size);
19690 		}
19691 #endif
19692 		/*
19693 		 * If requested, return the address of the data pointed to by the
19694 		 * request, rather than the base of the resulting page.
19695 		 */
19696 		if (vmk_flags.vmf_return_data_addr) {
19697 			target_addr += offset_in_mapping;
19698 		}
19699 
19700 		/*
19701 		 * Update OUT parameters.
19702 		 */
19703 		*address_u = vm_sanitize_wrap_addr(target_addr);
19704 
19705 		*cur_protection_u = vm_sanitize_wrap_prot(cur_protection);
19706 		*max_protection_u = vm_sanitize_wrap_prot(max_protection);
19707 	}
19708 
19709 	if (src_page_mask != target_page_mask) {
19710 		DEBUG4K_SHARE("vm_remap(%p 0x%llx 0x%llx copy=%d-> %p 0x%llx 0x%llx  result=0x%x\n", src_map, (uint64_t)memory_address, (uint64_t)target_size, copy, target_map, (uint64_t)target_addr, (uint64_t)offset_in_mapping, result);
19711 	}
19712 	vm_map_copy_discard(copy_map);
19713 	copy_map = VM_MAP_COPY_NULL;
19714 
19715 	return result;
19716 }
19717 
19718 /*
19719  *	vm_map_switch_to:
19720  *
19721  *	Set the address map for the current thread to the specified map.
19722  *  Returns a struct containing info about the previous map, which should be
19723  *  restored with `vm_map_switch_back`
19724  */
19725 
19726 vm_map_switch_context_t
vm_map_switch_to(vm_map_t map)19727 vm_map_switch_to(vm_map_t map)
19728 {
19729 	thread_t thread = current_thread();
19730 	vm_map_t oldmap = thread->map;
19731 
19732 	/*
19733 	 * Deactivate the current map and activate the requested map
19734 	 */
19735 	mp_disable_preemption();
19736 	PMAP_SWITCH_USER(thread, map, cpu_number());
19737 	mp_enable_preemption();
19738 
19739 	vm_map_lock(map);
19740 	task_t task = map->owning_task;
19741 	if (task) {
19742 		task_reference(task);
19743 	}
19744 	vm_map_unlock(map);
19745 
19746 	return (vm_map_switch_context_t) { oldmap, task };
19747 }
19748 
19749 void
vm_map_switch_back(vm_map_switch_context_t ctx)19750 vm_map_switch_back(vm_map_switch_context_t ctx)
19751 {
19752 	thread_t thread = current_thread();
19753 	task_t task = ctx.task;
19754 	vm_map_t map = ctx.map;
19755 
19756 	if (task) {
19757 		task_deallocate(task);
19758 	} else {
19759 		/*
19760 		 * We want to make sure that vm_map_setup was not called while the
19761 		 * map was switched. This allows us to guarantee the property that
19762 		 * we always have a reference on current_map()->owning_task if it is
19763 		 * not NULL.
19764 		 */
19765 		assert(!thread->map->owning_task);
19766 	}
19767 
19768 	/*
19769 	 * Restore the original map from prior to vm_map_switch_to
19770 	 */
19771 	mp_disable_preemption();
19772 	PMAP_SWITCH_USER(thread, map, cpu_number());
19773 	mp_enable_preemption();
19774 }
19775 
19776 static __attribute__((always_inline, warn_unused_result))
19777 kern_return_t
vm_map_rw_user_sanitize(vm_map_t map,vm_map_address_ut addr_u,vm_size_ut size_u,vm_sanitize_caller_t vm_sanitize_caller,vm_map_address_t * addr,vm_map_address_t * end,vm_map_size_t * size)19778 vm_map_rw_user_sanitize(
19779 	vm_map_t                map,
19780 	vm_map_address_ut       addr_u,
19781 	vm_size_ut              size_u,
19782 	vm_sanitize_caller_t    vm_sanitize_caller,
19783 	vm_map_address_t       *addr,
19784 	vm_map_address_t       *end,
19785 	vm_map_size_t          *size)
19786 {
19787 	vm_sanitize_flags_t flags = VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH |
19788 	    VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES |
19789 	    VM_SANITIZE_FLAGS_CHECK_ADDR_RANGE;
19790 
19791 	return vm_sanitize_addr_size(addr_u, size_u,
19792 	           vm_sanitize_caller, map,
19793 	           flags,
19794 	           addr, end, size);
19795 }
19796 
19797 /*
19798  *	Routine:	vm_map_write_user
19799  *
19800  *	Description:
19801  *		Copy out data from a kernel space into space in the
19802  *		destination map. The space must already exist in the
19803  *		destination map.
19804  *		NOTE:  This routine should only be called by threads
19805  *		which can block on a page fault. i.e. kernel mode user
19806  *		threads.
19807  *
19808  */
19809 kern_return_t
vm_map_write_user(vm_map_t map,void * src_p,vm_map_address_ut dst_addr_u,vm_size_ut size_u)19810 vm_map_write_user(
19811 	vm_map_t                map,
19812 	void                   *src_p,
19813 	vm_map_address_ut       dst_addr_u,
19814 	vm_size_ut              size_u)
19815 {
19816 	kern_return_t    kr;
19817 	vm_map_address_t dst_addr, dst_end;
19818 	vm_map_size_t    size;
19819 
19820 	/*
19821 	 * src_p isn't validated: [src_p, src_p + size_u)
19822 	 * is trusted kernel input.
19823 	 *
19824 	 * dst_addr_u and size_u are untrusted and need to be sanitized.
19825 	 */
19826 	kr = vm_map_rw_user_sanitize(map,
19827 	    dst_addr_u,
19828 	    size_u,
19829 	    VM_SANITIZE_CALLER_VM_MAP_WRITE_USER,
19830 	    &dst_addr,
19831 	    &dst_end,
19832 	    &size);
19833 	if (__improbable(kr != KERN_SUCCESS)) {
19834 		return vm_sanitize_get_kr(kr);
19835 	}
19836 
19837 	if (current_map() == map) {
19838 		if (copyout(src_p, dst_addr, size)) {
19839 			kr = KERN_INVALID_ADDRESS;
19840 		}
19841 	} else {
19842 		vm_map_switch_context_t switch_ctx;
19843 
19844 		/* take on the identity of the target map while doing */
19845 		/* the transfer */
19846 
19847 		vm_map_reference(map);
19848 		switch_ctx = vm_map_switch_to(map);
19849 		if (copyout(src_p, dst_addr, size)) {
19850 			kr = KERN_INVALID_ADDRESS;
19851 		}
19852 		vm_map_switch_back(switch_ctx);
19853 		vm_map_deallocate(map);
19854 	}
19855 	return kr;
19856 }
19857 
19858 /*
19859  *	Routine:	vm_map_read_user
19860  *
19861  *	Description:
19862  *		Copy in data from a user space source map into the
19863  *		kernel map. The space must already exist in the
19864  *		kernel map.
19865  *		NOTE:  This routine should only be called by threads
19866  *		which can block on a page fault. i.e. kernel mode user
19867  *		threads.
19868  *
19869  */
19870 kern_return_t
vm_map_read_user(vm_map_t map,vm_map_address_ut src_addr_u,void * dst_p,vm_size_ut size_u)19871 vm_map_read_user(
19872 	vm_map_t                map,
19873 	vm_map_address_ut       src_addr_u,
19874 	void                   *dst_p,
19875 	vm_size_ut              size_u)
19876 {
19877 	kern_return_t    kr;
19878 	vm_map_address_t src_addr, src_end;
19879 	vm_map_size_t    size;
19880 
19881 	/*
19882 	 * dst_p isn't validated: [dst_p, dst_p + size_u)
19883 	 * is trusted kernel input.
19884 	 *
19885 	 * src_addr_u and size_u are untrusted and need to be sanitized.
19886 	 */
19887 	kr = vm_map_rw_user_sanitize(map,
19888 	    src_addr_u,
19889 	    size_u,
19890 	    VM_SANITIZE_CALLER_VM_MAP_READ_USER,
19891 	    &src_addr,
19892 	    &src_end,
19893 	    &size);
19894 	if (__improbable(kr != KERN_SUCCESS)) {
19895 		return vm_sanitize_get_kr(kr);
19896 	}
19897 
19898 	if (current_map() == map) {
19899 		if (copyin(src_addr, dst_p, size)) {
19900 			kr = KERN_INVALID_ADDRESS;
19901 		}
19902 	} else {
19903 		vm_map_switch_context_t switch_ctx;
19904 
19905 		/* take on the identity of the target map while doing */
19906 		/* the transfer */
19907 
19908 		vm_map_reference(map);
19909 		switch_ctx = vm_map_switch_to(map);
19910 		if (copyin(src_addr, dst_p, size)) {
19911 			kr = KERN_INVALID_ADDRESS;
19912 		}
19913 		vm_map_switch_back(switch_ctx);
19914 		vm_map_deallocate(map);
19915 	}
19916 	return kr;
19917 }
19918 
19919 
19920 static __attribute__((always_inline, warn_unused_result))
19921 kern_return_t
vm_map_check_protection_sanitize(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_prot_ut protection_u,vm_sanitize_caller_t vm_sanitize_caller,vm_map_offset_t * start,vm_map_offset_t * end,vm_prot_t * protection)19922 vm_map_check_protection_sanitize(
19923 	vm_map_t                map,
19924 	vm_map_offset_ut        start_u,
19925 	vm_map_offset_ut        end_u,
19926 	vm_prot_ut              protection_u,
19927 	vm_sanitize_caller_t    vm_sanitize_caller,
19928 	vm_map_offset_t        *start,
19929 	vm_map_offset_t        *end,
19930 	vm_prot_t              *protection)
19931 {
19932 	kern_return_t           kr;
19933 	vm_map_size_t           size;
19934 
19935 	kr = vm_sanitize_addr_end(start_u, end_u, vm_sanitize_caller, map,
19936 	    VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH, start, end,
19937 	    &size);
19938 	if (__improbable(kr != KERN_SUCCESS)) {
19939 		return kr;
19940 	}
19941 
19942 	/*
19943 	 * Given that the protection is used only for comparisons below
19944 	 * no sanitization is being applied on it.
19945 	 */
19946 	*protection = VM_SANITIZE_UNSAFE_UNWRAP(protection_u);
19947 
19948 	return KERN_SUCCESS;
19949 }
19950 
19951 /*
19952  *	vm_map_check_protection:
19953  *
19954  *	Assert that the target map allows the specified
19955  *	privilege on the entire address region given.
19956  *	The entire region must be allocated.
19957  */
19958 boolean_t
vm_map_check_protection(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_prot_ut protection_u,vm_sanitize_caller_t vm_sanitize_caller)19959 vm_map_check_protection(
19960 	vm_map_t                map,
19961 	vm_map_offset_ut        start_u,
19962 	vm_map_offset_ut        end_u,
19963 	vm_prot_ut              protection_u,
19964 	vm_sanitize_caller_t    vm_sanitize_caller)
19965 {
19966 	vm_map_entry_t entry;
19967 	vm_map_entry_t tmp_entry;
19968 	vm_map_offset_t start;
19969 	vm_map_offset_t end;
19970 	vm_prot_t protection;
19971 	kern_return_t kr;
19972 
19973 	kr = vm_map_check_protection_sanitize(map,
19974 	    start_u,
19975 	    end_u,
19976 	    protection_u,
19977 	    vm_sanitize_caller,
19978 	    &start,
19979 	    &end,
19980 	    &protection);
19981 	if (__improbable(kr != KERN_SUCCESS)) {
19982 		kr = vm_sanitize_get_kr(kr);
19983 		if (kr == KERN_SUCCESS) {
19984 			return true;
19985 		}
19986 		return false;
19987 	}
19988 
19989 	vm_map_lock(map);
19990 
19991 	if (start < vm_map_min(map) || end > vm_map_max(map)) {
19992 		vm_map_unlock(map);
19993 		return false;
19994 	}
19995 
19996 	if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
19997 		vm_map_unlock(map);
19998 		return false;
19999 	}
20000 
20001 	entry = tmp_entry;
20002 
20003 	while (start < end) {
20004 		if (entry == vm_map_to_entry(map)) {
20005 			vm_map_unlock(map);
20006 			return false;
20007 		}
20008 
20009 		/*
20010 		 *	No holes allowed!
20011 		 */
20012 
20013 		if (start < entry->vme_start) {
20014 			vm_map_unlock(map);
20015 			return false;
20016 		}
20017 
20018 		/*
20019 		 * Check protection associated with entry.
20020 		 */
20021 
20022 		if ((entry->protection & protection) != protection) {
20023 			vm_map_unlock(map);
20024 			return false;
20025 		}
20026 
20027 		/* go to next entry */
20028 
20029 		start = entry->vme_end;
20030 		entry = entry->vme_next;
20031 	}
20032 	vm_map_unlock(map);
20033 	return true;
20034 }
20035 
20036 kern_return_t
vm_map_purgable_control(vm_map_t map,vm_map_offset_ut address_u,vm_purgable_t control,int * state)20037 vm_map_purgable_control(
20038 	vm_map_t                map,
20039 	vm_map_offset_ut        address_u,
20040 	vm_purgable_t           control,
20041 	int                    *state)
20042 {
20043 	vm_map_offset_t         address;
20044 	vm_map_entry_t          entry;
20045 	vm_object_t             object;
20046 	kern_return_t           kr;
20047 	boolean_t               was_nonvolatile;
20048 
20049 	/*
20050 	 * Vet all the input parameters and current type and state of the
20051 	 * underlaying object.  Return with an error if anything is amiss.
20052 	 */
20053 	if (map == VM_MAP_NULL) {
20054 		return KERN_INVALID_ARGUMENT;
20055 	}
20056 
20057 	if (control != VM_PURGABLE_SET_STATE &&
20058 	    control != VM_PURGABLE_GET_STATE &&
20059 	    control != VM_PURGABLE_PURGE_ALL &&
20060 	    control != VM_PURGABLE_SET_STATE_FROM_KERNEL) {
20061 		return KERN_INVALID_ARGUMENT;
20062 	}
20063 
20064 	if (control == VM_PURGABLE_PURGE_ALL) {
20065 		vm_purgeable_object_purge_all();
20066 		return KERN_SUCCESS;
20067 	}
20068 
20069 	if ((control == VM_PURGABLE_SET_STATE ||
20070 	    control == VM_PURGABLE_SET_STATE_FROM_KERNEL) &&
20071 	    (((*state & ~(VM_PURGABLE_ALL_MASKS)) != 0) ||
20072 	    ((*state & VM_PURGABLE_STATE_MASK) > VM_PURGABLE_STATE_MASK))) {
20073 		return KERN_INVALID_ARGUMENT;
20074 	}
20075 
20076 	address = vm_sanitize_addr(map, address_u);
20077 
20078 	vm_map_lock_read(map);
20079 
20080 	if (!vm_map_lookup_entry(map, address, &entry) || entry->is_sub_map) {
20081 		/*
20082 		 * Must pass a valid non-submap address.
20083 		 */
20084 		vm_map_unlock_read(map);
20085 		return KERN_INVALID_ADDRESS;
20086 	}
20087 
20088 	if ((entry->protection & VM_PROT_WRITE) == 0 &&
20089 	    control != VM_PURGABLE_GET_STATE) {
20090 		/*
20091 		 * Can't apply purgable controls to something you can't write.
20092 		 */
20093 		vm_map_unlock_read(map);
20094 		return KERN_PROTECTION_FAILURE;
20095 	}
20096 
20097 	object = VME_OBJECT(entry);
20098 	if (object == VM_OBJECT_NULL ||
20099 	    object->purgable == VM_PURGABLE_DENY) {
20100 		/*
20101 		 * Object must already be present and be purgeable.
20102 		 */
20103 		vm_map_unlock_read(map);
20104 		return KERN_INVALID_ARGUMENT;
20105 	}
20106 
20107 	vm_object_lock(object);
20108 
20109 #if 00
20110 	if (VME_OFFSET(entry) != 0 ||
20111 	    entry->vme_end - entry->vme_start != object->vo_size) {
20112 		/*
20113 		 * Can only apply purgable controls to the whole (existing)
20114 		 * object at once.
20115 		 */
20116 		vm_map_unlock_read(map);
20117 		vm_object_unlock(object);
20118 		return KERN_INVALID_ARGUMENT;
20119 	}
20120 #endif
20121 
20122 	assert(!entry->is_sub_map);
20123 	assert(!entry->use_pmap); /* purgeable has its own accounting */
20124 
20125 	vm_map_unlock_read(map);
20126 
20127 	was_nonvolatile = (object->purgable == VM_PURGABLE_NONVOLATILE);
20128 
20129 	kr = vm_object_purgable_control(object, control, state);
20130 
20131 	if (was_nonvolatile &&
20132 	    object->purgable != VM_PURGABLE_NONVOLATILE &&
20133 	    map->pmap == kernel_pmap) {
20134 #if DEBUG
20135 		object->vo_purgeable_volatilizer = kernel_task;
20136 #endif /* DEBUG */
20137 	}
20138 
20139 	vm_object_unlock(object);
20140 
20141 	return kr;
20142 }
20143 
20144 void
vm_map_footprint_query_page_info(vm_map_t map,vm_map_entry_t map_entry,vm_map_offset_t curr_s_offset,int * disposition_p)20145 vm_map_footprint_query_page_info(
20146 	vm_map_t        map,
20147 	vm_map_entry_t  map_entry,
20148 	vm_map_offset_t curr_s_offset,
20149 	int             *disposition_p)
20150 {
20151 	int             pmap_disp;
20152 	vm_object_t     object = VM_OBJECT_NULL;
20153 	int             disposition;
20154 	int             effective_page_size;
20155 
20156 	vm_map_lock_assert_held(map);
20157 	assert(!map->has_corpse_footprint);
20158 	assert(curr_s_offset >= map_entry->vme_start);
20159 	assert(curr_s_offset < map_entry->vme_end);
20160 
20161 	if (map_entry->is_sub_map) {
20162 		if (!map_entry->use_pmap) {
20163 			/* nested pmap: no footprint */
20164 			*disposition_p = 0;
20165 			return;
20166 		}
20167 	} else {
20168 		object = VME_OBJECT(map_entry);
20169 		if (object == VM_OBJECT_NULL) {
20170 			/* nothing mapped here: no need to ask */
20171 			*disposition_p = 0;
20172 			return;
20173 		}
20174 	}
20175 
20176 	effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(map));
20177 
20178 	pmap_disp = 0;
20179 
20180 	/*
20181 	 * Query the pmap.
20182 	 */
20183 	pmap_query_page_info(map->pmap, curr_s_offset, &pmap_disp);
20184 
20185 	/*
20186 	 * Compute this page's disposition.
20187 	 */
20188 	disposition = 0;
20189 
20190 	/* deal with "alternate accounting" first */
20191 	if (!map_entry->is_sub_map &&
20192 	    object->vo_no_footprint) {
20193 		/* does not count in footprint */
20194 //		assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
20195 	} else if (!map_entry->is_sub_map &&
20196 	    !object->internal &&
20197 	    object->vo_ledger_tag &&
20198 	    VM_OBJECT_OWNER(object) != NULL &&
20199 	    VM_OBJECT_OWNER(object)->map == map) {
20200 		/* owned external object: wired pages count in footprint */
20201 		assertf(map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
20202 		if ((((curr_s_offset
20203 		    - map_entry->vme_start
20204 		    + VME_OFFSET(map_entry))
20205 		    / effective_page_size) <
20206 		    object->wired_page_count)) {
20207 			/*
20208 			 * External object owned by this task: report the first
20209 			 * "#wired" pages as "resident" (to show that they
20210 			 * contribute to the footprint) but not "dirty"
20211 			 * (to avoid double-counting with the fake "owned"
20212 			 * region we'll report at the end of the address space
20213 			 * to account for all (mapped or not) owned memory
20214 			 * owned by this task.
20215 			 */
20216 			disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
20217 		}
20218 	} else if (!map_entry->is_sub_map &&
20219 	    object->internal &&
20220 	    (object->purgable == VM_PURGABLE_NONVOLATILE ||
20221 	    (object->purgable == VM_PURGABLE_DENY &&
20222 	    object->vo_ledger_tag)) &&
20223 	    VM_OBJECT_OWNER(object) != NULL &&
20224 	    VM_OBJECT_OWNER(object)->map == map) {
20225 		assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
20226 		if ((((curr_s_offset
20227 		    - map_entry->vme_start
20228 		    + VME_OFFSET(map_entry))
20229 		    / effective_page_size) <
20230 		    (object->resident_page_count +
20231 		    vm_compressor_pager_get_count(object->pager)))) {
20232 			/*
20233 			 * Non-volatile purgeable object owned
20234 			 * by this task: report the first
20235 			 * "#resident + #compressed" pages as
20236 			 * "resident" (to show that they
20237 			 * contribute to the footprint) but not
20238 			 * "dirty" (to avoid double-counting
20239 			 * with the fake "non-volatile" region
20240 			 * we'll report at the end of the
20241 			 * address space to account for all
20242 			 * (mapped or not) non-volatile memory
20243 			 * owned by this task.
20244 			 */
20245 			disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
20246 		}
20247 	} else if (!map_entry->is_sub_map &&
20248 	    object->internal &&
20249 	    (object->purgable == VM_PURGABLE_VOLATILE ||
20250 	    object->purgable == VM_PURGABLE_EMPTY) &&
20251 	    VM_OBJECT_OWNER(object) != NULL &&
20252 	    VM_OBJECT_OWNER(object)->map == map) {
20253 		if (object->internal) {
20254 			assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
20255 		}
20256 		if ((((curr_s_offset
20257 		    - map_entry->vme_start
20258 		    + VME_OFFSET(map_entry))
20259 		    / effective_page_size) <
20260 		    object->wired_page_count)) {
20261 			/*
20262 			 * Volatile|empty purgeable object owned
20263 			 * by this task: report the first
20264 			 * "#wired" pages as "resident" (to
20265 			 * show that they contribute to the
20266 			 * footprint) but not "dirty" (to avoid
20267 			 * double-counting with the fake
20268 			 * "non-volatile" region we'll report
20269 			 * at the end of the address space to
20270 			 * account for all (mapped or not)
20271 			 * non-volatile memory owned by this
20272 			 * task.
20273 			 */
20274 			disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
20275 		}
20276 	} else if (!map_entry->is_sub_map &&
20277 	    map_entry->iokit_acct &&
20278 	    object->internal &&
20279 	    object->purgable == VM_PURGABLE_DENY) {
20280 		/*
20281 		 * Non-purgeable IOKit memory: phys_footprint
20282 		 * includes the entire virtual mapping.
20283 		 */
20284 		assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
20285 		disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
20286 		disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
20287 	} else if (pmap_disp & (PMAP_QUERY_PAGE_ALTACCT |
20288 	    PMAP_QUERY_PAGE_COMPRESSED_ALTACCT)) {
20289 		/* alternate accounting */
20290 #if __arm64__ && (DEVELOPMENT || DEBUG)
20291 		if (map->pmap->footprint_was_suspended) {
20292 			/*
20293 			 * The assertion below can fail if dyld
20294 			 * suspended footprint accounting
20295 			 * while doing some adjustments to
20296 			 * this page;  the mapping would say
20297 			 * "use pmap accounting" but the page
20298 			 * would be marked "alternate
20299 			 * accounting".
20300 			 */
20301 		} else
20302 #endif /* __arm64__ && (DEVELOPMENT || DEBUG) */
20303 		{
20304 			assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
20305 		}
20306 		disposition = 0;
20307 	} else {
20308 		if (pmap_disp & PMAP_QUERY_PAGE_PRESENT) {
20309 			assertf(map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
20310 			disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
20311 			disposition |= VM_PAGE_QUERY_PAGE_REF;
20312 			if (pmap_disp & PMAP_QUERY_PAGE_INTERNAL) {
20313 				disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
20314 			} else {
20315 				disposition |= VM_PAGE_QUERY_PAGE_EXTERNAL;
20316 			}
20317 			if (pmap_disp & PMAP_QUERY_PAGE_REUSABLE) {
20318 				disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
20319 			}
20320 		} else if (pmap_disp & PMAP_QUERY_PAGE_COMPRESSED) {
20321 			assertf(map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
20322 			disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
20323 		}
20324 	}
20325 
20326 	*disposition_p = disposition;
20327 }
20328 
20329 kern_return_t
vm_map_page_info(vm_map_t map,vm_map_offset_ut offset_u,vm_page_info_flavor_t flavor,vm_page_info_t info,mach_msg_type_number_t * count)20330 vm_map_page_info(
20331 	vm_map_t                map,
20332 	vm_map_offset_ut        offset_u,
20333 	vm_page_info_flavor_t   flavor,
20334 	vm_page_info_t          info,
20335 	mach_msg_type_number_t  *count)
20336 {
20337 	return vm_map_page_range_info_internal(map,
20338 	           offset_u, /* start of range */
20339 	           vm_sanitize_compute_ut_end(offset_u, 1), /* this will get rounded in the call to the page boundary */
20340 	           (int)-1, /* effective_page_shift: unspecified */
20341 	           flavor,
20342 	           info,
20343 	           count);
20344 }
20345 
20346 static __attribute__((always_inline, warn_unused_result))
20347 kern_return_t
vm_map_page_range_info_sanitize(vm_map_t map,vm_map_offset_ut start_offset_u,vm_map_offset_ut end_offset_u,vm_map_offset_t effective_page_mask,vm_map_offset_t * start,vm_map_offset_t * end,vm_map_offset_t * offset_in_page)20348 vm_map_page_range_info_sanitize(
20349 	vm_map_t                map,
20350 	vm_map_offset_ut        start_offset_u,
20351 	vm_map_offset_ut        end_offset_u,
20352 	vm_map_offset_t         effective_page_mask,
20353 	vm_map_offset_t        *start,
20354 	vm_map_offset_t        *end,
20355 	vm_map_offset_t        *offset_in_page)
20356 {
20357 	kern_return_t           retval;
20358 	vm_map_size_t           size;
20359 
20360 	/*
20361 	 * Perform validation against map's mask but don't align start/end,
20362 	 * as we need for those to be aligned wrt effective_page_mask
20363 	 */
20364 	retval = vm_sanitize_addr_end(start_offset_u, end_offset_u,
20365 	    VM_SANITIZE_CALLER_VM_MAP_PAGE_RANGE_INFO, map,
20366 	    VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH |
20367 	    VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES, start,
20368 	    end, &size);
20369 	if (retval != KERN_SUCCESS) {
20370 		return retval;
20371 	}
20372 
20373 	retval = vm_sanitize_addr_end(start_offset_u, end_offset_u,
20374 	    VM_SANITIZE_CALLER_VM_MAP_PAGE_RANGE_INFO, effective_page_mask,
20375 	    VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH, start,
20376 	    end, &size);
20377 	if (retval != KERN_SUCCESS) {
20378 		return retval;
20379 	}
20380 
20381 	*offset_in_page = vm_sanitize_offset_in_page(effective_page_mask,
20382 	    start_offset_u);
20383 
20384 	return KERN_SUCCESS;
20385 }
20386 
20387 kern_return_t
vm_map_page_range_info_internal(vm_map_t map,vm_map_offset_ut start_offset_u,vm_map_offset_ut end_offset_u,int effective_page_shift,vm_page_info_flavor_t flavor,vm_page_info_t info,mach_msg_type_number_t * count)20388 vm_map_page_range_info_internal(
20389 	vm_map_t                map,
20390 	vm_map_offset_ut        start_offset_u,
20391 	vm_map_offset_ut        end_offset_u,
20392 	int                     effective_page_shift,
20393 	vm_page_info_flavor_t   flavor,
20394 	vm_page_info_t          info,
20395 	mach_msg_type_number_t  *count)
20396 {
20397 	vm_map_entry_t          map_entry = VM_MAP_ENTRY_NULL;
20398 	vm_object_t             object = VM_OBJECT_NULL, curr_object = VM_OBJECT_NULL;
20399 	vm_page_t               m = VM_PAGE_NULL;
20400 	kern_return_t           retval = KERN_SUCCESS;
20401 	int                     disposition = 0;
20402 	int                     ref_count = 0;
20403 	int                     depth = 0, info_idx = 0;
20404 	vm_page_info_basic_t    basic_info = 0;
20405 	vm_map_offset_t         offset_in_page = 0, offset_in_object = 0, curr_offset_in_object = 0;
20406 	vm_map_offset_t         start = 0, end = 0, curr_s_offset = 0, curr_e_offset = 0;
20407 	boolean_t               do_region_footprint;
20408 	ledger_amount_t         ledger_resident, ledger_compressed;
20409 	int                     effective_page_size;
20410 	vm_map_offset_t         effective_page_mask;
20411 
20412 	switch (flavor) {
20413 	case VM_PAGE_INFO_BASIC:
20414 		if (*count != VM_PAGE_INFO_BASIC_COUNT) {
20415 			/*
20416 			 * The "vm_page_info_basic_data" structure was not
20417 			 * properly padded, so allow the size to be off by
20418 			 * one to maintain backwards binary compatibility...
20419 			 */
20420 			if (*count != VM_PAGE_INFO_BASIC_COUNT - 1) {
20421 				return KERN_INVALID_ARGUMENT;
20422 			}
20423 		}
20424 		break;
20425 	default:
20426 		return KERN_INVALID_ARGUMENT;
20427 	}
20428 
20429 	if (effective_page_shift == -1) {
20430 		effective_page_shift = vm_self_region_page_shift_safely(map);
20431 		if (effective_page_shift == -1) {
20432 			return KERN_INVALID_ARGUMENT;
20433 		}
20434 	}
20435 	effective_page_size = (1 << effective_page_shift);
20436 	effective_page_mask = effective_page_size - 1;
20437 
20438 
20439 	retval = vm_map_page_range_info_sanitize(map,
20440 	    start_offset_u,
20441 	    end_offset_u,
20442 	    effective_page_mask,
20443 	    &start,
20444 	    &end,
20445 	    &offset_in_page);
20446 	if (retval != KERN_SUCCESS) {
20447 		return vm_sanitize_get_kr(retval);
20448 	}
20449 
20450 	assert((end - start) <= MAX_PAGE_RANGE_QUERY);
20451 
20452 	do_region_footprint = task_self_region_footprint();
20453 	disposition = 0;
20454 	ref_count = 0;
20455 	depth = 0;
20456 	info_idx = 0; /* Tracks the next index within the info structure to be filled.*/
20457 
20458 	vm_map_lock_read(map);
20459 	task_ledgers_footprint(map->pmap->ledger, &ledger_resident, &ledger_compressed);
20460 
20461 	for (curr_s_offset = start; curr_s_offset < end;) {
20462 		/*
20463 		 * New lookup needs reset of these variables.
20464 		 */
20465 		curr_object = object = VM_OBJECT_NULL;
20466 		offset_in_object = 0;
20467 		ref_count = 0;
20468 		depth = 0;
20469 
20470 		if (do_region_footprint &&
20471 		    curr_s_offset >= vm_map_last_entry(map)->vme_end) {
20472 			/*
20473 			 * Request for "footprint" info about a page beyond
20474 			 * the end of address space: this must be for
20475 			 * the fake region vm_map_region_recurse_64()
20476 			 * reported to account for non-volatile purgeable
20477 			 * memory owned by this task.
20478 			 */
20479 			disposition = 0;
20480 
20481 			if (curr_s_offset - vm_map_last_entry(map)->vme_end <=
20482 			    (unsigned) ledger_compressed) {
20483 				/*
20484 				 * We haven't reported all the "non-volatile
20485 				 * compressed" pages yet, so report this fake
20486 				 * page as "compressed".
20487 				 */
20488 				disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
20489 			} else {
20490 				/*
20491 				 * We've reported all the non-volatile
20492 				 * compressed page but not all the non-volatile
20493 				 * pages , so report this fake page as
20494 				 * "resident dirty".
20495 				 */
20496 				disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
20497 				disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
20498 				disposition |= VM_PAGE_QUERY_PAGE_REF;
20499 			}
20500 			switch (flavor) {
20501 			case VM_PAGE_INFO_BASIC:
20502 				basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20503 				basic_info->disposition = disposition;
20504 				basic_info->ref_count = 1;
20505 				basic_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
20506 				basic_info->offset = 0;
20507 				basic_info->depth = 0;
20508 
20509 				info_idx++;
20510 				break;
20511 			}
20512 			curr_s_offset += effective_page_size;
20513 			continue;
20514 		}
20515 
20516 		/*
20517 		 * First, find the map entry covering "curr_s_offset", going down
20518 		 * submaps if necessary.
20519 		 */
20520 		if (!vm_map_lookup_entry(map, curr_s_offset, &map_entry)) {
20521 			/* no entry -> no object -> no page */
20522 
20523 			if (curr_s_offset < vm_map_min(map)) {
20524 				/*
20525 				 * Illegal address that falls below map min.
20526 				 */
20527 				curr_e_offset = MIN(end, vm_map_min(map));
20528 			} else if (curr_s_offset >= vm_map_max(map)) {
20529 				/*
20530 				 * Illegal address that falls on/after map max.
20531 				 */
20532 				curr_e_offset = end;
20533 			} else if (map_entry == vm_map_to_entry(map)) {
20534 				/*
20535 				 * Hit a hole.
20536 				 */
20537 				if (map_entry->vme_next == vm_map_to_entry(map)) {
20538 					/*
20539 					 * Empty map.
20540 					 */
20541 					curr_e_offset = MIN(map->max_offset, end);
20542 				} else {
20543 					/*
20544 					 * Hole at start of the map.
20545 					 */
20546 					curr_e_offset = MIN(map_entry->vme_next->vme_start, end);
20547 				}
20548 			} else {
20549 				if (map_entry->vme_next == vm_map_to_entry(map)) {
20550 					/*
20551 					 * Hole at the end of the map.
20552 					 */
20553 					curr_e_offset = MIN(map->max_offset, end);
20554 				} else {
20555 					curr_e_offset = MIN(map_entry->vme_next->vme_start, end);
20556 				}
20557 			}
20558 
20559 			assert(curr_e_offset >= curr_s_offset);
20560 
20561 			uint64_t num_pages = (curr_e_offset - curr_s_offset) >> effective_page_shift;
20562 
20563 			void *info_ptr = (void*) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20564 
20565 			bzero(info_ptr, num_pages * sizeof(struct vm_page_info_basic));
20566 
20567 			curr_s_offset = curr_e_offset;
20568 
20569 			info_idx += num_pages;
20570 
20571 			continue;
20572 		}
20573 
20574 		/* compute offset from this map entry's start */
20575 		offset_in_object = curr_s_offset - map_entry->vme_start;
20576 
20577 		/* compute offset into this map entry's object (or submap) */
20578 		offset_in_object += VME_OFFSET(map_entry);
20579 
20580 		if (map_entry->is_sub_map) {
20581 			vm_map_t sub_map = VM_MAP_NULL;
20582 			vm_page_info_t submap_info = 0;
20583 			vm_map_offset_t submap_s_offset = 0, submap_e_offset = 0, range_len = 0;
20584 
20585 			range_len = MIN(map_entry->vme_end, end) - curr_s_offset;
20586 
20587 			submap_s_offset = offset_in_object;
20588 			submap_e_offset = submap_s_offset + range_len;
20589 
20590 			sub_map = VME_SUBMAP(map_entry);
20591 
20592 			vm_map_reference(sub_map);
20593 			vm_map_unlock_read(map);
20594 
20595 			submap_info = (vm_page_info_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20596 
20597 			assertf(VM_MAP_PAGE_SHIFT(sub_map) >= VM_MAP_PAGE_SHIFT(map),
20598 			    "Submap page size (%d) differs from current map (%d)\n", VM_MAP_PAGE_SIZE(sub_map), VM_MAP_PAGE_SIZE(map));
20599 
20600 			retval = vm_map_page_range_info_internal(sub_map,
20601 			    submap_s_offset,
20602 			    submap_e_offset,
20603 			    effective_page_shift,
20604 			    VM_PAGE_INFO_BASIC,
20605 			    (vm_page_info_t) submap_info,
20606 			    count);
20607 
20608 			assert(retval == KERN_SUCCESS);
20609 
20610 			vm_map_deallocate(sub_map);
20611 			sub_map = VM_MAP_NULL;
20612 			vm_map_lock_read(map);
20613 
20614 			/* Move the "info" index by the number of pages we inspected.*/
20615 			info_idx += range_len >> effective_page_shift;
20616 
20617 			/* Move our current offset by the size of the range we inspected.*/
20618 			curr_s_offset += range_len;
20619 
20620 			continue;
20621 		}
20622 
20623 		object = VME_OBJECT(map_entry);
20624 
20625 		if (object == VM_OBJECT_NULL) {
20626 			/*
20627 			 * We don't have an object here and, hence,
20628 			 * no pages to inspect. We'll fill up the
20629 			 * info structure appropriately.
20630 			 */
20631 
20632 			curr_e_offset = MIN(map_entry->vme_end, end);
20633 
20634 			uint64_t num_pages = (curr_e_offset - curr_s_offset) >> effective_page_shift;
20635 
20636 			void *info_ptr = (void*) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20637 
20638 			bzero(info_ptr, num_pages * sizeof(struct vm_page_info_basic));
20639 
20640 			curr_s_offset = curr_e_offset;
20641 
20642 			info_idx += num_pages;
20643 
20644 			continue;
20645 		}
20646 
20647 		if (do_region_footprint) {
20648 			disposition = 0;
20649 			if (map->has_corpse_footprint) {
20650 				/*
20651 				 * Query the page info data we saved
20652 				 * while forking the corpse.
20653 				 */
20654 				vm_map_corpse_footprint_query_page_info(
20655 					map,
20656 					curr_s_offset,
20657 					&disposition);
20658 			} else {
20659 				/*
20660 				 * Query the live pmap for footprint info
20661 				 * about this page.
20662 				 */
20663 				vm_map_footprint_query_page_info(
20664 					map,
20665 					map_entry,
20666 					curr_s_offset,
20667 					&disposition);
20668 			}
20669 			switch (flavor) {
20670 			case VM_PAGE_INFO_BASIC:
20671 				basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20672 				basic_info->disposition = disposition;
20673 				basic_info->ref_count = 1;
20674 				basic_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
20675 				basic_info->offset = 0;
20676 				basic_info->depth = 0;
20677 
20678 				info_idx++;
20679 				break;
20680 			}
20681 			curr_s_offset += effective_page_size;
20682 			continue;
20683 		}
20684 
20685 		vm_object_reference(object);
20686 		/*
20687 		 * Shared mode -- so we can allow other readers
20688 		 * to grab the lock too.
20689 		 */
20690 		vm_object_lock_shared(object);
20691 
20692 		curr_e_offset = MIN(map_entry->vme_end, end);
20693 
20694 		vm_map_unlock_read(map);
20695 
20696 		map_entry = NULL; /* map is unlocked, the entry is no longer valid. */
20697 
20698 		curr_object = object;
20699 
20700 		for (; curr_s_offset < curr_e_offset;) {
20701 			if (object == curr_object) {
20702 				/* account for our object reference above. */
20703 				ref_count = os_ref_get_count_raw(&curr_object->ref_count) - 1;
20704 			} else {
20705 				ref_count = os_ref_get_count_raw(&curr_object->ref_count);
20706 			}
20707 
20708 			curr_offset_in_object = offset_in_object;
20709 
20710 			for (;;) {
20711 				m = vm_page_lookup(curr_object, vm_object_trunc_page(curr_offset_in_object));
20712 
20713 				if (m != VM_PAGE_NULL) {
20714 					disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
20715 					break;
20716 				} else {
20717 					if (curr_object->internal &&
20718 					    curr_object->alive &&
20719 					    !curr_object->terminating &&
20720 					    curr_object->pager_ready) {
20721 						if (vm_object_compressor_pager_state_get(curr_object, vm_object_trunc_page(curr_offset_in_object))
20722 						    == VM_EXTERNAL_STATE_EXISTS) {
20723 							/* the pager has that page */
20724 							disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
20725 							break;
20726 						}
20727 					}
20728 
20729 					/*
20730 					 * Go down the VM object shadow chain until we find the page
20731 					 * we're looking for.
20732 					 */
20733 
20734 					if (curr_object->shadow != VM_OBJECT_NULL) {
20735 						vm_object_t shadow = VM_OBJECT_NULL;
20736 
20737 						curr_offset_in_object += curr_object->vo_shadow_offset;
20738 						shadow = curr_object->shadow;
20739 
20740 						vm_object_lock_shared(shadow);
20741 						vm_object_unlock(curr_object);
20742 
20743 						curr_object = shadow;
20744 						depth++;
20745 						continue;
20746 					} else {
20747 						break;
20748 					}
20749 				}
20750 			}
20751 
20752 			/* The ref_count is not strictly accurate, it measures the number   */
20753 			/* of entities holding a ref on the object, they may not be mapping */
20754 			/* the object or may not be mapping the section holding the         */
20755 			/* target page but its still a ball park number and though an over- */
20756 			/* count, it picks up the copy-on-write cases                       */
20757 
20758 			/* We could also get a picture of page sharing from pmap_attributes */
20759 			/* but this would under count as only faulted-in mappings would     */
20760 			/* show up.							    */
20761 
20762 			if ((curr_object == object) && curr_object->shadow) {
20763 				disposition |= VM_PAGE_QUERY_PAGE_COPIED;
20764 			}
20765 
20766 			if (!curr_object->internal) {
20767 				disposition |= VM_PAGE_QUERY_PAGE_EXTERNAL;
20768 			}
20769 
20770 			if (m != VM_PAGE_NULL) {
20771 				if (vm_page_is_fictitious(m)) {
20772 					disposition |= VM_PAGE_QUERY_PAGE_FICTITIOUS;
20773 				} else {
20774 					if (m->vmp_dirty || pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(m))) {
20775 						disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
20776 					}
20777 
20778 					if (m->vmp_reference || pmap_is_referenced(VM_PAGE_GET_PHYS_PAGE(m))) {
20779 						disposition |= VM_PAGE_QUERY_PAGE_REF;
20780 					}
20781 
20782 					if (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
20783 						disposition |= VM_PAGE_QUERY_PAGE_SPECULATIVE;
20784 					}
20785 
20786 					/*
20787 					 * XXX TODO4K:
20788 					 * when this routine deals with 4k
20789 					 * pages, check the appropriate CS bit
20790 					 * here.
20791 					 */
20792 					if (m->vmp_cs_validated) {
20793 						disposition |= VM_PAGE_QUERY_PAGE_CS_VALIDATED;
20794 					}
20795 					if (m->vmp_cs_tainted) {
20796 						disposition |= VM_PAGE_QUERY_PAGE_CS_TAINTED;
20797 					}
20798 					if (m->vmp_cs_nx) {
20799 						disposition |= VM_PAGE_QUERY_PAGE_CS_NX;
20800 					}
20801 					if (m->vmp_reusable || curr_object->all_reusable) {
20802 						disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
20803 					}
20804 				}
20805 			}
20806 
20807 			switch (flavor) {
20808 			case VM_PAGE_INFO_BASIC:
20809 				basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20810 				basic_info->disposition = disposition;
20811 				basic_info->ref_count = ref_count;
20812 				basic_info->object_id = (vm_object_id_t) (uintptr_t)
20813 				    VM_KERNEL_ADDRHASH(curr_object);
20814 				basic_info->offset =
20815 				    (memory_object_offset_t) curr_offset_in_object + offset_in_page;
20816 				basic_info->depth = depth;
20817 
20818 				info_idx++;
20819 				break;
20820 			}
20821 
20822 			disposition = 0;
20823 			offset_in_page = 0; // This doesn't really make sense for any offset other than the starting offset.
20824 
20825 			/*
20826 			 * Move to next offset in the range and in our object.
20827 			 */
20828 			curr_s_offset += effective_page_size;
20829 			offset_in_object += effective_page_size;
20830 			curr_offset_in_object = offset_in_object;
20831 
20832 			if (curr_object != object) {
20833 				vm_object_unlock(curr_object);
20834 
20835 				curr_object = object;
20836 
20837 				vm_object_lock_shared(curr_object);
20838 			} else {
20839 				vm_object_lock_yield_shared(curr_object);
20840 			}
20841 		}
20842 
20843 		vm_object_unlock(curr_object);
20844 		vm_object_deallocate(curr_object);
20845 
20846 		vm_map_lock_read(map);
20847 	}
20848 
20849 	vm_map_unlock_read(map);
20850 	return retval;
20851 }
20852 
20853 static __attribute__((always_inline, warn_unused_result))
20854 kern_return_t
vm_map_msync_sanitize(vm_map_t map,vm_map_address_ut address_u,vm_map_size_ut size_u,vm_object_offset_t * address,vm_map_size_t * size)20855 vm_map_msync_sanitize(
20856 	vm_map_t                map,
20857 	vm_map_address_ut       address_u,
20858 	vm_map_size_ut          size_u,
20859 	vm_object_offset_t     *address,
20860 	vm_map_size_t          *size)
20861 {
20862 	vm_object_offset_t      end;
20863 	vm_sanitize_flags_t     flags = VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS;
20864 
20865 
20866 	return vm_sanitize_addr_size(address_u, size_u,
20867 	           VM_SANITIZE_CALLER_VM_MAP_MSYNC,
20868 	           map, flags, address, &end, size);
20869 }
20870 
20871 /*
20872  *	vm_map_msync
20873  *
20874  *	Synchronises the memory range specified with its backing store
20875  *	image by either flushing or cleaning the contents to the appropriate
20876  *	memory manager engaging in a memory object synchronize dialog with
20877  *	the manager.  The client doesn't return until the manager issues
20878  *	m_o_s_completed message.  MIG Magically converts user task parameter
20879  *	to the task's address map.
20880  *
20881  *	interpretation of sync_flags
20882  *	VM_SYNC_INVALIDATE	- discard pages, only return precious
20883  *				  pages to manager.
20884  *
20885  *	VM_SYNC_INVALIDATE & (VM_SYNC_SYNCHRONOUS | VM_SYNC_ASYNCHRONOUS)
20886  *				- discard pages, write dirty or precious
20887  *				  pages back to memory manager.
20888  *
20889  *	VM_SYNC_SYNCHRONOUS | VM_SYNC_ASYNCHRONOUS
20890  *				- write dirty or precious pages back to
20891  *				  the memory manager.
20892  *
20893  *	VM_SYNC_CONTIGUOUS	- does everything normally, but if there
20894  *				  is a hole in the region, and we would
20895  *				  have returned KERN_SUCCESS, return
20896  *				  KERN_INVALID_ADDRESS instead.
20897  *
20898  *	NOTE
20899  *	The memory object attributes have not yet been implemented, this
20900  *	function will have to deal with the invalidate attribute
20901  *
20902  *	RETURNS
20903  *	KERN_INVALID_TASK		Bad task parameter
20904  *	KERN_INVALID_ARGUMENT		both sync and async were specified.
20905  *	KERN_SUCCESS			The usual.
20906  *	KERN_INVALID_ADDRESS		There was a hole in the region.
20907  */
20908 
20909 kern_return_t
vm_map_msync(vm_map_t map,vm_map_address_ut address_u,vm_map_size_ut size_u,vm_sync_t sync_flags)20910 vm_map_msync(
20911 	vm_map_t                map,
20912 	vm_map_address_ut       address_u,
20913 	vm_map_size_ut          size_u,
20914 	vm_sync_t               sync_flags)
20915 {
20916 	vm_map_entry_t          entry;
20917 	vm_map_size_t           size, amount_left;
20918 	vm_object_offset_t      address, offset;
20919 	vm_object_offset_t      start_offset, end_offset;
20920 	boolean_t               do_sync_req;
20921 	boolean_t               had_hole = FALSE;
20922 	vm_map_offset_t         pmap_offset;
20923 	kern_return_t           kr;
20924 
20925 	if ((sync_flags & VM_SYNC_ASYNCHRONOUS) &&
20926 	    (sync_flags & VM_SYNC_SYNCHRONOUS)) {
20927 		return KERN_INVALID_ARGUMENT;
20928 	}
20929 
20930 	if (map == VM_MAP_NULL) {
20931 		return KERN_INVALID_TASK;
20932 	}
20933 
20934 	kr = vm_map_msync_sanitize(map,
20935 	    address_u,
20936 	    size_u,
20937 	    &address,
20938 	    &size);
20939 	if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
20940 		DEBUG4K_SHARE("map %p address 0x%llx size 0x%llx flags 0x%x\n", map, (uint64_t)address, (uint64_t)size, sync_flags);
20941 	}
20942 	if (__improbable(kr != KERN_SUCCESS)) {
20943 		return vm_sanitize_get_kr(kr);
20944 	}
20945 
20946 	amount_left = size;
20947 
20948 	while (amount_left > 0) {
20949 		vm_object_size_t        flush_size;
20950 		vm_object_t             object;
20951 
20952 		vm_map_lock(map);
20953 		if (!vm_map_lookup_entry(map,
20954 		    address,
20955 		    &entry)) {
20956 			vm_map_size_t   skip;
20957 
20958 			/*
20959 			 * hole in the address map.
20960 			 */
20961 			had_hole = TRUE;
20962 
20963 			if (sync_flags & VM_SYNC_KILLPAGES) {
20964 				/*
20965 				 * For VM_SYNC_KILLPAGES, there should be
20966 				 * no holes in the range, since we couldn't
20967 				 * prevent someone else from allocating in
20968 				 * that hole and we wouldn't want to "kill"
20969 				 * their pages.
20970 				 */
20971 				vm_map_unlock(map);
20972 				break;
20973 			}
20974 
20975 			/*
20976 			 * Check for empty map.
20977 			 */
20978 			if (entry == vm_map_to_entry(map) &&
20979 			    entry->vme_next == entry) {
20980 				vm_map_unlock(map);
20981 				break;
20982 			}
20983 			/*
20984 			 * Check that we don't wrap and that
20985 			 * we have at least one real map entry.
20986 			 */
20987 			if ((map->hdr.nentries == 0) ||
20988 			    (entry->vme_next->vme_start < address)) {
20989 				vm_map_unlock(map);
20990 				break;
20991 			}
20992 			/*
20993 			 * Move up to the next entry if needed
20994 			 */
20995 			skip = (entry->vme_next->vme_start - address);
20996 			if (skip >= amount_left) {
20997 				amount_left = 0;
20998 			} else {
20999 				amount_left -= skip;
21000 			}
21001 			address = entry->vme_next->vme_start;
21002 			vm_map_unlock(map);
21003 			continue;
21004 		}
21005 
21006 		offset = address - entry->vme_start;
21007 		pmap_offset = address;
21008 
21009 		/*
21010 		 * do we have more to flush than is contained in this
21011 		 * entry ?
21012 		 */
21013 		if (amount_left + entry->vme_start + offset > entry->vme_end) {
21014 			flush_size = entry->vme_end -
21015 			    (entry->vme_start + offset);
21016 		} else {
21017 			flush_size = amount_left;
21018 		}
21019 		amount_left -= flush_size;
21020 		address += flush_size;
21021 
21022 		if (entry->is_sub_map == TRUE) {
21023 			vm_map_t        local_map;
21024 			vm_map_offset_t local_offset;
21025 
21026 			local_map = VME_SUBMAP(entry);
21027 			local_offset = VME_OFFSET(entry);
21028 			vm_map_reference(local_map);
21029 			vm_map_unlock(map);
21030 			if (vm_map_msync(
21031 				    local_map,
21032 				    local_offset,
21033 				    flush_size,
21034 				    sync_flags) == KERN_INVALID_ADDRESS) {
21035 				had_hole = TRUE;
21036 			}
21037 			vm_map_deallocate(local_map);
21038 			local_map = VM_MAP_NULL;
21039 			continue;
21040 		}
21041 		object = VME_OBJECT(entry);
21042 
21043 		/*
21044 		 * We can't sync this object if the object has not been
21045 		 * created yet
21046 		 */
21047 		if (object == VM_OBJECT_NULL) {
21048 			vm_map_unlock(map);
21049 			continue;
21050 		}
21051 		offset += VME_OFFSET(entry);
21052 
21053 		vm_object_lock(object);
21054 
21055 		if (sync_flags & (VM_SYNC_KILLPAGES | VM_SYNC_DEACTIVATE)) {
21056 			int kill_pages = 0;
21057 
21058 			if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
21059 				/*
21060 				 * This is a destructive operation and so we
21061 				 * err on the side of limiting the range of
21062 				 * the operation.
21063 				 */
21064 				start_offset = vm_object_round_page(offset);
21065 				end_offset = vm_object_trunc_page(offset + flush_size);
21066 
21067 				if (end_offset <= start_offset) {
21068 					vm_object_unlock(object);
21069 					vm_map_unlock(map);
21070 					continue;
21071 				}
21072 
21073 				pmap_offset += start_offset - offset;
21074 			} else {
21075 				start_offset = offset;
21076 				end_offset = offset + flush_size;
21077 			}
21078 
21079 			if (sync_flags & VM_SYNC_KILLPAGES) {
21080 				if (((os_ref_get_count_raw(&object->ref_count) == 1) ||
21081 				    ((object->copy_strategy !=
21082 				    MEMORY_OBJECT_COPY_SYMMETRIC) &&
21083 				    (object->vo_copy == VM_OBJECT_NULL))) &&
21084 				    (object->shadow == VM_OBJECT_NULL)) {
21085 					if (os_ref_get_count_raw(&object->ref_count) != 1) {
21086 						vm_page_stats_reusable.free_shared++;
21087 					}
21088 					kill_pages = 1;
21089 				} else {
21090 					kill_pages = -1;
21091 				}
21092 			}
21093 			if (kill_pages != -1) {
21094 				boolean_t kill_no_write = FALSE;
21095 
21096 				if ((entry->protection & VM_PROT_EXECUTE) ||
21097 				    entry->vme_xnu_user_debug) {
21098 					/*
21099 					 * Executable or user debug pages might be write-protected by
21100 					 * hardware, so do not attempt to write to these pages.
21101 					 */
21102 					kill_no_write = TRUE;
21103 				}
21104 				vm_object_deactivate_pages(
21105 					object,
21106 					start_offset,
21107 					(vm_object_size_t) (end_offset - start_offset),
21108 					kill_pages,
21109 					FALSE, /* reusable_pages */
21110 					kill_no_write,
21111 					map->pmap,
21112 					pmap_offset);
21113 			}
21114 			vm_object_unlock(object);
21115 			vm_map_unlock(map);
21116 			continue;
21117 		}
21118 		/*
21119 		 * We can't sync this object if there isn't a pager.
21120 		 * Don't bother to sync internal objects, since there can't
21121 		 * be any "permanent" storage for these objects anyway.
21122 		 */
21123 		if ((object->pager == MEMORY_OBJECT_NULL) ||
21124 		    (object->internal) || (object->private)) {
21125 			vm_object_unlock(object);
21126 			vm_map_unlock(map);
21127 			continue;
21128 		}
21129 		/*
21130 		 * keep reference on the object until syncing is done
21131 		 */
21132 		vm_object_reference_locked(object);
21133 		vm_object_unlock(object);
21134 
21135 		vm_map_unlock(map);
21136 
21137 		if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
21138 			start_offset = vm_object_trunc_page(offset);
21139 			end_offset = vm_object_round_page(offset + flush_size);
21140 		} else {
21141 			start_offset = offset;
21142 			end_offset = offset + flush_size;
21143 		}
21144 
21145 		do_sync_req = vm_object_sync(object,
21146 		    start_offset,
21147 		    (end_offset - start_offset),
21148 		    sync_flags & VM_SYNC_INVALIDATE,
21149 		    ((sync_flags & VM_SYNC_SYNCHRONOUS) ||
21150 		    (sync_flags & VM_SYNC_ASYNCHRONOUS)),
21151 		    sync_flags & VM_SYNC_SYNCHRONOUS);
21152 
21153 		if ((sync_flags & VM_SYNC_INVALIDATE) && object->resident_page_count == 0) {
21154 			/*
21155 			 * clear out the clustering and read-ahead hints
21156 			 */
21157 			vm_object_lock(object);
21158 
21159 			object->pages_created = 0;
21160 			object->pages_used = 0;
21161 			object->sequential = 0;
21162 			object->last_alloc = 0;
21163 
21164 			vm_object_unlock(object);
21165 		}
21166 		vm_object_deallocate(object);
21167 	} /* while */
21168 
21169 	/* for proper msync() behaviour */
21170 	if (had_hole == TRUE && (sync_flags & VM_SYNC_CONTIGUOUS)) {
21171 		return KERN_INVALID_ADDRESS;
21172 	}
21173 
21174 	return KERN_SUCCESS;
21175 }/* vm_msync */
21176 
21177 void
vm_named_entry_associate_vm_object(vm_named_entry_t named_entry,vm_object_t object,vm_object_offset_t offset,vm_object_size_t size,vm_prot_t prot)21178 vm_named_entry_associate_vm_object(
21179 	vm_named_entry_t        named_entry,
21180 	vm_object_t             object,
21181 	vm_object_offset_t      offset,
21182 	vm_object_size_t        size,
21183 	vm_prot_t               prot)
21184 {
21185 	vm_map_copy_t copy;
21186 	vm_map_entry_t copy_entry;
21187 
21188 	assert(!named_entry->is_sub_map);
21189 	assert(!named_entry->is_copy);
21190 	assert(!named_entry->is_object);
21191 	assert(!named_entry->internal);
21192 	assert(named_entry->backing.copy == VM_MAP_COPY_NULL);
21193 
21194 	copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
21195 	copy->offset = offset;
21196 	copy->size = size;
21197 	copy->cpy_hdr.page_shift = (uint16_t)PAGE_SHIFT;
21198 
21199 	copy_entry = vm_map_copy_entry_create(copy);
21200 	copy_entry->protection = prot;
21201 	copy_entry->max_protection = prot;
21202 	copy_entry->use_pmap = TRUE;
21203 	copy_entry->vme_start = VM_MAP_TRUNC_PAGE(offset, PAGE_MASK);
21204 	copy_entry->vme_end = VM_MAP_ROUND_PAGE(offset + size, PAGE_MASK);
21205 	VME_OBJECT_SET(copy_entry, object, false, 0);
21206 	VME_OFFSET_SET(copy_entry, vm_object_trunc_page(offset));
21207 	vm_map_copy_entry_link(copy, vm_map_copy_last_entry(copy), copy_entry);
21208 
21209 	named_entry->backing.copy = copy;
21210 	named_entry->is_object = TRUE;
21211 	if (object->internal) {
21212 		named_entry->internal = TRUE;
21213 	}
21214 
21215 	DEBUG4K_MEMENTRY("named_entry %p copy %p object %p offset 0x%llx size 0x%llx prot 0x%x\n",
21216 	    named_entry, copy, object, offset, size, prot);
21217 }
21218 
21219 vm_object_t
vm_named_entry_to_vm_object(vm_named_entry_t named_entry)21220 vm_named_entry_to_vm_object(
21221 	vm_named_entry_t named_entry)
21222 {
21223 	vm_map_copy_t   copy;
21224 	vm_map_entry_t  copy_entry;
21225 	vm_object_t     object;
21226 
21227 	assert(!named_entry->is_sub_map);
21228 	assert(!named_entry->is_copy);
21229 	assert(named_entry->is_object);
21230 	copy = named_entry->backing.copy;
21231 	assert(copy != VM_MAP_COPY_NULL);
21232 	/*
21233 	 * Assert that the vm_map_copy is coming from the right
21234 	 * zone and hasn't been forged
21235 	 */
21236 	vm_map_copy_require(copy);
21237 	assert(copy->cpy_hdr.nentries == 1);
21238 	copy_entry = vm_map_copy_first_entry(copy);
21239 	object = VME_OBJECT(copy_entry);
21240 
21241 	DEBUG4K_MEMENTRY("%p -> %p -> %p [0x%llx 0x%llx 0x%llx 0x%x/0x%x ] -> %p offset 0x%llx size 0x%llx prot 0x%x\n", named_entry, copy, copy_entry, (uint64_t)copy_entry->vme_start, (uint64_t)copy_entry->vme_end, copy_entry->vme_offset, copy_entry->protection, copy_entry->max_protection, object, named_entry->offset, named_entry->size, named_entry->protection);
21242 
21243 	return object;
21244 }
21245 
21246 /*
21247  *	Routine:	convert_port_entry_to_map
21248  *	Purpose:
21249  *		Convert from a port specifying an entry or a task
21250  *		to a map. Doesn't consume the port ref; produces a map ref,
21251  *		which may be null.  Unlike convert_port_to_map, the
21252  *		port may be task or a named entry backed.
21253  *	Conditions:
21254  *		Nothing locked.
21255  */
21256 
21257 vm_map_t
convert_port_entry_to_map(ipc_port_t port)21258 convert_port_entry_to_map(
21259 	ipc_port_t      port)
21260 {
21261 	vm_map_t map = VM_MAP_NULL;
21262 	vm_named_entry_t named_entry;
21263 
21264 	if (!IP_VALID(port)) {
21265 		return VM_MAP_NULL;
21266 	}
21267 
21268 	if (ip_kotype(port) != IKOT_NAMED_ENTRY) {
21269 		return convert_port_to_map(port);
21270 	}
21271 
21272 	named_entry = mach_memory_entry_from_port(port);
21273 
21274 	if ((named_entry->is_sub_map) &&
21275 	    (named_entry->protection & VM_PROT_WRITE)) {
21276 		map = named_entry->backing.map;
21277 		if (map->pmap != PMAP_NULL) {
21278 			if (map->pmap == kernel_pmap) {
21279 				panic("userspace has access "
21280 				    "to a kernel map %p", map);
21281 			}
21282 			pmap_require(map->pmap);
21283 		}
21284 		vm_map_reference(map);
21285 	}
21286 
21287 	return map;
21288 }
21289 
21290 /*
21291  * Export routines to other components for the things we access locally through
21292  * macros.
21293  */
21294 #undef current_map
21295 vm_map_t
current_map(void)21296 current_map(void)
21297 {
21298 	return current_map_fast();
21299 }
21300 
21301 /*
21302  *	vm_map_reference:
21303  *
21304  *	Takes a reference on the specified map.
21305  */
21306 void
vm_map_reference(vm_map_t map)21307 vm_map_reference(
21308 	vm_map_t        map)
21309 {
21310 	if (__probable(map != VM_MAP_NULL)) {
21311 		vm_map_require(map);
21312 		os_ref_retain_raw(&map->map_refcnt, &map_refgrp);
21313 	}
21314 }
21315 
21316 /*
21317  *	vm_map_deallocate:
21318  *
21319  *	Removes a reference from the specified map,
21320  *	destroying it if no references remain.
21321  *	The map should not be locked.
21322  */
21323 void
vm_map_deallocate(vm_map_t map)21324 vm_map_deallocate(
21325 	vm_map_t        map)
21326 {
21327 	if (__probable(map != VM_MAP_NULL)) {
21328 		vm_map_require(map);
21329 		if (os_ref_release_raw(&map->map_refcnt, &map_refgrp) == 0) {
21330 			vm_map_destroy(map);
21331 		}
21332 	}
21333 }
21334 
21335 void
vm_map_inspect_deallocate(vm_map_inspect_t map)21336 vm_map_inspect_deallocate(
21337 	vm_map_inspect_t      map)
21338 {
21339 	vm_map_deallocate((vm_map_t)map);
21340 }
21341 
21342 void
vm_map_read_deallocate(vm_map_read_t map)21343 vm_map_read_deallocate(
21344 	vm_map_read_t      map)
21345 {
21346 	vm_map_deallocate((vm_map_t)map);
21347 }
21348 
21349 
21350 void
vm_map_disable_NX(vm_map_t map)21351 vm_map_disable_NX(vm_map_t map)
21352 {
21353 	if (map == NULL) {
21354 		return;
21355 	}
21356 	if (map->pmap == NULL) {
21357 		return;
21358 	}
21359 
21360 	pmap_disable_NX(map->pmap);
21361 }
21362 
21363 void
vm_map_disallow_data_exec(vm_map_t map)21364 vm_map_disallow_data_exec(vm_map_t map)
21365 {
21366 	if (map == NULL) {
21367 		return;
21368 	}
21369 
21370 	map->map_disallow_data_exec = TRUE;
21371 }
21372 
21373 /* XXX Consider making these constants (VM_MAX_ADDRESS and MACH_VM_MAX_ADDRESS)
21374  * more descriptive.
21375  */
21376 void
vm_map_set_32bit(vm_map_t map)21377 vm_map_set_32bit(vm_map_t map)
21378 {
21379 #if defined(__arm64__)
21380 	map->max_offset = pmap_max_offset(FALSE, ARM_PMAP_MAX_OFFSET_DEVICE);
21381 #else
21382 	map->max_offset = (vm_map_offset_t)VM_MAX_ADDRESS;
21383 #endif
21384 }
21385 
21386 
21387 void
vm_map_set_64bit(vm_map_t map)21388 vm_map_set_64bit(vm_map_t map)
21389 {
21390 #if defined(__arm64__)
21391 	map->max_offset = pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_DEVICE);
21392 #else
21393 	map->max_offset = (vm_map_offset_t)MACH_VM_MAX_ADDRESS;
21394 #endif
21395 }
21396 
21397 /*
21398  * Expand the maximum size of an existing map to 64GB.
21399  */
21400 void
vm_map_set_jumbo(vm_map_t map)21401 vm_map_set_jumbo(vm_map_t map)
21402 {
21403 #if defined (__arm64__) && !XNU_TARGET_OS_OSX
21404 	vm_map_set_max_addr(map, ~0, false);
21405 #else /* arm64 */
21406 	(void) map;
21407 #endif
21408 }
21409 
21410 #if XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT
21411 /*
21412  * Expand the maximum size of an existing map to the maximum supported.
21413  */
21414 void
vm_map_set_extra_jumbo(vm_map_t map)21415 vm_map_set_extra_jumbo(vm_map_t map)
21416 {
21417 #if defined (__arm64__) && !XNU_TARGET_OS_OSX
21418 	vm_map_set_max_addr(map, ~0, true);
21419 #else /* arm64 */
21420 	(void) map;
21421 #endif
21422 }
21423 #endif /* XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT */
21424 
21425 /*
21426  * This map has a JIT entitlement
21427  */
21428 void
vm_map_set_jit_entitled(vm_map_t map)21429 vm_map_set_jit_entitled(vm_map_t map)
21430 {
21431 #if defined (__arm64__)
21432 	pmap_set_jit_entitled(map->pmap);
21433 #else /* arm64 */
21434 	(void) map;
21435 #endif
21436 }
21437 
21438 /*
21439  * Get status of this maps TPRO flag
21440  */
21441 boolean_t
vm_map_tpro(vm_map_t map)21442 vm_map_tpro(vm_map_t map)
21443 {
21444 #if defined (__arm64e__)
21445 	return pmap_get_tpro(map->pmap);
21446 #else /* arm64e */
21447 	(void) map;
21448 	return FALSE;
21449 #endif
21450 }
21451 
21452 /*
21453  * This map has TPRO enabled
21454  */
21455 void
vm_map_set_tpro(vm_map_t map)21456 vm_map_set_tpro(vm_map_t map)
21457 {
21458 #if defined (__arm64e__)
21459 	pmap_set_tpro(map->pmap);
21460 #else /* arm64e */
21461 	(void) map;
21462 #endif
21463 }
21464 
21465 
21466 
21467 /*
21468  * Does this map have TPRO enforcement enabled
21469  */
21470 boolean_t
vm_map_tpro_enforcement(vm_map_t map)21471 vm_map_tpro_enforcement(vm_map_t map)
21472 {
21473 	return map->tpro_enforcement;
21474 }
21475 
21476 /*
21477  * Set TPRO enforcement for this map
21478  */
21479 void
vm_map_set_tpro_enforcement(vm_map_t map)21480 vm_map_set_tpro_enforcement(vm_map_t map)
21481 {
21482 	if (vm_map_tpro(map)) {
21483 		vm_map_lock(map);
21484 		map->tpro_enforcement = TRUE;
21485 		vm_map_unlock(map);
21486 	}
21487 }
21488 
21489 /*
21490  * Enable TPRO on the requested region
21491  *
21492  * Note:
21493  *     This routine is primarily intended to be called during/soon after map
21494  *     creation before the associated task has been released to run. It is only
21495  *     currently safe when we have no resident pages.
21496  */
21497 boolean_t
vm_map_set_tpro_range(__unused vm_map_t map,__unused vm_map_address_t start,__unused vm_map_address_t end)21498 vm_map_set_tpro_range(
21499 	__unused vm_map_t map,
21500 	__unused vm_map_address_t start,
21501 	__unused vm_map_address_t end)
21502 {
21503 	return TRUE;
21504 }
21505 
21506 /*
21507  * Expand the maximum size of an existing map.
21508  */
21509 void
vm_map_set_max_addr(vm_map_t map,vm_map_offset_t new_max_offset,__unused bool extra_jumbo)21510 vm_map_set_max_addr(
21511 	vm_map_t map,
21512 	vm_map_offset_t new_max_offset,
21513 	__unused bool extra_jumbo)
21514 {
21515 #if defined(__arm64__)
21516 	vm_map_offset_t max_supported_offset;
21517 	vm_map_offset_t old_max_offset;
21518 	unsigned int option = ARM_PMAP_MAX_OFFSET_JUMBO;
21519 
21520 	vm_map_lock(map);
21521 
21522 	old_max_offset = map->max_offset;
21523 #if XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT
21524 	if (extra_jumbo) {
21525 		option = ARM_PMAP_MAX_OFFSET_EXTRA_JUMBO;
21526 	}
21527 #endif /* XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT */
21528 	max_supported_offset = pmap_max_offset(vm_map_is_64bit(map), option);
21529 
21530 	new_max_offset = trunc_page(new_max_offset);
21531 
21532 	/* The address space cannot be shrunk using this routine. */
21533 	if (old_max_offset >= new_max_offset) {
21534 		vm_map_unlock(map);
21535 		return;
21536 	}
21537 
21538 	if (max_supported_offset < new_max_offset) {
21539 		new_max_offset = max_supported_offset;
21540 	}
21541 
21542 	map->max_offset = new_max_offset;
21543 
21544 	/*
21545 	 * Disable the following chunk of code that extends the "holes" list
21546 	 * to accomodate a larger VM map.
21547 	 * In `vm_map_create_options()`, we now set the end of the "holes" list to
21548 	 * max(map->max_offset, MACH_VM_MAX_ADDRESS) for all platforms.
21549 	 * MACH_VM_MAX_ADDRESS is the largest virtual address a userspace process
21550 	 * can map, so any `new_max_offset` value will be <= MACH_VM_MAX_ADDRESS.
21551 	 * The "holes" list does not need to be adjusted.
21552 	 */
21553 #if 0
21554 	if (map->holelistenabled) {
21555 		if (map->holes_list->prev->vme_end == old_max_offset) {
21556 			/*
21557 			 * There is already a hole at the end of the map; simply make it bigger.
21558 			 */
21559 			map->holes_list->prev->vme_end = map->max_offset;
21560 		} else {
21561 			/*
21562 			 * There is no hole at the end, so we need to create a new hole
21563 			 * for the new empty space we're creating.
21564 			 */
21565 			struct vm_map_links *new_hole;
21566 
21567 			new_hole = zalloc_id(ZONE_ID_VM_MAP_HOLES, Z_WAITOK | Z_NOFAIL);
21568 			new_hole->start = old_max_offset;
21569 			new_hole->end = map->max_offset;
21570 			new_hole->prev = map->holes_list->prev;
21571 			new_hole->next = (struct vm_map_entry *)map->holes_list;
21572 			map->holes_list->prev->vme_next = (struct vm_map_entry *)new_hole;
21573 			map->holes_list->prev = (struct vm_map_entry *)new_hole;
21574 		}
21575 	}
21576 #endif
21577 
21578 	vm_map_unlock(map);
21579 #else
21580 	(void)map;
21581 	(void)new_max_offset;
21582 #endif
21583 }
21584 
21585 vm_map_offset_t
vm_compute_max_offset(boolean_t is64)21586 vm_compute_max_offset(boolean_t is64)
21587 {
21588 #if defined(__arm64__)
21589 	return pmap_max_offset(is64, ARM_PMAP_MAX_OFFSET_DEVICE);
21590 #else
21591 	return is64 ? (vm_map_offset_t)MACH_VM_MAX_ADDRESS : (vm_map_offset_t)VM_MAX_ADDRESS;
21592 #endif
21593 }
21594 
21595 void
vm_map_get_max_aslr_slide_section(vm_map_t map __unused,int64_t * max_sections,int64_t * section_size)21596 vm_map_get_max_aslr_slide_section(
21597 	vm_map_t                map __unused,
21598 	int64_t                 *max_sections,
21599 	int64_t                 *section_size)
21600 {
21601 #if defined(__arm64__)
21602 	*max_sections = 3;
21603 	*section_size = ARM_TT_TWIG_SIZE;
21604 #else
21605 	*max_sections = 1;
21606 	*section_size = 0;
21607 #endif
21608 }
21609 
21610 uint64_t
vm_map_get_max_aslr_slide_pages(vm_map_t map)21611 vm_map_get_max_aslr_slide_pages(vm_map_t map)
21612 {
21613 #if defined(__arm64__)
21614 	/* Limit arm64 slide to 16MB to conserve contiguous VA space in the more
21615 	 * limited embedded address space; this is also meant to minimize pmap
21616 	 * memory usage on 16KB page systems.
21617 	 */
21618 	return 1 << (24 - VM_MAP_PAGE_SHIFT(map));
21619 #else
21620 	return 1 << (vm_map_is_64bit(map) ? 16 : 8);
21621 #endif
21622 }
21623 
21624 uint64_t
vm_map_get_max_loader_aslr_slide_pages(vm_map_t map)21625 vm_map_get_max_loader_aslr_slide_pages(vm_map_t map)
21626 {
21627 #if defined(__arm64__)
21628 	/* We limit the loader slide to 4MB, in order to ensure at least 8 bits
21629 	 * of independent entropy on 16KB page systems.
21630 	 */
21631 	return 1 << (22 - VM_MAP_PAGE_SHIFT(map));
21632 #else
21633 	return 1 << (vm_map_is_64bit(map) ? 16 : 8);
21634 #endif
21635 }
21636 
21637 boolean_t
vm_map_is_64bit(vm_map_t map)21638 vm_map_is_64bit(
21639 	vm_map_t map)
21640 {
21641 	return map->max_offset > ((vm_map_offset_t)VM_MAX_ADDRESS);
21642 }
21643 
21644 boolean_t
vm_map_has_hard_pagezero(vm_map_t map,vm_map_offset_t pagezero_size)21645 vm_map_has_hard_pagezero(
21646 	vm_map_t        map,
21647 	vm_map_offset_t pagezero_size)
21648 {
21649 	/*
21650 	 * XXX FBDP
21651 	 * We should lock the VM map (for read) here but we can get away
21652 	 * with it for now because there can't really be any race condition:
21653 	 * the VM map's min_offset is changed only when the VM map is created
21654 	 * and when the zero page is established (when the binary gets loaded),
21655 	 * and this routine gets called only when the task terminates and the
21656 	 * VM map is being torn down, and when a new map is created via
21657 	 * load_machfile()/execve().
21658 	 */
21659 	return map->min_offset >= pagezero_size;
21660 }
21661 
21662 /*
21663  * Raise a VM map's maximun offset.
21664  */
21665 kern_return_t
vm_map_raise_max_offset(vm_map_t map,vm_map_offset_t new_max_offset)21666 vm_map_raise_max_offset(
21667 	vm_map_t        map,
21668 	vm_map_offset_t new_max_offset)
21669 {
21670 	kern_return_t   ret;
21671 
21672 	vm_map_lock(map);
21673 	ret = KERN_INVALID_ADDRESS;
21674 
21675 	if (new_max_offset >= map->max_offset) {
21676 		if (!vm_map_is_64bit(map)) {
21677 			if (new_max_offset <= (vm_map_offset_t)VM_MAX_ADDRESS) {
21678 				map->max_offset = new_max_offset;
21679 				ret = KERN_SUCCESS;
21680 			}
21681 		} else {
21682 			if (new_max_offset <= (vm_map_offset_t)MACH_VM_MAX_ADDRESS) {
21683 				map->max_offset = new_max_offset;
21684 				ret = KERN_SUCCESS;
21685 			}
21686 		}
21687 	}
21688 
21689 	vm_map_unlock(map);
21690 	return ret;
21691 }
21692 
21693 
21694 /*
21695  * Raise a VM map's minimum offset.
21696  * To strictly enforce "page zero" reservation.
21697  */
21698 kern_return_t
vm_map_raise_min_offset(vm_map_t map,vm_map_offset_t new_min_offset)21699 vm_map_raise_min_offset(
21700 	vm_map_t        map,
21701 	vm_map_offset_t new_min_offset)
21702 {
21703 	vm_map_entry_t  first_entry;
21704 
21705 	new_min_offset = vm_map_round_page(new_min_offset,
21706 	    VM_MAP_PAGE_MASK(map));
21707 
21708 	vm_map_lock(map);
21709 
21710 	if (new_min_offset < map->min_offset) {
21711 		/*
21712 		 * Can't move min_offset backwards, as that would expose
21713 		 * a part of the address space that was previously, and for
21714 		 * possibly good reasons, inaccessible.
21715 		 */
21716 		vm_map_unlock(map);
21717 		return KERN_INVALID_ADDRESS;
21718 	}
21719 	if (new_min_offset >= map->max_offset) {
21720 		/* can't go beyond the end of the address space */
21721 		vm_map_unlock(map);
21722 		return KERN_INVALID_ADDRESS;
21723 	}
21724 
21725 	first_entry = vm_map_first_entry(map);
21726 	if (first_entry != vm_map_to_entry(map) &&
21727 	    first_entry->vme_start < new_min_offset) {
21728 		/*
21729 		 * Some memory was already allocated below the new
21730 		 * minimun offset.  It's too late to change it now...
21731 		 */
21732 		vm_map_unlock(map);
21733 		return KERN_NO_SPACE;
21734 	}
21735 
21736 	map->min_offset = new_min_offset;
21737 
21738 	if (map->holelistenabled) {
21739 		assert(map->holes_list);
21740 		map->holes_list->start = new_min_offset;
21741 		assert(new_min_offset < map->holes_list->end);
21742 	}
21743 
21744 	vm_map_unlock(map);
21745 
21746 	return KERN_SUCCESS;
21747 }
21748 
21749 /*
21750  * Set the limit on the maximum amount of address space and user wired memory allowed for this map.
21751  * This is basically a copy of the RLIMIT_AS and RLIMIT_MEMLOCK rlimit value maintained by the BSD
21752  * side of the kernel. The limits are checked in the mach VM side, so we keep a copy so we don't
21753  * have to reach over to the BSD data structures.
21754  */
21755 
21756 uint64_t vm_map_set_size_limit_count = 0;
21757 kern_return_t
vm_map_set_size_limit(vm_map_t map,uint64_t new_size_limit)21758 vm_map_set_size_limit(vm_map_t map, uint64_t new_size_limit)
21759 {
21760 	kern_return_t kr;
21761 
21762 	vm_map_lock(map);
21763 	if (new_size_limit < map->size) {
21764 		/* new limit should not be lower than its current size */
21765 		DTRACE_VM2(vm_map_set_size_limit_fail,
21766 		    vm_map_size_t, map->size,
21767 		    uint64_t, new_size_limit);
21768 		kr = KERN_FAILURE;
21769 	} else if (new_size_limit == map->size_limit) {
21770 		/* no change */
21771 		kr = KERN_SUCCESS;
21772 	} else {
21773 		/* set new limit */
21774 		DTRACE_VM2(vm_map_set_size_limit,
21775 		    vm_map_size_t, map->size,
21776 		    uint64_t, new_size_limit);
21777 		if (new_size_limit != RLIM_INFINITY) {
21778 			vm_map_set_size_limit_count++;
21779 		}
21780 		map->size_limit = new_size_limit;
21781 		kr = KERN_SUCCESS;
21782 	}
21783 	vm_map_unlock(map);
21784 	return kr;
21785 }
21786 
21787 uint64_t vm_map_set_data_limit_count = 0;
21788 kern_return_t
vm_map_set_data_limit(vm_map_t map,uint64_t new_data_limit)21789 vm_map_set_data_limit(vm_map_t map, uint64_t new_data_limit)
21790 {
21791 	kern_return_t kr;
21792 
21793 	vm_map_lock(map);
21794 	if (new_data_limit < map->size) {
21795 		/* new limit should not be lower than its current size */
21796 		DTRACE_VM2(vm_map_set_data_limit_fail,
21797 		    vm_map_size_t, map->size,
21798 		    uint64_t, new_data_limit);
21799 		kr = KERN_FAILURE;
21800 	} else if (new_data_limit == map->data_limit) {
21801 		/* no change */
21802 		kr = KERN_SUCCESS;
21803 	} else {
21804 		/* set new limit */
21805 		DTRACE_VM2(vm_map_set_data_limit,
21806 		    vm_map_size_t, map->size,
21807 		    uint64_t, new_data_limit);
21808 		if (new_data_limit != RLIM_INFINITY) {
21809 			vm_map_set_data_limit_count++;
21810 		}
21811 		map->data_limit = new_data_limit;
21812 		kr = KERN_SUCCESS;
21813 	}
21814 	vm_map_unlock(map);
21815 	return kr;
21816 }
21817 
21818 void
vm_map_set_user_wire_limit(vm_map_t map,vm_size_t limit)21819 vm_map_set_user_wire_limit(vm_map_t     map,
21820     vm_size_t    limit)
21821 {
21822 	vm_map_lock(map);
21823 	map->user_wire_limit = limit;
21824 	vm_map_unlock(map);
21825 }
21826 
21827 
21828 void
vm_map_switch_protect(vm_map_t map,boolean_t val)21829 vm_map_switch_protect(vm_map_t     map,
21830     boolean_t    val)
21831 {
21832 	vm_map_lock(map);
21833 	map->switch_protect = val;
21834 	vm_map_unlock(map);
21835 }
21836 
21837 extern int cs_process_enforcement_enable;
21838 boolean_t
vm_map_cs_enforcement(vm_map_t map)21839 vm_map_cs_enforcement(
21840 	vm_map_t map)
21841 {
21842 	if (cs_process_enforcement_enable) {
21843 		return TRUE;
21844 	}
21845 	return map->cs_enforcement;
21846 }
21847 
21848 kern_return_t
vm_map_cs_wx_enable(__unused vm_map_t map)21849 vm_map_cs_wx_enable(
21850 	__unused vm_map_t map)
21851 {
21852 #if CODE_SIGNING_MONITOR
21853 	kern_return_t ret = csm_allow_invalid_code(vm_map_pmap(map));
21854 	if ((ret == KERN_SUCCESS) || (ret == KERN_NOT_SUPPORTED)) {
21855 		return KERN_SUCCESS;
21856 	}
21857 	return ret;
21858 #else
21859 	/* The VM manages WX memory entirely on its own */
21860 	return KERN_SUCCESS;
21861 #endif
21862 }
21863 
21864 kern_return_t
vm_map_csm_allow_jit(__unused vm_map_t map)21865 vm_map_csm_allow_jit(
21866 	__unused vm_map_t map)
21867 {
21868 #if CODE_SIGNING_MONITOR
21869 	return csm_allow_jit_region(vm_map_pmap(map));
21870 #else
21871 	/* No code signing monitor to enforce JIT policy */
21872 	return KERN_SUCCESS;
21873 #endif
21874 }
21875 
21876 void
vm_map_cs_debugged_set(vm_map_t map,boolean_t val)21877 vm_map_cs_debugged_set(
21878 	vm_map_t map,
21879 	boolean_t val)
21880 {
21881 	vm_map_lock(map);
21882 	map->cs_debugged = val;
21883 	vm_map_unlock(map);
21884 }
21885 
21886 void
vm_map_cs_enforcement_set(vm_map_t map,boolean_t val)21887 vm_map_cs_enforcement_set(
21888 	vm_map_t map,
21889 	boolean_t val)
21890 {
21891 	vm_map_lock(map);
21892 	map->cs_enforcement = val;
21893 	pmap_set_vm_map_cs_enforced(map->pmap, val);
21894 	vm_map_unlock(map);
21895 }
21896 
21897 /*
21898  * IOKit has mapped a region into this map; adjust the pmap's ledgers appropriately.
21899  * phys_footprint is a composite limit consisting of iokit + physmem, so we need to
21900  * bump both counters.
21901  */
21902 void
vm_map_iokit_mapped_region(vm_map_t map,vm_size_t bytes)21903 vm_map_iokit_mapped_region(vm_map_t map, vm_size_t bytes)
21904 {
21905 	pmap_t pmap = vm_map_pmap(map);
21906 
21907 	ledger_credit(pmap->ledger, task_ledgers.iokit_mapped, bytes);
21908 	ledger_credit(pmap->ledger, task_ledgers.phys_footprint, bytes);
21909 }
21910 
21911 void
vm_map_iokit_unmapped_region(vm_map_t map,vm_size_t bytes)21912 vm_map_iokit_unmapped_region(vm_map_t map, vm_size_t bytes)
21913 {
21914 	pmap_t pmap = vm_map_pmap(map);
21915 
21916 	ledger_debit(pmap->ledger, task_ledgers.iokit_mapped, bytes);
21917 	ledger_debit(pmap->ledger, task_ledgers.phys_footprint, bytes);
21918 }
21919 
21920 /* Add (generate) code signature for memory range */
21921 #if CONFIG_DYNAMIC_CODE_SIGNING
21922 kern_return_t
vm_map_sign(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)21923 vm_map_sign(vm_map_t map,
21924     vm_map_offset_t start,
21925     vm_map_offset_t end)
21926 {
21927 	vm_map_entry_t entry;
21928 	vm_map_offset_t entry_start;
21929 	vm_object_offset_t entry_offset;
21930 	vm_page_t m;
21931 	vm_object_t object;
21932 
21933 	/*
21934 	 * Vet all the input parameters and current type and state of the
21935 	 * underlaying object.  Return with an error if anything is amiss.
21936 	 */
21937 	if (map == VM_MAP_NULL) {
21938 		return KERN_INVALID_ARGUMENT;
21939 	}
21940 
21941 	if (__improbable(vm_map_range_overflows(map, start, end - start))) {
21942 		return KERN_INVALID_ADDRESS;
21943 	}
21944 
21945 	vm_map_lock_read(map);
21946 
21947 	if (!vm_map_lookup_entry(map, start, &entry) || entry->is_sub_map) {
21948 		/*
21949 		 * Must pass a valid non-submap address.
21950 		 */
21951 		vm_map_unlock_read(map);
21952 		return KERN_INVALID_ADDRESS;
21953 	}
21954 
21955 	if ((entry->vme_start > start) || (entry->vme_end < end)) {
21956 		/*
21957 		 * Map entry doesn't cover the requested range. Not handling
21958 		 * this situation currently.
21959 		 */
21960 		vm_map_unlock_read(map);
21961 		return KERN_INVALID_ARGUMENT;
21962 	}
21963 
21964 	object = VME_OBJECT(entry);
21965 	if (object == VM_OBJECT_NULL) {
21966 		/*
21967 		 * Object must already be present or we can't sign.
21968 		 */
21969 		vm_map_unlock_read(map);
21970 		return KERN_INVALID_ARGUMENT;
21971 	}
21972 
21973 	vm_object_lock(object);
21974 
21975 	entry_start = entry->vme_start;
21976 	entry_offset = VME_OFFSET(entry);
21977 	vm_map_unlock_read(map);
21978 	entry = VM_MAP_ENTRY_NULL; /* no longer valid after unlocking map */
21979 
21980 	while (start < end) {
21981 		uint32_t refmod;
21982 
21983 		m = vm_page_lookup(object,
21984 		    start - entry_start + entry_offset);
21985 		if (m == VM_PAGE_NULL) {
21986 			/* shoud we try to fault a page here? we can probably
21987 			 * demand it exists and is locked for this request */
21988 			vm_object_unlock(object);
21989 			return KERN_FAILURE;
21990 		}
21991 		/* deal with special page status */
21992 		if (m->vmp_busy ||
21993 		    (m->vmp_unusual && (VMP_ERROR_GET(m) || m->vmp_restart ||
21994 		    vm_page_is_private(m) || m->vmp_absent))) {
21995 			vm_object_unlock(object);
21996 			return KERN_FAILURE;
21997 		}
21998 
21999 		/* Page is OK... now "validate" it */
22000 		/* This is the place where we'll call out to create a code
22001 		 * directory, later */
22002 		/* XXX TODO4K: deal with 4k subpages individually? */
22003 		m->vmp_cs_validated = VMP_CS_ALL_TRUE;
22004 
22005 		/* The page is now "clean" for codesigning purposes. That means
22006 		 * we don't consider it as modified (wpmapped) anymore. But
22007 		 * we'll disconnect the page so we note any future modification
22008 		 * attempts. */
22009 		m->vmp_wpmapped = FALSE;
22010 		refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
22011 
22012 		/* Pull the dirty status from the pmap, since we cleared the
22013 		 * wpmapped bit */
22014 		if ((refmod & VM_MEM_MODIFIED) && !m->vmp_dirty) {
22015 			SET_PAGE_DIRTY(m, FALSE);
22016 		}
22017 
22018 		/* On to the next page */
22019 		start += PAGE_SIZE;
22020 	}
22021 	vm_object_unlock(object);
22022 
22023 	return KERN_SUCCESS;
22024 }
22025 #endif
22026 
22027 kern_return_t
vm_map_partial_reap(vm_map_t map,unsigned int * reclaimed_resident,unsigned int * reclaimed_compressed)22028 vm_map_partial_reap(vm_map_t map, unsigned int *reclaimed_resident, unsigned int *reclaimed_compressed)
22029 {
22030 	vm_map_entry_t  entry = VM_MAP_ENTRY_NULL;
22031 	vm_map_entry_t  next_entry;
22032 	kern_return_t   kr = KERN_SUCCESS;
22033 	VM_MAP_ZAP_DECLARE(zap_list);
22034 
22035 	vm_map_lock(map);
22036 
22037 	for (entry = vm_map_first_entry(map);
22038 	    entry != vm_map_to_entry(map);
22039 	    entry = next_entry) {
22040 		next_entry = entry->vme_next;
22041 
22042 		if (!entry->is_sub_map &&
22043 		    VME_OBJECT(entry) &&
22044 		    (VME_OBJECT(entry)->internal == TRUE) &&
22045 		    (os_ref_get_count_raw(&VME_OBJECT(entry)->ref_count) == 1)) {
22046 			*reclaimed_resident += VME_OBJECT(entry)->resident_page_count;
22047 			*reclaimed_compressed += vm_compressor_pager_get_count(VME_OBJECT(entry)->pager);
22048 
22049 			(void)vm_map_delete(map, entry->vme_start,
22050 			    entry->vme_end, VM_MAP_REMOVE_NO_YIELD,
22051 			    KMEM_GUARD_NONE, &zap_list);
22052 		}
22053 	}
22054 
22055 	vm_map_unlock(map);
22056 
22057 	vm_map_zap_dispose(&zap_list);
22058 
22059 	return kr;
22060 }
22061 
22062 
22063 #if DEVELOPMENT || DEBUG
22064 
22065 int
vm_map_disconnect_page_mappings(vm_map_t map,boolean_t do_unnest)22066 vm_map_disconnect_page_mappings(
22067 	vm_map_t map,
22068 	boolean_t do_unnest)
22069 {
22070 	vm_map_entry_t entry;
22071 	ledger_amount_t byte_count = 0;
22072 
22073 	if (do_unnest == TRUE) {
22074 #ifndef NO_NESTED_PMAP
22075 		vm_map_lock(map);
22076 
22077 		for (entry = vm_map_first_entry(map);
22078 		    entry != vm_map_to_entry(map);
22079 		    entry = entry->vme_next) {
22080 			if (entry->is_sub_map && entry->use_pmap) {
22081 				/*
22082 				 * Make sure the range between the start of this entry and
22083 				 * the end of this entry is no longer nested, so that
22084 				 * we will only remove mappings from the pmap in use by this
22085 				 * this task
22086 				 */
22087 				vm_map_clip_unnest(map, entry, entry->vme_start, entry->vme_end);
22088 			}
22089 		}
22090 		vm_map_unlock(map);
22091 #endif
22092 	}
22093 	vm_map_lock_read(map);
22094 
22095 	ledger_get_balance(map->pmap->ledger, task_ledgers.phys_mem, &byte_count);
22096 
22097 	for (entry = vm_map_first_entry(map);
22098 	    entry != vm_map_to_entry(map);
22099 	    entry = entry->vme_next) {
22100 		if (!entry->is_sub_map && ((VME_OBJECT(entry) == 0) ||
22101 		    (VME_OBJECT(entry)->phys_contiguous))) {
22102 			continue;
22103 		}
22104 		if (entry->is_sub_map) {
22105 			assert(!entry->use_pmap);
22106 		}
22107 
22108 		pmap_remove_options(map->pmap, entry->vme_start, entry->vme_end, 0);
22109 	}
22110 	vm_map_unlock_read(map);
22111 
22112 	return (int) (byte_count / VM_MAP_PAGE_SIZE(map));
22113 }
22114 
22115 kern_return_t
vm_map_inject_error(vm_map_t map,vm_map_offset_t vaddr)22116 vm_map_inject_error(vm_map_t map, vm_map_offset_t vaddr)
22117 {
22118 	vm_object_t object = NULL;
22119 	vm_object_offset_t offset;
22120 	vm_prot_t prot;
22121 	boolean_t wired;
22122 	vm_map_version_t version;
22123 	vm_map_t real_map;
22124 	int result = KERN_FAILURE;
22125 
22126 	vaddr = vm_map_trunc_page(vaddr, PAGE_MASK);
22127 	vm_map_lock(map);
22128 
22129 	result = vm_map_lookup_and_lock_object(&map, vaddr, VM_PROT_READ,
22130 	    OBJECT_LOCK_EXCLUSIVE, &version, &object, &offset, &prot, &wired,
22131 	    NULL, &real_map, NULL);
22132 	if (object == NULL) {
22133 		result = KERN_MEMORY_ERROR;
22134 	} else if (object->pager) {
22135 		result = vm_compressor_pager_inject_error(object->pager,
22136 		    offset);
22137 	} else {
22138 		result = KERN_MEMORY_PRESENT;
22139 	}
22140 
22141 	if (object != NULL) {
22142 		vm_object_unlock(object);
22143 	}
22144 
22145 	if (real_map != map) {
22146 		vm_map_unlock(real_map);
22147 	}
22148 	vm_map_unlock(map);
22149 
22150 	return result;
22151 }
22152 
22153 /* iterate over map entries. Call the first argument block for the number of entries and the second for every entry
22154  * returns: KERN_SUCCESS if iteration completed ok,
22155  *      error code if callback returned an error
22156  *      KERN_FAILURE if there was a race of adding/removing entries during the iteration and the number of entries
22157  *      iterated is different from the number in the first call
22158  */
22159 static kern_return_t
22160 vm_map_entries_foreach_locked(vm_map_t map, kern_return_t (^count_handler)(int nentries),
22161     kern_return_t (^entry_handler)(void* entry))
22162 {
22163 	vm_map_lock_assert_held(map);
22164 	int nentries = map->hdr.nentries;
22165 	kern_return_t error = count_handler(nentries);
22166 	if (error) {
22167 		return error;
22168 	}
22169 
22170 	/* iterate until we loop back to the map, see get_vmmap_entries() */
22171 	vm_map_entry_t entry = vm_map_first_entry(map);
22172 	int count = 0;
22173 	while (entry != vm_map_to_entry(map)) {
22174 		error = entry_handler(entry);
22175 		if (error != KERN_SUCCESS) {
22176 			return error;
22177 		}
22178 		entry = entry->vme_next;
22179 		++count;
22180 		if (count > nentries) {
22181 			/* nentries and entries iteration don't agree on how many entries there are, shouldn't really happen */
22182 			return KERN_FAILURE;
22183 		}
22184 	}
22185 	if (count < nentries) {
22186 		return KERN_FAILURE;
22187 	}
22188 	return KERN_SUCCESS;
22189 }
22190 
22191 kern_return_t
22192 vm_map_entries_foreach(vm_map_t map, kern_return_t (^count_handler)(int nentries),
22193     kern_return_t (^entry_handler)(void* entry))
22194 {
22195 	vm_map_lock_read(map);
22196 	kern_return_t error = vm_map_entries_foreach_locked(map, count_handler, entry_handler);
22197 	vm_map_unlock_read(map);
22198 	return error;
22199 }
22200 
22201 /*
22202  * Dump info about the entry into the given buffer.
22203  * return true on success, false if there was not enough space in the give buffer
22204  * argument size in: bytes free in the given buffer, out: bytes written
22205  */
22206 kern_return_t
vm_map_dump_entry_and_compressor_pager(void * pentry,char * buf,size_t * size)22207 vm_map_dump_entry_and_compressor_pager(void* pentry, char *buf, size_t *size)
22208 {
22209 	size_t insize = *size;
22210 	kern_return_t kr;
22211 	size_t offset = 0;
22212 
22213 	*size = 0;
22214 	if (sizeof(struct vm_map_entry_info) > insize) {
22215 		return KERN_NO_SPACE;
22216 	}
22217 
22218 	vm_map_entry_t entry = (vm_map_entry_t)pentry;
22219 	struct vm_map_entry_info *out_entry = (struct vm_map_entry_info*)buf;
22220 	out_entry->vmei_start = entry->vme_start;
22221 	out_entry->vmei_end = entry->vme_end;
22222 	out_entry->vmei_alias = VME_ALIAS(entry);
22223 	out_entry->vmei_offset = VME_OFFSET(entry);
22224 	out_entry->vmei_is_sub_map = entry->is_sub_map;
22225 	out_entry->vmei_protection = entry->protection;
22226 	offset += sizeof(struct vm_map_entry_info);
22227 
22228 	out_entry->vmei_slot_mapping_count = 0;
22229 	out_entry->vmei_is_compressor_pager = false;
22230 	*size = offset;
22231 	if (out_entry->vmei_is_sub_map) {
22232 		return KERN_SUCCESS; // TODO: sub_map interrogation not supported yet
22233 	}
22234 	/* have a vm_object? */
22235 	vm_object_t object = VME_OBJECT(entry);
22236 	if (object == VM_OBJECT_NULL || !object->internal) {
22237 		return KERN_SUCCESS;
22238 	}
22239 	/* objects has a pager? */
22240 	memory_object_t pager = object->pager;
22241 	if (pager != MEMORY_OBJECT_NULL) {
22242 		return KERN_SUCCESS;
22243 	}
22244 	bool is_compressor = false;
22245 	unsigned int slot_mapping_count = 0;
22246 	size_t pager_info_size = insize - offset;
22247 	kr = vm_compressor_pager_dump(pager, buf + offset, &pager_info_size, &is_compressor, &slot_mapping_count);
22248 	if (kr != KERN_SUCCESS) {
22249 		/* didn't have enough space for everything we want to write, caller needs to retry */
22250 		return kr;
22251 	}
22252 	offset += pager_info_size;
22253 	/* if we got here, is_compressor should be true due to the object->internal check above, so this assignment
22254 	 * is just for sanity sake */
22255 	out_entry->vmei_is_compressor_pager = is_compressor;
22256 	out_entry->vmei_slot_mapping_count = slot_mapping_count;
22257 	*size = offset;
22258 	return KERN_SUCCESS;
22259 }
22260 
22261 
22262 #endif
22263 
22264 
22265 #if CONFIG_FREEZE
22266 
22267 
22268 extern struct freezer_context freezer_context_global;
22269 AbsoluteTime c_freezer_last_yield_ts = 0;
22270 
22271 extern unsigned int memorystatus_freeze_private_shared_pages_ratio;
22272 extern unsigned int memorystatus_freeze_shared_mb_per_process_max;
22273 
22274 kern_return_t
vm_map_freeze(task_t task,unsigned int * purgeable_count,unsigned int * wired_count,unsigned int * clean_count,unsigned int * dirty_count,unsigned int dirty_budget,unsigned int * shared_count,int * freezer_error_code,boolean_t eval_only)22275 vm_map_freeze(
22276 	task_t       task,
22277 	unsigned int *purgeable_count,
22278 	unsigned int *wired_count,
22279 	unsigned int *clean_count,
22280 	unsigned int *dirty_count,
22281 	unsigned int dirty_budget,
22282 	unsigned int *shared_count,
22283 	int          *freezer_error_code,
22284 	boolean_t    eval_only)
22285 {
22286 	vm_map_entry_t  entry2 = VM_MAP_ENTRY_NULL;
22287 	kern_return_t   kr = KERN_SUCCESS;
22288 	boolean_t       evaluation_phase = TRUE;
22289 	vm_object_t     cur_shared_object = NULL;
22290 	int             cur_shared_obj_ref_cnt = 0;
22291 	unsigned int    dirty_private_count = 0, dirty_shared_count = 0, obj_pages_snapshot = 0;
22292 
22293 	*purgeable_count = *wired_count = *clean_count = *dirty_count = *shared_count = 0;
22294 
22295 	/*
22296 	 * We need the exclusive lock here so that we can
22297 	 * block any page faults or lookups while we are
22298 	 * in the middle of freezing this vm map.
22299 	 */
22300 	vm_map_t map = task->map;
22301 
22302 	vm_map_lock(map);
22303 
22304 	assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
22305 
22306 	if (vm_compressor_low_on_space() || vm_swap_low_on_space()) {
22307 		if (vm_compressor_low_on_space()) {
22308 			*freezer_error_code = FREEZER_ERROR_NO_COMPRESSOR_SPACE;
22309 		}
22310 
22311 		if (vm_swap_low_on_space()) {
22312 			*freezer_error_code = FREEZER_ERROR_NO_SWAP_SPACE;
22313 		}
22314 
22315 		kr = KERN_NO_SPACE;
22316 		goto done;
22317 	}
22318 
22319 	if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE == FALSE) {
22320 		/*
22321 		 * In-memory compressor backing the freezer. No disk.
22322 		 * So no need to do the evaluation phase.
22323 		 */
22324 		evaluation_phase = FALSE;
22325 
22326 		if (eval_only == TRUE) {
22327 			/*
22328 			 * We don't support 'eval_only' mode
22329 			 * in this non-swap config.
22330 			 */
22331 			*freezer_error_code = FREEZER_ERROR_GENERIC;
22332 			kr = KERN_INVALID_ARGUMENT;
22333 			goto done;
22334 		}
22335 
22336 		freezer_context_global.freezer_ctx_uncompressed_pages = 0;
22337 		clock_get_uptime(&c_freezer_last_yield_ts);
22338 	}
22339 again:
22340 
22341 	for (entry2 = vm_map_first_entry(map);
22342 	    entry2 != vm_map_to_entry(map);
22343 	    entry2 = entry2->vme_next) {
22344 		vm_object_t src_object;
22345 
22346 		if (entry2->is_sub_map) {
22347 			continue;
22348 		}
22349 
22350 		src_object = VME_OBJECT(entry2);
22351 		if (!src_object ||
22352 		    src_object->phys_contiguous ||
22353 		    !src_object->internal) {
22354 			continue;
22355 		}
22356 
22357 		/* If eligible, scan the entry, moving eligible pages over to our parent object */
22358 
22359 		if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
22360 			/*
22361 			 * We skip purgeable objects during evaluation phase only.
22362 			 * If we decide to freeze this process, we'll explicitly
22363 			 * purge these objects before we go around again with
22364 			 * 'evaluation_phase' set to FALSE.
22365 			 */
22366 
22367 			if ((src_object->purgable == VM_PURGABLE_EMPTY) || (src_object->purgable == VM_PURGABLE_VOLATILE)) {
22368 				/*
22369 				 * We want to purge objects that may not belong to this task but are mapped
22370 				 * in this task alone. Since we already purged this task's purgeable memory
22371 				 * at the end of a successful evaluation phase, we want to avoid doing no-op calls
22372 				 * on this task's purgeable objects. Hence the check for only volatile objects.
22373 				 */
22374 				if (evaluation_phase ||
22375 				    src_object->purgable != VM_PURGABLE_VOLATILE ||
22376 				    os_ref_get_count_raw(&src_object->ref_count) != 1) {
22377 					continue;
22378 				}
22379 				vm_object_lock(src_object);
22380 				if (src_object->purgable == VM_PURGABLE_VOLATILE &&
22381 				    os_ref_get_count_raw(&src_object->ref_count) == 1) {
22382 					purgeable_q_t old_queue;
22383 
22384 					/* object should be on a purgeable queue */
22385 					assert(src_object->objq.next != NULL &&
22386 					    src_object->objq.prev != NULL);
22387 					/* move object from its volatile queue to the nonvolatile queue */
22388 					old_queue = vm_purgeable_object_remove(src_object);
22389 					assert(old_queue);
22390 					if (src_object->purgeable_when_ripe) {
22391 						/* remove a token from that volatile queue */
22392 						vm_page_lock_queues();
22393 						vm_purgeable_token_delete_first(old_queue);
22394 						vm_page_unlock_queues();
22395 					}
22396 					/* purge the object */
22397 					vm_object_purge(src_object, 0);
22398 				}
22399 				vm_object_unlock(src_object);
22400 				continue;
22401 			}
22402 
22403 			/*
22404 			 * Pages belonging to this object could be swapped to disk.
22405 			 * Make sure it's not a shared object because we could end
22406 			 * up just bringing it back in again.
22407 			 *
22408 			 * We try to optimize somewhat by checking for objects that are mapped
22409 			 * more than once within our own map. But we don't do full searches,
22410 			 * we just look at the entries following our current entry.
22411 			 */
22412 
22413 			if (os_ref_get_count_raw(&src_object->ref_count) > 1) {
22414 				if (src_object != cur_shared_object) {
22415 					obj_pages_snapshot = (src_object->resident_page_count - src_object->wired_page_count) + vm_compressor_pager_get_count(src_object->pager);
22416 					dirty_shared_count += obj_pages_snapshot;
22417 
22418 					cur_shared_object = src_object;
22419 					cur_shared_obj_ref_cnt = 1;
22420 					continue;
22421 				} else {
22422 					cur_shared_obj_ref_cnt++;
22423 					if (os_ref_get_count_raw(&src_object->ref_count) == cur_shared_obj_ref_cnt) {
22424 						/*
22425 						 * Fall through to below and treat this object as private.
22426 						 * So deduct its pages from our shared total and add it to the
22427 						 * private total.
22428 						 */
22429 
22430 						dirty_shared_count -= obj_pages_snapshot;
22431 						dirty_private_count += obj_pages_snapshot;
22432 					} else {
22433 						continue;
22434 					}
22435 				}
22436 			}
22437 
22438 
22439 			if (os_ref_get_count_raw(&src_object->ref_count) == 1) {
22440 				dirty_private_count += (src_object->resident_page_count - src_object->wired_page_count) + vm_compressor_pager_get_count(src_object->pager);
22441 			}
22442 
22443 			if (evaluation_phase == TRUE) {
22444 				continue;
22445 			}
22446 		}
22447 
22448 		uint32_t paged_out_count = vm_object_compressed_freezer_pageout(src_object, dirty_budget);
22449 		*wired_count += src_object->wired_page_count;
22450 
22451 		if (vm_compressor_low_on_space() || vm_swap_low_on_space()) {
22452 			if (vm_compressor_low_on_space()) {
22453 				*freezer_error_code = FREEZER_ERROR_NO_COMPRESSOR_SPACE;
22454 			}
22455 
22456 			if (vm_swap_low_on_space()) {
22457 				*freezer_error_code = FREEZER_ERROR_NO_SWAP_SPACE;
22458 			}
22459 
22460 			kr = KERN_NO_SPACE;
22461 			break;
22462 		}
22463 		if (paged_out_count >= dirty_budget) {
22464 			break;
22465 		}
22466 		dirty_budget -= paged_out_count;
22467 	}
22468 
22469 	*shared_count = (unsigned int) ((dirty_shared_count * PAGE_SIZE_64) / (1024 * 1024ULL));
22470 	if (evaluation_phase) {
22471 		unsigned int shared_pages_threshold = (memorystatus_freeze_shared_mb_per_process_max * 1024 * 1024ULL) / PAGE_SIZE_64;
22472 
22473 		if (dirty_shared_count > shared_pages_threshold) {
22474 			*freezer_error_code = FREEZER_ERROR_EXCESS_SHARED_MEMORY;
22475 			kr = KERN_FAILURE;
22476 			goto done;
22477 		}
22478 
22479 		if (dirty_shared_count &&
22480 		    ((dirty_private_count / dirty_shared_count) < memorystatus_freeze_private_shared_pages_ratio)) {
22481 			*freezer_error_code = FREEZER_ERROR_LOW_PRIVATE_SHARED_RATIO;
22482 			kr = KERN_FAILURE;
22483 			goto done;
22484 		}
22485 
22486 		evaluation_phase = FALSE;
22487 		dirty_shared_count = dirty_private_count = 0;
22488 
22489 		freezer_context_global.freezer_ctx_uncompressed_pages = 0;
22490 		clock_get_uptime(&c_freezer_last_yield_ts);
22491 
22492 		if (eval_only) {
22493 			kr = KERN_SUCCESS;
22494 			goto done;
22495 		}
22496 
22497 		vm_purgeable_purge_task_owned(task);
22498 
22499 		goto again;
22500 	} else {
22501 		kr = KERN_SUCCESS;
22502 	}
22503 
22504 done:
22505 	vm_map_unlock(map);
22506 
22507 	if ((eval_only == FALSE) && (kr == KERN_SUCCESS)) {
22508 		vm_object_compressed_freezer_done();
22509 	}
22510 	return kr;
22511 }
22512 
22513 #endif
22514 
22515 /*
22516  * vm_map_entry_should_cow_for_true_share:
22517  *
22518  * Determines if the map entry should be clipped and setup for copy-on-write
22519  * to avoid applying "true_share" to a large VM object when only a subset is
22520  * targeted.
22521  *
22522  * For now, we target only the map entries created for the Objective C
22523  * Garbage Collector, which initially have the following properties:
22524  *	- alias == VM_MEMORY_MALLOC
22525  *      - wired_count == 0
22526  *      - !needs_copy
22527  * and a VM object with:
22528  *      - internal
22529  *      - copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC
22530  *      - !true_share
22531  *      - vo_size == ANON_CHUNK_SIZE
22532  *
22533  * Only non-kernel map entries.
22534  */
22535 boolean_t
vm_map_entry_should_cow_for_true_share(vm_map_entry_t entry)22536 vm_map_entry_should_cow_for_true_share(
22537 	vm_map_entry_t  entry)
22538 {
22539 	vm_object_t     object;
22540 
22541 	if (entry->is_sub_map) {
22542 		/* entry does not point at a VM object */
22543 		return FALSE;
22544 	}
22545 
22546 	if (entry->needs_copy) {
22547 		/* already set for copy_on_write: done! */
22548 		return FALSE;
22549 	}
22550 
22551 	if (VME_ALIAS(entry) != VM_MEMORY_MALLOC &&
22552 	    VME_ALIAS(entry) != VM_MEMORY_MALLOC_SMALL) {
22553 		/* not a malloc heap or Obj-C Garbage Collector heap */
22554 		return FALSE;
22555 	}
22556 
22557 	if (entry->wired_count) {
22558 		/* wired: can't change the map entry... */
22559 		vm_counters.should_cow_but_wired++;
22560 		return FALSE;
22561 	}
22562 
22563 	object = VME_OBJECT(entry);
22564 
22565 	if (object == VM_OBJECT_NULL) {
22566 		/* no object yet... */
22567 		return FALSE;
22568 	}
22569 
22570 	if (!object->internal) {
22571 		/* not an internal object */
22572 		return FALSE;
22573 	}
22574 
22575 	if (object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
22576 		/* not the default copy strategy */
22577 		return FALSE;
22578 	}
22579 
22580 	if (object->true_share) {
22581 		/* already true_share: too late to avoid it */
22582 		return FALSE;
22583 	}
22584 
22585 	if (VME_ALIAS(entry) == VM_MEMORY_MALLOC &&
22586 	    object->vo_size != ANON_CHUNK_SIZE) {
22587 		/* ... not an object created for the ObjC Garbage Collector */
22588 		return FALSE;
22589 	}
22590 
22591 	if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_SMALL &&
22592 	    object->vo_size != 2048 * 4096) {
22593 		/* ... not a "MALLOC_SMALL" heap */
22594 		return FALSE;
22595 	}
22596 
22597 	/*
22598 	 * All the criteria match: we have a large object being targeted for "true_share".
22599 	 * To limit the adverse side-effects linked with "true_share", tell the caller to
22600 	 * try and avoid setting up the entire object for "true_share" by clipping the
22601 	 * targeted range and setting it up for copy-on-write.
22602 	 */
22603 	return TRUE;
22604 }
22605 
22606 uint64_t vm_map_range_overflows_count = 0;
22607 TUNABLE_WRITEABLE(boolean_t, vm_map_range_overflows_log, "vm_map_range_overflows_log", FALSE);
22608 bool
vm_map_range_overflows(vm_map_t map,vm_map_offset_t addr,vm_map_size_t size)22609 vm_map_range_overflows(
22610 	vm_map_t map,
22611 	vm_map_offset_t addr,
22612 	vm_map_size_t size)
22613 {
22614 	vm_map_offset_t start, end, sum;
22615 	vm_map_offset_t pgmask;
22616 
22617 	if (size == 0) {
22618 		/* empty range -> no overflow */
22619 		return false;
22620 	}
22621 	pgmask = vm_map_page_mask(map);
22622 	start = vm_map_trunc_page_mask(addr, pgmask);
22623 	end = vm_map_round_page_mask(addr + size, pgmask);
22624 	if (__improbable(os_add_overflow(addr, size, &sum) || end <= start)) {
22625 		vm_map_range_overflows_count++;
22626 		if (vm_map_range_overflows_log) {
22627 			printf("%d[%s] vm_map_range_overflows addr 0x%llx size 0x%llx pgmask 0x%llx\n",
22628 			    proc_selfpid(),
22629 			    proc_best_name(current_proc()),
22630 			    (uint64_t)addr,
22631 			    (uint64_t)size,
22632 			    (uint64_t)pgmask);
22633 		}
22634 		DTRACE_VM4(vm_map_range_overflows,
22635 		    vm_map_t, map,
22636 		    uint32_t, pgmask,
22637 		    uint64_t, (uint64_t)addr,
22638 		    uint64_t, (uint64_t)size);
22639 		return true;
22640 	}
22641 	return false;
22642 }
22643 
22644 vm_map_offset_t
vm_map_round_page_mask(vm_map_offset_t offset,vm_map_offset_t mask)22645 vm_map_round_page_mask(
22646 	vm_map_offset_t offset,
22647 	vm_map_offset_t mask)
22648 {
22649 	return VM_MAP_ROUND_PAGE(offset, mask);
22650 }
22651 
22652 vm_map_offset_t
vm_map_trunc_page_mask(vm_map_offset_t offset,vm_map_offset_t mask)22653 vm_map_trunc_page_mask(
22654 	vm_map_offset_t offset,
22655 	vm_map_offset_t mask)
22656 {
22657 	return VM_MAP_TRUNC_PAGE(offset, mask);
22658 }
22659 
22660 boolean_t
vm_map_page_aligned(vm_map_offset_t offset,vm_map_offset_t mask)22661 vm_map_page_aligned(
22662 	vm_map_offset_t offset,
22663 	vm_map_offset_t mask)
22664 {
22665 	return ((offset) & mask) == 0;
22666 }
22667 
22668 int
vm_map_page_shift(vm_map_t map)22669 vm_map_page_shift(
22670 	vm_map_t map)
22671 {
22672 	return VM_MAP_PAGE_SHIFT(map);
22673 }
22674 
22675 int
vm_map_page_size(vm_map_t map)22676 vm_map_page_size(
22677 	vm_map_t map)
22678 {
22679 	return VM_MAP_PAGE_SIZE(map);
22680 }
22681 
22682 vm_map_offset_t
vm_map_page_mask(vm_map_t map)22683 vm_map_page_mask(
22684 	vm_map_t map)
22685 {
22686 	return VM_MAP_PAGE_MASK(map);
22687 }
22688 
22689 kern_return_t
vm_map_set_page_shift(vm_map_t map,int pageshift)22690 vm_map_set_page_shift(
22691 	vm_map_t        map,
22692 	int             pageshift)
22693 {
22694 	if (map->hdr.nentries != 0) {
22695 		/* too late to change page size */
22696 		return KERN_FAILURE;
22697 	}
22698 
22699 	map->hdr.page_shift = (uint16_t)pageshift;
22700 
22701 	return KERN_SUCCESS;
22702 }
22703 
22704 kern_return_t
vm_map_query_volatile(vm_map_t map,mach_vm_size_t * volatile_virtual_size_p,mach_vm_size_t * volatile_resident_size_p,mach_vm_size_t * volatile_compressed_size_p,mach_vm_size_t * volatile_pmap_size_p,mach_vm_size_t * volatile_compressed_pmap_size_p)22705 vm_map_query_volatile(
22706 	vm_map_t        map,
22707 	mach_vm_size_t  *volatile_virtual_size_p,
22708 	mach_vm_size_t  *volatile_resident_size_p,
22709 	mach_vm_size_t  *volatile_compressed_size_p,
22710 	mach_vm_size_t  *volatile_pmap_size_p,
22711 	mach_vm_size_t  *volatile_compressed_pmap_size_p)
22712 {
22713 	mach_vm_size_t  volatile_virtual_size;
22714 	mach_vm_size_t  volatile_resident_count;
22715 	mach_vm_size_t  volatile_compressed_count;
22716 	mach_vm_size_t  volatile_pmap_count;
22717 	mach_vm_size_t  volatile_compressed_pmap_count;
22718 	mach_vm_size_t  resident_count;
22719 	vm_map_entry_t  entry;
22720 	vm_object_t     object;
22721 
22722 	/* map should be locked by caller */
22723 
22724 	volatile_virtual_size = 0;
22725 	volatile_resident_count = 0;
22726 	volatile_compressed_count = 0;
22727 	volatile_pmap_count = 0;
22728 	volatile_compressed_pmap_count = 0;
22729 
22730 	for (entry = vm_map_first_entry(map);
22731 	    entry != vm_map_to_entry(map);
22732 	    entry = entry->vme_next) {
22733 		mach_vm_size_t  pmap_resident_bytes, pmap_compressed_bytes;
22734 
22735 		if (entry->is_sub_map) {
22736 			continue;
22737 		}
22738 		if (!(entry->protection & VM_PROT_WRITE)) {
22739 			continue;
22740 		}
22741 		object = VME_OBJECT(entry);
22742 		if (object == VM_OBJECT_NULL) {
22743 			continue;
22744 		}
22745 		if (object->purgable != VM_PURGABLE_VOLATILE &&
22746 		    object->purgable != VM_PURGABLE_EMPTY) {
22747 			continue;
22748 		}
22749 		if (VME_OFFSET(entry)) {
22750 			/*
22751 			 * If the map entry has been split and the object now
22752 			 * appears several times in the VM map, we don't want
22753 			 * to count the object's resident_page_count more than
22754 			 * once.  We count it only for the first one, starting
22755 			 * at offset 0 and ignore the other VM map entries.
22756 			 */
22757 			continue;
22758 		}
22759 		resident_count = object->resident_page_count;
22760 		if ((VME_OFFSET(entry) / PAGE_SIZE) >= resident_count) {
22761 			resident_count = 0;
22762 		} else {
22763 			resident_count -= (VME_OFFSET(entry) / PAGE_SIZE);
22764 		}
22765 
22766 		volatile_virtual_size += entry->vme_end - entry->vme_start;
22767 		volatile_resident_count += resident_count;
22768 		if (object->pager) {
22769 			volatile_compressed_count +=
22770 			    vm_compressor_pager_get_count(object->pager);
22771 		}
22772 		pmap_compressed_bytes = 0;
22773 		pmap_resident_bytes =
22774 		    pmap_query_resident(map->pmap,
22775 		    entry->vme_start,
22776 		    entry->vme_end,
22777 		    &pmap_compressed_bytes);
22778 		volatile_pmap_count += (pmap_resident_bytes / PAGE_SIZE);
22779 		volatile_compressed_pmap_count += (pmap_compressed_bytes
22780 		    / PAGE_SIZE);
22781 	}
22782 
22783 	/* map is still locked on return */
22784 
22785 	*volatile_virtual_size_p = volatile_virtual_size;
22786 	*volatile_resident_size_p = volatile_resident_count * PAGE_SIZE;
22787 	*volatile_compressed_size_p = volatile_compressed_count * PAGE_SIZE;
22788 	*volatile_pmap_size_p = volatile_pmap_count * PAGE_SIZE;
22789 	*volatile_compressed_pmap_size_p = volatile_compressed_pmap_count * PAGE_SIZE;
22790 
22791 	return KERN_SUCCESS;
22792 }
22793 
22794 void
vm_map_sizes(vm_map_t map,vm_map_size_t * psize,vm_map_size_t * pfree,vm_map_size_t * plargest_free)22795 vm_map_sizes(vm_map_t map,
22796     vm_map_size_t * psize,
22797     vm_map_size_t * pfree,
22798     vm_map_size_t * plargest_free)
22799 {
22800 	vm_map_entry_t  entry;
22801 	vm_map_offset_t prev;
22802 	vm_map_size_t   free, total_free, largest_free;
22803 	boolean_t       end;
22804 
22805 	if (!map) {
22806 		*psize = *pfree = *plargest_free = 0;
22807 		return;
22808 	}
22809 	total_free = largest_free = 0;
22810 
22811 	vm_map_lock_read(map);
22812 	if (psize) {
22813 		*psize = map->max_offset - map->min_offset;
22814 	}
22815 
22816 	prev = map->min_offset;
22817 	for (entry = vm_map_first_entry(map);; entry = entry->vme_next) {
22818 		end = (entry == vm_map_to_entry(map));
22819 
22820 		if (end) {
22821 			free = entry->vme_end   - prev;
22822 		} else {
22823 			free = entry->vme_start - prev;
22824 		}
22825 
22826 		total_free += free;
22827 		if (free > largest_free) {
22828 			largest_free = free;
22829 		}
22830 
22831 		if (end) {
22832 			break;
22833 		}
22834 		prev = entry->vme_end;
22835 	}
22836 	vm_map_unlock_read(map);
22837 	if (pfree) {
22838 		*pfree = total_free;
22839 	}
22840 	if (plargest_free) {
22841 		*plargest_free = largest_free;
22842 	}
22843 }
22844 
22845 #if VM_SCAN_FOR_SHADOW_CHAIN
22846 int
vm_map_shadow_max(vm_map_t map)22847 vm_map_shadow_max(
22848 	vm_map_t map)
22849 {
22850 	int             shadows, shadows_max;
22851 	vm_map_entry_t  entry;
22852 	vm_object_t     object, next_object;
22853 
22854 	if (map == NULL) {
22855 		return 0;
22856 	}
22857 
22858 	shadows_max = 0;
22859 
22860 	vm_map_lock_read(map);
22861 
22862 	for (entry = vm_map_first_entry(map);
22863 	    entry != vm_map_to_entry(map);
22864 	    entry = entry->vme_next) {
22865 		if (entry->is_sub_map) {
22866 			continue;
22867 		}
22868 		object = VME_OBJECT(entry);
22869 		if (object == NULL) {
22870 			continue;
22871 		}
22872 		vm_object_lock_shared(object);
22873 		for (shadows = 0;
22874 		    object->shadow != NULL;
22875 		    shadows++, object = next_object) {
22876 			next_object = object->shadow;
22877 			vm_object_lock_shared(next_object);
22878 			vm_object_unlock(object);
22879 		}
22880 		vm_object_unlock(object);
22881 		if (shadows > shadows_max) {
22882 			shadows_max = shadows;
22883 		}
22884 	}
22885 
22886 	vm_map_unlock_read(map);
22887 
22888 	return shadows_max;
22889 }
22890 #endif /* VM_SCAN_FOR_SHADOW_CHAIN */
22891 
22892 void
vm_commit_pagezero_status(vm_map_t lmap)22893 vm_commit_pagezero_status(vm_map_t lmap)
22894 {
22895 	pmap_advise_pagezero_range(lmap->pmap, lmap->min_offset);
22896 }
22897 
22898 #if __x86_64__
22899 void
vm_map_set_high_start(vm_map_t map,vm_map_offset_t high_start)22900 vm_map_set_high_start(
22901 	vm_map_t        map,
22902 	vm_map_offset_t high_start)
22903 {
22904 	map->vmmap_high_start = high_start;
22905 }
22906 #endif /* __x86_64__ */
22907 
22908 #if CODE_SIGNING_MONITOR
22909 
22910 kern_return_t
vm_map_entry_cs_associate(vm_map_t map,vm_map_entry_t entry,vm_map_kernel_flags_t vmk_flags)22911 vm_map_entry_cs_associate(
22912 	vm_map_t                map,
22913 	vm_map_entry_t          entry,
22914 	vm_map_kernel_flags_t   vmk_flags)
22915 {
22916 	vm_object_t cs_object, cs_shadow, backing_object;
22917 	vm_object_offset_t cs_offset, backing_offset;
22918 	void *cs_blobs;
22919 	struct vnode *cs_vnode;
22920 	kern_return_t cs_ret;
22921 
22922 	if (map->pmap == NULL ||
22923 	    entry->is_sub_map || /* XXX FBDP: recurse on sub-range? */
22924 	    (csm_address_space_exempt(map->pmap) == KERN_SUCCESS) ||
22925 	    VME_OBJECT(entry) == VM_OBJECT_NULL) {
22926 		return KERN_SUCCESS;
22927 	}
22928 
22929 	if (!(entry->protection & VM_PROT_EXECUTE)) {
22930 		/*
22931 		 * This memory region is not executable, so the code-signing
22932 		 * monitor would usually not care about it...
22933 		 */
22934 		if (vmk_flags.vmkf_remap_prot_copy &&
22935 		    (entry->max_protection & VM_PROT_EXECUTE)) {
22936 			/*
22937 			 * ... except if the memory region is being remapped
22938 			 * from r-x/r-x to rw-/rwx via vm_protect(VM_PROT_COPY)
22939 			 * which is what a debugger or dtrace would be doing
22940 			 * to prepare to modify an executable page to insert
22941 			 * a breakpoint or activate a probe.
22942 			 * In that case, fall through so that we can mark
22943 			 * this region as being "debugged" and no longer
22944 			 * strictly code-signed.
22945 			 */
22946 		} else {
22947 			/*
22948 			 * Really not executable, so no need to tell the
22949 			 * code-signing monitor.
22950 			 */
22951 			return KERN_SUCCESS;
22952 		}
22953 	}
22954 
22955 	vm_map_lock_assert_exclusive(map);
22956 
22957 	/*
22958 	 * Check for a debug association mapping before we check for used_for_jit. This
22959 	 * allows non-RWX JIT on macOS systems to masquerade their mappings as USER_DEBUG
22960 	 * pages instead of USER_JIT. These non-RWX JIT pages cannot be marked as USER_JIT
22961 	 * since they are mapped with RW or RX permissions, which the page table monitor
22962 	 * denies on USER_JIT pages. Given that, if they're not mapped as USER_DEBUG,
22963 	 * they will be mapped as USER_EXEC, and that will cause another page table monitor
22964 	 * violation when those USER_EXEC pages are mapped as RW.
22965 	 *
22966 	 * Since these pages switch between RW and RX through mprotect, they mimic what
22967 	 * we expect a debugger to do. As the code signing monitor does not enforce mappings
22968 	 * on macOS systems, this works in our favor here and allows us to continue to
22969 	 * support these legacy-programmed applications without sacrificing security on
22970 	 * the page table or the code signing monitor. We don't need to explicitly check
22971 	 * for entry_for_jit here and the mapping permissions. If the initial mapping is
22972 	 * created with RX, then the application must map it as RW in order to first write
22973 	 * to the page (MAP_JIT mappings must be private and anonymous). The switch to
22974 	 * RX will cause vm_map_protect to mark the entry as vmkf_remap_prot_copy.
22975 	 * Similarly, if the mapping was created as RW, and then switched to RX,
22976 	 * vm_map_protect will again mark the entry as a copy, and both these cases
22977 	 * lead to this if-statement being entered.
22978 	 *
22979 	 * For more information: rdar://115313336.
22980 	 */
22981 	if (vmk_flags.vmkf_remap_prot_copy) {
22982 		cs_ret = csm_associate_debug_region(
22983 			map->pmap,
22984 			entry->vme_start,
22985 			entry->vme_end - entry->vme_start);
22986 
22987 		/*
22988 		 * csm_associate_debug_region returns not supported when the code signing
22989 		 * monitor is disabled. This is intentional, since cs_ret is checked towards
22990 		 * the end of the function, and if it is not supported, then we still want the
22991 		 * VM to perform code-signing enforcement on this entry. That said, if we don't
22992 		 * mark this as a xnu_user_debug page when the code-signing monitor is disabled,
22993 		 * then it never gets retyped to XNU_USER_DEBUG frame type, which then causes
22994 		 * an issue with debugging (since it'll be mapped in as XNU_USER_EXEC in some
22995 		 * cases, which will cause a violation when attempted to be mapped as writable).
22996 		 */
22997 		if ((cs_ret == KERN_SUCCESS) || (cs_ret == KERN_NOT_SUPPORTED)) {
22998 			entry->vme_xnu_user_debug = TRUE;
22999 		}
23000 #if DEVELOPMENT || DEBUG
23001 		if (vm_log_xnu_user_debug) {
23002 			printf("FBDP %d[%s] %s:%d map %p entry %p [ 0x%llx 0x%llx ]  vme_xnu_user_debug=%d cs_ret %d\n",
23003 			    proc_selfpid(),
23004 			    (get_bsdtask_info(current_task()) ? proc_name_address(get_bsdtask_info(current_task())) : "?"),
23005 			    __FUNCTION__, __LINE__,
23006 			    map, entry,
23007 			    (uint64_t)entry->vme_start, (uint64_t)entry->vme_end,
23008 			    entry->vme_xnu_user_debug,
23009 			    cs_ret);
23010 		}
23011 #endif /* DEVELOPMENT || DEBUG */
23012 		goto done;
23013 	}
23014 
23015 	if (entry->used_for_jit) {
23016 		cs_ret = csm_associate_jit_region(
23017 			map->pmap,
23018 			entry->vme_start,
23019 			entry->vme_end - entry->vme_start);
23020 		goto done;
23021 	}
23022 
23023 	cs_object = VME_OBJECT(entry);
23024 	vm_object_lock_shared(cs_object);
23025 	cs_offset = VME_OFFSET(entry);
23026 
23027 	/* find the VM object backed by the code-signed vnode */
23028 	for (;;) {
23029 		/* go to the bottom of cs_object's shadow chain */
23030 		for (;
23031 		    cs_object->shadow != VM_OBJECT_NULL;
23032 		    cs_object = cs_shadow) {
23033 			cs_shadow = cs_object->shadow;
23034 			cs_offset += cs_object->vo_shadow_offset;
23035 			vm_object_lock_shared(cs_shadow);
23036 			vm_object_unlock(cs_object);
23037 		}
23038 		if (cs_object->internal ||
23039 		    cs_object->pager == MEMORY_OBJECT_NULL) {
23040 			vm_object_unlock(cs_object);
23041 			return KERN_SUCCESS;
23042 		}
23043 
23044 		cs_offset += cs_object->paging_offset;
23045 
23046 		/*
23047 		 * cs_object could be backed by a:
23048 		 *      vnode_pager
23049 		 *	apple_protect_pager
23050 		 *      shared_region_pager
23051 		 *	fourk_pager (multiple backing objects -> fail?)
23052 		 * ask the pager if it has a backing VM object
23053 		 */
23054 		if (!memory_object_backing_object(cs_object->pager,
23055 		    cs_offset,
23056 		    &backing_object,
23057 		    &backing_offset)) {
23058 			/* no backing object: cs_object is it */
23059 			break;
23060 		}
23061 
23062 		/* look down the backing object's shadow chain */
23063 		vm_object_lock_shared(backing_object);
23064 		vm_object_unlock(cs_object);
23065 		cs_object = backing_object;
23066 		cs_offset = backing_offset;
23067 	}
23068 
23069 	cs_vnode = vnode_pager_lookup_vnode(cs_object->pager);
23070 	if (cs_vnode == NULL) {
23071 		/* no vnode, no code signatures to associate */
23072 		cs_ret = KERN_SUCCESS;
23073 	} else {
23074 		cs_ret = vnode_pager_get_cs_blobs(cs_vnode,
23075 		    &cs_blobs);
23076 		assert(cs_ret == KERN_SUCCESS);
23077 		cs_ret = cs_associate_blob_with_mapping(map->pmap,
23078 		    entry->vme_start,
23079 		    (entry->vme_end - entry->vme_start),
23080 		    cs_offset,
23081 		    cs_blobs);
23082 	}
23083 	vm_object_unlock(cs_object);
23084 	cs_object = VM_OBJECT_NULL;
23085 
23086 done:
23087 	if (cs_ret == KERN_SUCCESS) {
23088 		DTRACE_VM2(vm_map_entry_cs_associate_success,
23089 		    vm_map_offset_t, entry->vme_start,
23090 		    vm_map_offset_t, entry->vme_end);
23091 		if (vm_map_executable_immutable) {
23092 			/*
23093 			 * Prevent this executable
23094 			 * mapping from being unmapped
23095 			 * or modified.
23096 			 */
23097 			entry->vme_permanent = TRUE;
23098 		}
23099 		/*
23100 		 * pmap says it will validate the
23101 		 * code-signing validity of pages
23102 		 * faulted in via this mapping, so
23103 		 * this map entry should be marked so
23104 		 * that vm_fault() bypasses code-signing
23105 		 * validation for faults coming through
23106 		 * this mapping.
23107 		 */
23108 		entry->csm_associated = TRUE;
23109 	} else if (cs_ret == KERN_NOT_SUPPORTED) {
23110 		/*
23111 		 * pmap won't check the code-signing
23112 		 * validity of pages faulted in via
23113 		 * this mapping, so VM should keep
23114 		 * doing it.
23115 		 */
23116 		DTRACE_VM3(vm_map_entry_cs_associate_off,
23117 		    vm_map_offset_t, entry->vme_start,
23118 		    vm_map_offset_t, entry->vme_end,
23119 		    int, cs_ret);
23120 	} else {
23121 		/*
23122 		 * A real error: do not allow
23123 		 * execution in this mapping.
23124 		 */
23125 		DTRACE_VM3(vm_map_entry_cs_associate_failure,
23126 		    vm_map_offset_t, entry->vme_start,
23127 		    vm_map_offset_t, entry->vme_end,
23128 		    int, cs_ret);
23129 		if (vmk_flags.vmkf_overwrite_immutable) {
23130 			/*
23131 			 * We can get here when we remap an apple_protect pager
23132 			 * on top of an already cs_associated executable mapping
23133 			 * with the same code signatures, so we don't want to
23134 			 * lose VM_PROT_EXECUTE in that case...
23135 			 */
23136 		} else {
23137 			entry->protection &= ~VM_PROT_ALLEXEC;
23138 			entry->max_protection &= ~VM_PROT_ALLEXEC;
23139 		}
23140 	}
23141 
23142 	return cs_ret;
23143 }
23144 
23145 #endif /* CODE_SIGNING_MONITOR */
23146 
23147 inline bool
vm_map_is_corpse_source(vm_map_t map)23148 vm_map_is_corpse_source(vm_map_t map)
23149 {
23150 	bool status = false;
23151 	if (map) {
23152 		vm_map_lock_read(map);
23153 		status = map->corpse_source;
23154 		vm_map_unlock_read(map);
23155 	}
23156 	return status;
23157 }
23158 
23159 inline void
vm_map_set_corpse_source(vm_map_t map)23160 vm_map_set_corpse_source(vm_map_t map)
23161 {
23162 	if (map) {
23163 		vm_map_lock(map);
23164 		map->corpse_source = true;
23165 		vm_map_unlock(map);
23166 	}
23167 }
23168 
23169 inline void
vm_map_unset_corpse_source(vm_map_t map)23170 vm_map_unset_corpse_source(vm_map_t map)
23171 {
23172 	if (map) {
23173 		vm_map_lock(map);
23174 		map->corpse_source = false;
23175 		vm_map_unlock(map);
23176 	}
23177 }
23178 /*
23179  * FORKED CORPSE FOOTPRINT
23180  *
23181  * A forked corpse gets a copy of the original VM map but its pmap is mostly
23182  * empty since it never ran and never got to fault in any pages.
23183  * Collecting footprint info (via "sysctl vm.self_region_footprint") for
23184  * a forked corpse would therefore return very little information.
23185  *
23186  * When forking a corpse, we can pass the VM_MAP_FORK_CORPSE_FOOTPRINT option
23187  * to vm_map_fork() to collect footprint information from the original VM map
23188  * and its pmap, and store it in the forked corpse's VM map.  That information
23189  * is stored in place of the VM map's "hole list" since we'll never need to
23190  * lookup for holes in the corpse's map.
23191  *
23192  * The corpse's footprint info looks like this:
23193  *
23194  * vm_map->vmmap_corpse_footprint points to pageable kernel memory laid out
23195  * as follows:
23196  *                     +---------------------------------------+
23197  *            header-> | cf_size                               |
23198  *                     +-------------------+-------------------+
23199  *                     | cf_last_region    | cf_last_zeroes    |
23200  *                     +-------------------+-------------------+
23201  *           region1-> | cfr_vaddr                             |
23202  *                     +-------------------+-------------------+
23203  *                     | cfr_num_pages     | d0 | d1 | d2 | d3 |
23204  *                     +---------------------------------------+
23205  *                     | d4 | d5 | ...                         |
23206  *                     +---------------------------------------+
23207  *                     | ...                                   |
23208  *                     +-------------------+-------------------+
23209  *                     | dy | dz | na | na | cfr_vaddr...      | <-region2
23210  *                     +-------------------+-------------------+
23211  *                     | cfr_vaddr (ctd)   | cfr_num_pages     |
23212  *                     +---------------------------------------+
23213  *                     | d0 | d1 ...                           |
23214  *                     +---------------------------------------+
23215  *                       ...
23216  *                     +---------------------------------------+
23217  *       last region-> | cfr_vaddr                             |
23218  *                     +---------------------------------------+
23219  *                     + cfr_num_pages     | d0 | d1 | d2 | d3 |
23220  *                     +---------------------------------------+
23221  *                       ...
23222  *                     +---------------------------------------+
23223  *                     | dx | dy | dz | na | na | na | na | na |
23224  *                     +---------------------------------------+
23225  *
23226  * where:
23227  *      cf_size:	total size of the buffer (rounded to page size)
23228  *      cf_last_region:	offset in the buffer of the last "region" sub-header
23229  *	cf_last_zeroes: number of trailing "zero" dispositions at the end
23230  *			of last region
23231  *	cfr_vaddr:	virtual address of the start of the covered "region"
23232  *	cfr_num_pages:	number of pages in the covered "region"
23233  *	d*:		disposition of the page at that virtual address
23234  * Regions in the buffer are word-aligned.
23235  *
23236  * We estimate the size of the buffer based on the number of memory regions
23237  * and the virtual size of the address space.  While copying each memory region
23238  * during vm_map_fork(), we also collect the footprint info for that region
23239  * and store it in the buffer, packing it as much as possible (coalescing
23240  * contiguous memory regions to avoid having too many region headers and
23241  * avoiding long streaks of "zero" page dispositions by splitting footprint
23242  * "regions", so the number of regions in the footprint buffer might not match
23243  * the number of memory regions in the address space.
23244  *
23245  * We also have to copy the original task's "nonvolatile" ledgers since that's
23246  * part of the footprint and will need to be reported to any tool asking for
23247  * the footprint information of the forked corpse.
23248  */
23249 
23250 uint64_t vm_map_corpse_footprint_count = 0;
23251 uint64_t vm_map_corpse_footprint_size_avg = 0;
23252 uint64_t vm_map_corpse_footprint_size_max = 0;
23253 uint64_t vm_map_corpse_footprint_full = 0;
23254 uint64_t vm_map_corpse_footprint_no_buf = 0;
23255 
23256 struct vm_map_corpse_footprint_header {
23257 	vm_size_t       cf_size;        /* allocated buffer size */
23258 	uint32_t        cf_last_region; /* offset of last region in buffer */
23259 	union {
23260 		uint32_t cfu_last_zeroes; /* during creation:
23261 		                           * number of "zero" dispositions at
23262 		                           * end of last region */
23263 		uint32_t cfu_hint_region; /* during lookup:
23264 		                           * offset of last looked up region */
23265 #define cf_last_zeroes cfu.cfu_last_zeroes
23266 #define cf_hint_region cfu.cfu_hint_region
23267 	} cfu;
23268 };
23269 typedef uint8_t cf_disp_t;
23270 struct vm_map_corpse_footprint_region {
23271 	vm_map_offset_t cfr_vaddr;      /* region start virtual address */
23272 	uint32_t        cfr_num_pages;  /* number of pages in this "region" */
23273 	cf_disp_t   cfr_disposition[0]; /* disposition of each page */
23274 } __attribute__((packed));
23275 
23276 static cf_disp_t
vm_page_disposition_to_cf_disp(int disposition)23277 vm_page_disposition_to_cf_disp(
23278 	int disposition)
23279 {
23280 	assert(sizeof(cf_disp_t) == 1);
23281 	/* relocate bits that don't fit in a "uint8_t" */
23282 	if (disposition & VM_PAGE_QUERY_PAGE_REUSABLE) {
23283 		disposition |= VM_PAGE_QUERY_PAGE_FICTITIOUS;
23284 	}
23285 	/* cast gets rid of extra bits */
23286 	return (cf_disp_t) disposition;
23287 }
23288 
23289 static int
vm_page_cf_disp_to_disposition(cf_disp_t cf_disp)23290 vm_page_cf_disp_to_disposition(
23291 	cf_disp_t cf_disp)
23292 {
23293 	int disposition;
23294 
23295 	assert(sizeof(cf_disp_t) == 1);
23296 	disposition = (int) cf_disp;
23297 	/* move relocated bits back in place */
23298 	if (cf_disp & VM_PAGE_QUERY_PAGE_FICTITIOUS) {
23299 		disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
23300 		disposition &= ~VM_PAGE_QUERY_PAGE_FICTITIOUS;
23301 	}
23302 	return disposition;
23303 }
23304 
23305 static kmem_guard_t
vm_map_corpse_footprint_guard(vm_map_t map)23306 vm_map_corpse_footprint_guard(vm_map_t map)
23307 {
23308 	return (kmem_guard_t){
23309 		       .kmg_atomic = true,
23310 		       .kmg_tag = VM_KERN_MEMORY_DIAG,
23311 		       .kmg_context = os_hash_kernel_pointer(&map->vmmap_corpse_footprint),
23312 	};
23313 }
23314 
23315 /*
23316  * vm_map_corpse_footprint_new_region:
23317  *      closes the current footprint "region" and creates a new one
23318  *
23319  * Returns NULL if there's not enough space in the buffer for a new region.
23320  */
23321 static struct vm_map_corpse_footprint_region *
vm_map_corpse_footprint_new_region(struct vm_map_corpse_footprint_header * footprint_header)23322 vm_map_corpse_footprint_new_region(
23323 	struct vm_map_corpse_footprint_header *footprint_header)
23324 {
23325 	uintptr_t       footprint_edge;
23326 	uint32_t        new_region_offset;
23327 	struct vm_map_corpse_footprint_region *footprint_region;
23328 	struct vm_map_corpse_footprint_region *new_footprint_region;
23329 
23330 	footprint_edge = ((uintptr_t)footprint_header +
23331 	    footprint_header->cf_size);
23332 	footprint_region = ((struct vm_map_corpse_footprint_region *)
23333 	    ((char *)footprint_header +
23334 	    footprint_header->cf_last_region));
23335 	assert((uintptr_t)footprint_region + sizeof(*footprint_region) <=
23336 	    footprint_edge);
23337 
23338 	/* get rid of trailing zeroes in the last region */
23339 	assert(footprint_region->cfr_num_pages >=
23340 	    footprint_header->cf_last_zeroes);
23341 	footprint_region->cfr_num_pages -=
23342 	    footprint_header->cf_last_zeroes;
23343 	footprint_header->cf_last_zeroes = 0;
23344 
23345 	/* reuse this region if it's now empty */
23346 	if (footprint_region->cfr_num_pages == 0) {
23347 		return footprint_region;
23348 	}
23349 
23350 	/* compute offset of new region */
23351 	new_region_offset = footprint_header->cf_last_region;
23352 	new_region_offset += sizeof(*footprint_region);
23353 	new_region_offset += (footprint_region->cfr_num_pages * sizeof(cf_disp_t));
23354 	new_region_offset = roundup(new_region_offset, sizeof(int));
23355 
23356 	/* check if we're going over the edge */
23357 	if (((uintptr_t)footprint_header +
23358 	    new_region_offset +
23359 	    sizeof(*footprint_region)) >=
23360 	    footprint_edge) {
23361 		/* over the edge: no new region */
23362 		return NULL;
23363 	}
23364 
23365 	/* adjust offset of last region in header */
23366 	footprint_header->cf_last_region = new_region_offset;
23367 
23368 	new_footprint_region = (struct vm_map_corpse_footprint_region *)
23369 	    ((char *)footprint_header +
23370 	    footprint_header->cf_last_region);
23371 	new_footprint_region->cfr_vaddr = 0;
23372 	new_footprint_region->cfr_num_pages = 0;
23373 	/* caller needs to initialize new region */
23374 
23375 	return new_footprint_region;
23376 }
23377 
23378 /*
23379  * vm_map_corpse_footprint_collect:
23380  *	collect footprint information for "old_entry" in "old_map" and
23381  *	stores it in "new_map"'s vmmap_footprint_info.
23382  */
23383 kern_return_t
vm_map_corpse_footprint_collect(vm_map_t old_map,vm_map_entry_t old_entry,vm_map_t new_map)23384 vm_map_corpse_footprint_collect(
23385 	vm_map_t        old_map,
23386 	vm_map_entry_t  old_entry,
23387 	vm_map_t        new_map)
23388 {
23389 	vm_map_offset_t va;
23390 	kmem_return_t kmr;
23391 	struct vm_map_corpse_footprint_header *footprint_header;
23392 	struct vm_map_corpse_footprint_region *footprint_region;
23393 	struct vm_map_corpse_footprint_region *new_footprint_region;
23394 	cf_disp_t       *next_disp_p;
23395 	uintptr_t       footprint_edge;
23396 	uint32_t        num_pages_tmp;
23397 	int             effective_page_size;
23398 
23399 	effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(old_map));
23400 
23401 	va = old_entry->vme_start;
23402 
23403 	vm_map_lock_assert_exclusive(old_map);
23404 	vm_map_lock_assert_exclusive(new_map);
23405 
23406 	assert(new_map->has_corpse_footprint);
23407 	assert(!old_map->has_corpse_footprint);
23408 	if (!new_map->has_corpse_footprint ||
23409 	    old_map->has_corpse_footprint) {
23410 		/*
23411 		 * This can only transfer footprint info from a
23412 		 * map with a live pmap to a map with a corpse footprint.
23413 		 */
23414 		return KERN_NOT_SUPPORTED;
23415 	}
23416 
23417 	if (new_map->vmmap_corpse_footprint == NULL) {
23418 		vm_size_t buf_size;
23419 
23420 		buf_size = (sizeof(*footprint_header) +
23421 		    (old_map->hdr.nentries
23422 		    *
23423 		    (sizeof(*footprint_region) +
23424 		    +3))            /* potential alignment for each region */
23425 		    +
23426 		    ((old_map->size / effective_page_size)
23427 		    *
23428 		    sizeof(cf_disp_t)));      /* disposition for each page */
23429 //		printf("FBDP corpse map %p guestimate footprint size 0x%llx\n", new_map, (uint64_t) buf_size);
23430 		buf_size = round_page(buf_size);
23431 
23432 		/* limit buffer to 1 page to validate overflow detection */
23433 //		buf_size = PAGE_SIZE;
23434 
23435 		/* limit size to a somewhat sane amount */
23436 #if XNU_TARGET_OS_OSX
23437 #define VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE   (8*1024*1024)   /* 8MB */
23438 #else /* XNU_TARGET_OS_OSX */
23439 #define VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE   (256*1024)      /* 256KB */
23440 #endif /* XNU_TARGET_OS_OSX */
23441 		if (buf_size > VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE) {
23442 			buf_size = VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE;
23443 		}
23444 		kmem_guard_t guard = vm_map_corpse_footprint_guard(new_map);
23445 		kmr = kmem_alloc_guard(kernel_map, buf_size + PAGE_SIZE, 0,
23446 		    KMA_DATA | KMA_GUARD_LAST | KMA_KOBJECT | KMA_ZERO,
23447 		    guard);
23448 		if (kmr.kmr_return != KERN_SUCCESS) {
23449 			vm_map_corpse_footprint_no_buf++;
23450 			return kmr.kmr_return;
23451 		}
23452 
23453 		/* initialize header and 1st region */
23454 		footprint_header = (struct vm_map_corpse_footprint_header *)kmr.kmr_ptr;
23455 		assert3p(footprint_header, !=, NULL);
23456 		new_map->vmmap_corpse_footprint = footprint_header;
23457 
23458 		footprint_header->cf_size = buf_size;
23459 		footprint_header->cf_last_region =
23460 		    sizeof(*footprint_header);
23461 		footprint_header->cf_last_zeroes = 0;
23462 
23463 		footprint_region = (struct vm_map_corpse_footprint_region *)
23464 		    ((char *)footprint_header +
23465 		    footprint_header->cf_last_region);
23466 		footprint_region->cfr_vaddr = 0;
23467 		footprint_region->cfr_num_pages = 0;
23468 	} else {
23469 		/* retrieve header and last region */
23470 		footprint_header = (struct vm_map_corpse_footprint_header *)
23471 		    new_map->vmmap_corpse_footprint;
23472 		footprint_region = (struct vm_map_corpse_footprint_region *)
23473 		    ((char *)footprint_header +
23474 		    footprint_header->cf_last_region);
23475 	}
23476 	footprint_edge = ((uintptr_t)footprint_header +
23477 	    footprint_header->cf_size);
23478 
23479 	if ((footprint_region->cfr_vaddr +
23480 	    (((vm_map_offset_t)footprint_region->cfr_num_pages) *
23481 	    effective_page_size))
23482 	    != old_entry->vme_start) {
23483 		uint64_t num_pages_delta, num_pages_delta_size;
23484 		uint32_t region_offset_delta_size;
23485 
23486 		/*
23487 		 * Not the next contiguous virtual address:
23488 		 * start a new region or store "zero" dispositions for
23489 		 * the missing pages?
23490 		 */
23491 		/* size of gap in actual page dispositions */
23492 		num_pages_delta = ((old_entry->vme_start -
23493 		    footprint_region->cfr_vaddr) / effective_page_size)
23494 		    - footprint_region->cfr_num_pages;
23495 		num_pages_delta_size = num_pages_delta * sizeof(cf_disp_t);
23496 		/* size of gap as a new footprint region header */
23497 		region_offset_delta_size =
23498 		    (sizeof(*footprint_region) +
23499 		    roundup(((footprint_region->cfr_num_pages -
23500 		    footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)),
23501 		    sizeof(int)) -
23502 		    ((footprint_region->cfr_num_pages -
23503 		    footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)));
23504 //		printf("FBDP %s:%d region 0x%x 0x%llx 0x%x vme_start 0x%llx pages_delta 0x%llx region_delta 0x%x\n", __FUNCTION__, __LINE__, footprint_header->cf_last_region, footprint_region->cfr_vaddr, footprint_region->cfr_num_pages, old_entry->vme_start, num_pages_delta, region_offset_delta);
23505 		if (region_offset_delta_size < num_pages_delta_size ||
23506 		    os_add3_overflow(footprint_region->cfr_num_pages,
23507 		    (uint32_t) num_pages_delta,
23508 		    1,
23509 		    &num_pages_tmp)) {
23510 			/*
23511 			 * Storing data for this gap would take more space
23512 			 * than inserting a new footprint region header:
23513 			 * let's start a new region and save space. If it's a
23514 			 * tie, let's avoid using a new region, since that
23515 			 * would require more region hops to find the right
23516 			 * range during lookups.
23517 			 *
23518 			 * If the current region's cfr_num_pages would overflow
23519 			 * if we added "zero" page dispositions for the gap,
23520 			 * no choice but to start a new region.
23521 			 */
23522 //			printf("FBDP %s:%d new region\n", __FUNCTION__, __LINE__);
23523 			new_footprint_region =
23524 			    vm_map_corpse_footprint_new_region(footprint_header);
23525 			/* check that we're not going over the edge */
23526 			if (new_footprint_region == NULL) {
23527 				goto over_the_edge;
23528 			}
23529 			footprint_region = new_footprint_region;
23530 			/* initialize new region as empty */
23531 			footprint_region->cfr_vaddr = old_entry->vme_start;
23532 			footprint_region->cfr_num_pages = 0;
23533 		} else {
23534 			/*
23535 			 * Store "zero" page dispositions for the missing
23536 			 * pages.
23537 			 */
23538 //			printf("FBDP %s:%d zero gap\n", __FUNCTION__, __LINE__);
23539 			for (; num_pages_delta > 0; num_pages_delta--) {
23540 				next_disp_p = (cf_disp_t *)
23541 				    ((uintptr_t) footprint_region +
23542 				    sizeof(*footprint_region));
23543 				next_disp_p += footprint_region->cfr_num_pages;
23544 				/* check that we're not going over the edge */
23545 				if ((uintptr_t)next_disp_p >= footprint_edge) {
23546 					goto over_the_edge;
23547 				}
23548 				/* store "zero" disposition for this gap page */
23549 				footprint_region->cfr_num_pages++;
23550 				*next_disp_p = (cf_disp_t) 0;
23551 				footprint_header->cf_last_zeroes++;
23552 			}
23553 		}
23554 	}
23555 
23556 	for (va = old_entry->vme_start;
23557 	    va < old_entry->vme_end;
23558 	    va += effective_page_size) {
23559 		int             disposition;
23560 		cf_disp_t       cf_disp;
23561 
23562 		vm_map_footprint_query_page_info(old_map,
23563 		    old_entry,
23564 		    va,
23565 		    &disposition);
23566 		cf_disp = vm_page_disposition_to_cf_disp(disposition);
23567 
23568 //		if (va < SHARED_REGION_BASE_ARM64) printf("FBDP collect map %p va 0x%llx disp 0x%x\n", new_map, va, disp);
23569 
23570 		if (cf_disp == 0 && footprint_region->cfr_num_pages == 0) {
23571 			/*
23572 			 * Ignore "zero" dispositions at start of
23573 			 * region: just move start of region.
23574 			 */
23575 			footprint_region->cfr_vaddr += effective_page_size;
23576 			continue;
23577 		}
23578 
23579 		/* would region's cfr_num_pages overflow? */
23580 		if (os_add_overflow(footprint_region->cfr_num_pages, 1,
23581 		    &num_pages_tmp)) {
23582 			/* overflow: create a new region */
23583 			new_footprint_region =
23584 			    vm_map_corpse_footprint_new_region(
23585 				footprint_header);
23586 			if (new_footprint_region == NULL) {
23587 				goto over_the_edge;
23588 			}
23589 			footprint_region = new_footprint_region;
23590 			footprint_region->cfr_vaddr = va;
23591 			footprint_region->cfr_num_pages = 0;
23592 		}
23593 
23594 		next_disp_p = (cf_disp_t *) ((uintptr_t) footprint_region +
23595 		    sizeof(*footprint_region));
23596 		next_disp_p += footprint_region->cfr_num_pages;
23597 		/* check that we're not going over the edge */
23598 		if ((uintptr_t)next_disp_p >= footprint_edge) {
23599 			goto over_the_edge;
23600 		}
23601 		/* store this dispostion */
23602 		*next_disp_p = cf_disp;
23603 		footprint_region->cfr_num_pages++;
23604 
23605 		if (cf_disp != 0) {
23606 			/* non-zero disp: break the current zero streak */
23607 			footprint_header->cf_last_zeroes = 0;
23608 			/* done */
23609 			continue;
23610 		}
23611 
23612 		/* zero disp: add to the current streak of zeroes */
23613 		footprint_header->cf_last_zeroes++;
23614 		if ((footprint_header->cf_last_zeroes +
23615 		    roundup(((footprint_region->cfr_num_pages -
23616 		    footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)) &
23617 		    (sizeof(int) - 1),
23618 		    sizeof(int))) <
23619 		    (sizeof(*footprint_header))) {
23620 			/*
23621 			 * There are not enough trailing "zero" dispositions
23622 			 * (+ the extra padding we would need for the previous
23623 			 * region); creating a new region would not save space
23624 			 * at this point, so let's keep this "zero" disposition
23625 			 * in this region and reconsider later.
23626 			 */
23627 			continue;
23628 		}
23629 		/*
23630 		 * Create a new region to avoid having too many consecutive
23631 		 * "zero" dispositions.
23632 		 */
23633 		new_footprint_region =
23634 		    vm_map_corpse_footprint_new_region(footprint_header);
23635 		if (new_footprint_region == NULL) {
23636 			goto over_the_edge;
23637 		}
23638 		footprint_region = new_footprint_region;
23639 		/* initialize the new region as empty ... */
23640 		footprint_region->cfr_num_pages = 0;
23641 		/* ... and skip this "zero" disp */
23642 		footprint_region->cfr_vaddr = va + effective_page_size;
23643 	}
23644 
23645 	return KERN_SUCCESS;
23646 
23647 over_the_edge:
23648 //	printf("FBDP map %p footprint was full for va 0x%llx\n", new_map, va);
23649 	vm_map_corpse_footprint_full++;
23650 	return KERN_RESOURCE_SHORTAGE;
23651 }
23652 
23653 /*
23654  * vm_map_corpse_footprint_collect_done:
23655  *	completes the footprint collection by getting rid of any remaining
23656  *	trailing "zero" dispositions and trimming the unused part of the
23657  *	kernel buffer
23658  */
23659 void
vm_map_corpse_footprint_collect_done(vm_map_t new_map)23660 vm_map_corpse_footprint_collect_done(
23661 	vm_map_t        new_map)
23662 {
23663 	struct vm_map_corpse_footprint_header *footprint_header;
23664 	struct vm_map_corpse_footprint_region *footprint_region;
23665 	vm_size_t       buf_size, actual_size;
23666 
23667 	assert(new_map->has_corpse_footprint);
23668 	if (!new_map->has_corpse_footprint ||
23669 	    new_map->vmmap_corpse_footprint == NULL) {
23670 		return;
23671 	}
23672 
23673 	footprint_header = (struct vm_map_corpse_footprint_header *)
23674 	    new_map->vmmap_corpse_footprint;
23675 	buf_size = footprint_header->cf_size;
23676 
23677 	footprint_region = (struct vm_map_corpse_footprint_region *)
23678 	    ((char *)footprint_header +
23679 	    footprint_header->cf_last_region);
23680 
23681 	/* get rid of trailing zeroes in last region */
23682 	assert(footprint_region->cfr_num_pages >= footprint_header->cf_last_zeroes);
23683 	footprint_region->cfr_num_pages -= footprint_header->cf_last_zeroes;
23684 	footprint_header->cf_last_zeroes = 0;
23685 
23686 	actual_size = (vm_size_t)(footprint_header->cf_last_region +
23687 	    sizeof(*footprint_region) +
23688 	    (footprint_region->cfr_num_pages * sizeof(cf_disp_t)));
23689 
23690 //	printf("FBDP map %p buf_size 0x%llx actual_size 0x%llx\n", new_map, (uint64_t) buf_size, (uint64_t) actual_size);
23691 	vm_map_corpse_footprint_size_avg =
23692 	    (((vm_map_corpse_footprint_size_avg *
23693 	    vm_map_corpse_footprint_count) +
23694 	    actual_size) /
23695 	    (vm_map_corpse_footprint_count + 1));
23696 	vm_map_corpse_footprint_count++;
23697 	if (actual_size > vm_map_corpse_footprint_size_max) {
23698 		vm_map_corpse_footprint_size_max = actual_size;
23699 	}
23700 
23701 	actual_size = round_page(actual_size);
23702 	assert3u(buf_size, >=, actual_size);
23703 	if (buf_size > actual_size) {
23704 		/*
23705 		 * Free unused space at the end of the buffer
23706 		 */
23707 		kmem_guard_t guard = vm_map_corpse_footprint_guard(new_map);
23708 		kmem_return_t kmr = kmem_realloc_guard(kernel_map,
23709 		    (vm_offset_t)footprint_header,
23710 		    /* Account for guard page */
23711 		    buf_size + PAGE_SIZE,
23712 		    actual_size + PAGE_SIZE,
23713 		    KMR_DATA | KMR_GUARD_LAST | KMR_FREEOLD | KMR_KOBJECT,
23714 		    guard);
23715 		assertf(kmr.kmr_return == KERN_SUCCESS,
23716 		    "trim: footprint_header %p buf_size 0x%llx actual_size 0x%llx kr=0x%x\n",
23717 		    footprint_header,
23718 		    (uint64_t) buf_size,
23719 		    (uint64_t) actual_size,
23720 		    kmr.kmr_return);
23721 		footprint_header = (struct vm_map_corpse_footprint_header *)kmr.kmr_ptr;
23722 		assert3p(footprint_header, !=, NULL);
23723 		new_map->vmmap_corpse_footprint = footprint_header;
23724 		footprint_region = NULL;
23725 	}
23726 
23727 	footprint_header->cf_size = actual_size;
23728 }
23729 
23730 /*
23731  * vm_map_corpse_footprint_query_page_info:
23732  *	retrieves the disposition of the page at virtual address "vaddr"
23733  *	in the forked corpse's VM map
23734  *
23735  * This is the equivalent of vm_map_footprint_query_page_info() for a forked corpse.
23736  */
23737 kern_return_t
vm_map_corpse_footprint_query_page_info(vm_map_t map,vm_map_offset_t va,int * disposition_p)23738 vm_map_corpse_footprint_query_page_info(
23739 	vm_map_t        map,
23740 	vm_map_offset_t va,
23741 	int             *disposition_p)
23742 {
23743 	struct vm_map_corpse_footprint_header *footprint_header;
23744 	struct vm_map_corpse_footprint_region *footprint_region;
23745 	uint32_t        footprint_region_offset;
23746 	vm_map_offset_t region_start, region_end;
23747 	int             disp_idx;
23748 	kern_return_t   kr;
23749 	int             effective_page_size;
23750 	cf_disp_t       cf_disp;
23751 
23752 	if (!map->has_corpse_footprint) {
23753 		*disposition_p = 0;
23754 		kr = KERN_INVALID_ARGUMENT;
23755 		goto done;
23756 	}
23757 
23758 	footprint_header = map->vmmap_corpse_footprint;
23759 	if (footprint_header == NULL) {
23760 		*disposition_p = 0;
23761 //		if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p);
23762 		kr = KERN_INVALID_ARGUMENT;
23763 		goto done;
23764 	}
23765 
23766 	/* start looking at the hint ("cf_hint_region") */
23767 	footprint_region_offset = footprint_header->cf_hint_region;
23768 
23769 	effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(map));
23770 
23771 lookup_again:
23772 	if (footprint_region_offset < sizeof(*footprint_header)) {
23773 		/* hint too low: start from 1st region */
23774 		footprint_region_offset = sizeof(*footprint_header);
23775 	}
23776 	if (footprint_region_offset > footprint_header->cf_last_region) {
23777 		/* hint too high: re-start from 1st region */
23778 		footprint_region_offset = sizeof(*footprint_header);
23779 	}
23780 	footprint_region = (struct vm_map_corpse_footprint_region *)
23781 	    ((char *)footprint_header + footprint_region_offset);
23782 	region_start = footprint_region->cfr_vaddr;
23783 	region_end = (region_start +
23784 	    ((vm_map_offset_t)(footprint_region->cfr_num_pages) *
23785 	    effective_page_size));
23786 	if (va < region_start &&
23787 	    footprint_region_offset != sizeof(*footprint_header)) {
23788 		/* our range starts before the hint region */
23789 
23790 		/* reset the hint (in a racy way...) */
23791 		footprint_header->cf_hint_region = sizeof(*footprint_header);
23792 		/* lookup "va" again from 1st region */
23793 		footprint_region_offset = sizeof(*footprint_header);
23794 		goto lookup_again;
23795 	}
23796 
23797 	while (va >= region_end) {
23798 		if (footprint_region_offset >= footprint_header->cf_last_region) {
23799 			break;
23800 		}
23801 		/* skip the region's header */
23802 		footprint_region_offset += sizeof(*footprint_region);
23803 		/* skip the region's page dispositions */
23804 		footprint_region_offset += (footprint_region->cfr_num_pages * sizeof(cf_disp_t));
23805 		/* align to next word boundary */
23806 		footprint_region_offset =
23807 		    roundup(footprint_region_offset,
23808 		    sizeof(int));
23809 		footprint_region = (struct vm_map_corpse_footprint_region *)
23810 		    ((char *)footprint_header + footprint_region_offset);
23811 		region_start = footprint_region->cfr_vaddr;
23812 		region_end = (region_start +
23813 		    ((vm_map_offset_t)(footprint_region->cfr_num_pages) *
23814 		    effective_page_size));
23815 	}
23816 	if (va < region_start || va >= region_end) {
23817 		/* page not found */
23818 		*disposition_p = 0;
23819 //		if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p);
23820 		kr = KERN_SUCCESS;
23821 		goto done;
23822 	}
23823 
23824 	/* "va" found: set the lookup hint for next lookup (in a racy way...) */
23825 	footprint_header->cf_hint_region = footprint_region_offset;
23826 
23827 	/* get page disposition for "va" in this region */
23828 	disp_idx = (int) ((va - footprint_region->cfr_vaddr) / effective_page_size);
23829 	cf_disp = footprint_region->cfr_disposition[disp_idx];
23830 	*disposition_p = vm_page_cf_disp_to_disposition(cf_disp);
23831 	kr = KERN_SUCCESS;
23832 done:
23833 //	if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p);
23834 	/* dtrace -n 'vminfo:::footprint_query_page_info { printf("map 0x%p va 0x%llx disp 0x%x kr 0x%x", arg0, arg1, arg2, arg3); }' */
23835 	DTRACE_VM4(footprint_query_page_info,
23836 	    vm_map_t, map,
23837 	    vm_map_offset_t, va,
23838 	    int, *disposition_p,
23839 	    kern_return_t, kr);
23840 
23841 	return kr;
23842 }
23843 
23844 void
vm_map_corpse_footprint_destroy(vm_map_t map)23845 vm_map_corpse_footprint_destroy(
23846 	vm_map_t        map)
23847 {
23848 	if (map->has_corpse_footprint &&
23849 	    map->vmmap_corpse_footprint != NULL) {
23850 		struct vm_map_corpse_footprint_header *footprint_header;
23851 		vm_size_t buf_size;
23852 
23853 		footprint_header = map->vmmap_corpse_footprint;
23854 		buf_size = footprint_header->cf_size;
23855 		kmem_guard_t guard = vm_map_corpse_footprint_guard(map);
23856 		kmem_free_guard(kernel_map, (vm_offset_t)footprint_header,
23857 		    buf_size + PAGE_SIZE,
23858 		    KMF_GUARD_LAST, guard);
23859 		map->vmmap_corpse_footprint = NULL;
23860 		map->has_corpse_footprint = FALSE;
23861 	}
23862 }
23863 
23864 /*
23865  * vm_map_copy_footprint_ledgers:
23866  *	copies any ledger that's relevant to the memory footprint of "old_task"
23867  *	into the forked corpse's task ("new_task")
23868  */
23869 void
vm_map_copy_footprint_ledgers(task_t old_task,task_t new_task)23870 vm_map_copy_footprint_ledgers(
23871 	task_t  old_task,
23872 	task_t  new_task)
23873 {
23874 	vm_map_copy_ledger(old_task, new_task, task_ledgers.phys_footprint);
23875 	vm_map_copy_ledger(old_task, new_task, task_ledgers.purgeable_nonvolatile);
23876 	vm_map_copy_ledger(old_task, new_task, task_ledgers.purgeable_nonvolatile_compressed);
23877 	vm_map_copy_ledger(old_task, new_task, task_ledgers.internal);
23878 	vm_map_copy_ledger(old_task, new_task, task_ledgers.internal_compressed);
23879 	vm_map_copy_ledger(old_task, new_task, task_ledgers.iokit_mapped);
23880 	vm_map_copy_ledger(old_task, new_task, task_ledgers.alternate_accounting);
23881 	vm_map_copy_ledger(old_task, new_task, task_ledgers.alternate_accounting_compressed);
23882 	vm_map_copy_ledger(old_task, new_task, task_ledgers.page_table);
23883 	vm_map_copy_ledger(old_task, new_task, task_ledgers.tagged_footprint);
23884 	vm_map_copy_ledger(old_task, new_task, task_ledgers.tagged_footprint_compressed);
23885 	vm_map_copy_ledger(old_task, new_task, task_ledgers.network_nonvolatile);
23886 	vm_map_copy_ledger(old_task, new_task, task_ledgers.network_nonvolatile_compressed);
23887 	vm_map_copy_ledger(old_task, new_task, task_ledgers.media_footprint);
23888 	vm_map_copy_ledger(old_task, new_task, task_ledgers.media_footprint_compressed);
23889 	vm_map_copy_ledger(old_task, new_task, task_ledgers.graphics_footprint);
23890 	vm_map_copy_ledger(old_task, new_task, task_ledgers.graphics_footprint_compressed);
23891 	vm_map_copy_ledger(old_task, new_task, task_ledgers.neural_footprint);
23892 	vm_map_copy_ledger(old_task, new_task, task_ledgers.neural_footprint_compressed);
23893 	vm_map_copy_ledger(old_task, new_task, task_ledgers.wired_mem);
23894 	vm_map_copy_ledger(old_task, new_task, task_ledgers.neural_nofootprint_total);
23895 }
23896 
23897 /*
23898  * vm_map_copy_ledger:
23899  *	copy a single ledger from "old_task" to "new_task"
23900  */
23901 void
vm_map_copy_ledger(task_t old_task,task_t new_task,int ledger_entry)23902 vm_map_copy_ledger(
23903 	task_t  old_task,
23904 	task_t  new_task,
23905 	int     ledger_entry)
23906 {
23907 	ledger_amount_t old_balance, new_balance, delta;
23908 
23909 	assert(new_task->map->has_corpse_footprint);
23910 	if (!new_task->map->has_corpse_footprint) {
23911 		return;
23912 	}
23913 
23914 	/* turn off sanity checks for the ledger we're about to mess with */
23915 	ledger_disable_panic_on_negative(new_task->ledger,
23916 	    ledger_entry);
23917 
23918 	/* adjust "new_task" to match "old_task" */
23919 	ledger_get_balance(old_task->ledger,
23920 	    ledger_entry,
23921 	    &old_balance);
23922 	ledger_get_balance(new_task->ledger,
23923 	    ledger_entry,
23924 	    &new_balance);
23925 	if (new_balance == old_balance) {
23926 		/* new == old: done */
23927 	} else if (new_balance > old_balance) {
23928 		/* new > old ==> new -= new - old */
23929 		delta = new_balance - old_balance;
23930 		ledger_debit(new_task->ledger,
23931 		    ledger_entry,
23932 		    delta);
23933 	} else {
23934 		/* new < old ==> new += old - new */
23935 		delta = old_balance - new_balance;
23936 		ledger_credit(new_task->ledger,
23937 		    ledger_entry,
23938 		    delta);
23939 	}
23940 }
23941 
23942 /*
23943  * vm_map_get_pmap:
23944  * returns the pmap associated with the vm_map
23945  */
23946 pmap_t
vm_map_get_pmap(vm_map_t map)23947 vm_map_get_pmap(vm_map_t map)
23948 {
23949 	return vm_map_pmap(map);
23950 }
23951 
23952 ppnum_t
vm_map_get_phys_page(vm_map_t map,vm_offset_t addr)23953 vm_map_get_phys_page(
23954 	vm_map_t                map,
23955 	vm_offset_t             addr)
23956 {
23957 	vm_object_offset_t      offset;
23958 	vm_object_t             object;
23959 	vm_map_offset_t         map_offset;
23960 	vm_map_entry_t          entry;
23961 	ppnum_t                 phys_page = 0;
23962 
23963 	map_offset = vm_map_trunc_page(addr, PAGE_MASK);
23964 
23965 	vm_map_lock(map);
23966 	while (vm_map_lookup_entry(map, map_offset, &entry)) {
23967 		if (entry->is_sub_map) {
23968 			vm_map_t        old_map;
23969 			vm_map_lock(VME_SUBMAP(entry));
23970 			old_map = map;
23971 			map = VME_SUBMAP(entry);
23972 			map_offset = (VME_OFFSET(entry) +
23973 			    (map_offset - entry->vme_start));
23974 			vm_map_unlock(old_map);
23975 			continue;
23976 		}
23977 		if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
23978 			vm_map_unlock(map);
23979 			return (ppnum_t) 0;
23980 		}
23981 		if (VME_OBJECT(entry)->phys_contiguous) {
23982 			/* These are  not standard pageable memory mappings */
23983 			/* If they are not present in the object they will  */
23984 			/* have to be picked up from the pager through the  */
23985 			/* fault mechanism.  */
23986 			if (VME_OBJECT(entry)->vo_shadow_offset == 0) {
23987 				/* need to call vm_fault */
23988 				vm_map_unlock(map);
23989 				vm_fault(map, map_offset, VM_PROT_NONE,
23990 				    FALSE /* change_wiring */, VM_KERN_MEMORY_NONE,
23991 				    THREAD_UNINT, NULL, 0);
23992 				vm_map_lock(map);
23993 				continue;
23994 			}
23995 			offset = (VME_OFFSET(entry) +
23996 			    (map_offset - entry->vme_start));
23997 			phys_page = (ppnum_t)
23998 			    ((VME_OBJECT(entry)->vo_shadow_offset
23999 			    + offset) >> PAGE_SHIFT);
24000 			break;
24001 		}
24002 		offset = (VME_OFFSET(entry) + (map_offset - entry->vme_start));
24003 		object = VME_OBJECT(entry);
24004 		vm_object_lock(object);
24005 		while (TRUE) {
24006 			vm_page_t dst_page = vm_page_lookup(object, offset);
24007 			if (dst_page == VM_PAGE_NULL) {
24008 				if (object->shadow) {
24009 					vm_object_t old_object;
24010 					vm_object_lock(object->shadow);
24011 					old_object = object;
24012 					offset = offset + object->vo_shadow_offset;
24013 					object = object->shadow;
24014 					vm_object_unlock(old_object);
24015 				} else {
24016 					vm_object_unlock(object);
24017 					break;
24018 				}
24019 			} else {
24020 				phys_page = (ppnum_t)(VM_PAGE_GET_PHYS_PAGE(dst_page));
24021 				vm_object_unlock(object);
24022 				break;
24023 			}
24024 		}
24025 		break;
24026 	}
24027 
24028 	vm_map_unlock(map);
24029 	return phys_page;
24030 }
24031 
24032 #if CONFIG_MAP_RANGES
24033 static bitmap_t vm_map_user_range_heap_map[BITMAP_LEN(VM_MEMORY_COUNT)];
24034 static bitmap_t vm_map_user_range_large_file_map[BITMAP_LEN(VM_MEMORY_COUNT)];
24035 
24036 static_assert((int)UMEM_RANGE_ID_DEFAULT == MACH_VM_RANGE_DEFAULT);
24037 static_assert((int)UMEM_RANGE_ID_HEAP == MACH_VM_RANGE_DATA);
24038 
24039 /*
24040  * vm_map_range_map_init:
24041  *  initializes the VM range ID map to enable index lookup
24042  *  of user VM ranges based on VM tag from userspace.
24043  */
24044 static void
vm_map_range_map_init(void)24045 vm_map_range_map_init(void)
24046 {
24047 	/*
24048 	 * VM_MEMORY_MALLOC{,_NANO} are skipped on purpose:
24049 	 * - the former is malloc metadata which should be kept separate
24050 	 * - the latter has its own ranges
24051 	 */
24052 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_HUGE);
24053 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_LARGE);
24054 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_LARGE_REUSED);
24055 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_MEDIUM);
24056 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_PROB_GUARD);
24057 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_SMALL);
24058 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_TINY);
24059 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_TCMALLOC);
24060 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_LIBNETWORK);
24061 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_IOACCELERATOR);
24062 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_IOSURFACE);
24063 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_IMAGEIO);
24064 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_COREGRAPHICS);
24065 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_CORESERVICES);
24066 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_COREDATA);
24067 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_LAYERKIT);
24068 	bitmap_set(vm_map_user_range_large_file_map, VM_MEMORY_IOACCELERATOR);
24069 	bitmap_set(vm_map_user_range_large_file_map, VM_MEMORY_IOSURFACE);
24070 }
24071 
24072 static struct mach_vm_range
vm_map_range_random_uniform(vm_map_size_t req_size,vm_map_offset_t min_addr,vm_map_offset_t max_addr,vm_map_offset_t offmask)24073 vm_map_range_random_uniform(
24074 	vm_map_size_t           req_size,
24075 	vm_map_offset_t         min_addr,
24076 	vm_map_offset_t         max_addr,
24077 	vm_map_offset_t         offmask)
24078 {
24079 	vm_map_offset_t random_addr;
24080 	struct mach_vm_range alloc;
24081 
24082 	req_size = (req_size + offmask) & ~offmask;
24083 	min_addr = (min_addr + offmask) & ~offmask;
24084 	max_addr = max_addr & ~offmask;
24085 
24086 	read_random(&random_addr, sizeof(random_addr));
24087 	random_addr %= (max_addr - req_size - min_addr);
24088 	random_addr &= ~offmask;
24089 
24090 	alloc.min_address = min_addr + random_addr;
24091 	alloc.max_address = min_addr + random_addr + req_size;
24092 	return alloc;
24093 }
24094 
24095 static vm_map_offset_t
vm_map_range_offmask(void)24096 vm_map_range_offmask(void)
24097 {
24098 	uint32_t pte_depth;
24099 
24100 	/*
24101 	 * PTE optimizations
24102 	 *
24103 	 *
24104 	 * 16k pages systems
24105 	 * ~~~~~~~~~~~~~~~~~
24106 	 *
24107 	 * A single L1 (sub-)page covers the address space.
24108 	 * - L2 pages cover 64G,
24109 	 * - L3 pages cover 32M.
24110 	 *
24111 	 * On embedded, the dynamic VA range is 64G and uses a single L2 page.
24112 	 * As a result, we really only need to align the ranges to 32M to avoid
24113 	 * partial L3 pages.
24114 	 *
24115 	 * On macOS, the usage of L2 pages will increase, so as a result we will
24116 	 * want to align ranges to 64G in order to utilize them fully.
24117 	 *
24118 	 *
24119 	 * 4k pages systems
24120 	 * ~~~~~~~~~~~~~~~~
24121 	 *
24122 	 * A single L0 (sub-)page covers the address space.
24123 	 * - L1 pages cover 512G,
24124 	 * - L2 pages cover 1G,
24125 	 * - L3 pages cover 2M.
24126 	 *
24127 	 * The long tail of processes on a system will tend to have a VA usage
24128 	 * (ignoring the shared regions) in the 100s of MB order of magnitnude.
24129 	 * This is achievable with a single L1 and a few L2s without
24130 	 * randomization.
24131 	 *
24132 	 * However once randomization is introduced, the system will immediately
24133 	 * need several L1s and many more L2s. As a result:
24134 	 *
24135 	 * - on embedded devices, the cost of these extra pages isn't
24136 	 *   sustainable, and we just disable the feature entirely,
24137 	 *
24138 	 * - on macOS we align ranges to a 512G boundary so that the extra L1
24139 	 *   pages can be used to their full potential.
24140 	 */
24141 
24142 	/*
24143 	 * note, this function assumes _non exotic mappings_
24144 	 * which is why it uses the native kernel's PAGE_SHIFT.
24145 	 */
24146 #if XNU_PLATFORM_MacOSX
24147 	pte_depth = PAGE_SHIFT > 12 ? 2 : 3;
24148 #else /* !XNU_PLATFORM_MacOSX */
24149 	pte_depth = PAGE_SHIFT > 12 ? 1 : 0;
24150 #endif /* !XNU_PLATFORM_MacOSX */
24151 
24152 	if (pte_depth == 0) {
24153 		return 0;
24154 	}
24155 
24156 	return (1ull << ((PAGE_SHIFT - 3) * pte_depth + PAGE_SHIFT)) - 1;
24157 }
24158 
24159 /*
24160  * vm_map_range_configure:
24161  *	configures the user vm_map ranges by increasing the maximum VA range of
24162  *  the map and carving out a range at the end of VA space (searching backwards
24163  *  in the newly expanded map).
24164  */
24165 kern_return_t
vm_map_range_configure(vm_map_t map,__unused bool needs_extra_jumbo_va)24166 vm_map_range_configure(vm_map_t map, __unused bool needs_extra_jumbo_va)
24167 {
24168 	const vm_map_offset_t offmask = vm_map_range_offmask();
24169 	struct mach_vm_range data_range;
24170 	vm_map_offset_t default_end;
24171 	kern_return_t kr;
24172 
24173 	if (!vm_map_is_64bit(map) || vm_map_is_exotic(map) || offmask == 0) {
24174 		/*
24175 		 * No point doing vm ranges in a 32bit address space.
24176 		 */
24177 		return KERN_NOT_SUPPORTED;
24178 	}
24179 
24180 	/* Should not be applying ranges to kernel map or kernel map submaps */
24181 	assert(vm_map_pmap(map) != kernel_pmap);
24182 
24183 #if XNU_PLATFORM_MacOSX
24184 
24185 	/*
24186 	 * on macOS, the address space is a massive 47 bits (128T),
24187 	 * with several carve outs that processes can't use:
24188 	 * - the shared region
24189 	 * - the commpage region
24190 	 * - the GPU carve out (if applicable)
24191 	 *
24192 	 * and when nano-malloc is in use it desires memory at the 96T mark.
24193 	 *
24194 	 * However, their location is architecture dependent:
24195 	 * - On intel, the shared region and commpage are
24196 	 *   at the very end of the usable address space (above +127T),
24197 	 *   and there is no GPU carve out, and pthread wants to place
24198 	 *   threads at the 112T mark (0x70T).
24199 	 *
24200 	 * - On arm64, these are in the same spot as on embedded devices:
24201 	 *   o shared region:   [ 6G,  10G)  [ will likely grow over time ]
24202 	 *   o commpage region: [63G,  64G)
24203 	 *   o GPU carve out:   [64G, 448G)
24204 	 *
24205 	 * This is conveninent because the mappings at the end of the address
24206 	 * space (when they exist) are made by the kernel.
24207 	 *
24208 	 * The policy is to allocate a random 1T for the data heap
24209 	 * in the end of the address-space in the:
24210 	 * - [0x71, 0x7f) range on Intel (to leave space for pthread stacks)
24211 	 * - [0x61, 0x7f) range on ASM (to leave space for Nano malloc).
24212 	 */
24213 
24214 	/* see NANOZONE_SIGNATURE in libmalloc */
24215 #if __x86_64__
24216 	default_end = 0x71ull << 40;
24217 #else
24218 	default_end = 0x61ull << 40;
24219 #endif
24220 	data_range  = vm_map_range_random_uniform(1ull << 40,
24221 	        default_end, 0x7full << 40, offmask);
24222 
24223 #else /* !XNU_PLATFORM_MacOSX */
24224 
24225 	/*
24226 	 * Embedded devices:
24227 	 *
24228 	 *   The default VA Size scales with the device physical memory.
24229 	 *
24230 	 *   Out of that:
24231 	 *   - the "zero" page typically uses 4G + some slide
24232 	 *   - the shared region uses SHARED_REGION_SIZE bytes (4G)
24233 	 *
24234 	 *   Without the use of jumbo or any adjustment to the address space,
24235 	 *   a default VM map typically looks like this:
24236 	 *
24237 	 *       0G -->╒════════════╕
24238 	 *             │  pagezero  │
24239 	 *             │  + slide   │
24240 	 *      ~4G -->╞════════════╡<-- vm_map_min(map)
24241 	 *             │            │
24242 	 *       6G -->├────────────┤
24243 	 *             │   shared   │
24244 	 *             │   region   │
24245 	 *      10G -->├────────────┤
24246 	 *             │            │
24247 	 *   max_va -->├────────────┤<-- vm_map_max(map)
24248 	 *             │            │
24249 	 *             ╎   jumbo    ╎
24250 	 *             ╎            ╎
24251 	 *             │            │
24252 	 *      63G -->╞════════════╡<-- MACH_VM_MAX_ADDRESS
24253 	 *             │  commpage  │
24254 	 *      64G -->├────────────┤<-- MACH_VM_MIN_GPU_CARVEOUT_ADDRESS
24255 	 *             │            │
24256 	 *             ╎    GPU     ╎
24257 	 *             ╎  carveout  ╎
24258 	 *             │            │
24259 	 *     448G -->├────────────┤<-- MACH_VM_MAX_GPU_CARVEOUT_ADDRESS
24260 	 *             │            │
24261 	 *             ╎            ╎
24262 	 *             ╎            ╎
24263 	 *             │            │
24264 	 *     512G -->╘════════════╛<-- (1ull << ARM_16K_TT_L1_SHIFT)
24265 	 *
24266 	 *   When this drawing was made, "max_va" was smaller than
24267 	 *   ARM64_MAX_OFFSET_DEVICE_LARGE (~15.5G), leaving shy of
24268 	 *   12G of address space for the zero-page, slide, files,
24269 	 *   binaries, heap ...
24270 	 *
24271 	 *   We will want to make a "heap/data" carve out inside
24272 	 *   the jumbo range of half of that usable space, assuming
24273 	 *   that this is less than a forth of the jumbo range.
24274 	 *
24275 	 *   The assert below intends to catch when max_va grows
24276 	 *   too large for this heuristic.
24277 	 */
24278 
24279 	vm_map_lock_read(map);
24280 	default_end = vm_map_max(map);
24281 	vm_map_unlock_read(map);
24282 
24283 	/*
24284 	 * Check that we're not already jumbo'd,
24285 	 * or our address space was somehow modified.
24286 	 *
24287 	 * If so we cannot guarantee that we can set up the ranges
24288 	 * safely without interfering with the existing map.
24289 	 */
24290 	if (default_end > vm_compute_max_offset(true)) {
24291 		return KERN_NO_SPACE;
24292 	}
24293 
24294 	if (pmap_max_offset(true, ARM_PMAP_MAX_OFFSET_DEFAULT)) {
24295 		/*
24296 		 * an override boot-arg was set, disable user-ranges
24297 		 *
24298 		 * XXX: this is problematic because it means these boot-args
24299 		 *      no longer test the behavior changing the value
24300 		 *      of ARM64_MAX_OFFSET_DEVICE_* would have.
24301 		 */
24302 		return KERN_NOT_SUPPORTED;
24303 	}
24304 
24305 	/* expand the default VM space to 64GB */
24306 	vm_map_set_jumbo(map);
24307 
24308 	assert3u(7 * GiB(10) / 2, <=, vm_map_max(map) - default_end);
24309 	data_range = vm_map_range_random_uniform(GiB(10),
24310 	    default_end + PAGE_SIZE, vm_map_max(map), offmask);
24311 
24312 #endif /* !XNU_PLATFORM_MacOSX */
24313 
24314 	/*
24315 	 * Poke holes so that ASAN or people listing regions
24316 	 * do not think this space is free.
24317 	 */
24318 
24319 	if (default_end != data_range.min_address) {
24320 		kr = vm_map_enter(map, &default_end,
24321 		    data_range.min_address - default_end,
24322 		    0, VM_MAP_KERNEL_FLAGS_FIXED_PERMANENT(), VM_OBJECT_NULL,
24323 		    0, FALSE, VM_PROT_NONE, VM_PROT_NONE, VM_INHERIT_DEFAULT);
24324 		assert(kr == KERN_SUCCESS);
24325 	}
24326 
24327 	if (data_range.max_address != vm_map_max(map)) {
24328 		vm_map_entry_t entry;
24329 		vm_size_t size;
24330 
24331 		/*
24332 		 * Extend the end of the hole to the next VM entry or the end of the map,
24333 		 * whichever comes first.
24334 		 */
24335 		vm_map_lock_read(map);
24336 		vm_map_lookup_entry_or_next(map, data_range.max_address, &entry);
24337 		if (entry == vm_map_to_entry(map) || entry->vme_start > vm_map_max(map)) {
24338 			size = vm_map_max(map) - data_range.max_address;
24339 		} else {
24340 			size = entry->vme_start - data_range.max_address;
24341 		}
24342 		vm_map_unlock_read(map);
24343 
24344 		kr = vm_map_enter(map, &data_range.max_address, size,
24345 		    0, VM_MAP_KERNEL_FLAGS_FIXED_PERMANENT(), VM_OBJECT_NULL,
24346 		    0, FALSE, VM_PROT_NONE, VM_PROT_NONE, VM_INHERIT_DEFAULT);
24347 		assert(kr == KERN_SUCCESS);
24348 	}
24349 
24350 #if XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT
24351 	if (needs_extra_jumbo_va) {
24352 		/* This will grow the address space to MACH_VM_MAX_ADDRESS */
24353 		vm_map_set_extra_jumbo(map);
24354 	}
24355 #endif /* XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT */
24356 
24357 	vm_map_lock(map);
24358 	map->default_range.min_address = vm_map_min(map);
24359 	map->default_range.max_address = default_end;
24360 	map->data_range = data_range;
24361 #if XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT
24362 	/* If process has "extra jumbo" entitlement, enable large file range */
24363 	if (needs_extra_jumbo_va) {
24364 		map->large_file_range = vm_map_range_random_uniform(TiB(1),
24365 		    MACH_VM_JUMBO_ADDRESS, MACH_VM_MAX_ADDRESS, offmask);
24366 	}
24367 #endif /* XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT */
24368 	map->uses_user_ranges = true;
24369 	vm_map_unlock(map);
24370 
24371 	return KERN_SUCCESS;
24372 }
24373 
24374 /*
24375  * vm_map_range_fork:
24376  *	clones the array of ranges from old_map to new_map in support
24377  *  of a VM map fork.
24378  */
24379 void
vm_map_range_fork(vm_map_t new_map,vm_map_t old_map)24380 vm_map_range_fork(vm_map_t new_map, vm_map_t old_map)
24381 {
24382 	if (!old_map->uses_user_ranges) {
24383 		/* nothing to do */
24384 		return;
24385 	}
24386 
24387 	new_map->default_range = old_map->default_range;
24388 	new_map->data_range = old_map->data_range;
24389 
24390 	if (old_map->extra_ranges_count) {
24391 		vm_map_user_range_t otable, ntable;
24392 		uint16_t count;
24393 
24394 		otable = old_map->extra_ranges;
24395 		count  = old_map->extra_ranges_count;
24396 		ntable = kalloc_data(count * sizeof(struct vm_map_user_range),
24397 		    Z_WAITOK | Z_ZERO | Z_NOFAIL);
24398 		memcpy(ntable, otable,
24399 		    count * sizeof(struct vm_map_user_range));
24400 
24401 		new_map->extra_ranges_count = count;
24402 		new_map->extra_ranges = ntable;
24403 	}
24404 
24405 	new_map->uses_user_ranges = true;
24406 }
24407 
24408 /*
24409  * vm_map_get_user_range:
24410  *	copy the VM user range for the given VM map and range ID.
24411  */
24412 kern_return_t
vm_map_get_user_range(vm_map_t map,vm_map_range_id_t range_id,mach_vm_range_t range)24413 vm_map_get_user_range(
24414 	vm_map_t                map,
24415 	vm_map_range_id_t       range_id,
24416 	mach_vm_range_t         range)
24417 {
24418 	if (map == NULL || !map->uses_user_ranges || range == NULL) {
24419 		return KERN_INVALID_ARGUMENT;
24420 	}
24421 
24422 	switch (range_id) {
24423 	case UMEM_RANGE_ID_DEFAULT:
24424 		*range = map->default_range;
24425 		return KERN_SUCCESS;
24426 
24427 	case UMEM_RANGE_ID_HEAP:
24428 		*range = map->data_range;
24429 		return KERN_SUCCESS;
24430 
24431 	case UMEM_RANGE_ID_LARGE_FILE:
24432 		/*
24433 		 * Because this function tells a user-space process about the user
24434 		 * ranges in its VM map, this case communicates whether the large file
24435 		 * range is in use. Note that this is different from how the large file
24436 		 * range ID is handled in `vm_map_get_range()`: there, we "resolve" the
24437 		 * VA policy and return either the large file range or data range,
24438 		 * depending on whether the large file range is enabled.
24439 		 */
24440 		if (map->large_file_range.min_address != map->large_file_range.max_address) {
24441 			/* large file range is configured and should be used */
24442 			*range = map->large_file_range;
24443 		} else {
24444 			return KERN_INVALID_ARGUMENT;
24445 		}
24446 		return KERN_SUCCESS;
24447 
24448 	default:
24449 		return KERN_INVALID_ARGUMENT;
24450 	}
24451 }
24452 
24453 static vm_map_range_id_t
vm_map_user_range_resolve(vm_map_t map,mach_vm_address_t addr,mach_vm_size_t size,mach_vm_range_t range)24454 vm_map_user_range_resolve(
24455 	vm_map_t                map,
24456 	mach_vm_address_t       addr,
24457 	mach_vm_size_t          size,
24458 	mach_vm_range_t         range)
24459 {
24460 	struct mach_vm_range tmp;
24461 
24462 	vm_map_lock_assert_held(map);
24463 
24464 	static_assert((int)UMEM_RANGE_ID_DEFAULT == MACH_VM_RANGE_DEFAULT);
24465 	static_assert((int)UMEM_RANGE_ID_HEAP == MACH_VM_RANGE_DATA);
24466 
24467 	if (mach_vm_range_contains(&map->default_range, addr, size)) {
24468 		if (range) {
24469 			*range = map->default_range;
24470 		}
24471 		return UMEM_RANGE_ID_DEFAULT;
24472 	}
24473 
24474 	if (mach_vm_range_contains(&map->data_range, addr, size)) {
24475 		if (range) {
24476 			*range = map->data_range;
24477 		}
24478 		return UMEM_RANGE_ID_HEAP;
24479 	}
24480 
24481 	if (mach_vm_range_contains(&map->large_file_range, addr, size)) {
24482 		if (range) {
24483 			*range = map->large_file_range;
24484 		}
24485 		return UMEM_RANGE_ID_LARGE_FILE;
24486 	}
24487 
24488 	for (size_t i = 0; i < map->extra_ranges_count; i++) {
24489 		vm_map_user_range_t r = &map->extra_ranges[i];
24490 
24491 		tmp.min_address = r->vmur_min_address;
24492 		tmp.max_address = r->vmur_max_address;
24493 
24494 		if (mach_vm_range_contains(&tmp, addr, size)) {
24495 			if (range) {
24496 				*range = tmp;
24497 			}
24498 			return r->vmur_range_id;
24499 		}
24500 	}
24501 
24502 	if (range) {
24503 		range->min_address = range->max_address = 0;
24504 	}
24505 	return UMEM_RANGE_ID_DEFAULT;
24506 }
24507 #endif /* CONFIG_MAP_RANGES */
24508 
24509 void
vm_map_kernel_flags_update_range_id(vm_map_kernel_flags_t * vmkf,vm_map_t map,__unused vm_map_size_t size)24510 vm_map_kernel_flags_update_range_id(
24511 	vm_map_kernel_flags_t *vmkf,
24512 	vm_map_t map,
24513 	__unused vm_map_size_t size)
24514 {
24515 	if (map == kernel_map) {
24516 		if (vmkf->vmkf_range_id == KMEM_RANGE_ID_NONE) {
24517 			vmkf->vmkf_range_id = KMEM_RANGE_ID_DATA;
24518 		}
24519 #if CONFIG_MAP_RANGES
24520 	} else if (vmkf->vm_tag < VM_MEMORY_COUNT &&
24521 	    vmkf->vmkf_range_id == UMEM_RANGE_ID_DEFAULT) {
24522 		if (bitmap_test(vm_map_user_range_large_file_map, vmkf->vm_tag)
24523 		    || size >= VM_LARGE_FILE_THRESHOLD) {
24524 			/*
24525 			 * if the map doesn't have the large file range configured,
24526 			 * the range will get resolved to the heap range in `vm_map_get_range`
24527 			 */
24528 			vmkf->vmkf_range_id = UMEM_RANGE_ID_LARGE_FILE;
24529 		} else if (bitmap_test(vm_map_user_range_heap_map, vmkf->vm_tag)) {
24530 			vmkf->vmkf_range_id = UMEM_RANGE_ID_HEAP;
24531 		}
24532 #endif /* CONFIG_MAP_RANGES */
24533 	}
24534 }
24535 
24536 /*
24537  * vm_map_entry_has_device_pager:
24538  * Check if the vm map entry specified by the virtual address has a device pager.
24539  * If the vm map entry does not exist or if the map is NULL, this returns FALSE.
24540  */
24541 boolean_t
vm_map_entry_has_device_pager(vm_map_t map,vm_map_offset_t vaddr)24542 vm_map_entry_has_device_pager(vm_map_t map, vm_map_offset_t vaddr)
24543 {
24544 	vm_map_entry_t entry;
24545 	vm_object_t object;
24546 	boolean_t result;
24547 
24548 	if (map == NULL) {
24549 		return FALSE;
24550 	}
24551 
24552 	vm_map_lock(map);
24553 	while (TRUE) {
24554 		if (!vm_map_lookup_entry(map, vaddr, &entry)) {
24555 			result = FALSE;
24556 			break;
24557 		}
24558 		if (entry->is_sub_map) {
24559 			// Check the submap
24560 			vm_map_t submap = VME_SUBMAP(entry);
24561 			assert(submap != NULL);
24562 			vm_map_lock(submap);
24563 			vm_map_unlock(map);
24564 			map = submap;
24565 			continue;
24566 		}
24567 		object = VME_OBJECT(entry);
24568 		if (object != NULL && object->pager != NULL && is_device_pager_ops(object->pager->mo_pager_ops)) {
24569 			result = TRUE;
24570 			break;
24571 		}
24572 		result = FALSE;
24573 		break;
24574 	}
24575 
24576 	vm_map_unlock(map);
24577 	return result;
24578 }
24579 
24580 #if MACH_ASSERT
24581 
24582 extern int pmap_ledgers_panic;
24583 extern int pmap_ledgers_panic_leeway;
24584 
24585 #define LEDGER_DRIFT(__LEDGER)                    \
24586 	int             __LEDGER##_over;          \
24587 	ledger_amount_t __LEDGER##_over_total;    \
24588 	ledger_amount_t __LEDGER##_over_max;      \
24589 	int             __LEDGER##_under;         \
24590 	ledger_amount_t __LEDGER##_under_total;   \
24591 	ledger_amount_t __LEDGER##_under_max
24592 
24593 struct {
24594 	uint64_t        num_pmaps_checked;
24595 
24596 	LEDGER_DRIFT(phys_footprint);
24597 	LEDGER_DRIFT(internal);
24598 	LEDGER_DRIFT(internal_compressed);
24599 	LEDGER_DRIFT(external);
24600 	LEDGER_DRIFT(reusable);
24601 	LEDGER_DRIFT(iokit_mapped);
24602 	LEDGER_DRIFT(alternate_accounting);
24603 	LEDGER_DRIFT(alternate_accounting_compressed);
24604 	LEDGER_DRIFT(page_table);
24605 	LEDGER_DRIFT(purgeable_volatile);
24606 	LEDGER_DRIFT(purgeable_nonvolatile);
24607 	LEDGER_DRIFT(purgeable_volatile_compressed);
24608 	LEDGER_DRIFT(purgeable_nonvolatile_compressed);
24609 	LEDGER_DRIFT(tagged_nofootprint);
24610 	LEDGER_DRIFT(tagged_footprint);
24611 	LEDGER_DRIFT(tagged_nofootprint_compressed);
24612 	LEDGER_DRIFT(tagged_footprint_compressed);
24613 	LEDGER_DRIFT(network_volatile);
24614 	LEDGER_DRIFT(network_nonvolatile);
24615 	LEDGER_DRIFT(network_volatile_compressed);
24616 	LEDGER_DRIFT(network_nonvolatile_compressed);
24617 	LEDGER_DRIFT(media_nofootprint);
24618 	LEDGER_DRIFT(media_footprint);
24619 	LEDGER_DRIFT(media_nofootprint_compressed);
24620 	LEDGER_DRIFT(media_footprint_compressed);
24621 	LEDGER_DRIFT(graphics_nofootprint);
24622 	LEDGER_DRIFT(graphics_footprint);
24623 	LEDGER_DRIFT(graphics_nofootprint_compressed);
24624 	LEDGER_DRIFT(graphics_footprint_compressed);
24625 	LEDGER_DRIFT(neural_nofootprint);
24626 	LEDGER_DRIFT(neural_footprint);
24627 	LEDGER_DRIFT(neural_nofootprint_compressed);
24628 	LEDGER_DRIFT(neural_footprint_compressed);
24629 	LEDGER_DRIFT(neural_nofootprint_total);
24630 } pmap_ledgers_drift;
24631 
24632 void
vm_map_pmap_check_ledgers(pmap_t pmap,ledger_t ledger,int pid,char * procname)24633 vm_map_pmap_check_ledgers(
24634 	pmap_t          pmap,
24635 	ledger_t        ledger,
24636 	int             pid,
24637 	char            *procname)
24638 {
24639 	ledger_amount_t bal;
24640 	boolean_t       do_panic;
24641 
24642 	do_panic = FALSE;
24643 
24644 	pmap_ledgers_drift.num_pmaps_checked++;
24645 
24646 #define LEDGER_CHECK_BALANCE(__LEDGER)                                  \
24647 MACRO_BEGIN                                                             \
24648 	int panic_on_negative = TRUE;                                   \
24649 	ledger_get_balance(ledger,                                      \
24650 	                   task_ledgers.__LEDGER,                       \
24651 	                   &bal);                                       \
24652 	ledger_get_panic_on_negative(ledger,                            \
24653 	                             task_ledgers.__LEDGER,             \
24654 	                             &panic_on_negative);               \
24655 	if (bal != 0) {                                                 \
24656 	        if (panic_on_negative ||                                \
24657 	            (pmap_ledgers_panic &&                              \
24658 	             pmap_ledgers_panic_leeway > 0 &&                   \
24659 	             (bal > (pmap_ledgers_panic_leeway * PAGE_SIZE) ||  \
24660 	              bal < (-pmap_ledgers_panic_leeway * PAGE_SIZE)))) { \
24661 	                do_panic = TRUE;                                \
24662 	        }                                                       \
24663 	        printf("LEDGER BALANCE proc %d (%s) "                   \
24664 	               "\"%s\" = %lld\n",                               \
24665 	               pid, procname, #__LEDGER, bal);                  \
24666 	        if (bal > 0) {                                          \
24667 	                pmap_ledgers_drift.__LEDGER##_over++;           \
24668 	                pmap_ledgers_drift.__LEDGER##_over_total += bal; \
24669 	                if (bal > pmap_ledgers_drift.__LEDGER##_over_max) { \
24670 	                        pmap_ledgers_drift.__LEDGER##_over_max = bal; \
24671 	                }                                               \
24672 	        } else if (bal < 0) {                                   \
24673 	                pmap_ledgers_drift.__LEDGER##_under++;          \
24674 	                pmap_ledgers_drift.__LEDGER##_under_total += bal; \
24675 	                if (bal < pmap_ledgers_drift.__LEDGER##_under_max) { \
24676 	                        pmap_ledgers_drift.__LEDGER##_under_max = bal; \
24677 	                }                                               \
24678 	        }                                                       \
24679 	}                                                               \
24680 MACRO_END
24681 
24682 	LEDGER_CHECK_BALANCE(phys_footprint);
24683 	LEDGER_CHECK_BALANCE(internal);
24684 	LEDGER_CHECK_BALANCE(internal_compressed);
24685 	LEDGER_CHECK_BALANCE(external);
24686 	LEDGER_CHECK_BALANCE(reusable);
24687 	LEDGER_CHECK_BALANCE(iokit_mapped);
24688 	LEDGER_CHECK_BALANCE(alternate_accounting);
24689 	LEDGER_CHECK_BALANCE(alternate_accounting_compressed);
24690 	LEDGER_CHECK_BALANCE(page_table);
24691 	LEDGER_CHECK_BALANCE(purgeable_volatile);
24692 	LEDGER_CHECK_BALANCE(purgeable_nonvolatile);
24693 	LEDGER_CHECK_BALANCE(purgeable_volatile_compressed);
24694 	LEDGER_CHECK_BALANCE(purgeable_nonvolatile_compressed);
24695 	LEDGER_CHECK_BALANCE(tagged_nofootprint);
24696 	LEDGER_CHECK_BALANCE(tagged_footprint);
24697 	LEDGER_CHECK_BALANCE(tagged_nofootprint_compressed);
24698 	LEDGER_CHECK_BALANCE(tagged_footprint_compressed);
24699 	LEDGER_CHECK_BALANCE(network_volatile);
24700 	LEDGER_CHECK_BALANCE(network_nonvolatile);
24701 	LEDGER_CHECK_BALANCE(network_volatile_compressed);
24702 	LEDGER_CHECK_BALANCE(network_nonvolatile_compressed);
24703 	LEDGER_CHECK_BALANCE(media_nofootprint);
24704 	LEDGER_CHECK_BALANCE(media_footprint);
24705 	LEDGER_CHECK_BALANCE(media_nofootprint_compressed);
24706 	LEDGER_CHECK_BALANCE(media_footprint_compressed);
24707 	LEDGER_CHECK_BALANCE(graphics_nofootprint);
24708 	LEDGER_CHECK_BALANCE(graphics_footprint);
24709 	LEDGER_CHECK_BALANCE(graphics_nofootprint_compressed);
24710 	LEDGER_CHECK_BALANCE(graphics_footprint_compressed);
24711 	LEDGER_CHECK_BALANCE(neural_nofootprint);
24712 	LEDGER_CHECK_BALANCE(neural_footprint);
24713 	LEDGER_CHECK_BALANCE(neural_nofootprint_compressed);
24714 	LEDGER_CHECK_BALANCE(neural_footprint_compressed);
24715 	LEDGER_CHECK_BALANCE(neural_nofootprint_total);
24716 
24717 	if (do_panic) {
24718 		if (pmap_ledgers_panic) {
24719 			panic("pmap_destroy(%p) %d[%s] has imbalanced ledgers",
24720 			    pmap, pid, procname);
24721 		} else {
24722 			printf("pmap_destroy(%p) %d[%s] has imbalanced ledgers\n",
24723 			    pmap, pid, procname);
24724 		}
24725 	}
24726 }
24727 
24728 void
vm_map_pmap_set_process(vm_map_t map,int pid,char * procname)24729 vm_map_pmap_set_process(
24730 	vm_map_t map,
24731 	int pid,
24732 	char *procname)
24733 {
24734 	pmap_set_process(vm_map_pmap(map), pid, procname);
24735 }
24736 
24737 #endif /* MACH_ASSERT */
24738 
24739 /**
24740  * Check if a given given map operation size is valid for the given map, taking
24741  * in to account whether or not the map operation has overridden the soft limit.
24742  *
24743  * This function is meant to be inlined wherever possible as it can, in some
24744  * modes, generates telemetry events which capture shallow backtraces. To
24745  * maximize the usefulness of this backtrace, we want to minize the depth at
24746  * which the backtrace is taken.
24747  */
24748 __attribute__((always_inline))
24749 bool
vm_map_is_map_size_valid(vm_map_t target_map,vm_size_t size,bool no_soft_limit)24750 vm_map_is_map_size_valid(
24751 	vm_map_t target_map,
24752 	vm_size_t size,
24753 	bool no_soft_limit)
24754 {
24755 #ifdef __x86_64__
24756 	// Do not enforce any additional limits on x64
24757 	(void)target_map;
24758 	(void)size;
24759 	(void)no_soft_limit;
24760 	return true;
24761 #else
24762 	if (__probable(target_map->pmap != kernel_pmap ||
24763 	    size < VM_KERNEL_SIMPLE_MAX_SIZE || no_soft_limit)) {
24764 		// Allocation size matches policy
24765 		return true;
24766 	}
24767 
24768 	switch (vm_map_kernel_alloc_limit_mode) {
24769 	default:
24770 	case VM_MAP_KERNEL_ALLOC_LIMIT_MODE_BYPASS:
24771 		return true;
24772 	case VM_MAP_KERNEL_ALLOC_LIMIT_MODE_TRAP:
24773 		trap_telemetry_report_kernel_soft_error(
24774 			TRAP_TELEMETRY_KERNEL_SOFT_ERROR_VM_KERNEL_MAX_ALLOC_SIZE,
24775 			/* report_once_per_site */ false);
24776 		return true;
24777 	case VM_MAP_KERNEL_ALLOC_LIMIT_MODE_REJECT:
24778 		return false;
24779 	case VM_MAP_KERNEL_ALLOC_LIMIT_MODE_PANIC:
24780 		panic("1,000,000K ought to be enough for anybody "
24781 		    "(requested %lu bytes)", size);
24782 	}
24783 #endif /* __x86_64__ */
24784 }
24785 
24786 vm_map_serial_t
vm_map_maybe_serial_id(vm_map_t maybe_vm_map)24787 vm_map_maybe_serial_id(vm_map_t maybe_vm_map)
24788 {
24789 	return maybe_vm_map != NULL ? maybe_vm_map->serial_id : VM_MAP_SERIAL_NONE;
24790 }
24791