xref: /xnu-11417.101.15/osfmk/vm/vm_map.c (revision e3723e1f17661b24996789d8afc084c0c3303b26)
1 /*
2  * Copyright (c) 2000-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * @OSF_COPYRIGHT@
30  */
31 /*
32  * Mach Operating System
33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34  * All Rights Reserved.
35  *
36  * Permission to use, copy, modify and distribute this software and its
37  * documentation is hereby granted, provided that both the copyright
38  * notice and this permission notice appear in all copies of the
39  * software, derivative works or modified versions, and any portions
40  * thereof, and that both notices appear in supporting documentation.
41  *
42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45  *
46  * Carnegie Mellon requests users of this software to return to
47  *
48  *  Software Distribution Coordinator  or  [email protected]
49  *  School of Computer Science
50  *  Carnegie Mellon University
51  *  Pittsburgh PA 15213-3890
52  *
53  * any improvements or extensions that they make and grant Carnegie Mellon
54  * the rights to redistribute these changes.
55  */
56 /*
57  */
58 /*
59  *	File:	vm/vm_map.c
60  *	Author:	Avadis Tevanian, Jr., Michael Wayne Young
61  *	Date:	1985
62  *
63  *	Virtual memory mapping module.
64  */
65 
66 #include <mach/vm_types.h>
67 #include <mach_assert.h>
68 
69 #include <vm/vm_options.h>
70 
71 #include <libkern/OSAtomic.h>
72 
73 #include <mach/kern_return.h>
74 #include <mach/port.h>
75 #include <mach/vm_attributes.h>
76 #include <mach/vm_param.h>
77 #include <mach/vm_behavior.h>
78 #include <mach/vm_statistics.h>
79 #include <mach/memory_object.h>
80 #include <mach/mach_vm_server.h>
81 #include <machine/cpu_capabilities.h>
82 #include <mach/sdt.h>
83 
84 #include <kern/assert.h>
85 #include <kern/backtrace.h>
86 #include <kern/counter.h>
87 #include <kern/exc_guard.h>
88 #include <kern/kalloc.h>
89 #include <kern/zalloc_internal.h>
90 #include <kern/telemetry.h>
91 #include <kern/trap_telemetry.h>
92 
93 #include <vm/cpm_internal.h>
94 #include <vm/memory_types.h>
95 #include <vm/vm_compressor_xnu.h>
96 #include <vm/vm_compressor_pager_internal.h>
97 #include <vm/vm_init_xnu.h>
98 #include <vm/vm_fault_internal.h>
99 #include <vm/vm_map_internal.h>
100 #include <vm/vm_object_internal.h>
101 #include <vm/vm_page_internal.h>
102 #include <vm/vm_pageout.h>
103 #include <vm/pmap.h>
104 #include <vm/vm_kern_internal.h>
105 #include <ipc/ipc_port.h>
106 #include <kern/sched_prim.h>
107 #include <kern/misc_protos.h>
108 
109 #include <mach/vm_map_server.h>
110 #include <mach/mach_host_server.h>
111 #include <vm/vm_memtag.h>
112 #include <vm/vm_protos_internal.h>
113 #include <vm/vm_purgeable_internal.h>
114 
115 #include <vm/vm_iokit.h>
116 #include <vm/vm_shared_region_internal.h>
117 #include <vm/vm_map_store_internal.h>
118 #include <vm/vm_memory_entry_xnu.h>
119 #include <vm/memory_object_internal.h>
120 #include <vm/vm_memory_entry.h>
121 #include <vm/vm_sanitize_internal.h>
122 #include <vm/vm_reclaim_xnu.h>
123 #if DEVELOPMENT || DEBUG
124 #include <vm/vm_compressor_info.h>
125 #endif /* DEVELOPMENT || DEBUG */
126 #include <san/kasan.h>
127 
128 #include <sys/resource.h>
129 #include <sys/random.h>
130 #include <sys/codesign.h>
131 #include <sys/code_signing.h>
132 #include <sys/mman.h>
133 #include <sys/reboot.h>
134 #include <sys/kdebug_triage.h>
135 #include <sys/reason.h>
136 
137 #include <os/log.h>
138 
139 #include <libkern/section_keywords.h>
140 
141 #include <os/hash.h>
142 
143 #if DEVELOPMENT || DEBUG
144 extern int proc_selfcsflags(void);
145 int vm_log_xnu_user_debug = 0;
146 int panic_on_unsigned_execute = 0;
147 int panic_on_mlock_failure = 0;
148 #endif /* DEVELOPMENT || DEBUG */
149 
150 #if DEVELOPMENT || DEBUG
151 int debug4k_filter = 0;
152 char debug4k_proc_name[1024] = "";
153 int debug4k_proc_filter = (int)-1 & ~(1 << __DEBUG4K_FAULT);
154 int debug4k_panic_on_misaligned_sharing = 0;
155 const char *debug4k_category_name[] = {
156 	"error",        /* 0 */
157 	"life",         /* 1 */
158 	"load",         /* 2 */
159 	"fault",        /* 3 */
160 	"copy",         /* 4 */
161 	"share",        /* 5 */
162 	"adjust",       /* 6 */
163 	"pmap",         /* 7 */
164 	"mementry",     /* 8 */
165 	"iokit",        /* 9 */
166 	"upl",          /* 10 */
167 	"exc",          /* 11 */
168 	"vfs"           /* 12 */
169 };
170 #endif /* DEVELOPMENT || DEBUG */
171 int debug4k_no_cow_copyin = 0;
172 
173 
174 #if __arm64__
175 extern const int fourk_binary_compatibility_unsafe;
176 #endif /* __arm64__ */
177 extern int proc_selfpid(void);
178 extern char *proc_name_address(void *p);
179 extern const char *proc_best_name(struct proc *p);
180 
181 #if VM_MAP_DEBUG_APPLE_PROTECT
182 int vm_map_debug_apple_protect = 0;
183 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
184 #if VM_MAP_DEBUG_FOURK
185 int vm_map_debug_fourk = 0;
186 #endif /* VM_MAP_DEBUG_FOURK */
187 
188 #if DEBUG || DEVELOPMENT
189 static TUNABLE(bool, vm_map_executable_immutable,
190     "vm_map_executable_immutable", true);
191 #else
192 #define vm_map_executable_immutable true
193 #endif
194 
195 /** Do not enforce the kernel allocation size limit */
196 #define VM_MAP_KERNEL_ALLOC_LIMIT_MODE_BYPASS (0)
197 /** Enforce the kernel allocation limit by refusing too large requests */
198 #define VM_MAP_KERNEL_ALLOC_LIMIT_MODE_REJECT (1)
199 /** Enforce the kernel allocation limit by panicking on any too large request */
200 #define VM_MAP_KERNEL_ALLOC_LIMIT_MODE_PANIC (2)
201 /** Do not enforce the kernel allocation limit but generate a telemetry trap */
202 #define VM_MAP_KERNEL_ALLOC_LIMIT_MODE_TRAP (3)
203 
204 #if DEVELOPMENT || DEBUG
205 static TUNABLE(int, vm_map_kernel_alloc_limit_mode,
206     "vm_map_kernel_alloc_limit_mode", VM_MAP_KERNEL_ALLOC_LIMIT_MODE_TRAP);
207 #else
208 #define vm_map_kernel_alloc_limit_mode VM_MAP_KERNEL_ALLOC_LIMIT_MODE_BYPASS
209 #endif /* DEVELOPMENT || DEBUG */
210 
211 os_refgrp_decl(static, map_refgrp, "vm_map", NULL);
212 
213 extern u_int32_t random(void);  /* from <libkern/libkern.h> */
214 /* Internal prototypes
215  */
216 
217 typedef struct vm_map_zap {
218 	vm_map_entry_t          vmz_head;
219 	vm_map_entry_t         *vmz_tail;
220 } *vm_map_zap_t;
221 
222 #define VM_MAP_ZAP_DECLARE(zap) \
223 	struct vm_map_zap zap = { .vmz_tail = &zap.vmz_head }
224 
225 extern kern_return_t vm_map_wire_external(
226 	vm_map_t                map,
227 	vm_map_offset_ut        start_u,
228 	vm_map_offset_ut        end_u,
229 	vm_prot_ut              prot_u,
230 	boolean_t               user_wire) __exported;
231 
232 #if XNU_PLATFORM_MacOSX
233 extern /* exported via Private.<arch>.MacOSX.exports on macOS */
234 #else
235 static
236 #endif
237 kern_return_t vm_map_copyin_common(
238 	vm_map_t                src_map,
239 	vm_map_address_ut       src_addr,
240 	vm_map_size_ut          len,
241 	boolean_t               src_destroy,
242 	boolean_t               src_volatile,
243 	vm_map_copy_t          *copy_result,                           /* OUT */
244 	boolean_t               use_maxprot);
245 
246 static vm_map_entry_t   vm_map_entry_insert(
247 	vm_map_t                map,
248 	vm_map_entry_t          insp_entry,
249 	vm_map_offset_t         start,
250 	vm_map_offset_t         end,
251 	vm_object_t             object,
252 	vm_object_offset_t      offset,
253 	vm_map_kernel_flags_t   vmk_flags,
254 	boolean_t               needs_copy,
255 	vm_prot_t               cur_protection,
256 	vm_prot_t               max_protection,
257 	vm_inherit_t            inheritance,
258 	boolean_t               clear_map_aligned);
259 
260 static void vm_map_simplify_range(
261 	vm_map_t        map,
262 	vm_map_offset_t start,
263 	vm_map_offset_t end);   /* forward */
264 
265 static boolean_t        vm_map_range_check(
266 	vm_map_t        map,
267 	vm_map_offset_t start,
268 	vm_map_offset_t end,
269 	vm_map_entry_t  *entry);
270 
271 static void vm_map_submap_pmap_clean(
272 	vm_map_t        map,
273 	vm_map_offset_t start,
274 	vm_map_offset_t end,
275 	vm_map_t        sub_map,
276 	vm_map_offset_t offset);
277 
278 static void             vm_map_pmap_enter(
279 	vm_map_t                map,
280 	vm_map_offset_t         addr,
281 	vm_map_offset_t         end_addr,
282 	vm_object_t             object,
283 	vm_object_offset_t      offset,
284 	vm_prot_t               protection);
285 
286 static void             _vm_map_clip_end(
287 	struct vm_map_header    *map_header,
288 	vm_map_entry_t          entry,
289 	vm_map_offset_t         end);
290 
291 static void             _vm_map_clip_start(
292 	struct vm_map_header    *map_header,
293 	vm_map_entry_t          entry,
294 	vm_map_offset_t         start);
295 
296 static kmem_return_t vm_map_delete(
297 	vm_map_t        map,
298 	vm_map_offset_t start,
299 	vm_map_offset_t end,
300 	vmr_flags_t     flags,
301 	kmem_guard_t    guard,
302 	vm_map_zap_t    zap);
303 
304 static void             vm_map_copy_insert(
305 	vm_map_t        map,
306 	vm_map_entry_t  after_where,
307 	vm_map_copy_t   copy);
308 
309 static kern_return_t    vm_map_copy_overwrite_unaligned(
310 	vm_map_t        dst_map,
311 	vm_map_entry_t  entry,
312 	vm_map_copy_t   copy,
313 	vm_map_address_t start,
314 	boolean_t       discard_on_success);
315 
316 static kern_return_t    vm_map_copy_overwrite_aligned(
317 	vm_map_t        dst_map,
318 	vm_map_entry_t  tmp_entry,
319 	vm_map_copy_t   copy,
320 	vm_map_offset_t start,
321 	pmap_t          pmap);
322 
323 static kern_return_t    vm_map_copyin_kernel_buffer(
324 	vm_map_t        src_map,
325 	vm_map_address_t src_addr,
326 	vm_map_size_t   len,
327 	boolean_t       src_destroy,
328 	vm_map_copy_t   *copy_result);  /* OUT */
329 
330 static kern_return_t    vm_map_copyout_kernel_buffer(
331 	vm_map_t        map,
332 	vm_map_address_t *addr, /* IN/OUT */
333 	vm_map_copy_t   copy,
334 	vm_map_size_t   copy_size,
335 	boolean_t       overwrite,
336 	boolean_t       consume_on_success);
337 
338 static void             vm_map_fork_share(
339 	vm_map_t        old_map,
340 	vm_map_entry_t  old_entry,
341 	vm_map_t        new_map);
342 
343 static boolean_t        vm_map_fork_copy(
344 	vm_map_t        old_map,
345 	vm_map_entry_t  *old_entry_p,
346 	vm_map_t        new_map,
347 	int             vm_map_copyin_flags);
348 
349 static kern_return_t    vm_map_wire_nested(
350 	vm_map_t                   map,
351 	vm_map_offset_t            start,
352 	vm_map_offset_t            end,
353 	vm_prot_t                  caller_prot,
354 	vm_tag_t                   tag,
355 	boolean_t                  user_wire,
356 	pmap_t                     map_pmap,
357 	vm_map_offset_t            pmap_addr,
358 	ppnum_t                   *physpage_p);
359 
360 static kern_return_t    vm_map_unwire_nested(
361 	vm_map_t                   map,
362 	vm_map_offset_t            start,
363 	vm_map_offset_t            end,
364 	boolean_t                  user_wire,
365 	pmap_t                     map_pmap,
366 	vm_map_offset_t            pmap_addr);
367 
368 static kern_return_t    vm_map_overwrite_submap_recurse(
369 	vm_map_t                   dst_map,
370 	vm_map_offset_t            dst_addr,
371 	vm_map_size_t              dst_size);
372 
373 static kern_return_t    vm_map_copy_overwrite_nested(
374 	vm_map_t                   dst_map,
375 	vm_map_offset_t            dst_addr,
376 	vm_map_copy_t              copy,
377 	boolean_t                  interruptible,
378 	pmap_t                     pmap,
379 	boolean_t                  discard_on_success);
380 
381 static kern_return_t    vm_map_remap_extract(
382 	vm_map_t                map,
383 	vm_map_offset_t         addr,
384 	vm_map_size_t           size,
385 	boolean_t               copy,
386 	vm_map_copy_t           map_copy,
387 	vm_prot_t               *cur_protection,
388 	vm_prot_t               *max_protection,
389 	vm_inherit_t            inheritance,
390 	vm_map_kernel_flags_t   vmk_flags);
391 
392 static void             vm_map_region_look_for_page(
393 	vm_map_t                   map,
394 	vm_map_offset_t            va,
395 	vm_object_t                object,
396 	vm_object_offset_t         offset,
397 	int                        max_refcnt,
398 	unsigned short             depth,
399 	vm_region_extended_info_t  extended,
400 	mach_msg_type_number_t count);
401 
402 static boolean_t        vm_map_region_has_obj_ref(
403 	vm_map_entry_t             entry,
404 	vm_object_t                object);
405 
406 
407 static kern_return_t    vm_map_willneed(
408 	vm_map_t        map,
409 	vm_map_offset_t start,
410 	vm_map_offset_t end);
411 
412 static kern_return_t    vm_map_reuse_pages(
413 	vm_map_t        map,
414 	vm_map_offset_t start,
415 	vm_map_offset_t end);
416 
417 static kern_return_t    vm_map_reusable_pages(
418 	vm_map_t        map,
419 	vm_map_offset_t start,
420 	vm_map_offset_t end);
421 
422 static kern_return_t    vm_map_can_reuse(
423 	vm_map_t        map,
424 	vm_map_offset_t start,
425 	vm_map_offset_t end);
426 
427 static kern_return_t    vm_map_zero(
428 	vm_map_t        map,
429 	vm_map_offset_t start,
430 	vm_map_offset_t end);
431 
432 static kern_return_t    vm_map_random_address_for_size(
433 	vm_map_t                map,
434 	vm_map_offset_t        *address,
435 	vm_map_size_t           size,
436 	vm_map_kernel_flags_t   vmk_flags);
437 
438 
439 #if CONFIG_MAP_RANGES
440 
441 static vm_map_range_id_t vm_map_user_range_resolve(
442 	vm_map_t                map,
443 	mach_vm_address_t       addr,
444 	mach_vm_address_t       size,
445 	mach_vm_range_t         range);
446 
447 #endif /* CONFIG_MAP_RANGES */
448 #if MACH_ASSERT
449 static kern_return_t    vm_map_pageout(
450 	vm_map_t        map,
451 	vm_map_offset_t start,
452 	vm_map_offset_t end);
453 #endif /* MACH_ASSERT */
454 
455 kern_return_t vm_map_corpse_footprint_collect(
456 	vm_map_t        old_map,
457 	vm_map_entry_t  old_entry,
458 	vm_map_t        new_map);
459 void vm_map_corpse_footprint_collect_done(
460 	vm_map_t        new_map);
461 void vm_map_corpse_footprint_destroy(
462 	vm_map_t        map);
463 kern_return_t vm_map_corpse_footprint_query_page_info(
464 	vm_map_t        map,
465 	vm_map_offset_t va,
466 	int             *disposition_p);
467 void vm_map_footprint_query_page_info(
468 	vm_map_t        map,
469 	vm_map_entry_t  map_entry,
470 	vm_map_offset_t curr_s_offset,
471 	int             *disposition_p);
472 
473 #if CONFIG_MAP_RANGES
474 static void vm_map_range_map_init(void);
475 #endif /* CONFIG_MAP_RANGES */
476 
477 pid_t find_largest_process_vm_map_entries(void);
478 
479 __attribute__((always_inline))
480 int
vm_map_kernel_flags_vmflags(vm_map_kernel_flags_t vmk_flags)481 vm_map_kernel_flags_vmflags(vm_map_kernel_flags_t vmk_flags)
482 {
483 	int flags = vmk_flags.__vm_flags & VM_FLAGS_ANY_MASK;
484 
485 	/* in vmk flags the meaning of fixed/anywhere is inverted */
486 	return flags ^ (VM_FLAGS_FIXED | VM_FLAGS_ANYWHERE);
487 }
488 
489 __attribute__((always_inline, overloadable))
490 void
vm_map_kernel_flags_set_vmflags(vm_map_kernel_flags_t * vmk_flags,int vm_flags,vm_tag_t vm_tag)491 vm_map_kernel_flags_set_vmflags(
492 	vm_map_kernel_flags_t  *vmk_flags,
493 	int                     vm_flags,
494 	vm_tag_t                vm_tag)
495 {
496 	vm_flags ^= (VM_FLAGS_FIXED | VM_FLAGS_ANYWHERE);
497 	vmk_flags->__vm_flags &= ~VM_FLAGS_ANY_MASK;
498 	vmk_flags->__vm_flags |= (vm_flags & VM_FLAGS_ANY_MASK);
499 	vmk_flags->vm_tag = vm_tag;
500 }
501 
502 __attribute__((always_inline, overloadable))
503 void
vm_map_kernel_flags_set_vmflags(vm_map_kernel_flags_t * vmk_flags,int vm_flags_and_tag)504 vm_map_kernel_flags_set_vmflags(
505 	vm_map_kernel_flags_t  *vmk_flags,
506 	int                     vm_flags_and_tag)
507 {
508 	vm_flags_and_tag ^= (VM_FLAGS_FIXED | VM_FLAGS_ANYWHERE);
509 	vmk_flags->__vm_flags &= ~VM_FLAGS_ANY_MASK;
510 	vmk_flags->__vm_flags |= (vm_flags_and_tag & VM_FLAGS_ANY_MASK);
511 	VM_GET_FLAGS_ALIAS(vm_flags_and_tag, vmk_flags->vm_tag);
512 }
513 
514 __attribute__((always_inline))
515 void
vm_map_kernel_flags_and_vmflags(vm_map_kernel_flags_t * vmk_flags,int vm_flags_mask)516 vm_map_kernel_flags_and_vmflags(
517 	vm_map_kernel_flags_t  *vmk_flags,
518 	int                     vm_flags_mask)
519 {
520 	/* this function doesn't handle the inverted FIXED/ANYWHERE */
521 	assert(vm_flags_mask & VM_FLAGS_ANYWHERE);
522 	vmk_flags->__vm_flags &= vm_flags_mask;
523 }
524 
525 __attribute__((always_inline))
526 bool
vm_map_kernel_flags_check_vm_and_kflags(vm_map_kernel_flags_t vmk_flags,int vm_flags_mask)527 vm_map_kernel_flags_check_vm_and_kflags(
528 	vm_map_kernel_flags_t   vmk_flags,
529 	int                     vm_flags_mask)
530 {
531 	return (vmk_flags.__vm_flags & ~vm_flags_mask) == 0;
532 }
533 
534 bool
vm_map_kernel_flags_check_vmflags(vm_map_kernel_flags_t vmk_flags,int vm_flags_mask)535 vm_map_kernel_flags_check_vmflags(
536 	vm_map_kernel_flags_t   vmk_flags,
537 	int                     vm_flags_mask)
538 {
539 	int vmflags = vmk_flags.__vm_flags & VM_FLAGS_ANY_MASK;
540 
541 	/* Note: up to 16 still has good calling conventions */
542 	static_assert(sizeof(vm_map_kernel_flags_t) == 16);
543 
544 #if DEBUG || DEVELOPMENT
545 	/*
546 	 * All of this compiles to nothing if all checks pass.
547 	 */
548 #define check(field, value)  ({ \
549 	vm_map_kernel_flags_t fl = VM_MAP_KERNEL_FLAGS_NONE; \
550 	fl.__vm_flags = (value); \
551 	fl.field = 0; \
552 	assert(fl.__vm_flags == 0); \
553 })
554 
555 	/* bits 0-7 */
556 	check(vmf_fixed, VM_FLAGS_ANYWHERE); // kind of a lie this is inverted
557 	check(vmf_purgeable, VM_FLAGS_PURGABLE);
558 	check(vmf_4gb_chunk, VM_FLAGS_4GB_CHUNK);
559 	check(vmf_random_addr, VM_FLAGS_RANDOM_ADDR);
560 	check(vmf_no_cache, VM_FLAGS_NO_CACHE);
561 	check(vmf_resilient_codesign, VM_FLAGS_RESILIENT_CODESIGN);
562 	check(vmf_resilient_media, VM_FLAGS_RESILIENT_MEDIA);
563 	check(vmf_permanent, VM_FLAGS_PERMANENT);
564 
565 	/* bits 8-15 */
566 	check(vmf_tpro, VM_FLAGS_TPRO);
567 	check(vmf_overwrite, VM_FLAGS_OVERWRITE);
568 
569 	/* bits 16-23 */
570 	check(vmf_superpage_size, VM_FLAGS_SUPERPAGE_MASK);
571 	check(vmf_return_data_addr, VM_FLAGS_RETURN_DATA_ADDR);
572 	check(vmf_return_4k_data_addr, VM_FLAGS_RETURN_4K_DATA_ADDR);
573 
574 	{
575 		vm_map_kernel_flags_t fl = VM_MAP_KERNEL_FLAGS_NONE;
576 
577 		/* check user tags will never clip */
578 		fl.vm_tag = VM_MEMORY_COUNT - 1;
579 		assert(fl.vm_tag == VM_MEMORY_COUNT - 1);
580 
581 		/* check kernel tags will never clip */
582 		fl.vm_tag = VM_MAX_TAG_VALUE - 1;
583 		assert(fl.vm_tag == VM_MAX_TAG_VALUE - 1);
584 	}
585 
586 
587 #undef check
588 #endif /* DEBUG || DEVELOPMENT */
589 
590 	return (vmflags & ~vm_flags_mask) == 0;
591 }
592 
593 /*
594  * Macros to copy a vm_map_entry. We must be careful to correctly
595  * manage the wired page count. vm_map_entry_copy() creates a new
596  * map entry to the same memory - the wired count in the new entry
597  * must be set to zero. vm_map_entry_copy_full() creates a new
598  * entry that is identical to the old entry.  This preserves the
599  * wire count; it's used for map splitting and zone changing in
600  * vm_map_copyout.
601  */
602 
603 static inline void
vm_map_entry_copy_csm_assoc(vm_map_t map __unused,vm_map_entry_t new __unused,vm_map_entry_t old __unused)604 vm_map_entry_copy_csm_assoc(
605 	vm_map_t map __unused,
606 	vm_map_entry_t new __unused,
607 	vm_map_entry_t old __unused)
608 {
609 #if CODE_SIGNING_MONITOR
610 	/* when code signing monitor is enabled, we want to reset on copy */
611 	new->csm_associated = FALSE;
612 #else
613 	/* when code signing monitor is not enabled, assert as a sanity check */
614 	assert(new->csm_associated == FALSE);
615 #endif
616 #if DEVELOPMENT || DEBUG
617 	if (new->vme_xnu_user_debug && vm_log_xnu_user_debug) {
618 		printf("FBDP %d[%s] %s:%d map %p entry %p [ 0x%llx 0x%llx ] vme_xnu_user_debug\n",
619 		    proc_selfpid(),
620 		    (get_bsdtask_info(current_task())
621 		    ? proc_name_address(get_bsdtask_info(current_task()))
622 		    : "?"),
623 		    __FUNCTION__, __LINE__,
624 		    map, new, new->vme_start, new->vme_end);
625 	}
626 #endif /* DEVELOPMENT || DEBUG */
627 #if XNU_TARGET_OS_OSX
628 	/*
629 	 * On macOS, entries with "vme_xnu_user_debug" can be copied during fork()
630 	 * and we want the child's entry to keep its "vme_xnu_user_debug" to avoid
631 	 * trigggering CSM assertions when the child accesses its mapping.
632 	 */
633 #else /* XNU_TARGET_OS_OSX */
634 	new->vme_xnu_user_debug = FALSE;
635 #endif /* XNU_TARGET_OS_OSX */
636 }
637 
638 /*
639  * The "used_for_jit" flag was copied from OLD to NEW in vm_map_entry_copy().
640  * But for security reasons on some platforms, we don't want the
641  * new mapping to be "used for jit", so we reset the flag here.
642  */
643 static inline void
vm_map_entry_copy_code_signing(vm_map_t map,vm_map_entry_t new,vm_map_entry_t old __unused)644 vm_map_entry_copy_code_signing(
645 	vm_map_t map,
646 	vm_map_entry_t new,
647 	vm_map_entry_t old __unused)
648 {
649 	if (VM_MAP_POLICY_ALLOW_JIT_COPY(map)) {
650 		assert(new->used_for_jit == old->used_for_jit);
651 	} else {
652 		if (old->used_for_jit) {
653 			DTRACE_VM3(cs_wx,
654 			    uint64_t, new->vme_start,
655 			    uint64_t, new->vme_end,
656 			    vm_prot_t, new->protection);
657 			printf("CODE SIGNING: %d[%s] %s: curprot cannot be write+execute. %s\n",
658 			    proc_selfpid(),
659 			    (get_bsdtask_info(current_task())
660 			    ? proc_name_address(get_bsdtask_info(current_task()))
661 			    : "?"),
662 			    __FUNCTION__,
663 			    "removing execute access");
664 			new->protection &= ~VM_PROT_EXECUTE;
665 			new->max_protection &= ~VM_PROT_EXECUTE;
666 		}
667 		new->used_for_jit = FALSE;
668 	}
669 }
670 
671 static inline void
vm_map_entry_copy_full(vm_map_entry_t new,vm_map_entry_t old)672 vm_map_entry_copy_full(
673 	vm_map_entry_t new,
674 	vm_map_entry_t old)
675 {
676 #if MAP_ENTRY_CREATION_DEBUG
677 	btref_put(new->vme_creation_bt);
678 	btref_retain(old->vme_creation_bt);
679 #endif
680 #if MAP_ENTRY_INSERTION_DEBUG
681 	btref_put(new->vme_insertion_bt);
682 	btref_retain(old->vme_insertion_bt);
683 #endif
684 #if VM_BTLOG_TAGS
685 	/* Discard the btref that might be in the new entry */
686 	if (new->vme_kernel_object) {
687 		btref_put(new->vme_tag_btref);
688 	}
689 	/* Retain the btref in the old entry to account for its copy */
690 	if (old->vme_kernel_object) {
691 		btref_retain(old->vme_tag_btref);
692 	}
693 #endif /* VM_BTLOG_TAGS */
694 	*new = *old;
695 }
696 
697 static inline void
vm_map_entry_copy(vm_map_t map,vm_map_entry_t new,vm_map_entry_t old)698 vm_map_entry_copy(
699 	vm_map_t map,
700 	vm_map_entry_t new,
701 	vm_map_entry_t old)
702 {
703 	vm_map_entry_copy_full(new, old);
704 
705 	new->is_shared = FALSE;
706 	new->needs_wakeup = FALSE;
707 	new->in_transition = FALSE;
708 	new->wired_count = 0;
709 	new->user_wired_count = 0;
710 	new->vme_permanent = FALSE;
711 	vm_map_entry_copy_code_signing(map, new, old);
712 	vm_map_entry_copy_csm_assoc(map, new, old);
713 	if (new->iokit_acct) {
714 		assertf(!new->use_pmap, "old %p new %p\n", old, new);
715 		new->iokit_acct = FALSE;
716 		new->use_pmap = TRUE;
717 	}
718 	new->vme_resilient_codesign = FALSE;
719 	new->vme_resilient_media = FALSE;
720 	new->vme_atomic = FALSE;
721 	new->vme_no_copy_on_read = FALSE;
722 }
723 
724 /*
725  * Normal lock_read_to_write() returns FALSE/0 on failure.
726  * These functions evaluate to zero on success and non-zero value on failure.
727  */
728 __attribute__((always_inline))
729 int
vm_map_lock_read_to_write(vm_map_t map)730 vm_map_lock_read_to_write(vm_map_t map)
731 {
732 	if (lck_rw_lock_shared_to_exclusive(&(map)->lock)) {
733 		DTRACE_VM(vm_map_lock_upgrade);
734 		return 0;
735 	}
736 	return 1;
737 }
738 
739 __attribute__((always_inline))
740 boolean_t
vm_map_try_lock(vm_map_t map)741 vm_map_try_lock(vm_map_t map)
742 {
743 	if (lck_rw_try_lock_exclusive(&(map)->lock)) {
744 		DTRACE_VM(vm_map_lock_w);
745 		return TRUE;
746 	}
747 	return FALSE;
748 }
749 
750 __attribute__((always_inline))
751 boolean_t
vm_map_try_lock_read(vm_map_t map)752 vm_map_try_lock_read(vm_map_t map)
753 {
754 	if (lck_rw_try_lock_shared(&(map)->lock)) {
755 		DTRACE_VM(vm_map_lock_r);
756 		return TRUE;
757 	}
758 	return FALSE;
759 }
760 
761 /*!
762  * @function kdp_vm_map_is_acquired_exclusive
763  *
764  * @abstract
765  * Checks if vm map is acquired exclusive.
766  *
767  * @discussion
768  * NOT SAFE: To be used only by kernel debugger.
769  *
770  * @param map map to check
771  *
772  * @returns TRUE if the map is acquired exclusively.
773  */
774 boolean_t
kdp_vm_map_is_acquired_exclusive(vm_map_t map)775 kdp_vm_map_is_acquired_exclusive(vm_map_t map)
776 {
777 	return kdp_lck_rw_lock_is_acquired_exclusive(&map->lock);
778 }
779 
780 /*
781  * Routines to get the page size the caller should
782  * use while inspecting the target address space.
783  * Use the "_safely" variant if the caller is dealing with a user-provided
784  * array whose size depends on the page size, to avoid any overflow or
785  * underflow of a user-allocated buffer.
786  */
787 int
vm_self_region_page_shift_safely(vm_map_t target_map)788 vm_self_region_page_shift_safely(
789 	vm_map_t target_map)
790 {
791 	int effective_page_shift = 0;
792 
793 	if (PAGE_SIZE == (4096)) {
794 		/* x86_64 and 4k watches: always use 4k */
795 		return PAGE_SHIFT;
796 	}
797 	/* did caller provide an explicit page size for this thread to use? */
798 	effective_page_shift = thread_self_region_page_shift();
799 	if (effective_page_shift) {
800 		/* use the explicitly-provided page size */
801 		return effective_page_shift;
802 	}
803 	/* no explicit page size: use the caller's page size... */
804 	effective_page_shift = VM_MAP_PAGE_SHIFT(current_map());
805 	if (effective_page_shift == VM_MAP_PAGE_SHIFT(target_map)) {
806 		/* page size match: safe to use */
807 		return effective_page_shift;
808 	}
809 	/* page size mismatch */
810 	return -1;
811 }
812 int
vm_self_region_page_shift(vm_map_t target_map)813 vm_self_region_page_shift(
814 	vm_map_t target_map)
815 {
816 	int effective_page_shift;
817 
818 	effective_page_shift = vm_self_region_page_shift_safely(target_map);
819 	if (effective_page_shift == -1) {
820 		/* no safe value but OK to guess for caller */
821 		effective_page_shift = MIN(VM_MAP_PAGE_SHIFT(current_map()),
822 		    VM_MAP_PAGE_SHIFT(target_map));
823 	}
824 	return effective_page_shift;
825 }
826 
827 
828 /*
829  *	Decide if we want to allow processes to execute from their data or stack areas.
830  *	override_nx() returns true if we do.  Data/stack execution can be enabled independently
831  *	for 32 and 64 bit processes.  Set the VM_ABI_32 or VM_ABI_64 flags in allow_data_exec
832  *	or allow_stack_exec to enable data execution for that type of data area for that particular
833  *	ABI (or both by or'ing the flags together).  These are initialized in the architecture
834  *	specific pmap files since the default behavior varies according to architecture.  The
835  *	main reason it varies is because of the need to provide binary compatibility with old
836  *	applications that were written before these restrictions came into being.  In the old
837  *	days, an app could execute anything it could read, but this has slowly been tightened
838  *	up over time.  The default behavior is:
839  *
840  *	32-bit PPC apps		may execute from both stack and data areas
841  *	32-bit Intel apps	may exeucte from data areas but not stack
842  *	64-bit PPC/Intel apps	may not execute from either data or stack
843  *
844  *	An application on any architecture may override these defaults by explicitly
845  *	adding PROT_EXEC permission to the page in question with the mprotect(2)
846  *	system call.  This code here just determines what happens when an app tries to
847  *      execute from a page that lacks execute permission.
848  *
849  *	Note that allow_data_exec or allow_stack_exec may also be modified by sysctl to change the
850  *	default behavior for both 32 and 64 bit apps on a system-wide basis. Furthermore,
851  *	a Mach-O header flag bit (MH_NO_HEAP_EXECUTION) can be used to forcibly disallow
852  *	execution from data areas for a particular binary even if the arch normally permits it. As
853  *	a final wrinkle, a posix_spawn attribute flag can be used to negate this opt-in header bit
854  *	to support some complicated use cases, notably browsers with out-of-process plugins that
855  *	are not all NX-safe.
856  */
857 
858 extern int allow_data_exec, allow_stack_exec;
859 
860 int
override_nx(vm_map_t map,uint32_t user_tag)861 override_nx(vm_map_t map, uint32_t user_tag) /* map unused on arm */
862 {
863 	int current_abi;
864 
865 	if (map->pmap == kernel_pmap) {
866 		return FALSE;
867 	}
868 
869 	/*
870 	 * Determine if the app is running in 32 or 64 bit mode.
871 	 */
872 
873 	if (vm_map_is_64bit(map)) {
874 		current_abi = VM_ABI_64;
875 	} else {
876 		current_abi = VM_ABI_32;
877 	}
878 
879 	/*
880 	 * Determine if we should allow the execution based on whether it's a
881 	 * stack or data area and the current architecture.
882 	 */
883 
884 	if (user_tag == VM_MEMORY_STACK) {
885 		return allow_stack_exec & current_abi;
886 	}
887 
888 	return (allow_data_exec & current_abi) && (map->map_disallow_data_exec == FALSE);
889 }
890 
891 
892 /*
893  *	Virtual memory maps provide for the mapping, protection,
894  *	and sharing of virtual memory objects.  In addition,
895  *	this module provides for an efficient virtual copy of
896  *	memory from one map to another.
897  *
898  *	Synchronization is required prior to most operations.
899  *
900  *	Maps consist of an ordered doubly-linked list of simple
901  *	entries; a single hint is used to speed up lookups.
902  *
903  *	Sharing maps have been deleted from this version of Mach.
904  *	All shared objects are now mapped directly into the respective
905  *	maps.  This requires a change in the copy on write strategy;
906  *	the asymmetric (delayed) strategy is used for shared temporary
907  *	objects instead of the symmetric (shadow) strategy.  All maps
908  *	are now "top level" maps (either task map, kernel map or submap
909  *	of the kernel map).
910  *
911  *	Since portions of maps are specified by start/end addreses,
912  *	which may not align with existing map entries, all
913  *	routines merely "clip" entries to these start/end values.
914  *	[That is, an entry is split into two, bordering at a
915  *	start or end value.]  Note that these clippings may not
916  *	always be necessary (as the two resulting entries are then
917  *	not changed); however, the clipping is done for convenience.
918  *	No attempt is currently made to "glue back together" two
919  *	abutting entries.
920  *
921  *	The symmetric (shadow) copy strategy implements virtual copy
922  *	by copying VM object references from one map to
923  *	another, and then marking both regions as copy-on-write.
924  *	It is important to note that only one writeable reference
925  *	to a VM object region exists in any map when this strategy
926  *	is used -- this means that shadow object creation can be
927  *	delayed until a write operation occurs.  The symmetric (delayed)
928  *	strategy allows multiple maps to have writeable references to
929  *	the same region of a vm object, and hence cannot delay creating
930  *	its copy objects.  See vm_object_copy_quickly() in vm_object.c.
931  *	Copying of permanent objects is completely different; see
932  *	vm_object_copy_strategically() in vm_object.c.
933  */
934 
935 ZONE_DECLARE_ID(ZONE_ID_VM_MAP_COPY, struct vm_map_copy);
936 
937 #define VM_MAP_ZONE_NAME        "maps"
938 #define VM_MAP_ZFLAGS           (ZC_NOENCRYPT | ZC_VM)
939 
940 #define VM_MAP_ENTRY_ZONE_NAME  "VM map entries"
941 #define VM_MAP_ENTRY_ZFLAGS     (ZC_NOENCRYPT | ZC_VM)
942 
943 #define VM_MAP_HOLES_ZONE_NAME  "VM map holes"
944 #define VM_MAP_HOLES_ZFLAGS     (ZC_NOENCRYPT | ZC_VM)
945 
946 /*
947  * Asserts that a vm_map_copy object is coming from the
948  * vm_map_copy_zone to ensure that it isn't a fake constructed
949  * anywhere else.
950  */
951 void
vm_map_copy_require(struct vm_map_copy * copy)952 vm_map_copy_require(struct vm_map_copy *copy)
953 {
954 	zone_id_require(ZONE_ID_VM_MAP_COPY, sizeof(struct vm_map_copy), copy);
955 }
956 
957 /*
958  *	vm_map_require:
959  *
960  *	Ensures that the argument is memory allocated from the genuine
961  *	vm map zone. (See zone_id_require_allow_foreign).
962  */
963 void
vm_map_require(vm_map_t map)964 vm_map_require(vm_map_t map)
965 {
966 	zone_id_require(ZONE_ID_VM_MAP, sizeof(struct _vm_map), map);
967 }
968 
969 #define VM_MAP_EARLY_COUNT_MAX         16
970 static __startup_data vm_offset_t      map_data;
971 static __startup_data vm_size_t        map_data_size;
972 static __startup_data vm_offset_t      kentry_data;
973 static __startup_data vm_size_t        kentry_data_size;
974 static __startup_data vm_offset_t      map_holes_data;
975 static __startup_data vm_size_t        map_holes_data_size;
976 static __startup_data vm_map_t        *early_map_owners[VM_MAP_EARLY_COUNT_MAX];
977 static __startup_data uint32_t         early_map_count;
978 
979 #if XNU_TARGET_OS_OSX
980 #define         NO_COALESCE_LIMIT  ((1024 * 128) - 1)
981 #else /* XNU_TARGET_OS_OSX */
982 #define         NO_COALESCE_LIMIT  0
983 #endif /* XNU_TARGET_OS_OSX */
984 
985 /* Skip acquiring locks if we're in the midst of a kernel core dump */
986 unsigned int not_in_kdp = 1;
987 
988 unsigned int vm_map_set_cache_attr_count = 0;
989 
990 kern_return_t
vm_map_set_cache_attr(vm_map_t map,vm_map_offset_t va)991 vm_map_set_cache_attr(
992 	vm_map_t        map,
993 	vm_map_offset_t va)
994 {
995 	vm_map_entry_t  map_entry;
996 	vm_object_t     object;
997 	kern_return_t   kr = KERN_SUCCESS;
998 
999 	vm_map_lock_read(map);
1000 
1001 	if (!vm_map_lookup_entry(map, va, &map_entry) ||
1002 	    map_entry->is_sub_map) {
1003 		/*
1004 		 * that memory is not properly mapped
1005 		 */
1006 		kr = KERN_INVALID_ARGUMENT;
1007 		goto done;
1008 	}
1009 	object = VME_OBJECT(map_entry);
1010 
1011 	if (object == VM_OBJECT_NULL) {
1012 		/*
1013 		 * there should be a VM object here at this point
1014 		 */
1015 		kr = KERN_INVALID_ARGUMENT;
1016 		goto done;
1017 	}
1018 	vm_object_lock(object);
1019 	object->set_cache_attr = TRUE;
1020 	vm_object_unlock(object);
1021 
1022 	vm_map_set_cache_attr_count++;
1023 done:
1024 	vm_map_unlock_read(map);
1025 
1026 	return kr;
1027 }
1028 
1029 
1030 #if CONFIG_CODE_DECRYPTION
1031 /*
1032  * vm_map_apple_protected:
1033  * This remaps the requested part of the object with an object backed by
1034  * the decrypting pager.
1035  * crypt_info contains entry points and session data for the crypt module.
1036  * The crypt_info block will be copied by vm_map_apple_protected. The data structures
1037  * referenced in crypt_info must remain valid until crypt_info->crypt_end() is called.
1038  */
1039 kern_return_t
vm_map_apple_protected(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_object_offset_t crypto_backing_offset,struct pager_crypt_info * crypt_info,uint32_t cryptid)1040 vm_map_apple_protected(
1041 	vm_map_t                map,
1042 	vm_map_offset_t         start,
1043 	vm_map_offset_t         end,
1044 	vm_object_offset_t      crypto_backing_offset,
1045 	struct pager_crypt_info *crypt_info,
1046 	uint32_t                cryptid)
1047 {
1048 	boolean_t       map_locked;
1049 	kern_return_t   kr;
1050 	vm_map_entry_t  map_entry;
1051 	struct vm_map_entry tmp_entry;
1052 	memory_object_t unprotected_mem_obj;
1053 	vm_object_t     protected_object;
1054 	vm_map_offset_t map_addr;
1055 	vm_map_offset_t start_aligned, end_aligned;
1056 	vm_object_offset_t      crypto_start, crypto_end;
1057 	boolean_t       cache_pager;
1058 
1059 	map_locked = FALSE;
1060 	unprotected_mem_obj = MEMORY_OBJECT_NULL;
1061 
1062 	if (__improbable(vm_map_range_overflows(map, start, end - start))) {
1063 		return KERN_INVALID_ADDRESS;
1064 	}
1065 	start_aligned = vm_map_trunc_page(start, PAGE_MASK_64);
1066 	end_aligned = vm_map_round_page(end, PAGE_MASK_64);
1067 	start_aligned = vm_map_trunc_page(start_aligned, VM_MAP_PAGE_MASK(map));
1068 	end_aligned = vm_map_round_page(end_aligned, VM_MAP_PAGE_MASK(map));
1069 
1070 #if __arm64__
1071 	/*
1072 	 * "start" and "end" might be 4K-aligned but not 16K-aligned,
1073 	 * so we might have to loop and establish up to 3 mappings:
1074 	 *
1075 	 * + the first 16K-page, which might overlap with the previous
1076 	 *   4K-aligned mapping,
1077 	 * + the center,
1078 	 * + the last 16K-page, which might overlap with the next
1079 	 *   4K-aligned mapping.
1080 	 * Each of these mapping might be backed by a vnode pager (if
1081 	 * properly page-aligned) or a "fourk_pager", itself backed by a
1082 	 * vnode pager (if 4K-aligned but not page-aligned).
1083 	 */
1084 #endif /* __arm64__ */
1085 
1086 	map_addr = start_aligned;
1087 	for (map_addr = start_aligned;
1088 	    map_addr < end;
1089 	    map_addr = tmp_entry.vme_end) {
1090 		vm_map_lock(map);
1091 		map_locked = TRUE;
1092 
1093 		/* lookup the protected VM object */
1094 		if (!vm_map_lookup_entry(map,
1095 		    map_addr,
1096 		    &map_entry) ||
1097 		    map_entry->is_sub_map ||
1098 		    VME_OBJECT(map_entry) == VM_OBJECT_NULL) {
1099 			/* that memory is not properly mapped */
1100 			kr = KERN_INVALID_ARGUMENT;
1101 			goto done;
1102 		}
1103 
1104 		/* ensure mapped memory is mapped as executable except
1105 		 *  except for model decryption flow */
1106 		if ((cryptid != CRYPTID_MODEL_ENCRYPTION) &&
1107 		    !(map_entry->protection & VM_PROT_EXECUTE)) {
1108 			kr = KERN_INVALID_ARGUMENT;
1109 			goto done;
1110 		}
1111 
1112 		/* get the protected object to be decrypted */
1113 		protected_object = VME_OBJECT(map_entry);
1114 		if (protected_object == VM_OBJECT_NULL) {
1115 			/* there should be a VM object here at this point */
1116 			kr = KERN_INVALID_ARGUMENT;
1117 			goto done;
1118 		}
1119 		/* ensure protected object stays alive while map is unlocked */
1120 		vm_object_reference(protected_object);
1121 
1122 		/* limit the map entry to the area we want to cover */
1123 		vm_map_clip_start(map, map_entry, start_aligned);
1124 		vm_map_clip_end(map, map_entry, end_aligned);
1125 
1126 		tmp_entry = *map_entry;
1127 		map_entry = VM_MAP_ENTRY_NULL; /* not valid after unlocking map */
1128 		vm_map_unlock(map);
1129 		map_locked = FALSE;
1130 
1131 		/*
1132 		 * This map entry might be only partially encrypted
1133 		 * (if not fully "page-aligned").
1134 		 */
1135 		crypto_start = 0;
1136 		crypto_end = tmp_entry.vme_end - tmp_entry.vme_start;
1137 		if (tmp_entry.vme_start < start) {
1138 			if (tmp_entry.vme_start != start_aligned) {
1139 				kr = KERN_INVALID_ADDRESS;
1140 				vm_object_deallocate(protected_object);
1141 				goto done;
1142 			}
1143 			crypto_start += (start - tmp_entry.vme_start);
1144 		}
1145 		if (tmp_entry.vme_end > end) {
1146 			if (tmp_entry.vme_end != end_aligned) {
1147 				kr = KERN_INVALID_ADDRESS;
1148 				vm_object_deallocate(protected_object);
1149 				goto done;
1150 			}
1151 			crypto_end -= (tmp_entry.vme_end - end);
1152 		}
1153 
1154 		/*
1155 		 * This "extra backing offset" is needed to get the decryption
1156 		 * routine to use the right key.  It adjusts for the possibly
1157 		 * relative offset of an interposed "4K" pager...
1158 		 */
1159 		if (crypto_backing_offset == (vm_object_offset_t) -1) {
1160 			crypto_backing_offset = VME_OFFSET(&tmp_entry);
1161 		}
1162 
1163 		cache_pager = TRUE;
1164 #if XNU_TARGET_OS_OSX
1165 		if (vm_map_is_alien(map)) {
1166 			cache_pager = FALSE;
1167 		}
1168 #endif /* XNU_TARGET_OS_OSX */
1169 
1170 		/*
1171 		 * Lookup (and create if necessary) the protected memory object
1172 		 * matching that VM object.
1173 		 * If successful, this also grabs a reference on the memory object,
1174 		 * to guarantee that it doesn't go away before we get a chance to map
1175 		 * it.
1176 		 */
1177 		unprotected_mem_obj = apple_protect_pager_setup(
1178 			protected_object,
1179 			VME_OFFSET(&tmp_entry),
1180 			crypto_backing_offset,
1181 			crypt_info,
1182 			crypto_start,
1183 			crypto_end,
1184 			cache_pager);
1185 
1186 		/* release extra ref on protected object */
1187 		vm_object_deallocate(protected_object);
1188 
1189 		if (unprotected_mem_obj == NULL) {
1190 			kr = KERN_FAILURE;
1191 			goto done;
1192 		}
1193 
1194 		/* can overwrite an immutable mapping */
1195 		vm_map_kernel_flags_t vmk_flags = {
1196 			.vmf_fixed = true,
1197 			.vmf_overwrite = true,
1198 			.vmkf_overwrite_immutable = true,
1199 		};
1200 		/* make the new mapping as "permanent" as the one it replaces */
1201 		vmk_flags.vmf_permanent = tmp_entry.vme_permanent;
1202 
1203 		/* map this memory object in place of the current one */
1204 		map_addr = tmp_entry.vme_start;
1205 		kr = mach_vm_map_kernel(map,
1206 		    vm_sanitize_wrap_addr_ref(&map_addr),
1207 		    (tmp_entry.vme_end -
1208 		    tmp_entry.vme_start),
1209 		    (mach_vm_offset_t) 0,
1210 		    vmk_flags,
1211 		    (ipc_port_t)(uintptr_t) unprotected_mem_obj,
1212 		    0,
1213 		    TRUE,
1214 		    tmp_entry.protection,
1215 		    tmp_entry.max_protection,
1216 		    tmp_entry.inheritance);
1217 		assertf(kr == KERN_SUCCESS,
1218 		    "kr = 0x%x\n", kr);
1219 		assertf(map_addr == tmp_entry.vme_start,
1220 		    "map_addr=0x%llx vme_start=0x%llx tmp_entry=%p\n",
1221 		    (uint64_t)map_addr,
1222 		    (uint64_t) tmp_entry.vme_start,
1223 		    &tmp_entry);
1224 
1225 #if VM_MAP_DEBUG_APPLE_PROTECT
1226 		if (vm_map_debug_apple_protect) {
1227 			printf("APPLE_PROTECT: map %p [0x%llx:0x%llx] pager %p:"
1228 			    " backing:[object:%p,offset:0x%llx,"
1229 			    "crypto_backing_offset:0x%llx,"
1230 			    "crypto_start:0x%llx,crypto_end:0x%llx]\n",
1231 			    map,
1232 			    (uint64_t) map_addr,
1233 			    (uint64_t) (map_addr + (tmp_entry.vme_end -
1234 			    tmp_entry.vme_start)),
1235 			    unprotected_mem_obj,
1236 			    protected_object,
1237 			    VME_OFFSET(&tmp_entry),
1238 			    crypto_backing_offset,
1239 			    crypto_start,
1240 			    crypto_end);
1241 		}
1242 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
1243 
1244 		/*
1245 		 * Release the reference obtained by
1246 		 * apple_protect_pager_setup().
1247 		 * The mapping (if it succeeded) is now holding a reference on
1248 		 * the memory object.
1249 		 */
1250 		memory_object_deallocate(unprotected_mem_obj);
1251 		unprotected_mem_obj = MEMORY_OBJECT_NULL;
1252 
1253 		/* continue with next map entry */
1254 		crypto_backing_offset += (tmp_entry.vme_end -
1255 		    tmp_entry.vme_start);
1256 		crypto_backing_offset -= crypto_start;
1257 	}
1258 	kr = KERN_SUCCESS;
1259 
1260 done:
1261 	if (map_locked) {
1262 		vm_map_unlock(map);
1263 	}
1264 	return kr;
1265 }
1266 #endif  /* CONFIG_CODE_DECRYPTION */
1267 
1268 
1269 LCK_GRP_DECLARE(vm_map_lck_grp, "vm_map");
1270 LCK_ATTR_DECLARE(vm_map_lck_attr, 0, 0);
1271 LCK_ATTR_DECLARE(vm_map_lck_rw_attr, 0, LCK_ATTR_DEBUG);
1272 
1273 #if XNU_TARGET_OS_OSX
1274 #define MALLOC_NO_COW_DEFAULT 1
1275 #define MALLOC_NO_COW_EXCEPT_FORK_DEFAULT 1
1276 #else /* XNU_TARGET_OS_OSX */
1277 #define MALLOC_NO_COW_DEFAULT 1
1278 #define MALLOC_NO_COW_EXCEPT_FORK_DEFAULT 0
1279 #endif /* XNU_TARGET_OS_OSX */
1280 TUNABLE(int, malloc_no_cow, "malloc_no_cow", MALLOC_NO_COW_DEFAULT);
1281 TUNABLE(int, malloc_no_cow_except_fork, "malloc_no_cow_except_fork", MALLOC_NO_COW_EXCEPT_FORK_DEFAULT);
1282 uint64_t vm_memory_malloc_no_cow_mask = 0ULL;
1283 #if DEBUG
1284 int vm_check_map_sanity = 0;
1285 #endif
1286 
1287 /*
1288  *	vm_map_init:
1289  *
1290  *	Initialize the vm_map module.  Must be called before
1291  *	any other vm_map routines.
1292  *
1293  *	Map and entry structures are allocated from zones -- we must
1294  *	initialize those zones.
1295  *
1296  *	There are three zones of interest:
1297  *
1298  *	vm_map_zone:		used to allocate maps.
1299  *	vm_map_entry_zone:	used to allocate map entries.
1300  *
1301  *	LP32:
1302  *	vm_map_entry_reserved_zone:     fallback zone for kernel map entries
1303  *
1304  *	The kernel allocates map entries from a special zone that is initially
1305  *	"crammed" with memory.  It would be difficult (perhaps impossible) for
1306  *	the kernel to allocate more memory to a entry zone when it became
1307  *	empty since the very act of allocating memory implies the creation
1308  *	of a new entry.
1309  */
1310 __startup_func
1311 void
vm_map_init(void)1312 vm_map_init(void)
1313 {
1314 
1315 #if MACH_ASSERT
1316 	PE_parse_boot_argn("debug4k_filter", &debug4k_filter,
1317 	    sizeof(debug4k_filter));
1318 #endif /* MACH_ASSERT */
1319 
1320 	zone_create_ext(VM_MAP_ZONE_NAME, sizeof(struct _vm_map),
1321 	    VM_MAP_ZFLAGS, ZONE_ID_VM_MAP, NULL);
1322 
1323 	/*
1324 	 * Don't quarantine because we always need elements available
1325 	 * Disallow GC on this zone... to aid the GC.
1326 	 */
1327 	zone_create_ext(VM_MAP_ENTRY_ZONE_NAME,
1328 	    sizeof(struct vm_map_entry), VM_MAP_ENTRY_ZFLAGS,
1329 	    ZONE_ID_VM_MAP_ENTRY, ^(zone_t z) {
1330 		z->z_elems_rsv = (uint16_t)(32 *
1331 		(ml_early_cpu_max_number() + 1));
1332 	});
1333 
1334 	zone_create_ext(VM_MAP_HOLES_ZONE_NAME,
1335 	    sizeof(struct vm_map_links), VM_MAP_HOLES_ZFLAGS,
1336 	    ZONE_ID_VM_MAP_HOLES, ^(zone_t z) {
1337 		z->z_elems_rsv = (uint16_t)(16 * 1024 / zone_elem_outer_size(z));
1338 	});
1339 
1340 	zone_create_ext("VM map copies", sizeof(struct vm_map_copy),
1341 	    ZC_NOENCRYPT, ZONE_ID_VM_MAP_COPY, NULL);
1342 
1343 	/*
1344 	 * Add the stolen memory to zones, adjust zone size and stolen counts.
1345 	 */
1346 	zone_cram_early(vm_map_zone, map_data, map_data_size);
1347 	zone_cram_early(vm_map_entry_zone, kentry_data, kentry_data_size);
1348 	zone_cram_early(vm_map_holes_zone, map_holes_data, map_holes_data_size);
1349 	printf("VM boostrap: %d maps, %d entries and %d holes available\n",
1350 	    zone_count_free(vm_map_zone),
1351 	    zone_count_free(vm_map_entry_zone),
1352 	    zone_count_free(vm_map_holes_zone));
1353 
1354 	/*
1355 	 * Since these are covered by zones, remove them from stolen page accounting.
1356 	 */
1357 	VM_PAGE_MOVE_STOLEN(atop_64(map_data_size) + atop_64(kentry_data_size) + atop_64(map_holes_data_size));
1358 
1359 #if VM_MAP_DEBUG_APPLE_PROTECT
1360 	PE_parse_boot_argn("vm_map_debug_apple_protect",
1361 	    &vm_map_debug_apple_protect,
1362 	    sizeof(vm_map_debug_apple_protect));
1363 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
1364 #if VM_MAP_DEBUG_APPLE_FOURK
1365 	PE_parse_boot_argn("vm_map_debug_fourk",
1366 	    &vm_map_debug_fourk,
1367 	    sizeof(vm_map_debug_fourk));
1368 #endif /* VM_MAP_DEBUG_FOURK */
1369 
1370 	if (malloc_no_cow) {
1371 		vm_memory_malloc_no_cow_mask = 0ULL;
1372 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC;
1373 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_SMALL;
1374 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_MEDIUM;
1375 #if XNU_TARGET_OS_OSX
1376 		/*
1377 		 * On macOS, keep copy-on-write for MALLOC_LARGE because
1378 		 * realloc() may use vm_copy() to transfer the old contents
1379 		 * to the new location.
1380 		 */
1381 #else /* XNU_TARGET_OS_OSX */
1382 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE;
1383 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE_REUSABLE;
1384 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE_REUSED;
1385 #endif /* XNU_TARGET_OS_OSX */
1386 //		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_HUGE;
1387 //		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_REALLOC;
1388 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_TINY;
1389 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_NANO;
1390 //		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_TCMALLOC;
1391 		PE_parse_boot_argn("vm_memory_malloc_no_cow_mask",
1392 		    &vm_memory_malloc_no_cow_mask,
1393 		    sizeof(vm_memory_malloc_no_cow_mask));
1394 	}
1395 
1396 #if CONFIG_MAP_RANGES
1397 	vm_map_range_map_init();
1398 #endif /* CONFIG_MAP_RANGES */
1399 
1400 #if DEBUG
1401 	PE_parse_boot_argn("vm_check_map_sanity", &vm_check_map_sanity, sizeof(vm_check_map_sanity));
1402 	if (vm_check_map_sanity) {
1403 		kprintf("VM sanity checking enabled\n");
1404 	} else {
1405 		kprintf("VM sanity checking disabled. Set bootarg vm_check_map_sanity=1 to enable\n");
1406 	}
1407 #endif /* DEBUG */
1408 
1409 #if DEVELOPMENT || DEBUG
1410 	PE_parse_boot_argn("panic_on_unsigned_execute",
1411 	    &panic_on_unsigned_execute,
1412 	    sizeof(panic_on_unsigned_execute));
1413 	PE_parse_boot_argn("panic_on_mlock_failure",
1414 	    &panic_on_mlock_failure,
1415 	    sizeof(panic_on_mlock_failure));
1416 #endif /* DEVELOPMENT || DEBUG */
1417 }
1418 
1419 __startup_func
1420 static void
vm_map_steal_memory(void)1421 vm_map_steal_memory(void)
1422 {
1423 
1424 	/*
1425 	 * We need to reserve enough memory to support boostraping VM maps
1426 	 * and the zone subsystem.
1427 	 *
1428 	 * The VM Maps that need to function before zones can support them
1429 	 * are the ones registered with vm_map_will_allocate_early_map(),
1430 	 * which are:
1431 	 * - the kernel map
1432 	 * - the various submaps used by zones (pgz, meta, ...)
1433 	 *
1434 	 * We also need enough entries and holes to support them
1435 	 * until zone_metadata_init() is called, which is when
1436 	 * the zone allocator becomes capable of expanding dynamically.
1437 	 *
1438 	 * We need:
1439 	 * - VM_MAP_EARLY_COUNT_MAX worth of VM Maps.
1440 	 * - To allow for 3-4 entries per map, but the kernel map
1441 	 *   needs a multiple of VM_MAP_EARLY_COUNT_MAX entries
1442 	 *   to describe the submaps, so double it (and make it 8x too)
1443 	 * - To allow for holes between entries,
1444 	 *   hence needs the same budget as entries
1445 	 */
1446 	map_data_size = zone_get_early_alloc_size(VM_MAP_ZONE_NAME,
1447 	    sizeof(struct _vm_map), VM_MAP_ZFLAGS,
1448 	    VM_MAP_EARLY_COUNT_MAX);
1449 
1450 	kentry_data_size = zone_get_early_alloc_size(VM_MAP_ENTRY_ZONE_NAME,
1451 	    sizeof(struct vm_map_entry), VM_MAP_ENTRY_ZFLAGS,
1452 	    8 * VM_MAP_EARLY_COUNT_MAX);
1453 
1454 	map_holes_data_size = zone_get_early_alloc_size(VM_MAP_HOLES_ZONE_NAME,
1455 	    sizeof(struct vm_map_links), VM_MAP_HOLES_ZFLAGS,
1456 	    8 * VM_MAP_EARLY_COUNT_MAX);
1457 
1458 	/*
1459 	 * Steal a contiguous range of memory so that a simple range check
1460 	 * can validate early addresses being freed/crammed to these
1461 	 * zones
1462 	 */
1463 	map_data       = zone_early_mem_init(map_data_size + kentry_data_size +
1464 	    map_holes_data_size);
1465 	kentry_data    = map_data + map_data_size;
1466 	map_holes_data = kentry_data + kentry_data_size;
1467 }
1468 STARTUP(PMAP_STEAL, STARTUP_RANK_FIRST, vm_map_steal_memory);
1469 
1470 __startup_func
1471 static void
vm_kernel_boostraped(void)1472 vm_kernel_boostraped(void)
1473 {
1474 	zone_enable_caching(&zone_array[ZONE_ID_VM_MAP_ENTRY]);
1475 	zone_enable_caching(&zone_array[ZONE_ID_VM_MAP_HOLES]);
1476 	zone_enable_caching(&zone_array[ZONE_ID_VM_MAP_COPY]);
1477 
1478 	printf("VM bootstrap done: %d maps, %d entries and %d holes left\n",
1479 	    zone_count_free(vm_map_zone),
1480 	    zone_count_free(vm_map_entry_zone),
1481 	    zone_count_free(vm_map_holes_zone));
1482 }
1483 STARTUP(ZALLOC, STARTUP_RANK_SECOND, vm_kernel_boostraped);
1484 
1485 void
vm_map_disable_hole_optimization(vm_map_t map)1486 vm_map_disable_hole_optimization(vm_map_t map)
1487 {
1488 	vm_map_entry_t  head_entry, hole_entry, next_hole_entry;
1489 
1490 	if (map->holelistenabled) {
1491 		head_entry = hole_entry = CAST_TO_VM_MAP_ENTRY(map->holes_list);
1492 
1493 		while (hole_entry != NULL) {
1494 			next_hole_entry = hole_entry->vme_next;
1495 
1496 			hole_entry->vme_next = NULL;
1497 			hole_entry->vme_prev = NULL;
1498 			zfree_id(ZONE_ID_VM_MAP_HOLES, hole_entry);
1499 
1500 			if (next_hole_entry == head_entry) {
1501 				hole_entry = NULL;
1502 			} else {
1503 				hole_entry = next_hole_entry;
1504 			}
1505 		}
1506 
1507 		map->holes_list = NULL;
1508 		map->holelistenabled = FALSE;
1509 
1510 		map->first_free = vm_map_first_entry(map);
1511 		SAVE_HINT_HOLE_WRITE(map, NULL);
1512 	}
1513 }
1514 
1515 boolean_t
vm_kernel_map_is_kernel(vm_map_t map)1516 vm_kernel_map_is_kernel(vm_map_t map)
1517 {
1518 	return map->pmap == kernel_pmap;
1519 }
1520 
1521 /*
1522  *	vm_map_create:
1523  *
1524  *	Creates and returns a new empty VM map with
1525  *	the given physical map structure, and having
1526  *	the given lower and upper address bounds.
1527  */
1528 
1529 extern vm_map_t vm_map_create_external(
1530 	pmap_t                  pmap,
1531 	vm_map_offset_t         min_off,
1532 	vm_map_offset_t         max_off,
1533 	boolean_t               pageable);
1534 
1535 vm_map_t
vm_map_create_external(pmap_t pmap,vm_map_offset_t min,vm_map_offset_t max,boolean_t pageable)1536 vm_map_create_external(
1537 	pmap_t                  pmap,
1538 	vm_map_offset_t         min,
1539 	vm_map_offset_t         max,
1540 	boolean_t               pageable)
1541 {
1542 	vm_map_create_options_t options = VM_MAP_CREATE_DEFAULT;
1543 
1544 	if (pageable) {
1545 		options |= VM_MAP_CREATE_PAGEABLE;
1546 	}
1547 	return vm_map_create_options(pmap, min, max, options);
1548 }
1549 
1550 __startup_func
1551 void
vm_map_will_allocate_early_map(vm_map_t * owner)1552 vm_map_will_allocate_early_map(vm_map_t *owner)
1553 {
1554 	if (early_map_count >= VM_MAP_EARLY_COUNT_MAX) {
1555 		panic("VM_MAP_EARLY_COUNT_MAX is too low");
1556 	}
1557 
1558 	early_map_owners[early_map_count++] = owner;
1559 }
1560 
1561 __startup_func
1562 void
vm_map_relocate_early_maps(vm_offset_t delta)1563 vm_map_relocate_early_maps(vm_offset_t delta)
1564 {
1565 	for (uint32_t i = 0; i < early_map_count; i++) {
1566 		vm_address_t addr = (vm_address_t)*early_map_owners[i];
1567 
1568 		*early_map_owners[i] = (vm_map_t)(addr + delta);
1569 	}
1570 
1571 	early_map_count = ~0u;
1572 }
1573 
1574 /*
1575  *	Routine:	vm_map_relocate_early_elem
1576  *
1577  *	Purpose:
1578  *		Early zone elements are allocated in a temporary part
1579  *		of the address space.
1580  *
1581  *		Once the zones live in their final place, the early
1582  *		VM maps, map entries and map holes need to be relocated.
1583  *
1584  *		It involves rewriting any vm_map_t, vm_map_entry_t or
1585  *		pointers to vm_map_links. Other pointers to other types
1586  *		are fine.
1587  *
1588  *		Fortunately, pointers to those types are self-contained
1589  *		in those zones, _except_ for pointers to VM maps,
1590  *		which are tracked during early boot and fixed with
1591  *		vm_map_relocate_early_maps().
1592  */
1593 __startup_func
1594 void
vm_map_relocate_early_elem(uint32_t zone_id,vm_offset_t new_addr,vm_offset_t delta)1595 vm_map_relocate_early_elem(
1596 	uint32_t                zone_id,
1597 	vm_offset_t             new_addr,
1598 	vm_offset_t             delta)
1599 {
1600 #define relocate(type_t, field)  ({ \
1601 	typeof(((type_t)NULL)->field) *__field = &((type_t)new_addr)->field;   \
1602 	if (*__field) {                                                        \
1603 	        *__field = (typeof(*__field))((vm_offset_t)*__field + delta);  \
1604 	}                                                                      \
1605 })
1606 
1607 	switch (zone_id) {
1608 	case ZONE_ID_VM_MAP:
1609 	case ZONE_ID_VM_MAP_ENTRY:
1610 	case ZONE_ID_VM_MAP_HOLES:
1611 		break;
1612 
1613 	default:
1614 		panic("Unexpected zone ID %d", zone_id);
1615 	}
1616 
1617 	if (zone_id == ZONE_ID_VM_MAP) {
1618 		relocate(vm_map_t, hdr.links.prev);
1619 		relocate(vm_map_t, hdr.links.next);
1620 		((vm_map_t)new_addr)->pmap = kernel_pmap;
1621 #ifdef VM_MAP_STORE_USE_RB
1622 		relocate(vm_map_t, hdr.rb_head_store.rbh_root);
1623 #endif /* VM_MAP_STORE_USE_RB */
1624 		relocate(vm_map_t, hint);
1625 		relocate(vm_map_t, hole_hint);
1626 		relocate(vm_map_t, first_free);
1627 		return;
1628 	}
1629 
1630 	relocate(struct vm_map_links *, prev);
1631 	relocate(struct vm_map_links *, next);
1632 
1633 	if (zone_id == ZONE_ID_VM_MAP_ENTRY) {
1634 #ifdef VM_MAP_STORE_USE_RB
1635 		relocate(vm_map_entry_t, store.entry.rbe_left);
1636 		relocate(vm_map_entry_t, store.entry.rbe_right);
1637 		relocate(vm_map_entry_t, store.entry.rbe_parent);
1638 #endif /* VM_MAP_STORE_USE_RB */
1639 		if (((vm_map_entry_t)new_addr)->is_sub_map) {
1640 			/* no object to relocate because we haven't made any */
1641 			((vm_map_entry_t)new_addr)->vme_submap +=
1642 			    delta >> VME_SUBMAP_SHIFT;
1643 		}
1644 #if MAP_ENTRY_CREATION_DEBUG
1645 		relocate(vm_map_entry_t, vme_creation_maphdr);
1646 #endif /* MAP_ENTRY_CREATION_DEBUG */
1647 	}
1648 
1649 #undef relocate
1650 }
1651 
1652 vm_map_t
vm_map_create_options(pmap_t pmap,vm_map_offset_t min,vm_map_offset_t max,vm_map_create_options_t options)1653 vm_map_create_options(
1654 	pmap_t                  pmap,
1655 	vm_map_offset_t         min,
1656 	vm_map_offset_t         max,
1657 	vm_map_create_options_t options)
1658 {
1659 	vm_map_t result;
1660 
1661 #if DEBUG || DEVELOPMENT
1662 	if (__improbable(startup_phase < STARTUP_SUB_ZALLOC)) {
1663 		if (early_map_count != ~0u && early_map_count !=
1664 		    zone_count_allocated(vm_map_zone) + 1) {
1665 			panic("allocating %dth early map, owner not known",
1666 			    zone_count_allocated(vm_map_zone) + 1);
1667 		}
1668 		if (early_map_count != ~0u && pmap && pmap != kernel_pmap) {
1669 			panic("allocating %dth early map for non kernel pmap",
1670 			    early_map_count);
1671 		}
1672 	}
1673 #endif /* DEBUG || DEVELOPMENT */
1674 
1675 	result = zalloc_id(ZONE_ID_VM_MAP, Z_WAITOK | Z_NOFAIL | Z_ZERO);
1676 
1677 	vm_map_store_init(&result->hdr);
1678 	result->hdr.entries_pageable = (bool)(options & VM_MAP_CREATE_PAGEABLE);
1679 	vm_map_set_page_shift(result, PAGE_SHIFT);
1680 
1681 	result->size_limit      = RLIM_INFINITY;        /* default unlimited */
1682 	result->data_limit      = RLIM_INFINITY;        /* default unlimited */
1683 	result->user_wire_limit = MACH_VM_MAX_ADDRESS;  /* default limit is unlimited */
1684 	os_ref_init_count_raw(&result->map_refcnt, &map_refgrp, 1);
1685 	result->pmap = pmap;
1686 	result->min_offset = min;
1687 	result->max_offset = max;
1688 	result->first_free = vm_map_to_entry(result);
1689 	result->hint = vm_map_to_entry(result);
1690 
1691 	if (options & VM_MAP_CREATE_NEVER_FAULTS) {
1692 		assert(pmap == kernel_pmap);
1693 		result->never_faults = true;
1694 	}
1695 
1696 	/* "has_corpse_footprint" and "holelistenabled" are mutually exclusive */
1697 	if (options & VM_MAP_CREATE_CORPSE_FOOTPRINT) {
1698 		result->has_corpse_footprint = true;
1699 	} else if (!(options & VM_MAP_CREATE_DISABLE_HOLELIST)) {
1700 		struct vm_map_links *hole_entry;
1701 
1702 		hole_entry = zalloc_id(ZONE_ID_VM_MAP_HOLES, Z_WAITOK | Z_NOFAIL);
1703 		hole_entry->start = min;
1704 		/*
1705 		 * Holes can be used to track ranges all the way up to
1706 		 * MACH_VM_MAX_ADDRESS or more (e.g. kernel map).
1707 		 */
1708 		hole_entry->end = MAX(max, (vm_map_offset_t)MACH_VM_MAX_ADDRESS);
1709 		result->holes_list = result->hole_hint = hole_entry;
1710 		hole_entry->prev = hole_entry->next = CAST_TO_VM_MAP_ENTRY(hole_entry);
1711 		result->holelistenabled = true;
1712 	}
1713 
1714 	vm_map_lock_init(result);
1715 
1716 	return result;
1717 }
1718 
1719 /*
1720  * Adjusts a submap that was made by kmem_suballoc()
1721  * before it knew where it would be mapped,
1722  * so that it has the right min/max offsets.
1723  *
1724  * We do not need to hold any locks:
1725  * only the caller knows about this map,
1726  * and it is not published on any entry yet.
1727  */
1728 static void
vm_map_adjust_offsets(vm_map_t map,vm_map_offset_t min_off,vm_map_offset_t max_off)1729 vm_map_adjust_offsets(
1730 	vm_map_t                map,
1731 	vm_map_offset_t         min_off,
1732 	vm_map_offset_t         max_off)
1733 {
1734 	assert(map->min_offset == 0);
1735 	assert(map->max_offset == max_off - min_off);
1736 	assert(map->hdr.nentries == 0);
1737 	assert(os_ref_get_count_raw(&map->map_refcnt) == 2);
1738 
1739 	map->min_offset = min_off;
1740 	map->max_offset = max_off;
1741 
1742 	if (map->holelistenabled) {
1743 		struct vm_map_links *hole = map->holes_list;
1744 
1745 		hole->start = min_off;
1746 #if defined(__arm64__)
1747 		hole->end = max_off;
1748 #else
1749 		hole->end = MAX(max_off, (vm_map_offset_t)MACH_VM_MAX_ADDRESS);
1750 #endif
1751 	}
1752 }
1753 
1754 
1755 vm_map_size_t
vm_map_adjusted_size(vm_map_t map)1756 vm_map_adjusted_size(vm_map_t map)
1757 {
1758 	const struct vm_reserved_region *regions = NULL;
1759 	size_t num_regions = 0;
1760 	mach_vm_size_t  reserved_size = 0, map_size = 0;
1761 
1762 	if (map == NULL || (map->size == 0)) {
1763 		return 0;
1764 	}
1765 
1766 	map_size = map->size;
1767 
1768 	if (map->reserved_regions == FALSE || !vm_map_is_exotic(map) || map->terminated) {
1769 		/*
1770 		 * No special reserved regions or not an exotic map or the task
1771 		 * is terminating and these special regions might have already
1772 		 * been deallocated.
1773 		 */
1774 		return map_size;
1775 	}
1776 
1777 	num_regions = ml_get_vm_reserved_regions(vm_map_is_64bit(map), &regions);
1778 	assert((num_regions == 0) || (num_regions > 0 && regions != NULL));
1779 
1780 	while (num_regions) {
1781 		reserved_size += regions[--num_regions].vmrr_size;
1782 	}
1783 
1784 	/*
1785 	 * There are a few places where the map is being switched out due to
1786 	 * 'termination' without that bit being set (e.g. exec and corpse purging).
1787 	 * In those cases, we could have the map's regions being deallocated on
1788 	 * a core while some accounting process is trying to get the map's size.
1789 	 * So this assert can't be enabled till all those places are uniform in
1790 	 * their use of the 'map->terminated' bit.
1791 	 *
1792 	 * assert(map_size >= reserved_size);
1793 	 */
1794 
1795 	return (map_size >= reserved_size) ? (map_size - reserved_size) : map_size;
1796 }
1797 
1798 /*
1799  *	vm_map_entry_create:	[ internal use only ]
1800  *
1801  *	Allocates a VM map entry for insertion in the
1802  *	given map (or map copy).  No fields are filled.
1803  *
1804  *	The VM entry will be zero initialized, except for:
1805  *	- behavior set to VM_BEHAVIOR_DEFAULT
1806  *	- inheritance set to VM_INHERIT_DEFAULT
1807  */
1808 #define vm_map_entry_create(map)    _vm_map_entry_create(&(map)->hdr)
1809 
1810 #define vm_map_copy_entry_create(copy) _vm_map_entry_create(&(copy)->cpy_hdr)
1811 
1812 static vm_map_entry_t
_vm_map_entry_create(struct vm_map_header * map_header __unused)1813 _vm_map_entry_create(
1814 	struct vm_map_header    *map_header __unused)
1815 {
1816 	vm_map_entry_t entry = NULL;
1817 
1818 	entry = zalloc_id(ZONE_ID_VM_MAP_ENTRY, Z_WAITOK | Z_ZERO);
1819 
1820 	/*
1821 	 * Help the compiler with what we know to be true,
1822 	 * so that the further bitfields inits have good codegen.
1823 	 *
1824 	 * See rdar://87041299
1825 	 */
1826 	__builtin_assume(entry->vme_object_value == 0);
1827 	__builtin_assume(*(uint64_t *)(&entry->vme_object_value + 1) == 0);
1828 	__builtin_assume(*(uint64_t *)(&entry->vme_object_value + 2) == 0);
1829 
1830 	static_assert(VM_MAX_TAG_VALUE <= VME_ALIAS_MASK,
1831 	    "VME_ALIAS_MASK covers tags");
1832 
1833 	static_assert(VM_BEHAVIOR_DEFAULT == 0,
1834 	    "can skip zeroing of the behavior field");
1835 	entry->inheritance = VM_INHERIT_DEFAULT;
1836 
1837 #if MAP_ENTRY_CREATION_DEBUG
1838 	entry->vme_creation_maphdr = map_header;
1839 	entry->vme_creation_bt = btref_get(__builtin_frame_address(0),
1840 	    BTREF_GET_NOWAIT);
1841 #endif
1842 	return entry;
1843 }
1844 
1845 /*
1846  *	vm_map_entry_dispose:	[ internal use only ]
1847  *
1848  *	Inverse of vm_map_entry_create.
1849  *
1850  *      write map lock held so no need to
1851  *	do anything special to insure correctness
1852  *      of the stores
1853  */
1854 static void
vm_map_entry_dispose(vm_map_entry_t entry)1855 vm_map_entry_dispose(
1856 	vm_map_entry_t          entry)
1857 {
1858 #if VM_BTLOG_TAGS
1859 	if (entry->vme_kernel_object) {
1860 		btref_put(entry->vme_tag_btref);
1861 	}
1862 #endif /* VM_BTLOG_TAGS */
1863 #if MAP_ENTRY_CREATION_DEBUG
1864 	btref_put(entry->vme_creation_bt);
1865 #endif
1866 #if MAP_ENTRY_INSERTION_DEBUG
1867 	btref_put(entry->vme_insertion_bt);
1868 #endif
1869 	zfree(vm_map_entry_zone, entry);
1870 }
1871 
1872 #define vm_map_copy_entry_dispose(copy_entry) \
1873 	vm_map_entry_dispose(copy_entry)
1874 
1875 static vm_map_entry_t
vm_map_zap_first_entry(vm_map_zap_t list)1876 vm_map_zap_first_entry(
1877 	vm_map_zap_t            list)
1878 {
1879 	return list->vmz_head;
1880 }
1881 
1882 static vm_map_entry_t
vm_map_zap_last_entry(vm_map_zap_t list)1883 vm_map_zap_last_entry(
1884 	vm_map_zap_t            list)
1885 {
1886 	assert(vm_map_zap_first_entry(list));
1887 	return __container_of(list->vmz_tail, struct vm_map_entry, vme_next);
1888 }
1889 
1890 static void
vm_map_zap_append(vm_map_zap_t list,vm_map_entry_t entry)1891 vm_map_zap_append(
1892 	vm_map_zap_t            list,
1893 	vm_map_entry_t          entry)
1894 {
1895 	entry->vme_next = VM_MAP_ENTRY_NULL;
1896 	*list->vmz_tail = entry;
1897 	list->vmz_tail = &entry->vme_next;
1898 }
1899 
1900 static vm_map_entry_t
vm_map_zap_pop(vm_map_zap_t list)1901 vm_map_zap_pop(
1902 	vm_map_zap_t            list)
1903 {
1904 	vm_map_entry_t head = list->vmz_head;
1905 
1906 	if (head != VM_MAP_ENTRY_NULL &&
1907 	    (list->vmz_head = head->vme_next) == VM_MAP_ENTRY_NULL) {
1908 		list->vmz_tail = &list->vmz_head;
1909 	}
1910 
1911 	return head;
1912 }
1913 
1914 static void
vm_map_zap_dispose(vm_map_zap_t list)1915 vm_map_zap_dispose(
1916 	vm_map_zap_t            list)
1917 {
1918 	vm_map_entry_t          entry;
1919 
1920 	while ((entry = vm_map_zap_pop(list))) {
1921 		if (entry->is_sub_map) {
1922 			vm_map_deallocate(VME_SUBMAP(entry));
1923 		} else {
1924 			vm_object_deallocate(VME_OBJECT(entry));
1925 		}
1926 
1927 		vm_map_entry_dispose(entry);
1928 	}
1929 }
1930 
1931 #if MACH_ASSERT
1932 static boolean_t first_free_check = FALSE;
1933 boolean_t
first_free_is_valid(vm_map_t map)1934 first_free_is_valid(
1935 	vm_map_t        map)
1936 {
1937 	if (!first_free_check) {
1938 		return TRUE;
1939 	}
1940 
1941 	return first_free_is_valid_store( map );
1942 }
1943 #endif /* MACH_ASSERT */
1944 
1945 
1946 #define vm_map_copy_entry_link(copy, after_where, entry)                \
1947 	_vm_map_store_entry_link(&(copy)->cpy_hdr, after_where, (entry))
1948 
1949 #define vm_map_copy_entry_unlink(copy, entry)                           \
1950 	_vm_map_store_entry_unlink(&(copy)->cpy_hdr, (entry), false)
1951 
1952 /*
1953  *	vm_map_destroy:
1954  *
1955  *	Actually destroy a map.
1956  */
1957 void
vm_map_destroy(vm_map_t map)1958 vm_map_destroy(
1959 	vm_map_t        map)
1960 {
1961 	/* final cleanup: this is not allowed to fail */
1962 	vmr_flags_t flags = VM_MAP_REMOVE_NO_FLAGS;
1963 
1964 	VM_MAP_ZAP_DECLARE(zap);
1965 
1966 	vm_map_lock(map);
1967 
1968 	map->terminated = true;
1969 	/* clean up regular map entries */
1970 	(void)vm_map_delete(map, map->min_offset, map->max_offset, flags,
1971 	    KMEM_GUARD_NONE, &zap);
1972 	/* clean up leftover special mappings (commpage, GPU carveout, etc...) */
1973 	(void)vm_map_delete(map, 0x0, 0xFFFFFFFFFFFFF000ULL, flags,
1974 	    KMEM_GUARD_NONE, &zap);
1975 
1976 	vm_map_disable_hole_optimization(map);
1977 	vm_map_corpse_footprint_destroy(map);
1978 
1979 	vm_map_unlock(map);
1980 
1981 	vm_map_zap_dispose(&zap);
1982 
1983 	assert(map->hdr.nentries == 0);
1984 
1985 	if (map->pmap) {
1986 		pmap_destroy(map->pmap);
1987 	}
1988 
1989 	lck_rw_destroy(&map->lock, &vm_map_lck_grp);
1990 
1991 #if CONFIG_MAP_RANGES
1992 	kfree_data(map->extra_ranges,
1993 	    map->extra_ranges_count * sizeof(struct vm_map_user_range));
1994 #endif
1995 
1996 	zfree_id(ZONE_ID_VM_MAP, map);
1997 }
1998 
1999 /*
2000  * Returns pid of the task with the largest number of VM map entries.
2001  * Used in the zone-map-exhaustion jetsam path.
2002  */
2003 pid_t
find_largest_process_vm_map_entries(void)2004 find_largest_process_vm_map_entries(void)
2005 {
2006 	pid_t victim_pid = -1;
2007 	int max_vm_map_entries = 0;
2008 	task_t task = TASK_NULL;
2009 	queue_head_t *task_list = &tasks;
2010 
2011 	lck_mtx_lock(&tasks_threads_lock);
2012 	queue_iterate(task_list, task, task_t, tasks) {
2013 		if (task == kernel_task || !task->active) {
2014 			continue;
2015 		}
2016 
2017 		vm_map_t task_map = task->map;
2018 		if (task_map != VM_MAP_NULL) {
2019 			int task_vm_map_entries = task_map->hdr.nentries;
2020 			if (task_vm_map_entries > max_vm_map_entries) {
2021 				max_vm_map_entries = task_vm_map_entries;
2022 				victim_pid = pid_from_task(task);
2023 			}
2024 		}
2025 	}
2026 	lck_mtx_unlock(&tasks_threads_lock);
2027 
2028 	printf("zone_map_exhaustion: victim pid %d, vm region count: %d\n", victim_pid, max_vm_map_entries);
2029 	return victim_pid;
2030 }
2031 
2032 
2033 /*
2034  *	vm_map_lookup_entry:	[ internal use only ]
2035  *
2036  *	Calls into the vm map store layer to find the map
2037  *	entry containing (or immediately preceding) the
2038  *	specified address in the given map; the entry is returned
2039  *	in the "entry" parameter.  The boolean
2040  *	result indicates whether the address is
2041  *	actually contained in the map.
2042  */
2043 boolean_t
vm_map_lookup_entry(vm_map_t map,vm_map_offset_t address,vm_map_entry_t * entry)2044 vm_map_lookup_entry(
2045 	vm_map_t        map,
2046 	vm_map_offset_t address,
2047 	vm_map_entry_t  *entry)         /* OUT */
2048 {
2049 	bool result = false;
2050 
2051 #if CONFIG_KERNEL_TAGGING
2052 	if (VM_KERNEL_ADDRESS(address)) {
2053 		address = vm_memtag_canonicalize_kernel(address);
2054 	}
2055 #endif /* CONFIG_KERNEL_TAGGING */
2056 
2057 #if CONFIG_PROB_GZALLOC
2058 	if (map->pmap == kernel_pmap) {
2059 		assertf(!pgz_owned(address),
2060 		    "it is the responsibility of callers to unguard PGZ addresses");
2061 	}
2062 #endif /* CONFIG_PROB_GZALLOC */
2063 	result = vm_map_store_lookup_entry( map, address, entry );
2064 
2065 	return result;
2066 }
2067 
2068 boolean_t
vm_map_lookup_entry_or_next(vm_map_t map,vm_map_offset_t address,vm_map_entry_t * entry)2069 vm_map_lookup_entry_or_next(
2070 	vm_map_t        map,
2071 	vm_map_offset_t address,
2072 	vm_map_entry_t  *entry)         /* OUT */
2073 {
2074 	if (vm_map_lookup_entry(map, address, entry)) {
2075 		return true;
2076 	}
2077 
2078 	*entry = (*entry)->vme_next;
2079 	return false;
2080 }
2081 
2082 #if CONFIG_PROB_GZALLOC
2083 boolean_t
vm_map_lookup_entry_allow_pgz(vm_map_t map,vm_map_offset_t address,vm_map_entry_t * entry)2084 vm_map_lookup_entry_allow_pgz(
2085 	vm_map_t        map,
2086 	vm_map_offset_t address,
2087 	vm_map_entry_t  *entry)         /* OUT */
2088 {
2089 #if CONFIG_KERNEL_TAGGING
2090 	if (VM_KERNEL_ADDRESS(address)) {
2091 		address = vm_memtag_canonicalize_kernel(address);
2092 	}
2093 #endif /* CONFIG_KERNEL_TAGGING */
2094 
2095 	return vm_map_store_lookup_entry( map, address, entry );
2096 }
2097 #endif /* CONFIG_PROB_GZALLOC */
2098 
2099 /*
2100  *	Routine:	vm_map_range_invalid_panic
2101  *	Purpose:
2102  *			Panic on detection of an invalid range id.
2103  */
2104 __abortlike
2105 static void
vm_map_range_invalid_panic(vm_map_t map,vm_map_range_id_t range_id)2106 vm_map_range_invalid_panic(
2107 	vm_map_t                map,
2108 	vm_map_range_id_t       range_id)
2109 {
2110 	panic("invalid range ID (%u) for map %p", range_id, map);
2111 }
2112 
2113 /*
2114  *	Routine:	vm_map_get_range
2115  *	Purpose:
2116  *			Adjust bounds based on security policy.
2117  */
2118 static struct mach_vm_range
vm_map_get_range(vm_map_t map,vm_map_address_t * address,vm_map_kernel_flags_t * vmk_flags,vm_map_size_t size,bool * is_ptr)2119 vm_map_get_range(
2120 	vm_map_t                map,
2121 	vm_map_address_t       *address,
2122 	vm_map_kernel_flags_t  *vmk_flags,
2123 	vm_map_size_t           size,
2124 	bool                   *is_ptr)
2125 {
2126 	struct mach_vm_range effective_range = {};
2127 	vm_map_range_id_t range_id = vmk_flags->vmkf_range_id;
2128 
2129 	if (map == kernel_map) {
2130 		effective_range = kmem_ranges[range_id];
2131 
2132 		if (startup_phase >= STARTUP_SUB_KMEM) {
2133 			/*
2134 			 * Hint provided by caller is zeroed as the range is restricted to a
2135 			 * subset of the entire kernel_map VA, which could put the hint outside
2136 			 * the range, causing vm_map_store_find_space to fail.
2137 			 */
2138 			*address = 0ull;
2139 			/*
2140 			 * Ensure that range_id passed in by the caller is within meaningful
2141 			 * bounds. Range id of KMEM_RANGE_ID_NONE will cause vm_map_locate_space
2142 			 * to fail as the corresponding range is invalid. Range id larger than
2143 			 * KMEM_RANGE_ID_MAX will lead to an OOB access.
2144 			 */
2145 			if ((range_id == KMEM_RANGE_ID_NONE) ||
2146 			    (range_id > KMEM_RANGE_ID_MAX)) {
2147 				vm_map_range_invalid_panic(map, range_id);
2148 			}
2149 
2150 			/*
2151 			 * Pointer ranges use kmem_locate_space to do allocations.
2152 			 *
2153 			 * Non pointer fronts look like [ Small | Large | Permanent ]
2154 			 * Adjust range for allocations larger than KMEM_SMALLMAP_THRESHOLD.
2155 			 * Allocations smaller than KMEM_SMALLMAP_THRESHOLD are allowed to
2156 			 * use the entire range.
2157 			 */
2158 			if (range_id < KMEM_RANGE_ID_SPRAYQTN) {
2159 				*is_ptr = true;
2160 			} else if (size >= KMEM_SMALLMAP_THRESHOLD) {
2161 				effective_range = kmem_large_ranges[range_id];
2162 			}
2163 		}
2164 #if CONFIG_MAP_RANGES
2165 	} else if (map->uses_user_ranges) {
2166 		switch (range_id) {
2167 		case UMEM_RANGE_ID_DEFAULT:
2168 			effective_range = map->default_range;
2169 			break;
2170 		case UMEM_RANGE_ID_HEAP:
2171 			effective_range = map->data_range;
2172 			break;
2173 		case UMEM_RANGE_ID_LARGE_FILE:
2174 			if (map->large_file_range.min_address != map->large_file_range.max_address) {
2175 				/* large file range is configured and should be used */
2176 				effective_range = map->large_file_range;
2177 			} else {
2178 				/*
2179 				 * the user asking for this user range might not have the
2180 				 * permissions to use the large file range (i.e., it doesn't
2181 				 * hold the correct entitlement), so we give it the data range
2182 				 * instead
2183 				 */
2184 				effective_range = map->data_range;
2185 			}
2186 			break;
2187 		case UMEM_RANGE_ID_FIXED:
2188 			/*
2189 			 * anywhere allocations with an address in "FIXED"
2190 			 * makes no sense, leave the range empty
2191 			 */
2192 			break;
2193 
2194 		default:
2195 			vm_map_range_invalid_panic(map, range_id);
2196 		}
2197 #endif /* CONFIG_MAP_RANGES */
2198 	} else {
2199 		/*
2200 		 * If minimum is 0, bump it up by PAGE_SIZE.  We want to limit
2201 		 * allocations of PAGEZERO to explicit requests since its
2202 		 * normal use is to catch dereferences of NULL and many
2203 		 * applications also treat pointers with a value of 0 as
2204 		 * special and suddenly having address 0 contain useable
2205 		 * memory would tend to confuse those applications.
2206 		 */
2207 		effective_range.min_address = MAX(map->min_offset, VM_MAP_PAGE_SIZE(map));
2208 		effective_range.max_address = map->max_offset;
2209 	}
2210 
2211 	return effective_range;
2212 }
2213 
2214 kern_return_t
vm_map_locate_space_anywhere(vm_map_t map,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_map_offset_t * start_inout,vm_map_entry_t * entry_out)2215 vm_map_locate_space_anywhere(
2216 	vm_map_t                map,
2217 	vm_map_size_t           size,
2218 	vm_map_offset_t         mask,
2219 	vm_map_kernel_flags_t   vmk_flags,
2220 	vm_map_offset_t        *start_inout,
2221 	vm_map_entry_t         *entry_out)
2222 {
2223 	struct mach_vm_range effective_range = {};
2224 	vm_map_size_t   guard_offset;
2225 	vm_map_offset_t hint, limit;
2226 	vm_map_entry_t  entry;
2227 	bool            is_kmem_ptr_range = false;
2228 
2229 	/*
2230 	 * Only supported by vm_map_enter() with a fixed address.
2231 	 */
2232 	assert(!vmk_flags.vmf_fixed);
2233 	assert(!vmk_flags.vmkf_beyond_max);
2234 
2235 	if (__improbable(map->wait_for_space)) {
2236 		/*
2237 		 * support for "wait_for_space" is minimal,
2238 		 * its only consumer is the ipc_kernel_copy_map.
2239 		 */
2240 		assert(!map->holelistenabled &&
2241 		    !vmk_flags.vmkf_last_free &&
2242 		    !vmk_flags.vmkf_keep_map_locked &&
2243 		    !vmk_flags.vmkf_map_jit &&
2244 		    !vmk_flags.vmf_random_addr &&
2245 		    *start_inout <= map->min_offset);
2246 	} else if (vmk_flags.vmkf_last_free) {
2247 		assert(!vmk_flags.vmkf_map_jit &&
2248 		    !vmk_flags.vmf_random_addr);
2249 	}
2250 
2251 	if (vmk_flags.vmkf_guard_before) {
2252 		guard_offset = VM_MAP_PAGE_SIZE(map);
2253 		assert(size > guard_offset);
2254 		size -= guard_offset;
2255 	} else {
2256 		assert(size != 0);
2257 		guard_offset = 0;
2258 	}
2259 
2260 	if (__improbable(!vm_map_is_map_size_valid(
2261 		    map, size, vmk_flags.vmkf_no_soft_limit))) {
2262 		return KERN_NO_SPACE;
2263 	}
2264 
2265 	/*
2266 	 * Validate range_id from flags and get associated range
2267 	 */
2268 	effective_range = vm_map_get_range(map, start_inout, &vmk_flags, size,
2269 	    &is_kmem_ptr_range);
2270 
2271 	if (is_kmem_ptr_range) {
2272 		return kmem_locate_space(size + guard_offset, vmk_flags.vmkf_range_id,
2273 		           vmk_flags.vmkf_last_free, start_inout, entry_out);
2274 	}
2275 
2276 #if XNU_TARGET_OS_OSX
2277 	if (__improbable(vmk_flags.vmkf_32bit_map_va)) {
2278 		assert(map != kernel_map);
2279 		effective_range.max_address = MIN(map->max_offset, 0x00000000FFFFF000ULL);
2280 	}
2281 #endif /* XNU_TARGET_OS_OSX */
2282 
2283 again:
2284 	if (vmk_flags.vmkf_last_free) {
2285 		hint = *start_inout;
2286 
2287 		if (hint == 0 || hint > effective_range.max_address) {
2288 			hint = effective_range.max_address;
2289 		}
2290 		if (hint <= effective_range.min_address) {
2291 			return KERN_NO_SPACE;
2292 		}
2293 		limit = effective_range.min_address;
2294 	} else {
2295 		hint = *start_inout;
2296 
2297 		if (vmk_flags.vmkf_map_jit) {
2298 			if (map->jit_entry_exists &&
2299 			    !VM_MAP_POLICY_ALLOW_MULTIPLE_JIT(map)) {
2300 				return KERN_INVALID_ARGUMENT;
2301 			}
2302 			if (VM_MAP_POLICY_ALLOW_JIT_RANDOM_ADDRESS(map)) {
2303 				vmk_flags.vmf_random_addr = true;
2304 			}
2305 		}
2306 
2307 		if (vmk_flags.vmf_random_addr) {
2308 			kern_return_t kr;
2309 
2310 			kr = vm_map_random_address_for_size(map, &hint, size, vmk_flags);
2311 			if (kr != KERN_SUCCESS) {
2312 				return kr;
2313 			}
2314 		}
2315 #if __x86_64__
2316 		else if ((hint == 0 || hint == vm_map_min(map)) &&
2317 		    !map->disable_vmentry_reuse &&
2318 		    map->vmmap_high_start != 0) {
2319 			hint = map->vmmap_high_start;
2320 		}
2321 #endif /* __x86_64__ */
2322 
2323 		if (hint < effective_range.min_address) {
2324 			hint = effective_range.min_address;
2325 		}
2326 		if (effective_range.max_address <= hint) {
2327 			return KERN_NO_SPACE;
2328 		}
2329 
2330 		limit = effective_range.max_address;
2331 	}
2332 	entry = vm_map_store_find_space(map,
2333 	    hint, limit, vmk_flags.vmkf_last_free,
2334 	    guard_offset, size, mask,
2335 	    start_inout);
2336 
2337 	if (__improbable(entry == NULL)) {
2338 		if (map->wait_for_space &&
2339 		    guard_offset + size <=
2340 		    effective_range.max_address - effective_range.min_address) {
2341 			assert_wait((event_t)map, THREAD_ABORTSAFE);
2342 			vm_map_unlock(map);
2343 			thread_block(THREAD_CONTINUE_NULL);
2344 			vm_map_lock(map);
2345 			goto again;
2346 		}
2347 		return KERN_NO_SPACE;
2348 	}
2349 
2350 	if (entry_out) {
2351 		*entry_out = entry;
2352 	}
2353 	return KERN_SUCCESS;
2354 }
2355 
2356 /*!
2357  * @function vm_map_locate_space_fixed()
2358  *
2359  * @brief
2360  * Locate (no reservation) a range in the specified VM map at a fixed address.
2361  *
2362  * @param map           the map to scan for memory, must be locked.
2363  * @param start         the fixed address trying to be reserved
2364  * @param size          the size of the allocation to make.
2365  * @param mask          an alignment mask the allocation must respect,
2366  * @param vmk_flags     the vm map kernel flags to influence this call.
2367  *                      vmk_flags.vmf_anywhere must not be set.
2368  * @param entry_out     the entry right before the hole.
2369  * @param zap_list      a zap list of entries to clean up after the call.
2370  *
2371  * @returns
2372  * - KERN_SUCCESS in case of success and no conflicting entry is found,
2373  *   in which case entry_out is set to the entry before the hole.
2374  *
2375  * - KERN_MEMORY_PRESENT if a conflicting entry is found,
2376  *   in which case entry_out is set the conflicting entry,
2377  *   the callers MUST handle this error explicitly.
2378  *
2379  * - KERN_INVALID_ADDRESS if the specified @c start or @c size
2380  *   would result in a mapping outside of the map.
2381  *
2382  * - KERN_NO_SPACE for various cases of unrecoverable failures.
2383  */
2384 static kern_return_t
vm_map_locate_space_fixed(vm_map_t map,vm_map_offset_t start,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_map_entry_t * entry_out,vm_map_zap_t zap_list)2385 vm_map_locate_space_fixed(
2386 	vm_map_t                map,
2387 	vm_map_offset_t         start,
2388 	vm_map_size_t           size,
2389 	vm_map_offset_t         mask,
2390 	vm_map_kernel_flags_t   vmk_flags,
2391 	vm_map_entry_t         *entry_out,
2392 	vm_map_zap_t            zap_list)
2393 {
2394 	vm_map_offset_t effective_min_offset, effective_max_offset;
2395 	vm_map_entry_t  entry;
2396 	vm_map_offset_t end;
2397 
2398 	assert(vmk_flags.vmf_fixed);
2399 
2400 	effective_min_offset = map->min_offset;
2401 	effective_max_offset = map->max_offset;
2402 
2403 	if (vmk_flags.vmkf_beyond_max) {
2404 		/*
2405 		 * Allow an insertion beyond the map's max offset.
2406 		 */
2407 		effective_max_offset = 0x00000000FFFFF000ULL;
2408 		if (vm_map_is_64bit(map)) {
2409 			effective_max_offset = 0xFFFFFFFFFFFFF000ULL;
2410 		}
2411 #if XNU_TARGET_OS_OSX
2412 	} else if (__improbable(vmk_flags.vmkf_32bit_map_va)) {
2413 		effective_max_offset = MIN(map->max_offset, 0x00000000FFFFF000ULL);
2414 #endif /* XNU_TARGET_OS_OSX */
2415 	}
2416 
2417 	if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT &&
2418 	    !vmk_flags.vmf_overwrite &&
2419 	    map->pmap == kernel_pmap &&
2420 	    vmk_flags.vm_tag == VM_MEMORY_REALLOC) {
2421 		/*
2422 		 * Force realloc() to switch to a new allocation,
2423 		 * to prevent 4k-fragmented virtual ranges.
2424 		 */
2425 //		DEBUG4K_ERROR("no realloc in place");
2426 		return KERN_NO_SPACE;
2427 	}
2428 
2429 	/*
2430 	 *	Verify that:
2431 	 *		the address doesn't itself violate
2432 	 *		the mask requirement.
2433 	 */
2434 
2435 	if ((start & mask) != 0) {
2436 		return KERN_NO_SPACE;
2437 	}
2438 
2439 	if (__improbable(!vm_map_is_map_size_valid(
2440 		    map, size, vmk_flags.vmkf_no_soft_limit))) {
2441 		return KERN_NO_SPACE;
2442 	}
2443 
2444 #if CONFIG_MAP_RANGES
2445 	if (map->uses_user_ranges) {
2446 		struct mach_vm_range r;
2447 
2448 		vm_map_user_range_resolve(map, start, 1, &r);
2449 		if (r.max_address == 0) {
2450 			return KERN_INVALID_ADDRESS;
2451 		}
2452 		effective_min_offset = r.min_address;
2453 		effective_max_offset = r.max_address;
2454 	}
2455 #endif /* CONFIG_MAP_RANGES */
2456 
2457 	if ((startup_phase >= STARTUP_SUB_KMEM) && !vmk_flags.vmkf_submap &&
2458 	    (map == kernel_map)) {
2459 		mach_vm_range_t r = kmem_validate_range_for_overwrite(start, size);
2460 		effective_min_offset = r->min_address;
2461 		effective_max_offset = r->max_address;
2462 	}
2463 
2464 	/*
2465 	 *	...	the address is within bounds
2466 	 */
2467 
2468 	end = start + size;
2469 
2470 	if ((start < effective_min_offset) ||
2471 	    (end > effective_max_offset) ||
2472 	    (start >= end)) {
2473 		return KERN_INVALID_ADDRESS;
2474 	}
2475 
2476 	if (vmk_flags.vmf_overwrite) {
2477 		vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_MAP_ALIGN | VM_MAP_REMOVE_TO_OVERWRITE;
2478 		kern_return_t remove_kr;
2479 
2480 		/*
2481 		 * Fixed mapping and "overwrite" flag: attempt to
2482 		 * remove all existing mappings in the specified
2483 		 * address range, saving them in our "zap_list".
2484 		 *
2485 		 * This avoids releasing the VM map lock in
2486 		 * vm_map_entry_delete() and allows atomicity
2487 		 * when we want to replace some mappings with a new one.
2488 		 * It also allows us to restore the old VM mappings if the
2489 		 * new mapping fails.
2490 		 */
2491 		remove_flags |= VM_MAP_REMOVE_NO_YIELD;
2492 
2493 		if (vmk_flags.vmkf_overwrite_immutable) {
2494 			/* we can overwrite immutable mappings */
2495 			remove_flags |= VM_MAP_REMOVE_IMMUTABLE;
2496 		}
2497 		if (vmk_flags.vmkf_remap_prot_copy) {
2498 			remove_flags |= VM_MAP_REMOVE_IMMUTABLE_CODE;
2499 		}
2500 		remove_kr = vm_map_delete(map, start, end, remove_flags,
2501 		    KMEM_GUARD_NONE, zap_list).kmr_return;
2502 		if (remove_kr) {
2503 			/* XXX FBDP restore zap_list? */
2504 			return remove_kr;
2505 		}
2506 	}
2507 
2508 	/*
2509 	 *	...	the starting address isn't allocated
2510 	 */
2511 
2512 	if (vm_map_lookup_entry(map, start, &entry)) {
2513 		*entry_out = entry;
2514 		return KERN_MEMORY_PRESENT;
2515 	}
2516 
2517 	/*
2518 	 *	...	the next region doesn't overlap the
2519 	 *		end point.
2520 	 */
2521 
2522 	if ((entry->vme_next != vm_map_to_entry(map)) &&
2523 	    (entry->vme_next->vme_start < end)) {
2524 		return KERN_NO_SPACE;
2525 	}
2526 
2527 	*entry_out = entry;
2528 	return KERN_SUCCESS;
2529 }
2530 
2531 /*
2532  *	Routine:	vm_map_find_space
2533  *	Purpose:
2534  *		Allocate a range in the specified virtual address map,
2535  *		returning the entry allocated for that range.
2536  *		Used by kmem_alloc, etc.
2537  *
2538  *		The map must be NOT be locked. It will be returned locked
2539  *		on KERN_SUCCESS, unlocked on failure.
2540  *
2541  *		If an entry is allocated, the object/offset fields
2542  *		are initialized to zero.
2543  */
2544 kern_return_t
vm_map_find_space(vm_map_t map,vm_map_offset_t hint_address,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_map_entry_t * o_entry)2545 vm_map_find_space(
2546 	vm_map_t                map,
2547 	vm_map_offset_t         hint_address,
2548 	vm_map_size_t           size,
2549 	vm_map_offset_t         mask,
2550 	vm_map_kernel_flags_t   vmk_flags,
2551 	vm_map_entry_t          *o_entry)       /* OUT */
2552 {
2553 	vm_map_entry_t          new_entry, entry;
2554 	kern_return_t           kr;
2555 
2556 	if (size == 0) {
2557 		return KERN_INVALID_ARGUMENT;
2558 	}
2559 
2560 	new_entry = vm_map_entry_create(map);
2561 	new_entry->use_pmap = true;
2562 	new_entry->protection = VM_PROT_DEFAULT;
2563 	new_entry->max_protection = VM_PROT_ALL;
2564 
2565 	if (VM_MAP_PAGE_SHIFT(map) != PAGE_SHIFT) {
2566 		new_entry->map_aligned = true;
2567 	}
2568 	if (vmk_flags.vmf_permanent) {
2569 		new_entry->vme_permanent = true;
2570 	}
2571 
2572 	vm_map_lock(map);
2573 
2574 	kr = vm_map_locate_space_anywhere(map, size, mask, vmk_flags,
2575 	    &hint_address, &entry);
2576 	if (kr != KERN_SUCCESS) {
2577 		vm_map_unlock(map);
2578 		vm_map_entry_dispose(new_entry);
2579 		return kr;
2580 	}
2581 	new_entry->vme_start = hint_address;
2582 	new_entry->vme_end = hint_address + size;
2583 
2584 	/*
2585 	 *	At this point,
2586 	 *
2587 	 *	- new_entry's "vme_start" and "vme_end" should define
2588 	 *	  the endpoints of the available new range,
2589 	 *
2590 	 *	- and "entry" should refer to the region before
2591 	 *	  the new range,
2592 	 *
2593 	 *	- and the map should still be locked.
2594 	 */
2595 
2596 	assert(page_aligned(new_entry->vme_start));
2597 	assert(page_aligned(new_entry->vme_end));
2598 	assert(VM_MAP_PAGE_ALIGNED(new_entry->vme_start, VM_MAP_PAGE_MASK(map)));
2599 	assert(VM_MAP_PAGE_ALIGNED(new_entry->vme_end, VM_MAP_PAGE_MASK(map)));
2600 
2601 
2602 	/*
2603 	 *	Insert the new entry into the list
2604 	 */
2605 
2606 	vm_map_store_entry_link(map, entry, new_entry,
2607 	    VM_MAP_KERNEL_FLAGS_NONE);
2608 	map->size += size;
2609 
2610 	/*
2611 	 *	Update the lookup hint
2612 	 */
2613 	SAVE_HINT_MAP_WRITE(map, new_entry);
2614 
2615 	*o_entry = new_entry;
2616 	return KERN_SUCCESS;
2617 }
2618 
2619 int vm_map_pmap_enter_print = FALSE;
2620 int vm_map_pmap_enter_enable = FALSE;
2621 
2622 /*
2623  *	Routine:	vm_map_pmap_enter [internal only]
2624  *
2625  *	Description:
2626  *		Force pages from the specified object to be entered into
2627  *		the pmap at the specified address if they are present.
2628  *		As soon as a page not found in the object the scan ends.
2629  *
2630  *	Returns:
2631  *		Nothing.
2632  *
2633  *	In/out conditions:
2634  *		The source map should not be locked on entry.
2635  */
2636 __unused static void
vm_map_pmap_enter(vm_map_t map,vm_map_offset_t addr,vm_map_offset_t end_addr,vm_object_t object,vm_object_offset_t offset,vm_prot_t protection)2637 vm_map_pmap_enter(
2638 	vm_map_t                map,
2639 	vm_map_offset_t         addr,
2640 	vm_map_offset_t         end_addr,
2641 	vm_object_t             object,
2642 	vm_object_offset_t      offset,
2643 	vm_prot_t               protection)
2644 {
2645 	int                     type_of_fault;
2646 	kern_return_t           kr;
2647 	uint8_t                 object_lock_type = 0;
2648 	struct vm_object_fault_info fault_info = {
2649 		.interruptible = THREAD_UNINT,
2650 	};
2651 
2652 	if (map->pmap == 0) {
2653 		return;
2654 	}
2655 
2656 	assert(VM_MAP_PAGE_SHIFT(map) == PAGE_SHIFT);
2657 
2658 	while (addr < end_addr) {
2659 		vm_page_t       m;
2660 
2661 
2662 		/*
2663 		 * TODO:
2664 		 * From vm_map_enter(), we come into this function without the map
2665 		 * lock held or the object lock held.
2666 		 * We haven't taken a reference on the object either.
2667 		 * We should do a proper lookup on the map to make sure
2668 		 * that things are sane before we go locking objects that
2669 		 * could have been deallocated from under us.
2670 		 */
2671 
2672 		object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2673 		vm_object_lock(object);
2674 
2675 		m = vm_page_lookup(object, offset);
2676 
2677 		if (m == VM_PAGE_NULL || m->vmp_busy || vm_page_is_fictitious(m) ||
2678 		    (m->vmp_unusual && (VMP_ERROR_GET(m) || m->vmp_restart || m->vmp_absent))) {
2679 			vm_object_unlock(object);
2680 			return;
2681 		}
2682 
2683 		if (vm_map_pmap_enter_print) {
2684 			printf("vm_map_pmap_enter:");
2685 			printf("map: %p, addr: %llx, object: %p, offset: %llx\n",
2686 			    map, (unsigned long long)addr, object, (unsigned long long)offset);
2687 		}
2688 		type_of_fault = DBG_CACHE_HIT_FAULT;
2689 		kr = vm_fault_enter(m, map->pmap,
2690 		    addr,
2691 		    PAGE_SIZE, 0,
2692 		    protection, protection,
2693 		    VM_PAGE_WIRED(m),
2694 		    VM_KERN_MEMORY_NONE,                 /* tag - not wiring */
2695 		    &fault_info,
2696 		    NULL,                  /* need_retry */
2697 		    &type_of_fault,
2698 		    &object_lock_type); /* Exclusive lock mode. Will remain unchanged.*/
2699 
2700 		vm_object_unlock(object);
2701 
2702 		offset += PAGE_SIZE_64;
2703 		addr += PAGE_SIZE;
2704 	}
2705 }
2706 
2707 #define MAX_TRIES_TO_GET_RANDOM_ADDRESS 1000
2708 static kern_return_t
vm_map_random_address_for_size(vm_map_t map,vm_map_offset_t * address,vm_map_size_t size,vm_map_kernel_flags_t vmk_flags)2709 vm_map_random_address_for_size(
2710 	vm_map_t                map,
2711 	vm_map_offset_t        *address,
2712 	vm_map_size_t           size,
2713 	vm_map_kernel_flags_t   vmk_flags)
2714 {
2715 	kern_return_t   kr = KERN_SUCCESS;
2716 	int             tries = 0;
2717 	vm_map_offset_t random_addr = 0;
2718 	vm_map_offset_t hole_end;
2719 
2720 	vm_map_entry_t  next_entry = VM_MAP_ENTRY_NULL;
2721 	vm_map_entry_t  prev_entry = VM_MAP_ENTRY_NULL;
2722 	vm_map_size_t   vm_hole_size = 0;
2723 	vm_map_size_t   addr_space_size;
2724 	bool            is_kmem_ptr;
2725 	struct mach_vm_range effective_range;
2726 
2727 	effective_range = vm_map_get_range(map, address, &vmk_flags, size,
2728 	    &is_kmem_ptr);
2729 
2730 	addr_space_size = effective_range.max_address - effective_range.min_address;
2731 	if (size >= addr_space_size) {
2732 		return KERN_NO_SPACE;
2733 	}
2734 	addr_space_size -= size;
2735 
2736 	assert(VM_MAP_PAGE_ALIGNED(size, VM_MAP_PAGE_MASK(map)));
2737 
2738 	while (tries < MAX_TRIES_TO_GET_RANDOM_ADDRESS) {
2739 		if (startup_phase < STARTUP_SUB_ZALLOC) {
2740 			random_addr = (vm_map_offset_t)early_random();
2741 		} else {
2742 			random_addr = (vm_map_offset_t)random();
2743 		}
2744 		random_addr <<= VM_MAP_PAGE_SHIFT(map);
2745 		random_addr = vm_map_trunc_page(
2746 			effective_range.min_address + (random_addr % addr_space_size),
2747 			VM_MAP_PAGE_MASK(map));
2748 
2749 #if CONFIG_PROB_GZALLOC
2750 		if (map->pmap == kernel_pmap && pgz_owned(random_addr)) {
2751 			continue;
2752 		}
2753 #endif /* CONFIG_PROB_GZALLOC */
2754 
2755 		if (vm_map_lookup_entry(map, random_addr, &prev_entry) == FALSE) {
2756 			if (prev_entry == vm_map_to_entry(map)) {
2757 				next_entry = vm_map_first_entry(map);
2758 			} else {
2759 				next_entry = prev_entry->vme_next;
2760 			}
2761 			if (next_entry == vm_map_to_entry(map)) {
2762 				hole_end = vm_map_max(map);
2763 			} else {
2764 				hole_end = next_entry->vme_start;
2765 			}
2766 			vm_hole_size = hole_end - random_addr;
2767 			if (vm_hole_size >= size) {
2768 				*address = random_addr;
2769 				break;
2770 			}
2771 		}
2772 		tries++;
2773 	}
2774 
2775 	if (tries == MAX_TRIES_TO_GET_RANDOM_ADDRESS) {
2776 		kr = KERN_NO_SPACE;
2777 	}
2778 	return kr;
2779 }
2780 
2781 static boolean_t
vm_memory_malloc_no_cow(int alias)2782 vm_memory_malloc_no_cow(
2783 	int alias)
2784 {
2785 	uint64_t alias_mask;
2786 
2787 	if (!malloc_no_cow) {
2788 		return FALSE;
2789 	}
2790 	if (alias > 63) {
2791 		return FALSE;
2792 	}
2793 	alias_mask = 1ULL << alias;
2794 	if (alias_mask & vm_memory_malloc_no_cow_mask) {
2795 		return TRUE;
2796 	}
2797 	return FALSE;
2798 }
2799 
2800 uint64_t vm_map_enter_RLIMIT_AS_count = 0;
2801 uint64_t vm_map_enter_RLIMIT_DATA_count = 0;
2802 /*
2803  *	Routine:	vm_map_enter
2804  *
2805  *	Description:
2806  *		Allocate a range in the specified virtual address map.
2807  *		The resulting range will refer to memory defined by
2808  *		the given memory object and offset into that object.
2809  *
2810  *		Arguments are as defined in the vm_map call.
2811  */
2812 static unsigned int vm_map_enter_restore_successes = 0;
2813 static unsigned int vm_map_enter_restore_failures = 0;
2814 kern_return_t
vm_map_enter(vm_map_t map,vm_map_offset_t * address,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_object_t object,vm_object_offset_t offset,boolean_t needs_copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)2815 vm_map_enter(
2816 	vm_map_t                map,
2817 	vm_map_offset_t         *address,       /* IN/OUT */
2818 	vm_map_size_t           size,
2819 	vm_map_offset_t         mask,
2820 	vm_map_kernel_flags_t   vmk_flags,
2821 	vm_object_t             object,
2822 	vm_object_offset_t      offset,
2823 	boolean_t               needs_copy,
2824 	vm_prot_t               cur_protection,
2825 	vm_prot_t               max_protection,
2826 	vm_inherit_t            inheritance)
2827 {
2828 	vm_map_entry_t          entry, new_entry;
2829 	vm_map_offset_t         start, tmp_start, tmp_offset;
2830 	vm_map_offset_t         end, tmp_end;
2831 	vm_map_offset_t         tmp2_start, tmp2_end;
2832 	vm_map_offset_t         step;
2833 	kern_return_t           result = KERN_SUCCESS;
2834 	bool                    map_locked = FALSE;
2835 	bool                    pmap_empty = TRUE;
2836 	bool                    new_mapping_established = FALSE;
2837 	const bool              keep_map_locked = vmk_flags.vmkf_keep_map_locked;
2838 	const bool              anywhere = !vmk_flags.vmf_fixed;
2839 	const bool              purgable = vmk_flags.vmf_purgeable;
2840 	const bool              no_cache = vmk_flags.vmf_no_cache;
2841 	const bool              is_submap = vmk_flags.vmkf_submap;
2842 	const bool              permanent = vmk_flags.vmf_permanent;
2843 	const bool              no_copy_on_read = vmk_flags.vmkf_no_copy_on_read;
2844 	const bool              entry_for_jit = vmk_flags.vmkf_map_jit;
2845 	const bool              iokit_acct = vmk_flags.vmkf_iokit_acct;
2846 	const bool              resilient_codesign = vmk_flags.vmf_resilient_codesign;
2847 	const bool              resilient_media = vmk_flags.vmf_resilient_media;
2848 	const bool              entry_for_tpro = vmk_flags.vmf_tpro;
2849 	const unsigned int      superpage_size = vmk_flags.vmf_superpage_size;
2850 	const vm_tag_t          alias = vmk_flags.vm_tag;
2851 	vm_tag_t                user_alias;
2852 	kern_return_t           kr;
2853 	bool                    clear_map_aligned = FALSE;
2854 	vm_map_size_t           chunk_size = 0;
2855 	vm_object_t             caller_object;
2856 	VM_MAP_ZAP_DECLARE(zap_old_list);
2857 	VM_MAP_ZAP_DECLARE(zap_new_list);
2858 
2859 	caller_object = object;
2860 
2861 	assertf(vmk_flags.__vmkf_unused2 == 0, "vmk_flags unused2=0x%llx\n", vmk_flags.__vmkf_unused2);
2862 
2863 	if (vmk_flags.vmf_4gb_chunk) {
2864 #if defined(__LP64__)
2865 		chunk_size = (4ULL * 1024 * 1024 * 1024); /* max. 4GB chunks for the new allocation */
2866 #else /* __LP64__ */
2867 		chunk_size = ANON_CHUNK_SIZE;
2868 #endif /* __LP64__ */
2869 	} else {
2870 		chunk_size = ANON_CHUNK_SIZE;
2871 	}
2872 
2873 
2874 
2875 	if (superpage_size) {
2876 		if (object != VM_OBJECT_NULL) {
2877 			/* caller can't provide their own VM object */
2878 			return KERN_INVALID_ARGUMENT;
2879 		}
2880 		switch (superpage_size) {
2881 			/*
2882 			 * Note that the current implementation only supports
2883 			 * a single size for superpages, SUPERPAGE_SIZE, per
2884 			 * architecture. As soon as more sizes are supposed
2885 			 * to be supported, SUPERPAGE_SIZE has to be replaced
2886 			 * with a lookup of the size depending on superpage_size.
2887 			 */
2888 #ifdef __x86_64__
2889 		case SUPERPAGE_SIZE_ANY:
2890 			/* handle it like 2 MB and round up to page size */
2891 			size = (size + 2 * 1024 * 1024 - 1) & ~(2 * 1024 * 1024 - 1);
2892 			OS_FALLTHROUGH;
2893 		case SUPERPAGE_SIZE_2MB:
2894 			break;
2895 #endif
2896 		default:
2897 			return KERN_INVALID_ARGUMENT;
2898 		}
2899 		mask = SUPERPAGE_SIZE - 1;
2900 		if (size & (SUPERPAGE_SIZE - 1)) {
2901 			return KERN_INVALID_ARGUMENT;
2902 		}
2903 		inheritance = VM_INHERIT_NONE;  /* fork() children won't inherit superpages */
2904 	}
2905 
2906 
2907 	if ((cur_protection & VM_PROT_WRITE) &&
2908 	    (cur_protection & VM_PROT_EXECUTE) &&
2909 #if XNU_TARGET_OS_OSX
2910 	    map->pmap != kernel_pmap &&
2911 	    (cs_process_global_enforcement() ||
2912 	    (vmk_flags.vmkf_cs_enforcement_override
2913 	    ? vmk_flags.vmkf_cs_enforcement
2914 	    : (vm_map_cs_enforcement(map)
2915 #if __arm64__
2916 	    || !VM_MAP_IS_EXOTIC(map)
2917 #endif /* __arm64__ */
2918 	    ))) &&
2919 #endif /* XNU_TARGET_OS_OSX */
2920 #if CODE_SIGNING_MONITOR
2921 	    (csm_address_space_exempt(map->pmap) != KERN_SUCCESS) &&
2922 #endif
2923 	    (VM_MAP_POLICY_WX_FAIL(map) ||
2924 	    VM_MAP_POLICY_WX_STRIP_X(map)) &&
2925 	    !entry_for_jit) {
2926 		boolean_t vm_protect_wx_fail = VM_MAP_POLICY_WX_FAIL(map);
2927 
2928 		DTRACE_VM3(cs_wx,
2929 		    uint64_t, 0,
2930 		    uint64_t, 0,
2931 		    vm_prot_t, cur_protection);
2932 		printf("CODE SIGNING: %d[%s] %s: curprot cannot be write+execute. %s\n",
2933 		    proc_selfpid(),
2934 		    (get_bsdtask_info(current_task())
2935 		    ? proc_name_address(get_bsdtask_info(current_task()))
2936 		    : "?"),
2937 		    __FUNCTION__,
2938 		    (vm_protect_wx_fail ? "failing" : "turning off execute"));
2939 		cur_protection &= ~VM_PROT_EXECUTE;
2940 		if (vm_protect_wx_fail) {
2941 			return KERN_PROTECTION_FAILURE;
2942 		}
2943 	}
2944 
2945 	if (entry_for_jit
2946 	    && cur_protection != VM_PROT_ALL) {
2947 		/*
2948 		 * Native macOS processes and all non-macOS processes are
2949 		 * expected to create JIT regions via mmap(MAP_JIT, RWX) but
2950 		 * the RWX requirement was not enforced, and thus, we must live
2951 		 * with our sins. We are now dealing with a JIT mapping without
2952 		 * RWX.
2953 		 *
2954 		 * We deal with these by letting the MAP_JIT stick in order
2955 		 * to avoid CS violations when these pages are mapped executable
2956 		 * down the line. In order to appease the page table monitor (you
2957 		 * know what I'm talking about), these pages will end up being
2958 		 * marked as XNU_USER_DEBUG, which will be allowed because we
2959 		 * don't enforce the code signing monitor on macOS systems. If
2960 		 * the user-space application ever changes permissions to RWX,
2961 		 * which they are allowed to since the mapping was originally
2962 		 * created with MAP_JIT, then they'll switch over to using the
2963 		 * XNU_USER_JIT type, and won't be allowed to downgrade any
2964 		 * more after that.
2965 		 *
2966 		 * When not on macOS, a MAP_JIT mapping without VM_PROT_ALL is
2967 		 * strictly disallowed.
2968 		 */
2969 
2970 #if XNU_TARGET_OS_OSX
2971 		/*
2972 		 * Continue to allow non-RWX JIT
2973 		 */
2974 #else
2975 		/* non-macOS: reject JIT regions without RWX */
2976 		DTRACE_VM3(cs_wx,
2977 		    uint64_t, 0,
2978 		    uint64_t, 0,
2979 		    vm_prot_t, cur_protection);
2980 		printf("CODE SIGNING: %d[%s] %s(%d): JIT requires RWX: failing. \n",
2981 		    proc_selfpid(),
2982 		    (get_bsdtask_info(current_task())
2983 		    ? proc_name_address(get_bsdtask_info(current_task()))
2984 		    : "?"),
2985 		    __FUNCTION__,
2986 		    cur_protection);
2987 		return KERN_PROTECTION_FAILURE;
2988 #endif
2989 	}
2990 
2991 	/*
2992 	 * If the task has requested executable lockdown,
2993 	 * deny any new executable mapping.
2994 	 */
2995 	if (map->map_disallow_new_exec == TRUE) {
2996 		if (cur_protection & VM_PROT_EXECUTE) {
2997 			return KERN_PROTECTION_FAILURE;
2998 		}
2999 	}
3000 
3001 	if (resilient_codesign) {
3002 		assert(!is_submap);
3003 		int reject_prot = (needs_copy ? VM_PROT_ALLEXEC : (VM_PROT_WRITE | VM_PROT_ALLEXEC));
3004 		if ((cur_protection | max_protection) & reject_prot) {
3005 			return KERN_PROTECTION_FAILURE;
3006 		}
3007 	}
3008 
3009 	if (resilient_media) {
3010 		assert(!is_submap);
3011 //		assert(!needs_copy);
3012 		if (object != VM_OBJECT_NULL &&
3013 		    !object->internal) {
3014 			/*
3015 			 * This mapping is directly backed by an external
3016 			 * memory manager (e.g. a vnode pager for a file):
3017 			 * we would not have any safe place to inject
3018 			 * a zero-filled page if an actual page is not
3019 			 * available, without possibly impacting the actual
3020 			 * contents of the mapped object (e.g. the file),
3021 			 * so we can't provide any media resiliency here.
3022 			 */
3023 			return KERN_INVALID_ARGUMENT;
3024 		}
3025 	}
3026 
3027 	if (entry_for_tpro) {
3028 		/*
3029 		 * TPRO overrides the effective permissions of the region
3030 		 * and explicitly maps as RW. Ensure we have been passed
3031 		 * the expected permissions. We accept `cur_protections`
3032 		 * RO as that will be handled on fault.
3033 		 */
3034 		if (!(max_protection & VM_PROT_READ) ||
3035 		    !(max_protection & VM_PROT_WRITE) ||
3036 		    !(cur_protection & VM_PROT_READ)) {
3037 			return KERN_PROTECTION_FAILURE;
3038 		}
3039 
3040 		/*
3041 		 * We can now downgrade the cur_protection to RO. This is a mild lie
3042 		 * to the VM layer. But TPRO will be responsible for toggling the
3043 		 * protections between RO/RW
3044 		 */
3045 		cur_protection = VM_PROT_READ;
3046 	}
3047 
3048 	if (is_submap) {
3049 		vm_map_t submap;
3050 		if (purgable) {
3051 			/* submaps can not be purgeable */
3052 			return KERN_INVALID_ARGUMENT;
3053 		}
3054 		if (object == VM_OBJECT_NULL) {
3055 			/* submaps can not be created lazily */
3056 			return KERN_INVALID_ARGUMENT;
3057 		}
3058 		submap = (vm_map_t) object;
3059 		if (VM_MAP_PAGE_SHIFT(submap) != VM_MAP_PAGE_SHIFT(map)) {
3060 			/* page size mismatch */
3061 			return KERN_INVALID_ARGUMENT;
3062 		}
3063 	}
3064 	if (vmk_flags.vmkf_already) {
3065 		/*
3066 		 * VM_FLAGS_ALREADY says that it's OK if the same mapping
3067 		 * is already present.  For it to be meaningul, the requested
3068 		 * mapping has to be at a fixed address (!VM_FLAGS_ANYWHERE) and
3069 		 * we shouldn't try and remove what was mapped there first
3070 		 * (!VM_FLAGS_OVERWRITE).
3071 		 */
3072 		if (!vmk_flags.vmf_fixed || vmk_flags.vmf_overwrite) {
3073 			return KERN_INVALID_ARGUMENT;
3074 		}
3075 	}
3076 
3077 	if (size == 0 ||
3078 	    (offset & MIN(VM_MAP_PAGE_MASK(map), PAGE_MASK_64)) != 0) {
3079 		*address = 0;
3080 		return KERN_INVALID_ARGUMENT;
3081 	}
3082 
3083 	if (map->pmap == kernel_pmap) {
3084 		user_alias = VM_KERN_MEMORY_NONE;
3085 	} else {
3086 		user_alias = alias;
3087 	}
3088 
3089 	if (user_alias == VM_MEMORY_MALLOC_MEDIUM) {
3090 		chunk_size = MALLOC_MEDIUM_CHUNK_SIZE;
3091 	}
3092 
3093 #define RETURN(value)   { result = value; goto BailOut; }
3094 
3095 	assertf(VM_MAP_PAGE_ALIGNED(*address, FOURK_PAGE_MASK), "0x%llx", (uint64_t)*address);
3096 	assertf(VM_MAP_PAGE_ALIGNED(size, FOURK_PAGE_MASK), "0x%llx", (uint64_t)size);
3097 	if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK) {
3098 		assertf(page_aligned(*address), "0x%llx", (uint64_t)*address);
3099 		assertf(page_aligned(size), "0x%llx", (uint64_t)size);
3100 	}
3101 
3102 	if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK &&
3103 	    !VM_MAP_PAGE_ALIGNED(size, VM_MAP_PAGE_MASK(map))) {
3104 		/*
3105 		 * In most cases, the caller rounds the size up to the
3106 		 * map's page size.
3107 		 * If we get a size that is explicitly not map-aligned here,
3108 		 * we'll have to respect the caller's wish and mark the
3109 		 * mapping as "not map-aligned" to avoid tripping the
3110 		 * map alignment checks later.
3111 		 */
3112 		clear_map_aligned = TRUE;
3113 	}
3114 	if (!anywhere &&
3115 	    VM_MAP_PAGE_MASK(map) >= PAGE_MASK &&
3116 	    !VM_MAP_PAGE_ALIGNED(*address, VM_MAP_PAGE_MASK(map))) {
3117 		/*
3118 		 * We've been asked to map at a fixed address and that
3119 		 * address is not aligned to the map's specific alignment.
3120 		 * The caller should know what it's doing (i.e. most likely
3121 		 * mapping some fragmented copy map, transferring memory from
3122 		 * a VM map with a different alignment), so clear map_aligned
3123 		 * for this new VM map entry and proceed.
3124 		 */
3125 		clear_map_aligned = TRUE;
3126 	}
3127 
3128 	/*
3129 	 * Only zero-fill objects are allowed to be purgable.
3130 	 * LP64todo - limit purgable objects to 32-bits for now
3131 	 */
3132 	if (purgable &&
3133 	    (offset != 0 ||
3134 	    (object != VM_OBJECT_NULL &&
3135 	    (object->vo_size != size ||
3136 	    object->purgable == VM_PURGABLE_DENY))
3137 #if __LP64__
3138 	    || size > ANON_MAX_SIZE
3139 #endif
3140 	    )) {
3141 		return KERN_INVALID_ARGUMENT;
3142 	}
3143 
3144 	if (__improbable(!vm_map_is_map_size_valid(
3145 		    map, size, vmk_flags.vmkf_no_soft_limit))) {
3146 		return KERN_NO_SPACE;
3147 	}
3148 
3149 	vm_map_lock(map);
3150 	map_locked = TRUE;
3151 
3152 	if (anywhere) {
3153 		result = vm_map_locate_space_anywhere(map, size, mask, vmk_flags,
3154 		    address, &entry);
3155 		start = *address;
3156 	} else {
3157 		start = *address;
3158 		result = vm_map_locate_space_fixed(map, start, size, mask,
3159 		    vmk_flags, &entry, &zap_old_list);
3160 	}
3161 
3162 	end = start + size;
3163 
3164 	assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
3165 
3166 	/*
3167 	 * Check if what's already there is what we want.
3168 	 */
3169 	if (result == KERN_MEMORY_PRESENT) {
3170 		assert(!anywhere);
3171 		if (!(vmk_flags.vmkf_already)) {
3172 			RETURN(KERN_NO_SPACE);
3173 		}
3174 		tmp_start = start;
3175 		tmp_offset = offset;
3176 		if (entry->vme_start < start) {
3177 			tmp_start -= start - entry->vme_start;
3178 			tmp_offset -= start - entry->vme_start;
3179 		}
3180 		for (; entry->vme_start < end;
3181 		    entry = entry->vme_next) {
3182 			/*
3183 			 * Check if the mapping's attributes
3184 			 * match the existing map entry.
3185 			 */
3186 			if (entry == vm_map_to_entry(map) ||
3187 			    entry->vme_start != tmp_start ||
3188 			    entry->is_sub_map != is_submap ||
3189 			    VME_OFFSET(entry) != tmp_offset ||
3190 			    entry->needs_copy != needs_copy ||
3191 			    entry->protection != cur_protection ||
3192 			    entry->max_protection != max_protection ||
3193 			    entry->inheritance != inheritance ||
3194 			    entry->iokit_acct != iokit_acct ||
3195 			    VME_ALIAS(entry) != alias) {
3196 				/* not the same mapping ! */
3197 				RETURN(KERN_NO_SPACE);
3198 			}
3199 			/*
3200 			 * Check if the same object is being mapped.
3201 			 */
3202 			if (is_submap) {
3203 				if (VME_SUBMAP(entry) !=
3204 				    (vm_map_t) object) {
3205 					/* not the same submap */
3206 					RETURN(KERN_NO_SPACE);
3207 				}
3208 			} else {
3209 				if (VME_OBJECT(entry) != object) {
3210 					/* not the same VM object... */
3211 					vm_object_t obj2;
3212 
3213 					obj2 = VME_OBJECT(entry);
3214 					if ((obj2 == VM_OBJECT_NULL || obj2->internal) &&
3215 					    (object == VM_OBJECT_NULL || object->internal)) {
3216 						/*
3217 						 * ... but both are
3218 						 * anonymous memory,
3219 						 * so equivalent.
3220 						 */
3221 					} else {
3222 						RETURN(KERN_NO_SPACE);
3223 					}
3224 				}
3225 			}
3226 
3227 			tmp_offset += entry->vme_end - entry->vme_start;
3228 			tmp_start += entry->vme_end - entry->vme_start;
3229 			if (entry->vme_end >= end) {
3230 				/* reached the end of our mapping */
3231 				break;
3232 			}
3233 		}
3234 		/* it all matches:  let's use what's already there ! */
3235 		RETURN(KERN_MEMORY_PRESENT);
3236 	}
3237 
3238 	if (result != KERN_SUCCESS) {
3239 		goto BailOut;
3240 	}
3241 
3242 
3243 	/*
3244 	 *	At this point,
3245 	 *		"start" and "end" should define the endpoints of the
3246 	 *			available new range, and
3247 	 *		"entry" should refer to the region before the new
3248 	 *			range, and
3249 	 *
3250 	 *		the map should be locked.
3251 	 */
3252 
3253 	/*
3254 	 *	See whether we can avoid creating a new entry (and object) by
3255 	 *	extending one of our neighbors.  [So far, we only attempt to
3256 	 *	extend from below.]  Note that we can never extend/join
3257 	 *	purgable objects because they need to remain distinct
3258 	 *	entities in order to implement their "volatile object"
3259 	 *	semantics.
3260 	 */
3261 
3262 	if (purgable ||
3263 	    entry_for_jit ||
3264 	    entry_for_tpro ||
3265 	    vm_memory_malloc_no_cow(user_alias)) {
3266 		if (superpage_size) {
3267 			/*
3268 			 * For "super page" allocations, we will allocate
3269 			 * special physically-contiguous VM objects later on,
3270 			 * so we should not have flags instructing us to create
3271 			 * a differently special VM object here.
3272 			 */
3273 			RETURN(KERN_INVALID_ARGUMENT);
3274 		}
3275 
3276 		if (object == VM_OBJECT_NULL) {
3277 			assert(!superpage_size);
3278 			object = vm_object_allocate(size);
3279 			vm_object_lock(object);
3280 			object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
3281 			VM_OBJECT_SET_TRUE_SHARE(object, FALSE);
3282 			if (malloc_no_cow_except_fork &&
3283 			    !purgable &&
3284 			    !entry_for_jit &&
3285 			    !entry_for_tpro &&
3286 			    vm_memory_malloc_no_cow(user_alias)) {
3287 				object->copy_strategy = MEMORY_OBJECT_COPY_DELAY_FORK;
3288 				VM_OBJECT_SET_TRUE_SHARE(object, TRUE);
3289 			}
3290 			if (entry_for_jit) {
3291 				object->vo_inherit_copy_none = true;
3292 			}
3293 			if (purgable) {
3294 				task_t owner;
3295 				VM_OBJECT_SET_PURGABLE(object, VM_PURGABLE_NONVOLATILE);
3296 				if (map->pmap == kernel_pmap) {
3297 					/*
3298 					 * Purgeable mappings made in a kernel
3299 					 * map are "owned" by the kernel itself
3300 					 * rather than the current user task
3301 					 * because they're likely to be used by
3302 					 * more than this user task (see
3303 					 * execargs_purgeable_allocate(), for
3304 					 * example).
3305 					 */
3306 					owner = kernel_task;
3307 				} else {
3308 					owner = current_task();
3309 				}
3310 				assert(object->vo_owner == NULL);
3311 				assert(object->resident_page_count == 0);
3312 				assert(object->wired_page_count == 0);
3313 				vm_purgeable_nonvolatile_enqueue(object, owner);
3314 			}
3315 			vm_object_unlock(object);
3316 			offset = (vm_object_offset_t)0;
3317 		}
3318 	} else if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
3319 		/* no coalescing if address space uses sub-pages */
3320 	} else if ((is_submap == FALSE) &&
3321 	    (object == VM_OBJECT_NULL) &&
3322 	    (entry != vm_map_to_entry(map)) &&
3323 	    (entry->vme_end == start) &&
3324 	    (!entry->is_shared) &&
3325 	    (!entry->is_sub_map) &&
3326 	    (!entry->in_transition) &&
3327 	    (!entry->needs_wakeup) &&
3328 	    (entry->behavior == VM_BEHAVIOR_DEFAULT) &&
3329 	    (entry->protection == cur_protection) &&
3330 	    (entry->max_protection == max_protection) &&
3331 	    (entry->inheritance == inheritance) &&
3332 	    ((user_alias == VM_MEMORY_REALLOC) ||
3333 	    (VME_ALIAS(entry) == alias)) &&
3334 	    (entry->no_cache == no_cache) &&
3335 	    (entry->vme_permanent == permanent) &&
3336 	    /* no coalescing for immutable executable mappings */
3337 	    !((entry->protection & VM_PROT_EXECUTE) &&
3338 	    entry->vme_permanent) &&
3339 	    (!entry->superpage_size && !superpage_size) &&
3340 	    /*
3341 	     * No coalescing if not map-aligned, to avoid propagating
3342 	     * that condition any further than needed:
3343 	     */
3344 	    (!entry->map_aligned || !clear_map_aligned) &&
3345 	    (!entry->zero_wired_pages) &&
3346 	    (!entry->used_for_jit && !entry_for_jit) &&
3347 #if __arm64e__
3348 	    (!entry->used_for_tpro && !entry_for_tpro) &&
3349 #endif
3350 	    (!entry->csm_associated) &&
3351 	    (entry->iokit_acct == iokit_acct) &&
3352 	    (!entry->vme_resilient_codesign) &&
3353 	    (!entry->vme_resilient_media) &&
3354 	    (!entry->vme_atomic) &&
3355 	    (entry->vme_no_copy_on_read == no_copy_on_read) &&
3356 
3357 	    ((entry->vme_end - entry->vme_start) + size <=
3358 	    (user_alias == VM_MEMORY_REALLOC ?
3359 	    ANON_CHUNK_SIZE :
3360 	    NO_COALESCE_LIMIT)) &&
3361 
3362 	    (entry->wired_count == 0)) {        /* implies user_wired_count == 0 */
3363 		if (vm_object_coalesce(VME_OBJECT(entry),
3364 		    VM_OBJECT_NULL,
3365 		    VME_OFFSET(entry),
3366 		    (vm_object_offset_t) 0,
3367 		    (vm_map_size_t)(entry->vme_end - entry->vme_start),
3368 		    (vm_map_size_t)(end - entry->vme_end))) {
3369 			/*
3370 			 *	Coalesced the two objects - can extend
3371 			 *	the previous map entry to include the
3372 			 *	new range.
3373 			 */
3374 			map->size += (end - entry->vme_end);
3375 			assert(entry->vme_start < end);
3376 			assert(VM_MAP_PAGE_ALIGNED(end,
3377 			    VM_MAP_PAGE_MASK(map)));
3378 			if (__improbable(vm_debug_events)) {
3379 				DTRACE_VM5(map_entry_extend, vm_map_t, map, vm_map_entry_t, entry, vm_address_t, entry->vme_start, vm_address_t, entry->vme_end, vm_address_t, end);
3380 			}
3381 			entry->vme_end = end;
3382 			if (map->holelistenabled) {
3383 				vm_map_store_update_first_free(map, entry, TRUE);
3384 			} else {
3385 				vm_map_store_update_first_free(map, map->first_free, TRUE);
3386 			}
3387 			new_mapping_established = TRUE;
3388 			RETURN(KERN_SUCCESS);
3389 		}
3390 	}
3391 
3392 	step = superpage_size ? SUPERPAGE_SIZE : (end - start);
3393 	new_entry = NULL;
3394 
3395 	if (vmk_flags.vmkf_submap_adjust) {
3396 		vm_map_adjust_offsets((vm_map_t)caller_object, start, end);
3397 		offset = start;
3398 	}
3399 
3400 	for (tmp2_start = start; tmp2_start < end; tmp2_start += step) {
3401 		tmp2_end = tmp2_start + step;
3402 		/*
3403 		 *	Create a new entry
3404 		 *
3405 		 * XXX FBDP
3406 		 * The reserved "page zero" in each process's address space can
3407 		 * be arbitrarily large.  Splitting it into separate objects and
3408 		 * therefore different VM map entries serves no purpose and just
3409 		 * slows down operations on the VM map, so let's not split the
3410 		 * allocation into chunks if the max protection is NONE.  That
3411 		 * memory should never be accessible, so it will never get to the
3412 		 * default pager.
3413 		 */
3414 		tmp_start = tmp2_start;
3415 		if (!is_submap &&
3416 		    object == VM_OBJECT_NULL &&
3417 		    size > chunk_size &&
3418 		    max_protection != VM_PROT_NONE &&
3419 		    superpage_size == 0) {
3420 			tmp_end = tmp_start + chunk_size;
3421 		} else {
3422 			tmp_end = tmp2_end;
3423 		}
3424 		do {
3425 			if (!is_submap &&
3426 			    object != VM_OBJECT_NULL &&
3427 			    object->internal &&
3428 			    offset + (tmp_end - tmp_start) > object->vo_size) {
3429 //				printf("FBDP object %p size 0x%llx overmapping offset 0x%llx size 0x%llx\n", object, object->vo_size, offset, (uint64_t)(tmp_end - tmp_start));
3430 				DTRACE_VM5(vm_map_enter_overmap,
3431 				    vm_map_t, map,
3432 				    vm_map_address_t, tmp_start,
3433 				    vm_map_address_t, tmp_end,
3434 				    vm_object_offset_t, offset,
3435 				    vm_object_size_t, object->vo_size);
3436 			}
3437 			new_entry = vm_map_entry_insert(map,
3438 			    entry, tmp_start, tmp_end,
3439 			    object, offset, vmk_flags,
3440 			    needs_copy,
3441 			    cur_protection, max_protection,
3442 			    (entry_for_jit && !VM_MAP_POLICY_ALLOW_JIT_INHERIT(map) ?
3443 			    VM_INHERIT_NONE : inheritance),
3444 			    clear_map_aligned);
3445 
3446 			assert(!is_kernel_object(object) || (VM_KERN_MEMORY_NONE != alias));
3447 
3448 			if (resilient_codesign) {
3449 				int reject_prot = (needs_copy ? VM_PROT_ALLEXEC : (VM_PROT_WRITE | VM_PROT_ALLEXEC));
3450 				if (!((cur_protection | max_protection) & reject_prot)) {
3451 					new_entry->vme_resilient_codesign = TRUE;
3452 				}
3453 			}
3454 
3455 			if (resilient_media &&
3456 			    (object == VM_OBJECT_NULL ||
3457 			    object->internal)) {
3458 				new_entry->vme_resilient_media = TRUE;
3459 			}
3460 
3461 			assert(!new_entry->iokit_acct);
3462 			if (!is_submap &&
3463 			    object != VM_OBJECT_NULL &&
3464 			    object->internal &&
3465 			    (object->purgable != VM_PURGABLE_DENY ||
3466 			    object->vo_ledger_tag)) {
3467 				assert(new_entry->use_pmap);
3468 				assert(!new_entry->iokit_acct);
3469 				/*
3470 				 * Turn off pmap accounting since
3471 				 * purgeable (or tagged) objects have their
3472 				 * own ledgers.
3473 				 */
3474 				new_entry->use_pmap = FALSE;
3475 			} else if (!is_submap &&
3476 			    iokit_acct &&
3477 			    object != VM_OBJECT_NULL &&
3478 			    object->internal) {
3479 				/* alternate accounting */
3480 				assert(!new_entry->iokit_acct);
3481 				assert(new_entry->use_pmap);
3482 				new_entry->iokit_acct = TRUE;
3483 				new_entry->use_pmap = FALSE;
3484 				DTRACE_VM4(
3485 					vm_map_iokit_mapped_region,
3486 					vm_map_t, map,
3487 					vm_map_offset_t, new_entry->vme_start,
3488 					vm_map_offset_t, new_entry->vme_end,
3489 					int, VME_ALIAS(new_entry));
3490 				vm_map_iokit_mapped_region(
3491 					map,
3492 					(new_entry->vme_end -
3493 					new_entry->vme_start));
3494 			} else if (!is_submap) {
3495 				assert(!new_entry->iokit_acct);
3496 				assert(new_entry->use_pmap);
3497 			}
3498 
3499 			if (is_submap) {
3500 				vm_map_t        submap;
3501 				boolean_t       submap_is_64bit;
3502 				boolean_t       use_pmap;
3503 
3504 				assert(new_entry->is_sub_map);
3505 				assert(!new_entry->use_pmap);
3506 				assert(!new_entry->iokit_acct);
3507 				submap = (vm_map_t) object;
3508 				submap_is_64bit = vm_map_is_64bit(submap);
3509 				use_pmap = vmk_flags.vmkf_nested_pmap;
3510 #ifndef NO_NESTED_PMAP
3511 				if (use_pmap && submap->pmap == NULL) {
3512 					ledger_t ledger = map->pmap->ledger;
3513 					/* we need a sub pmap to nest... */
3514 					submap->pmap = pmap_create_options(ledger, 0,
3515 					    submap_is_64bit ? PMAP_CREATE_64BIT : 0);
3516 					if (submap->pmap == NULL) {
3517 						/* let's proceed without nesting... */
3518 					}
3519 #if defined(__arm64__)
3520 					else {
3521 						pmap_set_nested(submap->pmap);
3522 					}
3523 #endif
3524 				}
3525 				if (use_pmap && submap->pmap != NULL) {
3526 					if (VM_MAP_PAGE_SHIFT(map) != VM_MAP_PAGE_SHIFT(submap)) {
3527 						DEBUG4K_ERROR("map %p (%d) submap %p (%d): incompatible page sizes\n", map, VM_MAP_PAGE_SHIFT(map), submap, VM_MAP_PAGE_SHIFT(submap));
3528 						kr = KERN_FAILURE;
3529 					} else {
3530 						kr = pmap_nest(map->pmap,
3531 						    submap->pmap,
3532 						    tmp_start,
3533 						    tmp_end - tmp_start);
3534 					}
3535 					if (kr != KERN_SUCCESS) {
3536 						printf("vm_map_enter: "
3537 						    "pmap_nest(0x%llx,0x%llx) "
3538 						    "error 0x%x\n",
3539 						    (long long)tmp_start,
3540 						    (long long)tmp_end,
3541 						    kr);
3542 					} else {
3543 						/* we're now nested ! */
3544 						new_entry->use_pmap = TRUE;
3545 						pmap_empty = FALSE;
3546 					}
3547 				}
3548 #endif /* NO_NESTED_PMAP */
3549 			}
3550 			entry = new_entry;
3551 
3552 			if (superpage_size) {
3553 				vm_page_t pages, m;
3554 				vm_object_t sp_object;
3555 				vm_object_offset_t sp_offset;
3556 
3557 				assert(object == VM_OBJECT_NULL);
3558 				VME_OFFSET_SET(entry, 0);
3559 
3560 				/* allocate one superpage */
3561 				kr = cpm_allocate(SUPERPAGE_SIZE, &pages, 0, SUPERPAGE_NBASEPAGES - 1, TRUE, 0);
3562 				if (kr != KERN_SUCCESS) {
3563 					/* deallocate whole range... */
3564 					new_mapping_established = TRUE;
3565 					/* ... but only up to "tmp_end" */
3566 					size -= end - tmp_end;
3567 					RETURN(kr);
3568 				}
3569 
3570 				/* create one vm_object per superpage */
3571 				sp_object = vm_object_allocate((vm_map_size_t)(entry->vme_end - entry->vme_start));
3572 				vm_object_lock(sp_object);
3573 				sp_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
3574 				VM_OBJECT_SET_PHYS_CONTIGUOUS(sp_object, TRUE);
3575 				sp_object->vo_shadow_offset = (vm_object_offset_t)VM_PAGE_GET_PHYS_PAGE(pages) * PAGE_SIZE;
3576 				VME_OBJECT_SET(entry, sp_object, false, 0);
3577 				assert(entry->use_pmap);
3578 
3579 				/* enter the base pages into the object */
3580 				for (sp_offset = 0;
3581 				    sp_offset < SUPERPAGE_SIZE;
3582 				    sp_offset += PAGE_SIZE) {
3583 					m = pages;
3584 					pmap_zero_page(VM_PAGE_GET_PHYS_PAGE(m));
3585 					pages = NEXT_PAGE(m);
3586 					*(NEXT_PAGE_PTR(m)) = VM_PAGE_NULL;
3587 					vm_page_insert_wired(m, sp_object, sp_offset, VM_KERN_MEMORY_OSFMK);
3588 				}
3589 				vm_object_unlock(sp_object);
3590 			}
3591 		} while (tmp_end != tmp2_end &&
3592 		    (tmp_start = tmp_end) &&
3593 		    (tmp_end = (tmp2_end - tmp_end > chunk_size) ?
3594 		    tmp_end + chunk_size : tmp2_end));
3595 	}
3596 
3597 	new_mapping_established = TRUE;
3598 
3599 
3600 BailOut:
3601 	assert(map_locked == TRUE);
3602 
3603 	/*
3604 	 * Address space limit enforcement (RLIMIT_AS and RLIMIT_DATA):
3605 	 * If we have identified and possibly established the new mapping(s),
3606 	 * make sure we did not go beyond the address space limit.
3607 	 */
3608 	if (result == KERN_SUCCESS) {
3609 		if (map->size_limit != RLIM_INFINITY &&
3610 		    map->size > map->size_limit) {
3611 			/*
3612 			 * Establishing the requested mappings would exceed
3613 			 * the process's RLIMIT_AS limit: fail with
3614 			 * KERN_NO_SPACE.
3615 			 */
3616 			result = KERN_NO_SPACE;
3617 			printf("%d[%s] %s: map size 0x%llx over RLIMIT_AS 0x%llx\n",
3618 			    proc_selfpid(),
3619 			    (get_bsdtask_info(current_task())
3620 			    ? proc_name_address(get_bsdtask_info(current_task()))
3621 			    : "?"),
3622 			    __FUNCTION__,
3623 			    (uint64_t) map->size,
3624 			    (uint64_t) map->size_limit);
3625 			DTRACE_VM2(vm_map_enter_RLIMIT_AS,
3626 			    vm_map_size_t, map->size,
3627 			    uint64_t, map->size_limit);
3628 			vm_map_enter_RLIMIT_AS_count++;
3629 		} else if (map->data_limit != RLIM_INFINITY &&
3630 		    map->size > map->data_limit) {
3631 			/*
3632 			 * Establishing the requested mappings would exceed
3633 			 * the process's RLIMIT_DATA limit: fail with
3634 			 * KERN_NO_SPACE.
3635 			 */
3636 			result = KERN_NO_SPACE;
3637 			printf("%d[%s] %s: map size 0x%llx over RLIMIT_DATA 0x%llx\n",
3638 			    proc_selfpid(),
3639 			    (get_bsdtask_info(current_task())
3640 			    ? proc_name_address(get_bsdtask_info(current_task()))
3641 			    : "?"),
3642 			    __FUNCTION__,
3643 			    (uint64_t) map->size,
3644 			    (uint64_t) map->data_limit);
3645 			DTRACE_VM2(vm_map_enter_RLIMIT_DATA,
3646 			    vm_map_size_t, map->size,
3647 			    uint64_t, map->data_limit);
3648 			vm_map_enter_RLIMIT_DATA_count++;
3649 		}
3650 	}
3651 
3652 	if (result == KERN_SUCCESS) {
3653 		vm_prot_t pager_prot;
3654 		memory_object_t pager;
3655 
3656 #if DEBUG
3657 		if (pmap_empty &&
3658 		    !(vmk_flags.vmkf_no_pmap_check)) {
3659 			assert(pmap_is_empty(map->pmap,
3660 			    *address,
3661 			    *address + size));
3662 		}
3663 #endif /* DEBUG */
3664 
3665 		/*
3666 		 * For "named" VM objects, let the pager know that the
3667 		 * memory object is being mapped.  Some pagers need to keep
3668 		 * track of this, to know when they can reclaim the memory
3669 		 * object, for example.
3670 		 * VM calls memory_object_map() for each mapping (specifying
3671 		 * the protection of each mapping) and calls
3672 		 * memory_object_last_unmap() when all the mappings are gone.
3673 		 */
3674 		pager_prot = max_protection;
3675 		if (needs_copy) {
3676 			/*
3677 			 * Copy-On-Write mapping: won't modify
3678 			 * the memory object.
3679 			 */
3680 			pager_prot &= ~VM_PROT_WRITE;
3681 		}
3682 		if (!is_submap &&
3683 		    object != VM_OBJECT_NULL &&
3684 		    object->named &&
3685 		    object->pager != MEMORY_OBJECT_NULL) {
3686 			vm_object_lock(object);
3687 			pager = object->pager;
3688 			if (object->named &&
3689 			    pager != MEMORY_OBJECT_NULL) {
3690 				assert(object->pager_ready);
3691 				vm_object_mapping_wait(object, THREAD_UNINT);
3692 				/* object might have lost its pager while waiting */
3693 				pager = object->pager;
3694 				if (object->named && pager != MEMORY_OBJECT_NULL) {
3695 					vm_object_mapping_begin(object);
3696 					vm_object_unlock(object);
3697 
3698 					kr = memory_object_map(pager, pager_prot);
3699 					assert(kr == KERN_SUCCESS);
3700 
3701 					vm_object_lock(object);
3702 					vm_object_mapping_end(object);
3703 				}
3704 			}
3705 			vm_object_unlock(object);
3706 		}
3707 	}
3708 
3709 	assert(map_locked == TRUE);
3710 
3711 	if (new_mapping_established) {
3712 		/*
3713 		 * If we release the map lock for any reason below,
3714 		 * another thread could deallocate our new mapping,
3715 		 * releasing the caller's reference on "caller_object",
3716 		 * which was transferred to the mapping.
3717 		 * If this was the only reference, the object could be
3718 		 * destroyed.
3719 		 *
3720 		 * We need to take an extra reference on "caller_object"
3721 		 * to keep it alive if we need to return the caller's
3722 		 * reference to the caller in case of failure.
3723 		 */
3724 		if (is_submap) {
3725 			vm_map_reference((vm_map_t)caller_object);
3726 		} else {
3727 			vm_object_reference(caller_object);
3728 		}
3729 	}
3730 
3731 	if (!keep_map_locked) {
3732 		vm_map_unlock(map);
3733 		map_locked = FALSE;
3734 		entry = VM_MAP_ENTRY_NULL;
3735 		new_entry = VM_MAP_ENTRY_NULL;
3736 	}
3737 
3738 	/*
3739 	 * We can't hold the map lock if we enter this block.
3740 	 */
3741 
3742 	if (result == KERN_SUCCESS) {
3743 		/*	Wire down the new entry if the user
3744 		 *	requested all new map entries be wired.
3745 		 */
3746 		if ((map->wiring_required) || (superpage_size)) {
3747 			assert(!keep_map_locked);
3748 			pmap_empty = FALSE; /* pmap won't be empty */
3749 			kr = vm_map_wire_nested(map, start, end,
3750 			    cur_protection, VM_KERN_MEMORY_MLOCK,
3751 			    TRUE, PMAP_NULL, 0, NULL);
3752 			result = kr;
3753 		}
3754 
3755 	}
3756 
3757 	if (result != KERN_SUCCESS) {
3758 		if (new_mapping_established) {
3759 			vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_FLAGS;
3760 
3761 			/*
3762 			 * We have to get rid of the new mappings since we
3763 			 * won't make them available to the user.
3764 			 * Try and do that atomically, to minimize the risk
3765 			 * that someone else create new mappings that range.
3766 			 */
3767 			if (!map_locked) {
3768 				vm_map_lock(map);
3769 				map_locked = TRUE;
3770 			}
3771 			remove_flags |= VM_MAP_REMOVE_NO_MAP_ALIGN;
3772 			remove_flags |= VM_MAP_REMOVE_NO_YIELD;
3773 			if (permanent) {
3774 				remove_flags |= VM_MAP_REMOVE_IMMUTABLE;
3775 			}
3776 			(void) vm_map_delete(map,
3777 			    *address, *address + size,
3778 			    remove_flags,
3779 			    KMEM_GUARD_NONE, &zap_new_list);
3780 		}
3781 
3782 		if (vm_map_zap_first_entry(&zap_old_list)) {
3783 			vm_map_entry_t entry1, entry2;
3784 
3785 			/*
3786 			 * The new mapping failed.  Attempt to restore
3787 			 * the old mappings, saved in the "zap_old_map".
3788 			 */
3789 			if (!map_locked) {
3790 				vm_map_lock(map);
3791 				map_locked = TRUE;
3792 			}
3793 
3794 			/* first check if the coast is still clear */
3795 			start = vm_map_zap_first_entry(&zap_old_list)->vme_start;
3796 			end   = vm_map_zap_last_entry(&zap_old_list)->vme_end;
3797 
3798 			if (vm_map_lookup_entry(map, start, &entry1) ||
3799 			    vm_map_lookup_entry(map, end, &entry2) ||
3800 			    entry1 != entry2) {
3801 				/*
3802 				 * Part of that range has already been
3803 				 * re-mapped:  we can't restore the old
3804 				 * mappings...
3805 				 */
3806 				vm_map_enter_restore_failures++;
3807 			} else {
3808 				/*
3809 				 * Transfer the saved map entries from
3810 				 * "zap_old_map" to the original "map",
3811 				 * inserting them all after "entry1".
3812 				 */
3813 				while ((entry2 = vm_map_zap_pop(&zap_old_list))) {
3814 					vm_map_size_t entry_size;
3815 
3816 					entry_size = (entry2->vme_end -
3817 					    entry2->vme_start);
3818 					vm_map_store_entry_link(map, entry1, entry2,
3819 					    VM_MAP_KERNEL_FLAGS_NONE);
3820 					map->size += entry_size;
3821 					entry1 = entry2;
3822 				}
3823 				if (map->wiring_required) {
3824 					/*
3825 					 * XXX TODO: we should rewire the
3826 					 * old pages here...
3827 					 */
3828 				}
3829 				vm_map_enter_restore_successes++;
3830 			}
3831 		}
3832 	}
3833 
3834 	/*
3835 	 * The caller is responsible for releasing the lock if it requested to
3836 	 * keep the map locked.
3837 	 */
3838 	if (map_locked && !keep_map_locked) {
3839 		vm_map_unlock(map);
3840 	}
3841 
3842 	vm_map_zap_dispose(&zap_old_list);
3843 	vm_map_zap_dispose(&zap_new_list);
3844 
3845 	if (new_mapping_established) {
3846 		/*
3847 		 * The caller had a reference on "caller_object" and we
3848 		 * transferred that reference to the mapping.
3849 		 * We also took an extra reference on "caller_object" to keep
3850 		 * it alive while the map was unlocked.
3851 		 */
3852 		if (result == KERN_SUCCESS) {
3853 			/*
3854 			 * On success, the caller's reference on the object gets
3855 			 * tranferred to the mapping.
3856 			 * Release our extra reference.
3857 			 */
3858 			if (is_submap) {
3859 				vm_map_deallocate((vm_map_t)caller_object);
3860 			} else {
3861 				vm_object_deallocate(caller_object);
3862 			}
3863 		} else {
3864 			/*
3865 			 * On error, the caller expects to still have a
3866 			 * reference on the object it gave us.
3867 			 * Let's use our extra reference for that.
3868 			 */
3869 		}
3870 	}
3871 
3872 	return result;
3873 
3874 #undef  RETURN
3875 }
3876 
3877 /*
3878  * Counters for the prefault optimization.
3879  */
3880 int64_t vm_prefault_nb_pages = 0;
3881 int64_t vm_prefault_nb_bailout = 0;
3882 
3883 static kern_return_t
vm_map_enter_adjust_offset(vm_object_offset_t * obj_offs,vm_object_offset_t * obj_end,vm_object_offset_t quantity)3884 vm_map_enter_adjust_offset(
3885 	vm_object_offset_t *obj_offs,
3886 	vm_object_offset_t *obj_end,
3887 	vm_object_offset_t  quantity)
3888 {
3889 	if (os_add_overflow(*obj_offs, quantity, obj_offs) ||
3890 	    os_add_overflow(*obj_end, quantity, obj_end) ||
3891 	    vm_map_round_page_mask(*obj_end, PAGE_MASK) == 0) {
3892 		return KERN_INVALID_ARGUMENT;
3893 	}
3894 
3895 	return KERN_SUCCESS;
3896 }
3897 
3898 static __attribute__((always_inline, warn_unused_result))
3899 kern_return_t
vm_map_enter_mem_object_sanitize(vm_map_t target_map,vm_map_offset_ut address_u,vm_map_size_ut initial_size_u,vm_map_offset_ut mask_u,vm_object_offset_ut offset_u,vm_prot_ut cur_protection_u,vm_prot_ut max_protection_u,vm_inherit_ut inheritance_u,vm_map_kernel_flags_t vmk_flags,ipc_port_t port,vm_map_address_t * map_addr,vm_map_size_t * map_size,vm_map_offset_t * mask,vm_object_offset_t * obj_offs,vm_object_offset_t * obj_end,vm_object_size_t * obj_size,vm_prot_t * cur_protection,vm_prot_t * max_protection,vm_inherit_t * inheritance)3900 vm_map_enter_mem_object_sanitize(
3901 	vm_map_t                target_map,
3902 	vm_map_offset_ut        address_u,
3903 	vm_map_size_ut          initial_size_u,
3904 	vm_map_offset_ut        mask_u,
3905 	vm_object_offset_ut     offset_u,
3906 	vm_prot_ut              cur_protection_u,
3907 	vm_prot_ut              max_protection_u,
3908 	vm_inherit_ut           inheritance_u,
3909 	vm_map_kernel_flags_t   vmk_flags,
3910 	ipc_port_t              port,
3911 	vm_map_address_t       *map_addr,
3912 	vm_map_size_t          *map_size,
3913 	vm_map_offset_t        *mask,
3914 	vm_object_offset_t     *obj_offs,
3915 	vm_object_offset_t     *obj_end,
3916 	vm_object_size_t       *obj_size,
3917 	vm_prot_t              *cur_protection,
3918 	vm_prot_t              *max_protection,
3919 	vm_inherit_t           *inheritance)
3920 {
3921 	kern_return_t           result;
3922 
3923 	result = vm_sanitize_cur_and_max_prots(cur_protection_u, max_protection_u,
3924 	    VM_SANITIZE_CALLER_ENTER_MEM_OBJ, target_map,
3925 	    VM_PROT_IS_MASK, cur_protection,
3926 	    max_protection);
3927 	if (__improbable(result != KERN_SUCCESS)) {
3928 		return result;
3929 	}
3930 
3931 	result = vm_sanitize_inherit(inheritance_u, VM_SANITIZE_CALLER_ENTER_MEM_OBJ,
3932 	    inheritance);
3933 	if (__improbable(result != KERN_SUCCESS)) {
3934 		return result;
3935 	}
3936 
3937 	result = vm_sanitize_mask(mask_u, VM_SANITIZE_CALLER_ENTER_MEM_OBJ, mask);
3938 	if (__improbable(result != KERN_SUCCESS)) {
3939 		return result;
3940 	}
3941 
3942 	if (vmk_flags.vmf_fixed) {
3943 		vm_map_address_t        map_end;
3944 
3945 		result = vm_sanitize_addr_size(address_u, initial_size_u,
3946 		    VM_SANITIZE_CALLER_ENTER_MEM_OBJ,
3947 		    target_map,
3948 		    VM_SANITIZE_FLAGS_SIZE_ZERO_FAILS | VM_SANITIZE_FLAGS_REALIGN_START,
3949 		    map_addr, &map_end, map_size);
3950 		if (__improbable(result != KERN_SUCCESS)) {
3951 			return result;
3952 		}
3953 	} else {
3954 		*map_addr = vm_sanitize_addr(target_map, address_u);
3955 		result = vm_sanitize_size(0, initial_size_u,
3956 		    VM_SANITIZE_CALLER_ENTER_MEM_OBJ, target_map,
3957 		    VM_SANITIZE_FLAGS_SIZE_ZERO_FAILS, map_size);
3958 		if (__improbable(result != KERN_SUCCESS)) {
3959 			return result;
3960 		}
3961 	}
3962 
3963 	*obj_size = vm_object_round_page(*map_size);
3964 	if (__improbable(*obj_size == 0)) {
3965 		return KERN_INVALID_ARGUMENT;
3966 	}
3967 
3968 	if (IP_VALID(port)) {
3969 		result = vm_sanitize_addr_size(offset_u, *obj_size,
3970 		    VM_SANITIZE_CALLER_ENTER_MEM_OBJ,
3971 		    PAGE_MASK,
3972 		    VM_SANITIZE_FLAGS_SIZE_ZERO_FAILS |
3973 		    VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES,
3974 		    obj_offs, obj_end, obj_size);
3975 		if (__improbable(result != KERN_SUCCESS)) {
3976 			return result;
3977 		}
3978 	} else {
3979 		*obj_offs = 0;
3980 		*obj_end  = *obj_size;
3981 	}
3982 
3983 	return KERN_SUCCESS;
3984 }
3985 
3986 kern_return_t
vm_map_enter_mem_object(vm_map_t target_map,vm_map_offset_ut * address_u,vm_map_size_ut initial_size_u,vm_map_offset_ut mask_u,vm_map_kernel_flags_t vmk_flags,ipc_port_t port,vm_object_offset_ut offset_u,boolean_t copy,vm_prot_ut cur_protection_u,vm_prot_ut max_protection_u,vm_inherit_ut inheritance_u,upl_page_list_ptr_t page_list,unsigned int page_list_count)3987 vm_map_enter_mem_object(
3988 	vm_map_t                target_map,
3989 	vm_map_offset_ut       *address_u,
3990 	vm_map_size_ut          initial_size_u,
3991 	vm_map_offset_ut        mask_u,
3992 	vm_map_kernel_flags_t   vmk_flags,
3993 	ipc_port_t              port,
3994 	vm_object_offset_ut     offset_u,
3995 	boolean_t               copy,
3996 	vm_prot_ut              cur_protection_u,
3997 	vm_prot_ut              max_protection_u,
3998 	vm_inherit_ut           inheritance_u,
3999 	upl_page_list_ptr_t     page_list,
4000 	unsigned int            page_list_count)
4001 {
4002 	vm_map_offset_t         mask;
4003 	vm_prot_t               cur_protection;
4004 	vm_prot_t               max_protection;
4005 	vm_inherit_t            inheritance;
4006 	vm_map_address_t        map_addr, map_mask;
4007 	vm_map_size_t           map_size;
4008 	vm_object_t             object = VM_OBJECT_NULL;
4009 	vm_object_offset_t      obj_offs, obj_end;
4010 	vm_object_size_t        obj_size;
4011 	kern_return_t           result;
4012 	boolean_t               mask_cur_protection, mask_max_protection;
4013 	boolean_t               kernel_prefault, try_prefault = (page_list_count != 0);
4014 	vm_map_offset_t         offset_in_mapping = 0;
4015 
4016 	if (VM_MAP_PAGE_SHIFT(target_map) < PAGE_SHIFT) {
4017 		/* XXX TODO4K prefaulting depends on page size... */
4018 		try_prefault = FALSE;
4019 	}
4020 
4021 	/*
4022 	 * Check arguments for validity
4023 	 */
4024 	if ((target_map == VM_MAP_NULL) ||
4025 	    (try_prefault && (copy || !page_list))) {
4026 		return KERN_INVALID_ARGUMENT;
4027 	}
4028 
4029 	map_mask = vm_map_page_mask(target_map);
4030 
4031 	/*
4032 	 * Sanitize any input parameters that are addr/size/prot/inherit
4033 	 */
4034 	result = vm_map_enter_mem_object_sanitize(
4035 		target_map,
4036 		*address_u,
4037 		initial_size_u,
4038 		mask_u,
4039 		offset_u,
4040 		cur_protection_u,
4041 		max_protection_u,
4042 		inheritance_u,
4043 		vmk_flags,
4044 		port,
4045 		&map_addr,
4046 		&map_size,
4047 		&mask,
4048 		&obj_offs,
4049 		&obj_end,
4050 		&obj_size,
4051 		&cur_protection,
4052 		&max_protection,
4053 		&inheritance);
4054 	if (__improbable(result != KERN_SUCCESS)) {
4055 		return vm_sanitize_get_kr(result);
4056 	}
4057 
4058 	assertf(vmk_flags.__vmkf_unused2 == 0, "vmk_flags unused2=0x%llx\n", vmk_flags.__vmkf_unused2);
4059 	vm_map_kernel_flags_update_range_id(&vmk_flags, target_map, map_size);
4060 
4061 	mask_cur_protection = cur_protection & VM_PROT_IS_MASK;
4062 	mask_max_protection = max_protection & VM_PROT_IS_MASK;
4063 	cur_protection &= ~VM_PROT_IS_MASK;
4064 	max_protection &= ~VM_PROT_IS_MASK;
4065 
4066 #if __arm64__
4067 	if (cur_protection & VM_PROT_EXECUTE) {
4068 		cur_protection |= VM_PROT_READ;
4069 	}
4070 #endif /* __arm64__ */
4071 
4072 	/*
4073 	 * Find the vm object (if any) corresponding to this port.
4074 	 */
4075 	if (!IP_VALID(port)) {
4076 		object = VM_OBJECT_NULL;
4077 		copy = FALSE;
4078 	} else if (ip_kotype(port) == IKOT_NAMED_ENTRY) {
4079 		vm_named_entry_t        named_entry;
4080 		vm_object_size_t        initial_size;
4081 
4082 		named_entry = mach_memory_entry_from_port(port);
4083 
4084 		if (vmk_flags.vmf_return_data_addr ||
4085 		    vmk_flags.vmf_return_4k_data_addr) {
4086 			result = vm_map_enter_adjust_offset(&obj_offs,
4087 			    &obj_end, named_entry->data_offset);
4088 			if (__improbable(result)) {
4089 				return result;
4090 			}
4091 		}
4092 
4093 		/* a few checks to make sure user is obeying rules */
4094 		if (mask_max_protection) {
4095 			max_protection &= named_entry->protection;
4096 		}
4097 		if (mask_cur_protection) {
4098 			cur_protection &= named_entry->protection;
4099 		}
4100 		if ((named_entry->protection & max_protection) !=
4101 		    max_protection) {
4102 			return KERN_INVALID_RIGHT;
4103 		}
4104 		if ((named_entry->protection & cur_protection) !=
4105 		    cur_protection) {
4106 			return KERN_INVALID_RIGHT;
4107 		}
4108 
4109 		/*
4110 		 * unwrap is safe because we know obj_size is larger and doesn't
4111 		 * overflow
4112 		 */
4113 		initial_size = VM_SANITIZE_UNSAFE_UNWRAP(initial_size_u);
4114 		if (named_entry->size < obj_offs + initial_size) {
4115 			return KERN_INVALID_ARGUMENT;
4116 		}
4117 
4118 		/* for a vm_map_copy, we can only map it whole */
4119 		if (named_entry->is_copy &&
4120 		    (obj_size != named_entry->size) &&
4121 		    (vm_map_round_page(obj_size, map_mask) == named_entry->size)) {
4122 			/* XXX FBDP use the rounded size... */
4123 			obj_end += named_entry->size - obj_size;
4124 			obj_size = named_entry->size;
4125 		}
4126 
4127 		if (named_entry->offset) {
4128 			/*
4129 			 * the callers parameter offset is defined to be the
4130 			 * offset from beginning of named entry offset in object
4131 			 *
4132 			 * Because we checked above that
4133 			 *   obj_offs + obj_size < named_entry_size
4134 			 * these overflow checks should be redundant...
4135 			 */
4136 			result = vm_map_enter_adjust_offset(&obj_offs,
4137 			    &obj_end, named_entry->offset);
4138 			if (__improbable(result)) {
4139 				return result;
4140 			}
4141 		}
4142 
4143 		if (!VM_MAP_PAGE_ALIGNED(obj_size, map_mask)) {
4144 			/*
4145 			 * Let's not map more than requested;
4146 			 * vm_map_enter() will handle this "not map-aligned"
4147 			 * case.
4148 			 */
4149 			map_size = obj_size;
4150 		}
4151 
4152 		named_entry_lock(named_entry);
4153 
4154 		// rdar://130307561 (Combine copy, object, and submap fields of vm_named_entry into an enum)
4155 		assert(named_entry->is_copy || named_entry->is_object || named_entry->is_sub_map);
4156 
4157 		if (named_entry->is_sub_map) {
4158 			vm_map_t                submap;
4159 
4160 			assert(!named_entry->is_copy);
4161 			assert(!named_entry->is_object);
4162 
4163 			if (vmk_flags.vmf_return_data_addr ||
4164 			    vmk_flags.vmf_return_4k_data_addr) {
4165 				panic("VM_FLAGS_RETURN_DATA_ADDR not expected for submap.");
4166 			}
4167 
4168 			submap = named_entry->backing.map;
4169 			vm_map_reference(submap);
4170 			named_entry_unlock(named_entry);
4171 
4172 			vmk_flags.vmkf_submap = TRUE;
4173 			result = vm_map_enter(target_map,
4174 			    &map_addr,
4175 			    map_size,
4176 			    mask,
4177 			    vmk_flags,
4178 			    (vm_object_t)(uintptr_t) submap,
4179 			    obj_offs,
4180 			    copy,
4181 			    cur_protection,
4182 			    max_protection,
4183 			    inheritance);
4184 			if (result != KERN_SUCCESS) {
4185 				vm_map_deallocate(submap);
4186 				return result;
4187 			}
4188 			/*
4189 			 * No need to lock "submap" just to check its
4190 			 * "mapped" flag: that flag is never reset
4191 			 * once it's been set and if we race, we'll
4192 			 * just end up setting it twice, which is OK.
4193 			 */
4194 			if (submap->mapped_in_other_pmaps == FALSE &&
4195 			    vm_map_pmap(submap) != PMAP_NULL &&
4196 			    vm_map_pmap(submap) !=
4197 			    vm_map_pmap(target_map)) {
4198 				/*
4199 				 * This submap is being mapped in a map
4200 				 * that uses a different pmap.
4201 				 * Set its "mapped_in_other_pmaps" flag
4202 				 * to indicate that we now need to
4203 				 * remove mappings from all pmaps rather
4204 				 * than just the submap's pmap.
4205 				 */
4206 				vm_map_lock(submap);
4207 				submap->mapped_in_other_pmaps = TRUE;
4208 				vm_map_unlock(submap);
4209 			}
4210 			goto out;
4211 		}
4212 
4213 		if (named_entry->is_copy) {
4214 			kern_return_t   kr;
4215 			vm_map_copy_t   copy_map;
4216 			vm_map_entry_t  copy_entry;
4217 			vm_map_offset_t copy_addr;
4218 			vm_map_copy_t   target_copy_map;
4219 			vm_map_offset_t overmap_start, overmap_end;
4220 			vm_map_offset_t trimmed_start;
4221 			vm_map_size_t   target_size;
4222 
4223 			assert(!named_entry->is_object);
4224 			assert(!named_entry->is_sub_map);
4225 
4226 			int allowed_flags = VM_FLAGS_FIXED |
4227 			    VM_FLAGS_ANYWHERE |
4228 			    VM_FLAGS_OVERWRITE |
4229 			    VM_FLAGS_RETURN_4K_DATA_ADDR |
4230 			    VM_FLAGS_RETURN_DATA_ADDR;
4231 
4232 			if (!vm_map_kernel_flags_check_vmflags(vmk_flags, allowed_flags)) {
4233 				named_entry_unlock(named_entry);
4234 				return KERN_INVALID_ARGUMENT;
4235 			}
4236 
4237 			copy_map = named_entry->backing.copy;
4238 			assert(copy_map->type == VM_MAP_COPY_ENTRY_LIST);
4239 			if (copy_map->type != VM_MAP_COPY_ENTRY_LIST) {
4240 				/* unsupported type; should not happen */
4241 				printf("vm_map_enter_mem_object: "
4242 				    "memory_entry->backing.copy "
4243 				    "unsupported type 0x%x\n",
4244 				    copy_map->type);
4245 				named_entry_unlock(named_entry);
4246 				return KERN_INVALID_ARGUMENT;
4247 			}
4248 
4249 			if (VM_MAP_PAGE_SHIFT(target_map) != copy_map->cpy_hdr.page_shift) {
4250 				DEBUG4K_SHARE("copy_map %p offset %llx size 0x%llx pgshift %d -> target_map %p pgshift %d\n", copy_map, obj_offs, (uint64_t)map_size, copy_map->cpy_hdr.page_shift, target_map, VM_MAP_PAGE_SHIFT(target_map));
4251 			}
4252 
4253 			if (vmk_flags.vmf_return_data_addr ||
4254 			    vmk_flags.vmf_return_4k_data_addr) {
4255 				offset_in_mapping = obj_offs & map_mask;
4256 				if (vmk_flags.vmf_return_4k_data_addr) {
4257 					offset_in_mapping &= ~((signed)(0xFFF));
4258 				}
4259 			}
4260 
4261 			target_copy_map = VM_MAP_COPY_NULL;
4262 			target_size = copy_map->size;
4263 			overmap_start = 0;
4264 			overmap_end = 0;
4265 			trimmed_start = 0;
4266 			if (copy_map->cpy_hdr.page_shift != VM_MAP_PAGE_SHIFT(target_map)) {
4267 				DEBUG4K_ADJUST("adjusting...\n");
4268 				kr = vm_map_copy_adjust_to_target(
4269 					copy_map,
4270 					obj_offs,
4271 					initial_size,
4272 					target_map,
4273 					copy,
4274 					&target_copy_map,
4275 					&overmap_start,
4276 					&overmap_end,
4277 					&trimmed_start);
4278 				if (kr != KERN_SUCCESS) {
4279 					named_entry_unlock(named_entry);
4280 					return kr;
4281 				}
4282 				target_size = target_copy_map->size;
4283 			} else {
4284 				/*
4285 				 * Assert that the vm_map_copy is coming from the right
4286 				 * zone and hasn't been forged
4287 				 */
4288 				vm_map_copy_require(copy_map);
4289 				target_copy_map = copy_map;
4290 			}
4291 
4292 			vm_map_kernel_flags_t rsv_flags = vmk_flags;
4293 
4294 			vm_map_kernel_flags_and_vmflags(&rsv_flags,
4295 			    (VM_FLAGS_FIXED |
4296 			    VM_FLAGS_ANYWHERE |
4297 			    VM_FLAGS_OVERWRITE |
4298 			    VM_FLAGS_RETURN_4K_DATA_ADDR |
4299 			    VM_FLAGS_RETURN_DATA_ADDR));
4300 
4301 			/* reserve a contiguous range */
4302 			kr = vm_map_enter(target_map,
4303 			    &map_addr,
4304 			    vm_map_round_page(target_size, map_mask),
4305 			    mask,
4306 			    rsv_flags,
4307 			    VM_OBJECT_NULL,
4308 			    0,
4309 			    FALSE,               /* copy */
4310 			    cur_protection,
4311 			    max_protection,
4312 			    inheritance);
4313 			if (kr != KERN_SUCCESS) {
4314 				DEBUG4K_ERROR("kr 0x%x\n", kr);
4315 				if (target_copy_map != copy_map) {
4316 					vm_map_copy_discard(target_copy_map);
4317 					target_copy_map = VM_MAP_COPY_NULL;
4318 				}
4319 				named_entry_unlock(named_entry);
4320 				return kr;
4321 			}
4322 
4323 			copy_addr = map_addr;
4324 
4325 			for (copy_entry = vm_map_copy_first_entry(target_copy_map);
4326 			    copy_entry != vm_map_copy_to_entry(target_copy_map);
4327 			    copy_entry = copy_entry->vme_next) {
4328 				vm_map_t                copy_submap = VM_MAP_NULL;
4329 				vm_object_t             copy_object = VM_OBJECT_NULL;
4330 				vm_map_size_t           copy_size;
4331 				vm_object_offset_t      copy_offset;
4332 				boolean_t               do_copy = false;
4333 
4334 				if (copy_entry->is_sub_map) {
4335 					copy_submap = VME_SUBMAP(copy_entry);
4336 					copy_object = (vm_object_t)copy_submap;
4337 				} else {
4338 					copy_object = VME_OBJECT(copy_entry);
4339 				}
4340 				copy_offset = VME_OFFSET(copy_entry);
4341 				copy_size = (copy_entry->vme_end -
4342 				    copy_entry->vme_start);
4343 
4344 				/* sanity check */
4345 				if ((copy_addr + copy_size) >
4346 				    (map_addr +
4347 				    overmap_start + overmap_end +
4348 				    named_entry->size /* XXX full size */)) {
4349 					/* over-mapping too much !? */
4350 					kr = KERN_INVALID_ARGUMENT;
4351 					DEBUG4K_ERROR("kr 0x%x\n", kr);
4352 					/* abort */
4353 					break;
4354 				}
4355 
4356 				/* take a reference on the object */
4357 				if (copy_entry->is_sub_map) {
4358 					vm_map_reference(copy_submap);
4359 				} else {
4360 					if (!copy &&
4361 					    copy_object != VM_OBJECT_NULL &&
4362 					    copy_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
4363 						bool is_writable;
4364 
4365 						/*
4366 						 * We need to resolve our side of this
4367 						 * "symmetric" copy-on-write now; we
4368 						 * need a new object to map and share,
4369 						 * instead of the current one which
4370 						 * might still be shared with the
4371 						 * original mapping.
4372 						 *
4373 						 * Note: A "vm_map_copy_t" does not
4374 						 * have a lock but we're protected by
4375 						 * the named entry's lock here.
4376 						 */
4377 						// assert(copy_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC);
4378 						VME_OBJECT_SHADOW(copy_entry, copy_size, TRUE);
4379 						assert(copy_object != VME_OBJECT(copy_entry));
4380 						is_writable = false;
4381 						if (copy_entry->protection & VM_PROT_WRITE) {
4382 							is_writable = true;
4383 #if __arm64e__
4384 						} else if (copy_entry->used_for_tpro) {
4385 							is_writable = true;
4386 #endif /* __arm64e__ */
4387 						}
4388 						if (!copy_entry->needs_copy && is_writable) {
4389 							vm_prot_t prot;
4390 
4391 							prot = copy_entry->protection & ~VM_PROT_WRITE;
4392 							vm_object_pmap_protect(copy_object,
4393 							    copy_offset,
4394 							    copy_size,
4395 							    PMAP_NULL,
4396 							    PAGE_SIZE,
4397 							    0,
4398 							    prot);
4399 						}
4400 						copy_entry->needs_copy = FALSE;
4401 						copy_entry->is_shared = TRUE;
4402 						copy_object = VME_OBJECT(copy_entry);
4403 						copy_offset = VME_OFFSET(copy_entry);
4404 						vm_object_lock(copy_object);
4405 						/* we're about to make a shared mapping of this object */
4406 						copy_object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
4407 						VM_OBJECT_SET_TRUE_SHARE(copy_object, TRUE);
4408 						vm_object_unlock(copy_object);
4409 					}
4410 
4411 					if (copy_object != VM_OBJECT_NULL &&
4412 					    copy_object->named &&
4413 					    copy_object->pager != MEMORY_OBJECT_NULL &&
4414 					    copy_object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4415 						memory_object_t pager;
4416 						vm_prot_t       pager_prot;
4417 
4418 						/*
4419 						 * For "named" VM objects, let the pager know that the
4420 						 * memory object is being mapped.  Some pagers need to keep
4421 						 * track of this, to know when they can reclaim the memory
4422 						 * object, for example.
4423 						 * VM calls memory_object_map() for each mapping (specifying
4424 						 * the protection of each mapping) and calls
4425 						 * memory_object_last_unmap() when all the mappings are gone.
4426 						 */
4427 						pager_prot = max_protection;
4428 						if (copy) {
4429 							/*
4430 							 * Copy-On-Write mapping: won't modify the
4431 							 * memory object.
4432 							 */
4433 							pager_prot &= ~VM_PROT_WRITE;
4434 						}
4435 						vm_object_lock(copy_object);
4436 						pager = copy_object->pager;
4437 						if (copy_object->named &&
4438 						    pager != MEMORY_OBJECT_NULL &&
4439 						    copy_object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4440 							assert(copy_object->pager_ready);
4441 							vm_object_mapping_wait(copy_object, THREAD_UNINT);
4442 							/*
4443 							 * Object might have lost its pager
4444 							 * while waiting.
4445 							 */
4446 							pager = copy_object->pager;
4447 							if (copy_object->named &&
4448 							    pager != MEMORY_OBJECT_NULL) {
4449 								vm_object_mapping_begin(copy_object);
4450 								vm_object_unlock(copy_object);
4451 
4452 								kr = memory_object_map(pager, pager_prot);
4453 								assert(kr == KERN_SUCCESS);
4454 
4455 								vm_object_lock(copy_object);
4456 								vm_object_mapping_end(copy_object);
4457 							}
4458 						}
4459 						vm_object_unlock(copy_object);
4460 					}
4461 
4462 					/*
4463 					 *	Perform the copy if requested
4464 					 */
4465 
4466 					if (copy && copy_object != VM_OBJECT_NULL) {
4467 						vm_object_t             new_object;
4468 						vm_object_offset_t      new_offset;
4469 
4470 						result = vm_object_copy_strategically(copy_object, copy_offset,
4471 						    copy_size,
4472 						    false,                                   /* forking */
4473 						    &new_object, &new_offset,
4474 						    &do_copy);
4475 
4476 
4477 						if (result == KERN_MEMORY_RESTART_COPY) {
4478 							boolean_t success;
4479 							boolean_t src_needs_copy;
4480 
4481 							/*
4482 							 * XXX
4483 							 * We currently ignore src_needs_copy.
4484 							 * This really is the issue of how to make
4485 							 * MEMORY_OBJECT_COPY_SYMMETRIC safe for
4486 							 * non-kernel users to use. Solution forthcoming.
4487 							 * In the meantime, since we don't allow non-kernel
4488 							 * memory managers to specify symmetric copy,
4489 							 * we won't run into problems here.
4490 							 */
4491 							new_object = copy_object;
4492 							new_offset = copy_offset;
4493 							success = vm_object_copy_quickly(new_object,
4494 							    new_offset,
4495 							    copy_size,
4496 							    &src_needs_copy,
4497 							    &do_copy);
4498 							assert(success);
4499 							result = KERN_SUCCESS;
4500 						}
4501 						if (result != KERN_SUCCESS) {
4502 							kr = result;
4503 							break;
4504 						}
4505 
4506 						copy_object = new_object;
4507 						copy_offset = new_offset;
4508 						/*
4509 						 * No extra object reference for the mapping:
4510 						 * the mapping should be the only thing keeping
4511 						 * this new object alive.
4512 						 */
4513 					} else {
4514 						/*
4515 						 * We already have the right object
4516 						 * to map.
4517 						 */
4518 						copy_object = VME_OBJECT(copy_entry);
4519 						/* take an extra ref for the mapping below */
4520 						vm_object_reference(copy_object);
4521 					}
4522 				}
4523 
4524 				/*
4525 				 * If the caller does not want a specific
4526 				 * tag for this new mapping:  use
4527 				 * the tag of the original mapping.
4528 				 */
4529 				vm_map_kernel_flags_t vmk_remap_flags = {
4530 					.vmkf_submap = copy_entry->is_sub_map,
4531 				};
4532 
4533 				vm_map_kernel_flags_set_vmflags(&vmk_remap_flags,
4534 				    vm_map_kernel_flags_vmflags(vmk_flags),
4535 				    vmk_flags.vm_tag ?: VME_ALIAS(copy_entry));
4536 
4537 				/* over-map the object into destination */
4538 				vmk_remap_flags.vmf_fixed = true;
4539 				vmk_remap_flags.vmf_overwrite = true;
4540 
4541 				if (!copy && !copy_entry->is_sub_map) {
4542 					/*
4543 					 * copy-on-write should have been
4544 					 * resolved at this point, or we would
4545 					 * end up sharing instead of copying.
4546 					 */
4547 					assert(!copy_entry->needs_copy);
4548 				}
4549 #if XNU_TARGET_OS_OSX
4550 				if (copy_entry->used_for_jit) {
4551 					vmk_remap_flags.vmkf_map_jit = TRUE;
4552 				}
4553 #endif /* XNU_TARGET_OS_OSX */
4554 
4555 				kr = vm_map_enter(target_map,
4556 				    &copy_addr,
4557 				    copy_size,
4558 				    (vm_map_offset_t) 0,
4559 				    vmk_remap_flags,
4560 				    copy_object,
4561 				    copy_offset,
4562 				    ((copy_object == NULL)
4563 				    ? FALSE
4564 				    : (copy || copy_entry->needs_copy)),
4565 				    cur_protection,
4566 				    max_protection,
4567 				    inheritance);
4568 				if (kr != KERN_SUCCESS) {
4569 					DEBUG4K_SHARE("failed kr 0x%x\n", kr);
4570 					if (copy_entry->is_sub_map) {
4571 						vm_map_deallocate(copy_submap);
4572 					} else {
4573 						vm_object_deallocate(copy_object);
4574 					}
4575 					/* abort */
4576 					break;
4577 				}
4578 
4579 				/* next mapping */
4580 				copy_addr += copy_size;
4581 			}
4582 
4583 			named_entry_unlock(named_entry);
4584 			if (target_copy_map != copy_map) {
4585 				vm_map_copy_discard(target_copy_map);
4586 				target_copy_map = VM_MAP_COPY_NULL;
4587 			}
4588 
4589 			if (kr == KERN_SUCCESS) {
4590 				if (overmap_start) {
4591 					DEBUG4K_SHARE("map %p map_addr 0x%llx offset_in_mapping 0x%llx overmap_start 0x%llx -> *address 0x%llx\n", target_map, (uint64_t)map_addr, (uint64_t)offset_in_mapping, (uint64_t)overmap_start, (uint64_t)(map_addr + offset_in_mapping + overmap_start));
4592 				}
4593 				offset_in_mapping += overmap_start;
4594 			} else if (!vmk_flags.vmf_overwrite) {
4595 				/* deallocate the contiguous range */
4596 				vm_map_remove(target_map, map_addr,
4597 				    map_addr + map_size);
4598 			}
4599 			result = kr;
4600 			goto out;
4601 		}
4602 
4603 		if (named_entry->is_object) {
4604 			unsigned int    access;
4605 			unsigned int    wimg_mode;
4606 
4607 			assert(!named_entry->is_copy);
4608 			assert(!named_entry->is_sub_map);
4609 
4610 			/* we are mapping a VM object */
4611 
4612 			access = named_entry->access;
4613 
4614 			if (vmk_flags.vmf_return_data_addr ||
4615 			    vmk_flags.vmf_return_4k_data_addr) {
4616 				offset_in_mapping = obj_offs & map_mask;
4617 				if (vmk_flags.vmf_return_4k_data_addr) {
4618 					offset_in_mapping &= ~((signed)(0xFFF));
4619 				}
4620 				obj_offs -= offset_in_mapping;
4621 				map_size  = vm_map_round_page(initial_size +
4622 				    offset_in_mapping, map_mask);
4623 			}
4624 
4625 			object = vm_named_entry_to_vm_object(named_entry);
4626 			assert(object != VM_OBJECT_NULL);
4627 			vm_object_lock(object);
4628 			named_entry_unlock(named_entry);
4629 
4630 			wimg_mode = object->wimg_bits;
4631 			vm_prot_to_wimg(access, &wimg_mode);
4632 			if (object->wimg_bits != wimg_mode) {
4633 				vm_object_change_wimg_mode(object, wimg_mode);
4634 			}
4635 
4636 			vm_object_reference_locked(object);
4637 			vm_object_unlock(object);
4638 		} else {
4639 			panic("invalid VM named entry %p", named_entry);
4640 		}
4641 	} else if (ip_kotype(port) == IKOT_MEMORY_OBJECT) {
4642 		/*
4643 		 * JMM - This is temporary until we unify named entries
4644 		 * and raw memory objects.
4645 		 *
4646 		 * Detected fake ip_kotype for a memory object.  In
4647 		 * this case, the port isn't really a port at all, but
4648 		 * instead is just a raw memory object.
4649 		 */
4650 		if (vmk_flags.vmf_return_data_addr ||
4651 		    vmk_flags.vmf_return_4k_data_addr) {
4652 			panic("VM_FLAGS_RETURN_DATA_ADDR not expected for raw memory object.");
4653 		}
4654 
4655 		object = memory_object_to_vm_object((memory_object_t)port);
4656 		if (object == VM_OBJECT_NULL) {
4657 			return KERN_INVALID_OBJECT;
4658 		}
4659 		vm_object_reference(object);
4660 
4661 		/* wait for object (if any) to be ready */
4662 		if (object != VM_OBJECT_NULL) {
4663 			if (is_kernel_object(object)) {
4664 				printf("Warning: Attempt to map kernel object"
4665 				    " by a non-private kernel entity\n");
4666 				return KERN_INVALID_OBJECT;
4667 			}
4668 			if (!object->pager_ready) {
4669 				vm_object_lock(object);
4670 
4671 				while (!object->pager_ready) {
4672 					vm_object_sleep(object,
4673 					    VM_OBJECT_EVENT_PAGER_READY,
4674 					    THREAD_UNINT,
4675 					    LCK_SLEEP_EXCLUSIVE);
4676 				}
4677 				vm_object_unlock(object);
4678 			}
4679 		}
4680 	} else {
4681 		return KERN_INVALID_OBJECT;
4682 	}
4683 
4684 	if (object != VM_OBJECT_NULL &&
4685 	    object->named &&
4686 	    object->pager != MEMORY_OBJECT_NULL &&
4687 	    object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4688 		memory_object_t pager;
4689 		vm_prot_t       pager_prot;
4690 		kern_return_t   kr;
4691 
4692 		/*
4693 		 * For "named" VM objects, let the pager know that the
4694 		 * memory object is being mapped.  Some pagers need to keep
4695 		 * track of this, to know when they can reclaim the memory
4696 		 * object, for example.
4697 		 * VM calls memory_object_map() for each mapping (specifying
4698 		 * the protection of each mapping) and calls
4699 		 * memory_object_last_unmap() when all the mappings are gone.
4700 		 */
4701 		pager_prot = max_protection;
4702 		if (copy) {
4703 			/*
4704 			 * Copy-On-Write mapping: won't modify the
4705 			 * memory object.
4706 			 */
4707 			pager_prot &= ~VM_PROT_WRITE;
4708 		}
4709 		vm_object_lock(object);
4710 		pager = object->pager;
4711 		if (object->named &&
4712 		    pager != MEMORY_OBJECT_NULL &&
4713 		    object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4714 			assert(object->pager_ready);
4715 			vm_object_mapping_wait(object, THREAD_UNINT);
4716 			/* object might have lost its pager while waiting */
4717 			pager = object->pager;
4718 			if (object->named && pager != MEMORY_OBJECT_NULL) {
4719 				vm_object_mapping_begin(object);
4720 				vm_object_unlock(object);
4721 
4722 				kr = memory_object_map(pager, pager_prot);
4723 				assert(kr == KERN_SUCCESS);
4724 
4725 				vm_object_lock(object);
4726 				vm_object_mapping_end(object);
4727 			}
4728 		}
4729 		vm_object_unlock(object);
4730 	}
4731 
4732 	/*
4733 	 *	Perform the copy if requested
4734 	 */
4735 
4736 	if (copy) {
4737 		vm_object_t             new_object;
4738 		vm_object_offset_t      new_offset;
4739 
4740 		result = vm_object_copy_strategically(object,
4741 		    obj_offs,
4742 		    map_size,
4743 		    false,                                   /* forking */
4744 		    &new_object, &new_offset,
4745 		    &copy);
4746 
4747 
4748 		if (result == KERN_MEMORY_RESTART_COPY) {
4749 			boolean_t success;
4750 			boolean_t src_needs_copy;
4751 
4752 			/*
4753 			 * XXX
4754 			 * We currently ignore src_needs_copy.
4755 			 * This really is the issue of how to make
4756 			 * MEMORY_OBJECT_COPY_SYMMETRIC safe for
4757 			 * non-kernel users to use. Solution forthcoming.
4758 			 * In the meantime, since we don't allow non-kernel
4759 			 * memory managers to specify symmetric copy,
4760 			 * we won't run into problems here.
4761 			 */
4762 			new_object = object;
4763 			new_offset = obj_offs;
4764 			success = vm_object_copy_quickly(new_object,
4765 			    new_offset,
4766 			    map_size,
4767 			    &src_needs_copy,
4768 			    &copy);
4769 			assert(success);
4770 			result = KERN_SUCCESS;
4771 		}
4772 		/*
4773 		 *	Throw away the reference to the
4774 		 *	original object, as it won't be mapped.
4775 		 */
4776 
4777 		vm_object_deallocate(object);
4778 
4779 		if (result != KERN_SUCCESS) {
4780 			return result;
4781 		}
4782 
4783 		object   = new_object;
4784 		obj_offs = new_offset;
4785 	}
4786 
4787 	/*
4788 	 * If non-kernel users want to try to prefault pages, the mapping and prefault
4789 	 * needs to be atomic.
4790 	 */
4791 	kernel_prefault = (try_prefault && vm_kernel_map_is_kernel(target_map));
4792 	vmk_flags.vmkf_keep_map_locked = (try_prefault && !kernel_prefault);
4793 
4794 	result = vm_map_enter(target_map,
4795 	    &map_addr, map_size,
4796 	    (vm_map_offset_t)mask,
4797 	    vmk_flags,
4798 	    object, obj_offs,
4799 	    copy,
4800 	    cur_protection, max_protection,
4801 	    inheritance);
4802 	if (result != KERN_SUCCESS) {
4803 		vm_object_deallocate(object);
4804 	}
4805 
4806 	/*
4807 	 * Try to prefault, and do not forget to release the vm map lock.
4808 	 */
4809 	if (result == KERN_SUCCESS && try_prefault) {
4810 		mach_vm_address_t va = map_addr;
4811 		kern_return_t kr = KERN_SUCCESS;
4812 		unsigned int i = 0;
4813 		int pmap_options;
4814 
4815 		pmap_options = kernel_prefault ? 0 : PMAP_OPTIONS_NOWAIT;
4816 
4817 		for (i = 0; i < page_list_count; ++i) {
4818 			if (!UPL_VALID_PAGE(page_list, i)) {
4819 				if (kernel_prefault) {
4820 					assertf(FALSE, "kernel_prefault && !UPL_VALID_PAGE");
4821 					result = KERN_MEMORY_ERROR;
4822 					break;
4823 				}
4824 			} else {
4825 				/*
4826 				 * If this function call failed, we should stop
4827 				 * trying to optimize, other calls are likely
4828 				 * going to fail too.
4829 				 *
4830 				 * We are not gonna report an error for such
4831 				 * failure though. That's an optimization, not
4832 				 * something critical.
4833 				 */
4834 				kr = pmap_enter_object_options_check(target_map->pmap,
4835 				    va, 0, object, UPL_PHYS_PAGE(page_list, i),
4836 				    cur_protection, VM_PROT_NONE,
4837 				    TRUE, pmap_options);
4838 				if (kr != KERN_SUCCESS) {
4839 					OSIncrementAtomic64(&vm_prefault_nb_bailout);
4840 					if (kernel_prefault) {
4841 						result = kr;
4842 					}
4843 					break;
4844 				}
4845 				OSIncrementAtomic64(&vm_prefault_nb_pages);
4846 			}
4847 
4848 			/* Next virtual address */
4849 			va += PAGE_SIZE;
4850 		}
4851 		if (vmk_flags.vmkf_keep_map_locked) {
4852 			vm_map_unlock(target_map);
4853 		}
4854 	}
4855 
4856 out:
4857 	if (result == KERN_SUCCESS) {
4858 #if KASAN
4859 		if (target_map->pmap == kernel_pmap) {
4860 			kasan_notify_address(map_addr, map_size);
4861 		}
4862 #endif
4863 		*address_u = vm_sanitize_wrap_addr(map_addr + offset_in_mapping);
4864 	}
4865 	return result;
4866 }
4867 
4868 kern_return_t
vm_map_enter_mem_object_prefault(vm_map_t target_map,vm_map_offset_ut * address,vm_map_size_ut initial_size,vm_map_offset_ut mask,vm_map_kernel_flags_t vmk_flags,ipc_port_t port,vm_object_offset_ut offset,vm_prot_ut cur_protection,vm_prot_ut max_protection,upl_page_list_ptr_t page_list,unsigned int page_list_count)4869 vm_map_enter_mem_object_prefault(
4870 	vm_map_t                target_map,
4871 	vm_map_offset_ut       *address,
4872 	vm_map_size_ut          initial_size,
4873 	vm_map_offset_ut        mask,
4874 	vm_map_kernel_flags_t   vmk_flags,
4875 	ipc_port_t              port,
4876 	vm_object_offset_ut     offset,
4877 	vm_prot_ut              cur_protection,
4878 	vm_prot_ut              max_protection,
4879 	upl_page_list_ptr_t     page_list,
4880 	unsigned int            page_list_count)
4881 {
4882 	/* range_id is set by vm_map_enter_mem_object */
4883 	return vm_map_enter_mem_object(target_map,
4884 	           address,
4885 	           initial_size,
4886 	           mask,
4887 	           vmk_flags,
4888 	           port,
4889 	           offset,
4890 	           FALSE,
4891 	           cur_protection,
4892 	           max_protection,
4893 	           VM_INHERIT_DEFAULT,
4894 	           page_list,
4895 	           page_list_count);
4896 }
4897 
4898 static __attribute__((always_inline, warn_unused_result))
4899 kern_return_t
vm_map_enter_mem_object_control_sanitize(vm_map_t target_map,vm_map_offset_ut address_u,vm_map_size_ut initial_size_u,vm_map_offset_ut mask_u,vm_object_offset_ut offset_u,vm_prot_ut cur_protection_u,vm_prot_ut max_protection_u,vm_inherit_ut inheritance_u,vm_map_kernel_flags_t vmk_flags,vm_map_address_t * map_addr,vm_map_size_t * map_size,vm_map_offset_t * mask,vm_object_offset_t * obj_offs,vm_object_offset_t * obj_end,vm_object_size_t * obj_size,vm_prot_t * cur_protection,vm_prot_t * max_protection,vm_inherit_t * inheritance)4900 vm_map_enter_mem_object_control_sanitize(
4901 	vm_map_t                target_map,
4902 	vm_map_offset_ut        address_u,
4903 	vm_map_size_ut          initial_size_u,
4904 	vm_map_offset_ut        mask_u,
4905 	vm_object_offset_ut     offset_u,
4906 	vm_prot_ut              cur_protection_u,
4907 	vm_prot_ut              max_protection_u,
4908 	vm_inherit_ut           inheritance_u,
4909 	vm_map_kernel_flags_t   vmk_flags,
4910 	vm_map_address_t       *map_addr,
4911 	vm_map_size_t          *map_size,
4912 	vm_map_offset_t        *mask,
4913 	vm_object_offset_t     *obj_offs,
4914 	vm_object_offset_t     *obj_end,
4915 	vm_object_size_t       *obj_size,
4916 	vm_prot_t              *cur_protection,
4917 	vm_prot_t              *max_protection,
4918 	vm_inherit_t           *inheritance)
4919 {
4920 	kern_return_t           kr;
4921 
4922 	kr = vm_sanitize_cur_and_max_prots(cur_protection_u, max_protection_u,
4923 	    VM_SANITIZE_CALLER_ENTER_MEM_OBJ_CTL, target_map,
4924 	    cur_protection, max_protection);
4925 	if (__improbable(kr != KERN_SUCCESS)) {
4926 		return kr;
4927 	}
4928 
4929 	kr = vm_sanitize_inherit(inheritance_u, VM_SANITIZE_CALLER_ENTER_MEM_OBJ_CTL,
4930 	    inheritance);
4931 	if (__improbable(kr != KERN_SUCCESS)) {
4932 		return kr;
4933 	}
4934 
4935 	kr = vm_sanitize_mask(mask_u, VM_SANITIZE_CALLER_ENTER_MEM_OBJ_CTL, mask);
4936 	if (__improbable(kr != KERN_SUCCESS)) {
4937 		return kr;
4938 	}
4939 	/*
4940 	 * Ensure arithmetic doesn't overflow in vm_object space (kernel
4941 	 * pages).
4942 	 * We keep unaligned values for now. The call we eventually make to
4943 	 * vm_map_enter does guarantee that offset_u is page aligned for EITHER
4944 	 * target_map pages or kernel pages. But this isn't enough to guarantee
4945 	 * kernel space alignment.
4946 	 */
4947 	kr = vm_sanitize_addr_size(offset_u, initial_size_u,
4948 	    VM_SANITIZE_CALLER_ENTER_MEM_OBJ_CTL, PAGE_MASK,
4949 	    VM_SANITIZE_FLAGS_SIZE_ZERO_FAILS |
4950 	    VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES,
4951 	    obj_offs, obj_end, obj_size);
4952 	if (__improbable(kr != KERN_SUCCESS)) {
4953 		return kr;
4954 	}
4955 
4956 	/*
4957 	 * There is no vm_sanitize_addr_size variant that also adjusts for
4958 	 * a separate offset. Rather than create one for this one-off issue,
4959 	 * we sanitize map_addr and map_size individually, relying on
4960 	 * vm_sanitize_size to incorporate the offset. Then, we perform the
4961 	 * overflow check manually below.
4962 	 */
4963 	*map_addr = vm_sanitize_addr(target_map, address_u);
4964 	kr = vm_sanitize_size(offset_u, initial_size_u,
4965 	    VM_SANITIZE_CALLER_ENTER_MEM_OBJ_CTL, target_map,
4966 	    VM_SANITIZE_FLAGS_SIZE_ZERO_FAILS, map_size);
4967 	if (__improbable(kr != KERN_SUCCESS)) {
4968 		return kr;
4969 	}
4970 
4971 	/*
4972 	 * Ensure arithmetic doesn't overflow in target_map space.
4973 	 * The computation of map_size above accounts for the possibility that
4974 	 * offset_u might be unaligned in target_map space.
4975 	 */
4976 	if (vmk_flags.vmf_fixed) {
4977 		vm_map_address_t map_end;
4978 
4979 		if (__improbable(os_add_overflow(*map_addr, *map_size, &map_end))) {
4980 			return KERN_INVALID_ARGUMENT;
4981 		}
4982 	}
4983 
4984 	return KERN_SUCCESS;
4985 }
4986 
4987 kern_return_t
vm_map_enter_mem_object_control(vm_map_t target_map,vm_map_offset_ut * address_u,vm_map_size_ut initial_size_u,vm_map_offset_ut mask_u,vm_map_kernel_flags_t vmk_flags,memory_object_control_t control,vm_object_offset_ut offset_u,boolean_t needs_copy,vm_prot_ut cur_protection_u,vm_prot_ut max_protection_u,vm_inherit_ut inheritance_u)4988 vm_map_enter_mem_object_control(
4989 	vm_map_t                target_map,
4990 	vm_map_offset_ut       *address_u,
4991 	vm_map_size_ut          initial_size_u,
4992 	vm_map_offset_ut        mask_u,
4993 	vm_map_kernel_flags_t   vmk_flags,
4994 	memory_object_control_t control,
4995 	vm_object_offset_ut     offset_u,
4996 	boolean_t               needs_copy,
4997 	vm_prot_ut              cur_protection_u,
4998 	vm_prot_ut              max_protection_u,
4999 	vm_inherit_ut           inheritance_u)
5000 {
5001 	vm_map_offset_t         mask;
5002 	vm_prot_t               cur_protection;
5003 	vm_prot_t               max_protection;
5004 	vm_inherit_t            inheritance;
5005 	vm_map_address_t        map_addr;
5006 	vm_map_size_t           map_size;
5007 	vm_object_t             object;
5008 	vm_object_offset_t      obj_offs, obj_end;
5009 	vm_object_size_t        obj_size;
5010 	kern_return_t           result;
5011 	memory_object_t         pager;
5012 	vm_prot_t               pager_prot;
5013 	kern_return_t           kr;
5014 
5015 	/*
5016 	 * Check arguments for validity
5017 	 */
5018 	if (target_map == VM_MAP_NULL) {
5019 		return KERN_INVALID_ARGUMENT;
5020 	}
5021 
5022 	/*
5023 	 * We only support vmf_return_data_addr-like behavior.
5024 	 */
5025 	vmk_flags.vmf_return_data_addr = true;
5026 
5027 	/*
5028 	 * Sanitize any input parameters that are addr/size/prot/inherit
5029 	 */
5030 	kr = vm_map_enter_mem_object_control_sanitize(target_map,
5031 	    *address_u,
5032 	    initial_size_u,
5033 	    mask_u,
5034 	    offset_u,
5035 	    cur_protection_u,
5036 	    max_protection_u,
5037 	    inheritance_u,
5038 	    vmk_flags,
5039 	    &map_addr,
5040 	    &map_size,
5041 	    &mask,
5042 	    &obj_offs,
5043 	    &obj_end,
5044 	    &obj_size,
5045 	    &cur_protection,
5046 	    &max_protection,
5047 	    &inheritance);
5048 	if (__improbable(kr != KERN_SUCCESS)) {
5049 		return vm_sanitize_get_kr(kr);
5050 	}
5051 
5052 	object = memory_object_control_to_vm_object(control);
5053 
5054 	if (object == VM_OBJECT_NULL) {
5055 		return KERN_INVALID_OBJECT;
5056 	}
5057 
5058 	if (is_kernel_object(object)) {
5059 		printf("Warning: Attempt to map kernel object"
5060 		    " by a non-private kernel entity\n");
5061 		return KERN_INVALID_OBJECT;
5062 	}
5063 
5064 	vm_object_lock(object);
5065 	os_ref_retain_locked_raw(&object->ref_count, &vm_object_refgrp);
5066 
5067 
5068 	/*
5069 	 * For "named" VM objects, let the pager know that the
5070 	 * memory object is being mapped.  Some pagers need to keep
5071 	 * track of this, to know when they can reclaim the memory
5072 	 * object, for example.
5073 	 * VM calls memory_object_map() for each mapping (specifying
5074 	 * the protection of each mapping) and calls
5075 	 * memory_object_last_unmap() when all the mappings are gone.
5076 	 */
5077 	pager_prot = max_protection;
5078 	if (needs_copy) {
5079 		pager_prot &= ~VM_PROT_WRITE;
5080 	}
5081 	pager = object->pager;
5082 	if (object->named &&
5083 	    pager != MEMORY_OBJECT_NULL &&
5084 	    object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
5085 		assert(object->pager_ready);
5086 		vm_object_mapping_wait(object, THREAD_UNINT);
5087 		/* object might have lost its pager while waiting */
5088 		pager = object->pager;
5089 		if (object->named && pager != MEMORY_OBJECT_NULL) {
5090 			vm_object_mapping_begin(object);
5091 			vm_object_unlock(object);
5092 
5093 			kr = memory_object_map(pager, pager_prot);
5094 			assert(kr == KERN_SUCCESS);
5095 
5096 			vm_object_lock(object);
5097 			vm_object_mapping_end(object);
5098 		}
5099 	}
5100 	vm_object_unlock(object);
5101 
5102 	/*
5103 	 *	Perform the copy if requested
5104 	 */
5105 
5106 	if (needs_copy) {
5107 		vm_object_t             new_object;
5108 		vm_object_offset_t      new_offset;
5109 
5110 		result = vm_object_copy_strategically(object, obj_offs, obj_size,
5111 		    false,                                   /* forking */
5112 		    &new_object, &new_offset,
5113 		    &needs_copy);
5114 
5115 
5116 		if (result == KERN_MEMORY_RESTART_COPY) {
5117 			boolean_t success;
5118 			boolean_t src_needs_copy;
5119 
5120 			/*
5121 			 * XXX
5122 			 * We currently ignore src_needs_copy.
5123 			 * This really is the issue of how to make
5124 			 * MEMORY_OBJECT_COPY_SYMMETRIC safe for
5125 			 * non-kernel users to use. Solution forthcoming.
5126 			 * In the meantime, since we don't allow non-kernel
5127 			 * memory managers to specify symmetric copy,
5128 			 * we won't run into problems here.
5129 			 */
5130 			new_object = object;
5131 			new_offset = obj_offs;
5132 			success = vm_object_copy_quickly(new_object,
5133 			    new_offset, obj_size,
5134 			    &src_needs_copy,
5135 			    &needs_copy);
5136 			assert(success);
5137 			result = KERN_SUCCESS;
5138 		}
5139 		/*
5140 		 *	Throw away the reference to the
5141 		 *	original object, as it won't be mapped.
5142 		 */
5143 
5144 		vm_object_deallocate(object);
5145 
5146 		if (result != KERN_SUCCESS) {
5147 			return result;
5148 		}
5149 
5150 		object   = new_object;
5151 		obj_offs = new_offset;
5152 	}
5153 
5154 	result = vm_map_enter(target_map,
5155 	    &map_addr, map_size,
5156 	    (vm_map_offset_t)mask,
5157 	    vmk_flags,
5158 	    object,
5159 	    obj_offs,
5160 	    needs_copy,
5161 	    cur_protection, max_protection,
5162 	    inheritance);
5163 
5164 	if (result == KERN_SUCCESS) {
5165 		*address_u = vm_sanitize_wrap_addr(
5166 			map_addr + (obj_offs & vm_map_page_mask(target_map)));
5167 	} else {
5168 		vm_object_deallocate(object);
5169 	}
5170 
5171 	return result;
5172 }
5173 
5174 
5175 /* Not used without nested pmaps */
5176 #ifndef NO_NESTED_PMAP
5177 /*
5178  * Clip and unnest a portion of a nested submap mapping.
5179  */
5180 
5181 
5182 static void
vm_map_clip_unnest(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t start_unnest,vm_map_offset_t end_unnest)5183 vm_map_clip_unnest(
5184 	vm_map_t        map,
5185 	vm_map_entry_t  entry,
5186 	vm_map_offset_t start_unnest,
5187 	vm_map_offset_t end_unnest)
5188 {
5189 	vm_map_offset_t old_start_unnest = start_unnest;
5190 	vm_map_offset_t old_end_unnest = end_unnest;
5191 
5192 	assert(entry->is_sub_map);
5193 	assert(VME_SUBMAP(entry) != NULL);
5194 	assert(entry->use_pmap);
5195 
5196 	/*
5197 	 * Query the platform for the optimal unnest range.
5198 	 * DRK: There's some duplication of effort here, since
5199 	 * callers may have adjusted the range to some extent. This
5200 	 * routine was introduced to support 1GiB subtree nesting
5201 	 * for x86 platforms, which can also nest on 2MiB boundaries
5202 	 * depending on size/alignment.
5203 	 */
5204 	if (pmap_adjust_unnest_parameters(map->pmap, &start_unnest, &end_unnest)) {
5205 		assert(VME_SUBMAP(entry)->is_nested_map);
5206 		assert(!VME_SUBMAP(entry)->disable_vmentry_reuse);
5207 		log_unnest_badness(map,
5208 		    old_start_unnest,
5209 		    old_end_unnest,
5210 		    VME_SUBMAP(entry)->is_nested_map,
5211 		    (entry->vme_start +
5212 		    VME_SUBMAP(entry)->lowest_unnestable_start -
5213 		    VME_OFFSET(entry)));
5214 	}
5215 
5216 	if (entry->vme_start > start_unnest ||
5217 	    entry->vme_end < end_unnest) {
5218 		panic("vm_map_clip_unnest(0x%llx,0x%llx): "
5219 		    "bad nested entry: start=0x%llx end=0x%llx\n",
5220 		    (long long)start_unnest, (long long)end_unnest,
5221 		    (long long)entry->vme_start, (long long)entry->vme_end);
5222 	}
5223 
5224 	if (start_unnest > entry->vme_start) {
5225 		_vm_map_clip_start(&map->hdr,
5226 		    entry,
5227 		    start_unnest);
5228 		if (map->holelistenabled) {
5229 			vm_map_store_update_first_free(map, NULL, FALSE);
5230 		} else {
5231 			vm_map_store_update_first_free(map, map->first_free, FALSE);
5232 		}
5233 	}
5234 	if (entry->vme_end > end_unnest) {
5235 		_vm_map_clip_end(&map->hdr,
5236 		    entry,
5237 		    end_unnest);
5238 		if (map->holelistenabled) {
5239 			vm_map_store_update_first_free(map, NULL, FALSE);
5240 		} else {
5241 			vm_map_store_update_first_free(map, map->first_free, FALSE);
5242 		}
5243 	}
5244 
5245 	pmap_unnest(map->pmap,
5246 	    entry->vme_start,
5247 	    entry->vme_end - entry->vme_start);
5248 	if ((map->mapped_in_other_pmaps) && os_ref_get_count_raw(&map->map_refcnt) != 0) {
5249 		/* clean up parent map/maps */
5250 		vm_map_submap_pmap_clean(
5251 			map, entry->vme_start,
5252 			entry->vme_end,
5253 			VME_SUBMAP(entry),
5254 			VME_OFFSET(entry));
5255 	}
5256 	entry->use_pmap = FALSE;
5257 	if ((map->pmap != kernel_pmap) &&
5258 	    (VME_ALIAS(entry) == VM_MEMORY_SHARED_PMAP)) {
5259 		VME_ALIAS_SET(entry, VM_MEMORY_UNSHARED_PMAP);
5260 	}
5261 }
5262 #endif  /* NO_NESTED_PMAP */
5263 
5264 __abortlike
5265 static void
__vm_map_clip_atomic_entry_panic(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t where)5266 __vm_map_clip_atomic_entry_panic(
5267 	vm_map_t        map,
5268 	vm_map_entry_t  entry,
5269 	vm_map_offset_t where)
5270 {
5271 	panic("vm_map_clip(%p): Attempting to clip an atomic VM map entry "
5272 	    "%p [0x%llx:0x%llx] at 0x%llx", map, entry,
5273 	    (uint64_t)entry->vme_start,
5274 	    (uint64_t)entry->vme_end,
5275 	    (uint64_t)where);
5276 }
5277 
5278 /*
5279  *	vm_map_clip_start:	[ internal use only ]
5280  *
5281  *	Asserts that the given entry begins at or after
5282  *	the specified address; if necessary,
5283  *	it splits the entry into two.
5284  */
5285 void
vm_map_clip_start(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t startaddr)5286 vm_map_clip_start(
5287 	vm_map_t        map,
5288 	vm_map_entry_t  entry,
5289 	vm_map_offset_t startaddr)
5290 {
5291 #ifndef NO_NESTED_PMAP
5292 	if (entry->is_sub_map &&
5293 	    entry->use_pmap &&
5294 	    startaddr >= entry->vme_start) {
5295 		vm_map_offset_t start_unnest, end_unnest;
5296 
5297 		/*
5298 		 * Make sure "startaddr" is no longer in a nested range
5299 		 * before we clip.  Unnest only the minimum range the platform
5300 		 * can handle.
5301 		 * vm_map_clip_unnest may perform additional adjustments to
5302 		 * the unnest range.
5303 		 */
5304 		start_unnest = startaddr & ~(pmap_shared_region_size_min(map->pmap) - 1);
5305 		end_unnest = start_unnest + pmap_shared_region_size_min(map->pmap);
5306 		vm_map_clip_unnest(map, entry, start_unnest, end_unnest);
5307 	}
5308 #endif /* NO_NESTED_PMAP */
5309 	if (startaddr > entry->vme_start) {
5310 		if (!entry->is_sub_map &&
5311 		    VME_OBJECT(entry) &&
5312 		    VME_OBJECT(entry)->phys_contiguous) {
5313 			pmap_remove(map->pmap,
5314 			    (addr64_t)(entry->vme_start),
5315 			    (addr64_t)(entry->vme_end));
5316 		}
5317 		if (entry->vme_atomic) {
5318 			__vm_map_clip_atomic_entry_panic(map, entry, startaddr);
5319 		}
5320 
5321 		DTRACE_VM5(
5322 			vm_map_clip_start,
5323 			vm_map_t, map,
5324 			vm_map_offset_t, entry->vme_start,
5325 			vm_map_offset_t, entry->vme_end,
5326 			vm_map_offset_t, startaddr,
5327 			int, VME_ALIAS(entry));
5328 
5329 		_vm_map_clip_start(&map->hdr, entry, startaddr);
5330 		if (map->holelistenabled) {
5331 			vm_map_store_update_first_free(map, NULL, FALSE);
5332 		} else {
5333 			vm_map_store_update_first_free(map, map->first_free, FALSE);
5334 		}
5335 	}
5336 }
5337 
5338 
5339 #define vm_map_copy_clip_start(copy, entry, startaddr) \
5340 	MACRO_BEGIN \
5341 	if ((startaddr) > (entry)->vme_start) \
5342 	        _vm_map_clip_start(&(copy)->cpy_hdr,(entry),(startaddr)); \
5343 	MACRO_END
5344 
5345 /*
5346  *	This routine is called only when it is known that
5347  *	the entry must be split.
5348  */
5349 static void
_vm_map_clip_start(struct vm_map_header * map_header,vm_map_entry_t entry,vm_map_offset_t start)5350 _vm_map_clip_start(
5351 	struct vm_map_header    *map_header,
5352 	vm_map_entry_t          entry,
5353 	vm_map_offset_t         start)
5354 {
5355 	vm_map_entry_t  new_entry;
5356 
5357 	/*
5358 	 *	Split off the front portion --
5359 	 *	note that we must insert the new
5360 	 *	entry BEFORE this one, so that
5361 	 *	this entry has the specified starting
5362 	 *	address.
5363 	 */
5364 
5365 	if (entry->map_aligned) {
5366 		assert(VM_MAP_PAGE_ALIGNED(start,
5367 		    VM_MAP_HDR_PAGE_MASK(map_header)));
5368 	}
5369 
5370 	new_entry = _vm_map_entry_create(map_header);
5371 	vm_map_entry_copy_full(new_entry, entry);
5372 
5373 	new_entry->vme_end = start;
5374 	assert(new_entry->vme_start < new_entry->vme_end);
5375 	VME_OFFSET_SET(entry, VME_OFFSET(entry) + (start - entry->vme_start));
5376 	if (__improbable(start >= entry->vme_end)) {
5377 		panic("mapHdr %p entry %p start 0x%llx end 0x%llx new start 0x%llx", map_header, entry, entry->vme_start, entry->vme_end, start);
5378 	}
5379 	assert(start < entry->vme_end);
5380 	entry->vme_start = start;
5381 
5382 #if VM_BTLOG_TAGS
5383 	if (new_entry->vme_kernel_object) {
5384 		btref_retain(new_entry->vme_tag_btref);
5385 	}
5386 #endif /* VM_BTLOG_TAGS */
5387 
5388 	_vm_map_store_entry_link(map_header, entry->vme_prev, new_entry);
5389 
5390 	if (entry->is_sub_map) {
5391 		vm_map_reference(VME_SUBMAP(new_entry));
5392 	} else {
5393 		vm_object_reference(VME_OBJECT(new_entry));
5394 	}
5395 }
5396 
5397 
5398 /*
5399  *	vm_map_clip_end:	[ internal use only ]
5400  *
5401  *	Asserts that the given entry ends at or before
5402  *	the specified address; if necessary,
5403  *	it splits the entry into two.
5404  */
5405 void
vm_map_clip_end(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t endaddr)5406 vm_map_clip_end(
5407 	vm_map_t        map,
5408 	vm_map_entry_t  entry,
5409 	vm_map_offset_t endaddr)
5410 {
5411 	if (endaddr > entry->vme_end) {
5412 		/*
5413 		 * Within the scope of this clipping, limit "endaddr" to
5414 		 * the end of this map entry...
5415 		 */
5416 		endaddr = entry->vme_end;
5417 	}
5418 #ifndef NO_NESTED_PMAP
5419 	if (entry->is_sub_map && entry->use_pmap) {
5420 		vm_map_offset_t start_unnest, end_unnest;
5421 
5422 		/*
5423 		 * Make sure the range between the start of this entry and
5424 		 * the new "endaddr" is no longer nested before we clip.
5425 		 * Unnest only the minimum range the platform can handle.
5426 		 * vm_map_clip_unnest may perform additional adjustments to
5427 		 * the unnest range.
5428 		 */
5429 		start_unnest = entry->vme_start;
5430 		end_unnest =
5431 		    (endaddr + pmap_shared_region_size_min(map->pmap) - 1) &
5432 		    ~(pmap_shared_region_size_min(map->pmap) - 1);
5433 		vm_map_clip_unnest(map, entry, start_unnest, end_unnest);
5434 	}
5435 #endif /* NO_NESTED_PMAP */
5436 	if (endaddr < entry->vme_end) {
5437 		if (!entry->is_sub_map &&
5438 		    VME_OBJECT(entry) &&
5439 		    VME_OBJECT(entry)->phys_contiguous) {
5440 			pmap_remove(map->pmap,
5441 			    (addr64_t)(entry->vme_start),
5442 			    (addr64_t)(entry->vme_end));
5443 		}
5444 		if (entry->vme_atomic) {
5445 			__vm_map_clip_atomic_entry_panic(map, entry, endaddr);
5446 		}
5447 		DTRACE_VM5(
5448 			vm_map_clip_end,
5449 			vm_map_t, map,
5450 			vm_map_offset_t, entry->vme_start,
5451 			vm_map_offset_t, entry->vme_end,
5452 			vm_map_offset_t, endaddr,
5453 			int, VME_ALIAS(entry));
5454 
5455 		_vm_map_clip_end(&map->hdr, entry, endaddr);
5456 		if (map->holelistenabled) {
5457 			vm_map_store_update_first_free(map, NULL, FALSE);
5458 		} else {
5459 			vm_map_store_update_first_free(map, map->first_free, FALSE);
5460 		}
5461 	}
5462 }
5463 
5464 
5465 #define vm_map_copy_clip_end(copy, entry, endaddr) \
5466 	MACRO_BEGIN \
5467 	if ((endaddr) < (entry)->vme_end) \
5468 	        _vm_map_clip_end(&(copy)->cpy_hdr,(entry),(endaddr)); \
5469 	MACRO_END
5470 
5471 /*
5472  *	This routine is called only when it is known that
5473  *	the entry must be split.
5474  */
5475 static void
_vm_map_clip_end(struct vm_map_header * map_header,vm_map_entry_t entry,vm_map_offset_t end)5476 _vm_map_clip_end(
5477 	struct vm_map_header    *map_header,
5478 	vm_map_entry_t          entry,
5479 	vm_map_offset_t         end)
5480 {
5481 	vm_map_entry_t  new_entry;
5482 
5483 	/*
5484 	 *	Create a new entry and insert it
5485 	 *	AFTER the specified entry
5486 	 */
5487 
5488 	if (entry->map_aligned) {
5489 		assert(VM_MAP_PAGE_ALIGNED(end,
5490 		    VM_MAP_HDR_PAGE_MASK(map_header)));
5491 	}
5492 
5493 	new_entry = _vm_map_entry_create(map_header);
5494 	vm_map_entry_copy_full(new_entry, entry);
5495 
5496 	if (__improbable(end <= entry->vme_start)) {
5497 		panic("mapHdr %p entry %p start 0x%llx end 0x%llx new end 0x%llx", map_header, entry, entry->vme_start, entry->vme_end, end);
5498 	}
5499 	assert(entry->vme_start < end);
5500 	new_entry->vme_start = entry->vme_end = end;
5501 	VME_OFFSET_SET(new_entry,
5502 	    VME_OFFSET(new_entry) + (end - entry->vme_start));
5503 	assert(new_entry->vme_start < new_entry->vme_end);
5504 
5505 #if VM_BTLOG_TAGS
5506 	if (new_entry->vme_kernel_object) {
5507 		btref_retain(new_entry->vme_tag_btref);
5508 	}
5509 #endif /* VM_BTLOG_TAGS */
5510 
5511 	_vm_map_store_entry_link(map_header, entry, new_entry);
5512 
5513 	if (entry->is_sub_map) {
5514 		vm_map_reference(VME_SUBMAP(new_entry));
5515 	} else {
5516 		vm_object_reference(VME_OBJECT(new_entry));
5517 	}
5518 }
5519 
5520 
5521 /*
5522  *	VM_MAP_RANGE_CHECK:	[ internal use only ]
5523  *
5524  *	Asserts that the starting and ending region
5525  *	addresses fall within the valid range of the map.
5526  */
5527 #define VM_MAP_RANGE_CHECK(map, start, end)     \
5528 	MACRO_BEGIN                             \
5529 	if (start < vm_map_min(map))            \
5530 	        start = vm_map_min(map);        \
5531 	if (end > vm_map_max(map))              \
5532 	        end = vm_map_max(map);          \
5533 	if (start > end)                        \
5534 	        start = end;                    \
5535 	MACRO_END
5536 
5537 /*
5538  *	vm_map_range_check:	[ internal use only ]
5539  *
5540  *	Check that the region defined by the specified start and
5541  *	end addresses are wholly contained within a single map
5542  *	entry or set of adjacent map entries of the spacified map,
5543  *	i.e. the specified region contains no unmapped space.
5544  *	If any or all of the region is unmapped, FALSE is returned.
5545  *	Otherwise, TRUE is returned and if the output argument 'entry'
5546  *	is not NULL it points to the map entry containing the start
5547  *	of the region.
5548  *
5549  *	The map is locked for reading on entry and is left locked.
5550  */
5551 static boolean_t
vm_map_range_check(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_map_entry_t * entry)5552 vm_map_range_check(
5553 	vm_map_t                map,
5554 	vm_map_offset_t         start,
5555 	vm_map_offset_t         end,
5556 	vm_map_entry_t          *entry)
5557 {
5558 	vm_map_entry_t          cur;
5559 	vm_map_offset_t         prev;
5560 
5561 	/*
5562 	 *      Basic sanity checks first
5563 	 */
5564 	if (start < vm_map_min(map) || end > vm_map_max(map) || start > end) {
5565 		return FALSE;
5566 	}
5567 
5568 	/*
5569 	 *      Check first if the region starts within a valid
5570 	 *	mapping for the map.
5571 	 */
5572 	if (!vm_map_lookup_entry(map, start, &cur)) {
5573 		return FALSE;
5574 	}
5575 
5576 	/*
5577 	 *	Optimize for the case that the region is contained
5578 	 *	in a single map entry.
5579 	 */
5580 	if (entry != (vm_map_entry_t *) NULL) {
5581 		*entry = cur;
5582 	}
5583 	if (end <= cur->vme_end) {
5584 		return TRUE;
5585 	}
5586 
5587 	/*
5588 	 *      If the region is not wholly contained within a
5589 	 *      single entry, walk the entries looking for holes.
5590 	 */
5591 	prev = cur->vme_end;
5592 	cur = cur->vme_next;
5593 	while ((cur != vm_map_to_entry(map)) && (prev == cur->vme_start)) {
5594 		if (end <= cur->vme_end) {
5595 			return TRUE;
5596 		}
5597 		prev = cur->vme_end;
5598 		cur = cur->vme_next;
5599 	}
5600 	return FALSE;
5601 }
5602 
5603 static __attribute__((always_inline, warn_unused_result))
5604 kern_return_t
vm_map_protect_sanitize(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_prot_ut new_prot_u,vm_map_offset_t * start,vm_map_offset_t * end,vm_prot_t * new_prot)5605 vm_map_protect_sanitize(
5606 	vm_map_t                map,
5607 	vm_map_offset_ut        start_u,
5608 	vm_map_offset_ut        end_u,
5609 	vm_prot_ut              new_prot_u,
5610 	vm_map_offset_t        *start,
5611 	vm_map_offset_t        *end,
5612 	vm_prot_t              *new_prot)
5613 {
5614 	kern_return_t           kr;
5615 	vm_map_size_t           size;
5616 
5617 	kr = vm_sanitize_prot(new_prot_u, VM_SANITIZE_CALLER_VM_MAP_PROTECT,
5618 	    map, VM_PROT_COPY, new_prot);
5619 	if (__improbable(kr != KERN_SUCCESS)) {
5620 		return kr;
5621 	}
5622 
5623 	kr = vm_sanitize_addr_end(start_u, end_u, VM_SANITIZE_CALLER_VM_MAP_PROTECT,
5624 	    map, VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS, start, end, &size);
5625 	if (__improbable(kr != KERN_SUCCESS)) {
5626 		return kr;
5627 	}
5628 
5629 	return KERN_SUCCESS;
5630 }
5631 
5632 /*
5633  *	vm_map_protect:
5634  *
5635  *	Sets the protection of the specified address
5636  *	region in the target map.  If "set_max" is
5637  *	specified, the maximum protection is to be set;
5638  *	otherwise, only the current protection is affected.
5639  */
5640 kern_return_t
vm_map_protect(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,boolean_t set_max,vm_prot_ut new_prot_u)5641 vm_map_protect(
5642 	vm_map_t                map,
5643 	vm_map_offset_ut        start_u,
5644 	vm_map_offset_ut        end_u,
5645 	boolean_t               set_max,
5646 	vm_prot_ut              new_prot_u)
5647 {
5648 	vm_map_entry_t                  current;
5649 	vm_map_offset_t                 prev;
5650 	vm_map_entry_t                  entry;
5651 	vm_prot_t                       new_prot;
5652 	vm_prot_t                       new_max;
5653 	int                             pmap_options = 0;
5654 	kern_return_t                   kr;
5655 	vm_map_offset_t                 start, original_start;
5656 	vm_map_offset_t                 end;
5657 
5658 	kr = vm_map_protect_sanitize(map,
5659 	    start_u,
5660 	    end_u,
5661 	    new_prot_u,
5662 	    &start,
5663 	    &end,
5664 	    &new_prot);
5665 	if (__improbable(kr != KERN_SUCCESS)) {
5666 		return vm_sanitize_get_kr(kr);
5667 	}
5668 	original_start = start;
5669 
5670 	if (new_prot & VM_PROT_COPY) {
5671 		vm_map_offset_t         new_start;
5672 		vm_prot_t               cur_prot, max_prot;
5673 		vm_map_kernel_flags_t   kflags;
5674 
5675 		/* LP64todo - see below */
5676 		if (start >= map->max_offset) {
5677 			return KERN_INVALID_ADDRESS;
5678 		}
5679 
5680 		if ((new_prot & VM_PROT_ALLEXEC) &&
5681 		    map->pmap != kernel_pmap &&
5682 		    (vm_map_cs_enforcement(map)
5683 #if XNU_TARGET_OS_OSX && __arm64__
5684 		    || !VM_MAP_IS_EXOTIC(map)
5685 #endif /* XNU_TARGET_OS_OSX && __arm64__ */
5686 		    ) &&
5687 		    VM_MAP_POLICY_WX_FAIL(map)) {
5688 			DTRACE_VM3(cs_wx,
5689 			    uint64_t, (uint64_t) start,
5690 			    uint64_t, (uint64_t) end,
5691 			    vm_prot_t, new_prot);
5692 			printf("CODE SIGNING: %d[%s] %s:%d(0x%llx,0x%llx,0x%x) can't have both write and exec at the same time\n",
5693 			    proc_selfpid(),
5694 			    (get_bsdtask_info(current_task())
5695 			    ? proc_name_address(get_bsdtask_info(current_task()))
5696 			    : "?"),
5697 			    __FUNCTION__, __LINE__,
5698 #if DEVELOPMENT || DEBUG
5699 			    (uint64_t)start,
5700 			    (uint64_t)end,
5701 #else /* DEVELOPMENT || DEBUG */
5702 			    (uint64_t)0,
5703 			    (uint64_t)0,
5704 #endif /* DEVELOPMENT || DEBUG */
5705 			    new_prot);
5706 			return KERN_PROTECTION_FAILURE;
5707 		}
5708 
5709 		/*
5710 		 * Let vm_map_remap_extract() know that it will need to:
5711 		 * + make a copy of the mapping
5712 		 * + add VM_PROT_WRITE to the max protections
5713 		 * + remove any protections that are no longer allowed from the
5714 		 *   max protections (to avoid any WRITE/EXECUTE conflict, for
5715 		 *   example).
5716 		 * Note that "max_prot" is an IN/OUT parameter only for this
5717 		 * specific (VM_PROT_COPY) case.  It's usually an OUT parameter
5718 		 * only.
5719 		 */
5720 		max_prot = new_prot & (VM_PROT_ALL | VM_PROT_ALLEXEC);
5721 		cur_prot = VM_PROT_NONE;
5722 		kflags = VM_MAP_KERNEL_FLAGS_FIXED(.vmf_overwrite = true);
5723 		kflags.vmkf_remap_prot_copy = true;
5724 		kflags.vmkf_tpro_enforcement_override = !vm_map_tpro_enforcement(map);
5725 		new_start = start;
5726 		kr = vm_map_remap(map,
5727 		    vm_sanitize_wrap_addr_ref(&new_start),
5728 		    end - start,
5729 		    0, /* mask */
5730 		    kflags,
5731 		    map,
5732 		    start,
5733 		    TRUE, /* copy-on-write remapping! */
5734 		    vm_sanitize_wrap_prot_ref(&cur_prot), /* IN/OUT */
5735 		    vm_sanitize_wrap_prot_ref(&max_prot), /* IN/OUT */
5736 		    VM_INHERIT_DEFAULT);
5737 		if (kr != KERN_SUCCESS) {
5738 			return kr;
5739 		}
5740 		new_prot &= ~VM_PROT_COPY;
5741 	}
5742 
5743 	vm_map_lock(map);
5744 restart_after_unlock:
5745 
5746 	/* LP64todo - remove this check when vm_map_commpage64()
5747 	 * no longer has to stuff in a map_entry for the commpage
5748 	 * above the map's max_offset.
5749 	 */
5750 	if (start >= map->max_offset) {
5751 		vm_map_unlock(map);
5752 		return KERN_INVALID_ADDRESS;
5753 	}
5754 
5755 	while (1) {
5756 		/*
5757 		 *      Lookup the entry.  If it doesn't start in a valid
5758 		 *	entry, return an error.
5759 		 */
5760 		if (!vm_map_lookup_entry(map, start, &entry)) {
5761 			vm_map_unlock(map);
5762 			return KERN_INVALID_ADDRESS;
5763 		}
5764 
5765 		if (entry->superpage_size && (start & (SUPERPAGE_SIZE - 1))) { /* extend request to whole entry */
5766 			start = SUPERPAGE_ROUND_DOWN(start);
5767 			continue;
5768 		}
5769 		break;
5770 	}
5771 	if (entry->superpage_size) {
5772 		end = SUPERPAGE_ROUND_UP(end);
5773 	}
5774 
5775 	/*
5776 	 *	Make a first pass to check for protection and address
5777 	 *	violations.
5778 	 */
5779 
5780 	current = entry;
5781 	prev = current->vme_start;
5782 	while ((current != vm_map_to_entry(map)) &&
5783 	    (current->vme_start < end)) {
5784 		/*
5785 		 * If there is a hole, return an error.
5786 		 */
5787 		if (current->vme_start != prev) {
5788 			vm_map_unlock(map);
5789 			return KERN_INVALID_ADDRESS;
5790 		}
5791 
5792 		new_max = current->max_protection;
5793 
5794 #if defined(__x86_64__)
5795 		/* Allow max mask to include execute prot bits if this map doesn't enforce CS */
5796 		if (set_max && (new_prot & VM_PROT_ALLEXEC) && !vm_map_cs_enforcement(map)) {
5797 			new_max = (new_max & ~VM_PROT_ALLEXEC) | (new_prot & VM_PROT_ALLEXEC);
5798 		}
5799 #elif CODE_SIGNING_MONITOR
5800 		if (set_max && (new_prot & VM_PROT_EXECUTE) && (csm_address_space_exempt(map->pmap) == KERN_SUCCESS)) {
5801 			new_max |= VM_PROT_EXECUTE;
5802 		}
5803 #endif
5804 		if ((new_prot & new_max) != new_prot) {
5805 			vm_map_unlock(map);
5806 			return KERN_PROTECTION_FAILURE;
5807 		}
5808 
5809 		if (current->used_for_jit &&
5810 		    pmap_has_prot_policy(map->pmap, current->translated_allow_execute, current->protection)) {
5811 			vm_map_unlock(map);
5812 			return KERN_PROTECTION_FAILURE;
5813 		}
5814 
5815 #if __arm64e__
5816 		/* Disallow protecting hw assisted TPRO mappings */
5817 		if (current->used_for_tpro) {
5818 			vm_map_unlock(map);
5819 			return KERN_PROTECTION_FAILURE;
5820 		}
5821 #endif /* __arm64e__ */
5822 
5823 
5824 		if ((new_prot & VM_PROT_WRITE) &&
5825 		    (new_prot & VM_PROT_ALLEXEC) &&
5826 #if XNU_TARGET_OS_OSX
5827 		    map->pmap != kernel_pmap &&
5828 		    (vm_map_cs_enforcement(map)
5829 #if __arm64__
5830 		    || !VM_MAP_IS_EXOTIC(map)
5831 #endif /* __arm64__ */
5832 		    ) &&
5833 #endif /* XNU_TARGET_OS_OSX */
5834 #if CODE_SIGNING_MONITOR
5835 		    (csm_address_space_exempt(map->pmap) != KERN_SUCCESS) &&
5836 #endif
5837 		    !(current->used_for_jit)) {
5838 			DTRACE_VM3(cs_wx,
5839 			    uint64_t, (uint64_t) current->vme_start,
5840 			    uint64_t, (uint64_t) current->vme_end,
5841 			    vm_prot_t, new_prot);
5842 			printf("CODE SIGNING: %d[%s] %s:%d(0x%llx,0x%llx,0x%x) can't have both write and exec at the same time\n",
5843 			    proc_selfpid(),
5844 			    (get_bsdtask_info(current_task())
5845 			    ? proc_name_address(get_bsdtask_info(current_task()))
5846 			    : "?"),
5847 			    __FUNCTION__, __LINE__,
5848 #if DEVELOPMENT || DEBUG
5849 			    (uint64_t)current->vme_start,
5850 			    (uint64_t)current->vme_end,
5851 #else /* DEVELOPMENT || DEBUG */
5852 			    (uint64_t)0,
5853 			    (uint64_t)0,
5854 #endif /* DEVELOPMENT || DEBUG */
5855 			    new_prot);
5856 			new_prot &= ~VM_PROT_ALLEXEC;
5857 			if (VM_MAP_POLICY_WX_FAIL(map)) {
5858 				vm_map_unlock(map);
5859 				return KERN_PROTECTION_FAILURE;
5860 			}
5861 		}
5862 
5863 		/*
5864 		 * If the task has requested executable lockdown,
5865 		 * deny both:
5866 		 * - adding executable protections OR
5867 		 * - adding write protections to an existing executable mapping.
5868 		 */
5869 		if (map->map_disallow_new_exec == TRUE) {
5870 			if ((new_prot & VM_PROT_ALLEXEC) ||
5871 			    ((current->protection & VM_PROT_EXECUTE) && (new_prot & VM_PROT_WRITE))) {
5872 				vm_map_unlock(map);
5873 				return KERN_PROTECTION_FAILURE;
5874 			}
5875 		}
5876 
5877 		prev = current->vme_end;
5878 		current = current->vme_next;
5879 	}
5880 
5881 #if __arm64__
5882 	if (end > prev &&
5883 	    end == vm_map_round_page(prev, VM_MAP_PAGE_MASK(map))) {
5884 		vm_map_entry_t prev_entry;
5885 
5886 		prev_entry = current->vme_prev;
5887 		if (prev_entry != vm_map_to_entry(map) &&
5888 		    !prev_entry->map_aligned &&
5889 		    (vm_map_round_page(prev_entry->vme_end,
5890 		    VM_MAP_PAGE_MASK(map))
5891 		    == end)) {
5892 			/*
5893 			 * The last entry in our range is not "map-aligned"
5894 			 * but it would have reached all the way to "end"
5895 			 * if it had been map-aligned, so this is not really
5896 			 * a hole in the range and we can proceed.
5897 			 */
5898 			prev = end;
5899 		}
5900 	}
5901 #endif /* __arm64__ */
5902 
5903 	if (end > prev) {
5904 		vm_map_unlock(map);
5905 		return KERN_INVALID_ADDRESS;
5906 	}
5907 
5908 	/*
5909 	 *	Go back and fix up protections.
5910 	 *	Clip to start here if the range starts within
5911 	 *	the entry.
5912 	 */
5913 
5914 	current = entry;
5915 	if (current != vm_map_to_entry(map)) {
5916 		/* clip and unnest if necessary */
5917 		vm_map_clip_start(map, current, start);
5918 	}
5919 
5920 	while ((current != vm_map_to_entry(map)) &&
5921 	    (current->vme_start < end)) {
5922 		vm_prot_t       old_prot;
5923 
5924 		if (current->in_transition) {
5925 			wait_result_t wait_result;
5926 			vm_map_offset_t current_start;
5927 
5928 			/*
5929 			 * Another thread is wiring/unwiring this entry.
5930 			 * Let the other thread know we are waiting.
5931 			 */
5932 			current_start = current->vme_start;
5933 			current->needs_wakeup = true;
5934 			/* wait for the other thread to be done */
5935 			wait_result = vm_map_entry_wait(map, TH_UNINT);
5936 			/*
5937 			 * We unlocked the map, so anything could have changed in the
5938 			 * range and we need to re-check from "current_start" to "end".
5939 			 * Our entries might no longer be valid.
5940 			 */
5941 			current = NULL;
5942 			entry = NULL;
5943 			/*
5944 			 * Re-lookup and re-clip "current_start".
5945 			 * If it's no longer mapped,
5946 			 */
5947 			vm_map_lookup_entry_or_next(map, current_start, &current);
5948 			if (current != vm_map_to_entry(map)) {
5949 				vm_map_clip_start(map, current, current_start);
5950 			}
5951 			/* restart from this point */
5952 			start = current_start;
5953 			goto restart_after_unlock;
5954 		}
5955 
5956 		vm_map_clip_end(map, current, end);
5957 
5958 #if DEVELOPMENT || DEBUG
5959 		if (current->csm_associated && vm_log_xnu_user_debug) {
5960 			printf("FBDP %d[%s] %s(0x%llx,0x%llx,0x%x) on map %p entry %p [0x%llx:0x%llx 0x%x/0x%x] csm_associated\n",
5961 			    proc_selfpid(),
5962 			    (get_bsdtask_info(current_task())
5963 			    ? proc_name_address(get_bsdtask_info(current_task()))
5964 			    : "?"),
5965 			    __FUNCTION__,
5966 			    (uint64_t)start,
5967 			    (uint64_t)end,
5968 			    new_prot,
5969 			    map, current,
5970 			    current->vme_start,
5971 			    current->vme_end,
5972 			    current->protection,
5973 			    current->max_protection);
5974 		}
5975 #endif /* DEVELOPMENT || DEBUG */
5976 
5977 		if (current->is_sub_map) {
5978 			/* clipping did unnest if needed */
5979 			assert(!current->use_pmap);
5980 		}
5981 
5982 		old_prot = current->protection;
5983 
5984 		if (set_max) {
5985 			current->max_protection = new_prot;
5986 			/* Consider either EXECUTE or UEXEC as EXECUTE for this masking */
5987 			current->protection = (new_prot & old_prot);
5988 		} else {
5989 			current->protection = new_prot;
5990 		}
5991 
5992 #if CODE_SIGNING_MONITOR
5993 		if (/* a !csm_associated mapping becoming executable */
5994 			((!current->csm_associated &&
5995 			!(old_prot & VM_PROT_EXECUTE) &&
5996 			(current->protection & VM_PROT_EXECUTE))
5997 			||
5998 			/* a csm_associated mapping becoming writable */
5999 			(current->csm_associated &&
6000 			!(old_prot & VM_PROT_WRITE) &&
6001 			(current->protection & VM_PROT_WRITE)))) {
6002 			/*
6003 			 * This mapping has not already been marked as
6004 			 * "user_debug" and it is either:
6005 			 * 1. not code-signing-monitored and becoming executable
6006 			 * 2. code-signing-monitored and becoming writable,
6007 			 * so inform the CodeSigningMonitor and mark the
6008 			 * mapping as "user_debug" if appropriate.
6009 			 */
6010 			vm_map_kernel_flags_t vmk_flags;
6011 			vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
6012 			/* pretend it's a vm_protect(VM_PROT_COPY)... */
6013 			vmk_flags.vmkf_remap_prot_copy = true;
6014 			kr = vm_map_entry_cs_associate(map, current, vmk_flags);
6015 #if DEVELOPMENT || DEBUG
6016 			if (vm_log_xnu_user_debug) {
6017 				printf("FBDP %d[%s] %s:%d map %p entry %p [ 0x%llx 0x%llx ] prot 0x%x -> 0x%x cs_associate -> %d user_debug=%d\n",
6018 				    proc_selfpid(),
6019 				    (get_bsdtask_info(current_task()) ? proc_name_address(get_bsdtask_info(current_task())) : "?"),
6020 				    __FUNCTION__, __LINE__,
6021 				    map, current,
6022 				    current->vme_start, current->vme_end,
6023 				    old_prot, current->protection,
6024 				    kr, current->vme_xnu_user_debug);
6025 			}
6026 #endif /* DEVELOPMENT || DEBUG */
6027 		}
6028 #endif /* CODE_SIGNING_MONITOR */
6029 
6030 		/*
6031 		 *	Update physical map if necessary.
6032 		 *	If the request is to turn off write protection,
6033 		 *	we won't do it for real (in pmap). This is because
6034 		 *	it would cause copy-on-write to fail.  We've already
6035 		 *	set, the new protection in the map, so if a
6036 		 *	write-protect fault occurred, it will be fixed up
6037 		 *	properly, COW or not.
6038 		 */
6039 		if (current->protection != old_prot) {
6040 			/* Look one level in we support nested pmaps */
6041 			/* from mapped submaps which are direct entries */
6042 			/* in our map */
6043 
6044 			vm_prot_t prot;
6045 
6046 			prot = current->protection;
6047 			if (current->is_sub_map || (VME_OBJECT(current) == NULL) || (VME_OBJECT(current) != compressor_object)) {
6048 				prot &= ~VM_PROT_WRITE;
6049 			} else {
6050 				assert(!VME_OBJECT(current)->code_signed);
6051 				assert(VME_OBJECT(current)->copy_strategy == MEMORY_OBJECT_COPY_NONE);
6052 				if (prot & VM_PROT_WRITE) {
6053 					/*
6054 					 * For write requests on the
6055 					 * compressor, we wil ask the
6056 					 * pmap layer to prevent us from
6057 					 * taking a write fault when we
6058 					 * attempt to access the mapping
6059 					 * next.
6060 					 */
6061 					pmap_options |= PMAP_OPTIONS_PROTECT_IMMEDIATE;
6062 				}
6063 			}
6064 
6065 			if (override_nx(map, VME_ALIAS(current)) && prot) {
6066 				prot |= VM_PROT_EXECUTE;
6067 			}
6068 
6069 #if DEVELOPMENT || DEBUG
6070 			if (!(old_prot & VM_PROT_EXECUTE) &&
6071 			    (prot & VM_PROT_EXECUTE) &&
6072 			    panic_on_unsigned_execute &&
6073 			    (proc_selfcsflags() & CS_KILL)) {
6074 				panic("vm_map_protect(%p,0x%llx,0x%llx) old=0x%x new=0x%x - <rdar://23770418> code-signing bypass?", map, (uint64_t)current->vme_start, (uint64_t)current->vme_end, old_prot, prot);
6075 			}
6076 #endif /* DEVELOPMENT || DEBUG */
6077 
6078 			if (pmap_has_prot_policy(map->pmap, current->translated_allow_execute, prot)) {
6079 				if (current->wired_count) {
6080 					panic("vm_map_protect(%p,0x%llx,0x%llx) new=0x%x wired=%x",
6081 					    map, (uint64_t)current->vme_start, (uint64_t)current->vme_end, prot, current->wired_count);
6082 				}
6083 
6084 				/* If the pmap layer cares about this
6085 				 * protection type, force a fault for
6086 				 * each page so that vm_fault will
6087 				 * repopulate the page with the full
6088 				 * set of protections.
6089 				 */
6090 				/*
6091 				 * TODO: We don't seem to need this,
6092 				 * but this is due to an internal
6093 				 * implementation detail of
6094 				 * pmap_protect.  Do we want to rely
6095 				 * on this?
6096 				 */
6097 				prot = VM_PROT_NONE;
6098 			}
6099 
6100 			if (current->is_sub_map && current->use_pmap) {
6101 				pmap_protect(VME_SUBMAP(current)->pmap,
6102 				    current->vme_start,
6103 				    current->vme_end,
6104 				    prot);
6105 			} else {
6106 				pmap_protect_options(map->pmap,
6107 				    current->vme_start,
6108 				    current->vme_end,
6109 				    prot,
6110 				    pmap_options,
6111 				    NULL);
6112 			}
6113 		}
6114 		current = current->vme_next;
6115 	}
6116 
6117 	if (entry == VM_MAP_ENTRY_NULL) {
6118 		/*
6119 		 * Re-lookup the original start of our range.
6120 		 * If it's no longer mapped, start with the next mapping.
6121 		 */
6122 		vm_map_lookup_entry_or_next(map, original_start, &entry);
6123 	}
6124 	current = entry;
6125 	while ((current != vm_map_to_entry(map)) &&
6126 	    (current->vme_start <= end)) {
6127 		vm_map_simplify_entry(map, current);
6128 		current = current->vme_next;
6129 	}
6130 
6131 	vm_map_unlock(map);
6132 	return KERN_SUCCESS;
6133 }
6134 
6135 static __attribute__((always_inline, warn_unused_result))
6136 kern_return_t
vm_map_inherit_sanitize(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_inherit_ut new_inheritance_u,vm_map_offset_t * start,vm_map_offset_t * end,vm_inherit_t * new_inheritance)6137 vm_map_inherit_sanitize(
6138 	vm_map_t                        map,
6139 	vm_map_offset_ut                start_u,
6140 	vm_map_offset_ut                end_u,
6141 	vm_inherit_ut                   new_inheritance_u,
6142 	vm_map_offset_t                *start,
6143 	vm_map_offset_t                *end,
6144 	vm_inherit_t                   *new_inheritance)
6145 {
6146 	kern_return_t   kr;
6147 	vm_map_size_t   size;
6148 
6149 	kr = vm_sanitize_inherit(new_inheritance_u,
6150 	    VM_SANITIZE_CALLER_VM_MAP_INHERIT, new_inheritance);
6151 	if (__improbable(kr != KERN_SUCCESS)) {
6152 		return kr;
6153 	}
6154 
6155 	kr = vm_sanitize_addr_end(start_u, end_u, VM_SANITIZE_CALLER_VM_MAP_INHERIT,
6156 	    map, VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS, start, end, &size);
6157 	if (__improbable(kr != KERN_SUCCESS)) {
6158 		return kr;
6159 	}
6160 
6161 	return KERN_SUCCESS;
6162 }
6163 
6164 /*
6165  *	vm_map_inherit:
6166  *
6167  *	Sets the inheritance of the specified address
6168  *	range in the target map.  Inheritance
6169  *	affects how the map will be shared with
6170  *	child maps at the time of vm_map_fork.
6171  */
6172 kern_return_t
vm_map_inherit(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_inherit_ut new_inheritance_u)6173 vm_map_inherit(
6174 	vm_map_t                        map,
6175 	vm_map_offset_ut                start_u,
6176 	vm_map_offset_ut                end_u,
6177 	vm_inherit_ut                   new_inheritance_u)
6178 {
6179 	vm_map_entry_t  entry;
6180 	vm_map_entry_t  temp_entry;
6181 	kern_return_t   kr;
6182 	vm_map_offset_t start;
6183 	vm_map_offset_t end;
6184 	vm_inherit_t    new_inheritance;
6185 
6186 	kr = vm_map_inherit_sanitize(map,
6187 	    start_u,
6188 	    end_u,
6189 	    new_inheritance_u,
6190 	    &start,
6191 	    &end,
6192 	    &new_inheritance);
6193 	if (__improbable(kr != KERN_SUCCESS)) {
6194 		return vm_sanitize_get_kr(kr);
6195 	}
6196 
6197 	vm_map_lock(map);
6198 
6199 	VM_MAP_RANGE_CHECK(map, start, end);
6200 
6201 	if (vm_map_lookup_entry(map, start, &temp_entry)) {
6202 		entry = temp_entry;
6203 	} else {
6204 		temp_entry = temp_entry->vme_next;
6205 		entry = temp_entry;
6206 	}
6207 
6208 	/* first check entire range for entries which can't support the */
6209 	/* given inheritance. */
6210 	while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
6211 		if (entry->is_sub_map) {
6212 			if (new_inheritance == VM_INHERIT_COPY) {
6213 				vm_map_unlock(map);
6214 				return KERN_INVALID_ARGUMENT;
6215 			}
6216 		}
6217 
6218 		entry = entry->vme_next;
6219 	}
6220 
6221 	entry = temp_entry;
6222 	if (entry != vm_map_to_entry(map)) {
6223 		/* clip and unnest if necessary */
6224 		vm_map_clip_start(map, entry, start);
6225 	}
6226 
6227 	while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
6228 		vm_map_clip_end(map, entry, end);
6229 		if (entry->is_sub_map) {
6230 			/* clip did unnest if needed */
6231 			assert(!entry->use_pmap);
6232 		}
6233 
6234 		entry->inheritance = new_inheritance;
6235 
6236 		entry = entry->vme_next;
6237 	}
6238 
6239 	vm_map_unlock(map);
6240 	return KERN_SUCCESS;
6241 }
6242 
6243 /*
6244  * Update the accounting for the amount of wired memory in this map.  If the user has
6245  * exceeded the defined limits, then we fail.  Wiring on behalf of the kernel never fails.
6246  */
6247 
6248 static kern_return_t
add_wire_counts(vm_map_t map,vm_map_entry_t entry,boolean_t user_wire)6249 add_wire_counts(
6250 	vm_map_t        map,
6251 	vm_map_entry_t  entry,
6252 	boolean_t       user_wire)
6253 {
6254 	vm_map_size_t   size;
6255 
6256 	bool first_wire = entry->wired_count == 0 && entry->user_wired_count == 0;
6257 
6258 	if (user_wire) {
6259 		unsigned int total_wire_count =  vm_page_wire_count + vm_lopage_free_count;
6260 
6261 		/*
6262 		 * We're wiring memory at the request of the user.  Check if this is the first time the user is wiring
6263 		 * this map entry.
6264 		 */
6265 
6266 		if (entry->user_wired_count == 0) {
6267 			size = entry->vme_end - entry->vme_start;
6268 
6269 			/*
6270 			 * Since this is the first time the user is wiring this map entry, check to see if we're
6271 			 * exceeding the user wire limits.  There is a per map limit which is the smaller of either
6272 			 * the process's rlimit or the global vm_per_task_user_wire_limit which caps this value.  There is also
6273 			 * a system-wide limit on the amount of memory all users can wire.  If the user is over either
6274 			 * limit, then we fail.
6275 			 */
6276 
6277 			if (size + map->user_wire_size > MIN(map->user_wire_limit, vm_per_task_user_wire_limit) ||
6278 			    size + ptoa_64(total_wire_count) > vm_global_user_wire_limit) {
6279 				if (size + ptoa_64(total_wire_count) > vm_global_user_wire_limit) {
6280 #if DEVELOPMENT || DEBUG
6281 					if (panic_on_mlock_failure) {
6282 						panic("mlock: Over global wire limit. %llu bytes wired and requested to wire %llu bytes more", ptoa_64(total_wire_count), (uint64_t) size);
6283 					}
6284 #endif /* DEVELOPMENT || DEBUG */
6285 					os_atomic_inc(&vm_add_wire_count_over_global_limit, relaxed);
6286 				} else {
6287 					os_atomic_inc(&vm_add_wire_count_over_user_limit, relaxed);
6288 #if DEVELOPMENT || DEBUG
6289 					if (panic_on_mlock_failure) {
6290 						panic("mlock: Over process wire limit. %llu bytes wired and requested to wire %llu bytes more", (uint64_t) map->user_wire_size, (uint64_t) size);
6291 					}
6292 #endif /* DEVELOPMENT || DEBUG */
6293 				}
6294 				return KERN_RESOURCE_SHORTAGE;
6295 			}
6296 
6297 			/*
6298 			 * The first time the user wires an entry, we also increment the wired_count and add this to
6299 			 * the total that has been wired in the map.
6300 			 */
6301 
6302 			if (entry->wired_count >= MAX_WIRE_COUNT) {
6303 				return KERN_FAILURE;
6304 			}
6305 
6306 			entry->wired_count++;
6307 			map->user_wire_size += size;
6308 		}
6309 
6310 		if (entry->user_wired_count >= MAX_WIRE_COUNT) {
6311 			return KERN_FAILURE;
6312 		}
6313 
6314 		entry->user_wired_count++;
6315 	} else {
6316 		/*
6317 		 * The kernel's wiring the memory.  Just bump the count and continue.
6318 		 */
6319 
6320 		if (entry->wired_count >= MAX_WIRE_COUNT) {
6321 			panic("vm_map_wire: too many wirings");
6322 		}
6323 
6324 		entry->wired_count++;
6325 	}
6326 
6327 	if (first_wire) {
6328 		vme_btref_consider_and_set(entry, __builtin_frame_address(0));
6329 	}
6330 
6331 	return KERN_SUCCESS;
6332 }
6333 
6334 /*
6335  * Update the memory wiring accounting now that the given map entry is being unwired.
6336  */
6337 
6338 static void
subtract_wire_counts(vm_map_t map,vm_map_entry_t entry,boolean_t user_wire)6339 subtract_wire_counts(
6340 	vm_map_t        map,
6341 	vm_map_entry_t  entry,
6342 	boolean_t       user_wire)
6343 {
6344 	if (user_wire) {
6345 		/*
6346 		 * We're unwiring memory at the request of the user.  See if we're removing the last user wire reference.
6347 		 */
6348 
6349 		if (entry->user_wired_count == 1) {
6350 			/*
6351 			 * We're removing the last user wire reference.  Decrement the wired_count and the total
6352 			 * user wired memory for this map.
6353 			 */
6354 
6355 			assert(entry->wired_count >= 1);
6356 			entry->wired_count--;
6357 			map->user_wire_size -= entry->vme_end - entry->vme_start;
6358 		}
6359 
6360 		assert(entry->user_wired_count >= 1);
6361 		entry->user_wired_count--;
6362 	} else {
6363 		/*
6364 		 * The kernel is unwiring the memory.   Just update the count.
6365 		 */
6366 
6367 		assert(entry->wired_count >= 1);
6368 		entry->wired_count--;
6369 	}
6370 
6371 	vme_btref_consider_and_put(entry);
6372 }
6373 
6374 int cs_executable_wire = 0;
6375 
6376 static kern_return_t
vm_map_wire_nested(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t caller_prot,vm_tag_t tag,boolean_t user_wire,pmap_t map_pmap,vm_map_offset_t pmap_addr,ppnum_t * physpage_p)6377 vm_map_wire_nested(
6378 	vm_map_t                map,
6379 	vm_map_offset_t         start,
6380 	vm_map_offset_t         end,
6381 	vm_prot_t               caller_prot,
6382 	vm_tag_t                tag,
6383 	boolean_t               user_wire,
6384 	pmap_t                  map_pmap,
6385 	vm_map_offset_t         pmap_addr,
6386 	ppnum_t                *physpage_p)
6387 {
6388 	vm_map_entry_t          entry;
6389 	vm_prot_t               access_type;
6390 	struct vm_map_entry     *first_entry, tmp_entry;
6391 	vm_map_t                real_map;
6392 	vm_map_offset_t         s, e;
6393 	kern_return_t           rc;
6394 	boolean_t               need_wakeup;
6395 	boolean_t               main_map = FALSE;
6396 	wait_interrupt_t        interruptible_state;
6397 	thread_t                cur_thread;
6398 	unsigned int            last_timestamp;
6399 	vm_map_size_t           size;
6400 	boolean_t               wire_and_extract;
6401 	vm_prot_t               extra_prots;
6402 
6403 	extra_prots = VM_PROT_COPY;
6404 	extra_prots |= VM_PROT_COPY_FAIL_IF_EXECUTABLE;
6405 #if XNU_TARGET_OS_OSX
6406 	if (map->pmap == kernel_pmap ||
6407 	    !vm_map_cs_enforcement(map)) {
6408 		extra_prots &= ~VM_PROT_COPY_FAIL_IF_EXECUTABLE;
6409 	}
6410 #endif /* XNU_TARGET_OS_OSX */
6411 #if CODE_SIGNING_MONITOR
6412 	if (csm_address_space_exempt(map->pmap) == KERN_SUCCESS) {
6413 		extra_prots &= ~VM_PROT_COPY_FAIL_IF_EXECUTABLE;
6414 	}
6415 #endif /* CODE_SIGNING_MONITOR */
6416 
6417 	access_type = (caller_prot & (VM_PROT_ALL | VM_PROT_ALLEXEC));
6418 
6419 	wire_and_extract = FALSE;
6420 	if (physpage_p != NULL) {
6421 		/*
6422 		 * The caller wants the physical page number of the
6423 		 * wired page.  We return only one physical page number
6424 		 * so this works for only one page at a time.
6425 		 *
6426 		 * The only caller (vm_map_wire_and_extract)
6427 		 * guarantees it.
6428 		 */
6429 		assert(end - start == VM_MAP_PAGE_SIZE(map));
6430 		wire_and_extract = TRUE;
6431 		*physpage_p = 0;
6432 	}
6433 
6434 	VM_MAP_RANGE_CHECK(map, start, end);
6435 	assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
6436 	assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
6437 	if (start == end) {
6438 		/* We wired what the caller asked for, zero pages */
6439 		return KERN_SUCCESS;
6440 	}
6441 
6442 	vm_map_lock(map);
6443 	if (map_pmap == NULL) {
6444 		main_map = TRUE;
6445 	}
6446 	last_timestamp = map->timestamp;
6447 
6448 	need_wakeup = FALSE;
6449 	cur_thread = current_thread();
6450 
6451 	s = start;
6452 	rc = KERN_SUCCESS;
6453 
6454 	if (vm_map_lookup_entry(map, s, &first_entry)) {
6455 		entry = first_entry;
6456 		/*
6457 		 * vm_map_clip_start will be done later.
6458 		 * We don't want to unnest any nested submaps here !
6459 		 */
6460 	} else {
6461 		/* Start address is not in map */
6462 		rc = KERN_INVALID_ADDRESS;
6463 		goto done;
6464 	}
6465 
6466 	while ((entry != vm_map_to_entry(map)) && (s < end)) {
6467 		/*
6468 		 * At this point, we have wired from "start" to "s".
6469 		 * We still need to wire from "s" to "end".
6470 		 *
6471 		 * "entry" hasn't been clipped, so it could start before "s"
6472 		 * and/or end after "end".
6473 		 */
6474 
6475 		/* "e" is how far we want to wire in this entry */
6476 		e = entry->vme_end;
6477 		if (e > end) {
6478 			e = end;
6479 		}
6480 
6481 		/*
6482 		 * If another thread is wiring/unwiring this entry then
6483 		 * block after informing other thread to wake us up.
6484 		 */
6485 		if (entry->in_transition) {
6486 			wait_result_t wait_result;
6487 
6488 			/*
6489 			 * We have not clipped the entry.  Make sure that
6490 			 * the start address is in range so that the lookup
6491 			 * below will succeed.
6492 			 * "s" is the current starting point: we've already
6493 			 * wired from "start" to "s" and we still have
6494 			 * to wire from "s" to "end".
6495 			 */
6496 
6497 			entry->needs_wakeup = TRUE;
6498 
6499 			/*
6500 			 * wake up anybody waiting on entries that we have
6501 			 * already wired.
6502 			 */
6503 			if (need_wakeup) {
6504 				vm_map_entry_wakeup(map);
6505 				need_wakeup = FALSE;
6506 			}
6507 			/*
6508 			 * User wiring is interruptible
6509 			 */
6510 			wait_result = vm_map_entry_wait(map,
6511 			    (user_wire) ? THREAD_ABORTSAFE :
6512 			    THREAD_UNINT);
6513 			if (user_wire && wait_result == THREAD_INTERRUPTED) {
6514 				/*
6515 				 * undo the wirings we have done so far
6516 				 * We do not clear the needs_wakeup flag,
6517 				 * because we cannot tell if we were the
6518 				 * only one waiting.
6519 				 */
6520 				rc = KERN_FAILURE;
6521 				goto done;
6522 			}
6523 
6524 			/*
6525 			 * Cannot avoid a lookup here. reset timestamp.
6526 			 */
6527 			last_timestamp = map->timestamp;
6528 
6529 			/*
6530 			 * The entry could have been clipped, look it up again.
6531 			 * Worse that can happen is, it may not exist anymore.
6532 			 */
6533 			if (!vm_map_lookup_entry(map, s, &first_entry)) {
6534 				/*
6535 				 * User: undo everything upto the previous
6536 				 * entry.  let vm_map_unwire worry about
6537 				 * checking the validity of the range.
6538 				 */
6539 				rc = KERN_FAILURE;
6540 				goto done;
6541 			}
6542 			entry = first_entry;
6543 			continue;
6544 		}
6545 
6546 		if (entry->is_sub_map) {
6547 			vm_map_offset_t sub_start;
6548 			vm_map_offset_t sub_end;
6549 			vm_map_offset_t local_start;
6550 			vm_map_offset_t local_end;
6551 			pmap_t          pmap;
6552 			vm_map_t        sub_map = VM_MAP_NULL;
6553 
6554 			if (wire_and_extract) {
6555 				/*
6556 				 * Wiring would result in copy-on-write
6557 				 * which would not be compatible with
6558 				 * the sharing we have with the original
6559 				 * provider of this memory.
6560 				 */
6561 				rc = KERN_INVALID_ARGUMENT;
6562 				goto done;
6563 			}
6564 
6565 			vm_map_clip_start(map, entry, s);
6566 			vm_map_clip_end(map, entry, end);
6567 
6568 			sub_start = VME_OFFSET(entry);
6569 			sub_end = entry->vme_end;
6570 			sub_end += VME_OFFSET(entry) - entry->vme_start;
6571 
6572 			local_end = entry->vme_end;
6573 			if (map_pmap == NULL) {
6574 				vm_object_t             object;
6575 				vm_object_offset_t      offset;
6576 				vm_prot_t               prot;
6577 				boolean_t               wired;
6578 				vm_map_entry_t          local_entry;
6579 				vm_map_version_t         version;
6580 				vm_map_t                lookup_map;
6581 
6582 				if (entry->use_pmap) {
6583 					pmap = VME_SUBMAP(entry)->pmap;
6584 					/* ppc implementation requires that */
6585 					/* submaps pmap address ranges line */
6586 					/* up with parent map */
6587 #ifdef notdef
6588 					pmap_addr = sub_start;
6589 #endif
6590 					pmap_addr = s;
6591 				} else {
6592 					pmap = map->pmap;
6593 					pmap_addr = s;
6594 				}
6595 
6596 				if (entry->wired_count) {
6597 					if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6598 						goto done;
6599 					}
6600 
6601 					/*
6602 					 * The map was not unlocked:
6603 					 * no need to goto re-lookup.
6604 					 * Just go directly to next entry.
6605 					 */
6606 					entry = entry->vme_next;
6607 					s = entry->vme_start;
6608 					continue;
6609 				}
6610 
6611 				/* call vm_map_lookup_and_lock_object to */
6612 				/* cause any needs copy to be   */
6613 				/* evaluated */
6614 				local_start = entry->vme_start;
6615 				lookup_map = map;
6616 				vm_map_lock_write_to_read(map);
6617 				rc = vm_map_lookup_and_lock_object(
6618 					&lookup_map, local_start,
6619 					(access_type | extra_prots),
6620 					OBJECT_LOCK_EXCLUSIVE,
6621 					&version, &object,
6622 					&offset, &prot, &wired,
6623 					NULL,
6624 					&real_map, NULL);
6625 				if (rc != KERN_SUCCESS) {
6626 					vm_map_unlock_read(lookup_map);
6627 					assert(map_pmap == NULL);
6628 					vm_map_unwire_nested(map, start,
6629 					    s, user_wire, PMAP_NULL, 0);
6630 					return rc;
6631 				}
6632 				vm_object_unlock(object);
6633 				if (real_map != lookup_map) {
6634 					vm_map_unlock(real_map);
6635 				}
6636 				vm_map_unlock_read(lookup_map);
6637 				vm_map_lock(map);
6638 
6639 				/* we unlocked, so must re-lookup */
6640 				if (!vm_map_lookup_entry(map,
6641 				    local_start,
6642 				    &local_entry)) {
6643 					rc = KERN_FAILURE;
6644 					goto done;
6645 				}
6646 
6647 				/*
6648 				 * entry could have been "simplified",
6649 				 * so re-clip
6650 				 */
6651 				entry = local_entry;
6652 				assert(s == local_start);
6653 				vm_map_clip_start(map, entry, s);
6654 				vm_map_clip_end(map, entry, end);
6655 				/* re-compute "e" */
6656 				e = entry->vme_end;
6657 				if (e > end) {
6658 					e = end;
6659 				}
6660 
6661 				/* did we have a change of type? */
6662 				if (!entry->is_sub_map) {
6663 					last_timestamp = map->timestamp;
6664 					continue;
6665 				}
6666 			} else {
6667 				local_start = entry->vme_start;
6668 				pmap = map_pmap;
6669 			}
6670 
6671 			if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6672 				goto done;
6673 			}
6674 
6675 			entry->in_transition = TRUE;
6676 
6677 			sub_map = VME_SUBMAP(entry);
6678 			vm_map_reference(sub_map);
6679 			vm_map_unlock(map);
6680 			rc = vm_map_wire_nested(sub_map,
6681 			    sub_start, sub_end,
6682 			    caller_prot, tag,
6683 			    user_wire, pmap, pmap_addr,
6684 			    NULL);
6685 			vm_map_deallocate(sub_map);
6686 			sub_map = VM_MAP_NULL;
6687 			vm_map_lock(map);
6688 
6689 			/*
6690 			 * Find the entry again.  It could have been clipped
6691 			 * after we unlocked the map.
6692 			 */
6693 			if (!vm_map_lookup_entry(map, local_start,
6694 			    &first_entry)) {
6695 				panic("vm_map_wire: re-lookup failed");
6696 			}
6697 			entry = first_entry;
6698 
6699 			assert(local_start == s);
6700 			/* re-compute "e" */
6701 			e = entry->vme_end;
6702 			if (e > end) {
6703 				e = end;
6704 			}
6705 
6706 			last_timestamp = map->timestamp;
6707 			while ((entry != vm_map_to_entry(map)) &&
6708 			    (entry->vme_start < e)) {
6709 				assert(entry->in_transition);
6710 				entry->in_transition = FALSE;
6711 				if (entry->needs_wakeup) {
6712 					entry->needs_wakeup = FALSE;
6713 					need_wakeup = TRUE;
6714 				}
6715 				if (rc != KERN_SUCCESS) {/* from vm_*_wire */
6716 					subtract_wire_counts(map, entry, user_wire);
6717 				}
6718 				entry = entry->vme_next;
6719 			}
6720 			if (rc != KERN_SUCCESS) {       /* from vm_*_wire */
6721 				goto done;
6722 			}
6723 
6724 			/* no need to relookup again */
6725 			s = entry->vme_start;
6726 			continue;
6727 		}
6728 
6729 		/*
6730 		 * If this entry is already wired then increment
6731 		 * the appropriate wire reference count.
6732 		 */
6733 		if (entry->wired_count) {
6734 			if ((entry->protection & access_type) != access_type) {
6735 				/* found a protection problem */
6736 
6737 				/*
6738 				 * XXX FBDP
6739 				 * We should always return an error
6740 				 * in this case but since we didn't
6741 				 * enforce it before, let's do
6742 				 * it only for the new "wire_and_extract"
6743 				 * code path for now...
6744 				 */
6745 				if (wire_and_extract) {
6746 					rc = KERN_PROTECTION_FAILURE;
6747 					goto done;
6748 				}
6749 			}
6750 
6751 			/*
6752 			 * entry is already wired down, get our reference
6753 			 * after clipping to our range.
6754 			 */
6755 			vm_map_clip_start(map, entry, s);
6756 			vm_map_clip_end(map, entry, end);
6757 
6758 			if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6759 				goto done;
6760 			}
6761 
6762 			if (wire_and_extract) {
6763 				vm_object_t             object;
6764 				vm_object_offset_t      offset;
6765 				vm_page_t               m;
6766 
6767 				/*
6768 				 * We don't have to "wire" the page again
6769 				 * bit we still have to "extract" its
6770 				 * physical page number, after some sanity
6771 				 * checks.
6772 				 */
6773 				assert((entry->vme_end - entry->vme_start)
6774 				    == PAGE_SIZE);
6775 				assert(!entry->needs_copy);
6776 				assert(!entry->is_sub_map);
6777 				assert(VME_OBJECT(entry));
6778 				if (((entry->vme_end - entry->vme_start)
6779 				    != PAGE_SIZE) ||
6780 				    entry->needs_copy ||
6781 				    entry->is_sub_map ||
6782 				    VME_OBJECT(entry) == VM_OBJECT_NULL) {
6783 					rc = KERN_INVALID_ARGUMENT;
6784 					goto done;
6785 				}
6786 
6787 				object = VME_OBJECT(entry);
6788 				offset = VME_OFFSET(entry);
6789 				/* need exclusive lock to update m->dirty */
6790 				if (entry->protection & VM_PROT_WRITE) {
6791 					vm_object_lock(object);
6792 				} else {
6793 					vm_object_lock_shared(object);
6794 				}
6795 				m = vm_page_lookup(object, offset);
6796 				assert(m != VM_PAGE_NULL);
6797 				assert(VM_PAGE_WIRED(m));
6798 				if (m != VM_PAGE_NULL && VM_PAGE_WIRED(m)) {
6799 					*physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
6800 					if (entry->protection & VM_PROT_WRITE) {
6801 						vm_object_lock_assert_exclusive(
6802 							object);
6803 						m->vmp_dirty = TRUE;
6804 					}
6805 				} else {
6806 					/* not already wired !? */
6807 					*physpage_p = 0;
6808 				}
6809 				vm_object_unlock(object);
6810 			}
6811 
6812 			/* map was not unlocked: no need to relookup */
6813 			entry = entry->vme_next;
6814 			s = entry->vme_start;
6815 			continue;
6816 		}
6817 
6818 		/*
6819 		 * Unwired entry or wire request transmitted via submap
6820 		 */
6821 
6822 		/*
6823 		 * Wiring would copy the pages to the shadow object.
6824 		 * The shadow object would not be code-signed so
6825 		 * attempting to execute code from these copied pages
6826 		 * would trigger a code-signing violation.
6827 		 */
6828 
6829 		if ((entry->protection & VM_PROT_EXECUTE)
6830 #if XNU_TARGET_OS_OSX
6831 		    &&
6832 		    map->pmap != kernel_pmap &&
6833 		    (vm_map_cs_enforcement(map)
6834 #if __arm64__
6835 		    || !VM_MAP_IS_EXOTIC(map)
6836 #endif /* __arm64__ */
6837 		    )
6838 #endif /* XNU_TARGET_OS_OSX */
6839 #if CODE_SIGNING_MONITOR
6840 		    &&
6841 		    (csm_address_space_exempt(map->pmap) != KERN_SUCCESS)
6842 #endif
6843 		    ) {
6844 #if MACH_ASSERT
6845 			printf("pid %d[%s] wiring executable range from "
6846 			    "0x%llx to 0x%llx: rejected to preserve "
6847 			    "code-signing\n",
6848 			    proc_selfpid(),
6849 			    (get_bsdtask_info(current_task())
6850 			    ? proc_name_address(get_bsdtask_info(current_task()))
6851 			    : "?"),
6852 			    (uint64_t) entry->vme_start,
6853 			    (uint64_t) entry->vme_end);
6854 #endif /* MACH_ASSERT */
6855 			DTRACE_VM2(cs_executable_wire,
6856 			    uint64_t, (uint64_t)entry->vme_start,
6857 			    uint64_t, (uint64_t)entry->vme_end);
6858 			cs_executable_wire++;
6859 			rc = KERN_PROTECTION_FAILURE;
6860 			goto done;
6861 		}
6862 
6863 		/*
6864 		 * Perform actions of vm_map_lookup that need the write
6865 		 * lock on the map: create a shadow object for a
6866 		 * copy-on-write region, or an object for a zero-fill
6867 		 * region.
6868 		 */
6869 		size = entry->vme_end - entry->vme_start;
6870 		/*
6871 		 * If wiring a copy-on-write page, we need to copy it now
6872 		 * even if we're only (currently) requesting read access.
6873 		 * This is aggressive, but once it's wired we can't move it.
6874 		 */
6875 		if (entry->needs_copy) {
6876 			if (wire_and_extract) {
6877 				/*
6878 				 * We're supposed to share with the original
6879 				 * provider so should not be "needs_copy"
6880 				 */
6881 				rc = KERN_INVALID_ARGUMENT;
6882 				goto done;
6883 			}
6884 
6885 			VME_OBJECT_SHADOW(entry, size,
6886 			    vm_map_always_shadow(map));
6887 			entry->needs_copy = FALSE;
6888 		} else if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
6889 			if (wire_and_extract) {
6890 				/*
6891 				 * We're supposed to share with the original
6892 				 * provider so should already have an object.
6893 				 */
6894 				rc = KERN_INVALID_ARGUMENT;
6895 				goto done;
6896 			}
6897 			VME_OBJECT_SET(entry, vm_object_allocate(size), false, 0);
6898 			VME_OFFSET_SET(entry, (vm_object_offset_t)0);
6899 			assert(entry->use_pmap);
6900 		} else if (VME_OBJECT(entry)->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
6901 			if (wire_and_extract) {
6902 				/*
6903 				 * We're supposed to share with the original
6904 				 * provider so should not be COPY_SYMMETRIC.
6905 				 */
6906 				rc = KERN_INVALID_ARGUMENT;
6907 				goto done;
6908 			}
6909 			/*
6910 			 * Force an unrequested "copy-on-write" but only for
6911 			 * the range we're wiring.
6912 			 */
6913 //			printf("FBDP %s:%d map %p entry %p [ 0x%llx 0x%llx ] s 0x%llx end 0x%llx wire&extract=%d\n", __FUNCTION__, __LINE__, map, entry, (uint64_t)entry->vme_start, (uint64_t)entry->vme_end, (uint64_t)s, (uint64_t)end, wire_and_extract);
6914 			vm_map_clip_start(map, entry, s);
6915 			vm_map_clip_end(map, entry, end);
6916 			/* recompute "size" */
6917 			size = entry->vme_end - entry->vme_start;
6918 			/* make a shadow object */
6919 			vm_object_t orig_object;
6920 			vm_object_offset_t orig_offset;
6921 			orig_object = VME_OBJECT(entry);
6922 			orig_offset = VME_OFFSET(entry);
6923 			VME_OBJECT_SHADOW(entry, size, vm_map_always_shadow(map));
6924 			if (VME_OBJECT(entry) != orig_object) {
6925 				/*
6926 				 * This mapping has not been shared (or it would be
6927 				 * COPY_DELAY instead of COPY_SYMMETRIC) and it has
6928 				 * not been copied-on-write (or it would be marked
6929 				 * as "needs_copy" and would have been handled above
6930 				 * and also already write-protected).
6931 				 * We still need to write-protect here to prevent
6932 				 * other threads from modifying these pages while
6933 				 * we're in the process of copying and wiring
6934 				 * the copied pages.
6935 				 * Since the mapping is neither shared nor COWed,
6936 				 * we only need to write-protect the PTEs for this
6937 				 * mapping.
6938 				 */
6939 				vm_object_pmap_protect(orig_object,
6940 				    orig_offset,
6941 				    size,
6942 				    map->pmap,
6943 				    VM_MAP_PAGE_SIZE(map),
6944 				    entry->vme_start,
6945 				    entry->protection & ~VM_PROT_WRITE);
6946 			}
6947 		}
6948 		if (VME_OBJECT(entry)->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
6949 			/*
6950 			 * Make the object COPY_DELAY to get a stable object
6951 			 * to wire.
6952 			 * That should avoid creating long shadow chains while
6953 			 * wiring/unwiring the same range repeatedly.
6954 			 * That also prevents part of the object from being
6955 			 * wired while another part is "needs_copy", which
6956 			 * could result in conflicting rules wrt copy-on-write.
6957 			 */
6958 			vm_object_t object;
6959 
6960 			object = VME_OBJECT(entry);
6961 			vm_object_lock(object);
6962 			if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
6963 				assertf(vm_object_round_page(VME_OFFSET(entry) + size) - vm_object_trunc_page(VME_OFFSET(entry)) == object->vo_size,
6964 				    "object %p size 0x%llx entry %p [0x%llx:0x%llx:0x%llx] size 0x%llx\n",
6965 				    object, (uint64_t)object->vo_size,
6966 				    entry,
6967 				    (uint64_t)entry->vme_start,
6968 				    (uint64_t)entry->vme_end,
6969 				    (uint64_t)VME_OFFSET(entry),
6970 				    (uint64_t)size);
6971 				assertf(os_ref_get_count_raw(&object->ref_count) == 1,
6972 				    "object %p ref_count %d\n",
6973 				    object, os_ref_get_count_raw(&object->ref_count));
6974 				assertf(!entry->needs_copy,
6975 				    "entry %p\n", entry);
6976 				object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
6977 				VM_OBJECT_SET_TRUE_SHARE(object, TRUE);
6978 			}
6979 			vm_object_unlock(object);
6980 		}
6981 
6982 		vm_map_clip_start(map, entry, s);
6983 		vm_map_clip_end(map, entry, end);
6984 
6985 		/* re-compute "e" */
6986 		e = entry->vme_end;
6987 		if (e > end) {
6988 			e = end;
6989 		}
6990 
6991 		/*
6992 		 * Check for holes and protection mismatch.
6993 		 * Holes: Next entry should be contiguous unless this
6994 		 *	  is the end of the region.
6995 		 * Protection: Access requested must be allowed, unless
6996 		 *	wiring is by protection class
6997 		 */
6998 		if ((entry->vme_end < end) &&
6999 		    ((entry->vme_next == vm_map_to_entry(map)) ||
7000 		    (entry->vme_next->vme_start > entry->vme_end))) {
7001 			/* found a hole */
7002 			rc = KERN_INVALID_ADDRESS;
7003 			goto done;
7004 		}
7005 		if ((entry->protection & access_type) != access_type) {
7006 			/* found a protection problem */
7007 			rc = KERN_PROTECTION_FAILURE;
7008 			goto done;
7009 		}
7010 
7011 		assert(entry->wired_count == 0 && entry->user_wired_count == 0);
7012 
7013 		if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
7014 			goto done;
7015 		}
7016 
7017 		entry->in_transition = TRUE;
7018 
7019 		/*
7020 		 * This entry might get split once we unlock the map.
7021 		 * In vm_fault_wire(), we need the current range as
7022 		 * defined by this entry.  In order for this to work
7023 		 * along with a simultaneous clip operation, we make a
7024 		 * temporary copy of this entry and use that for the
7025 		 * wiring.  Note that the underlying objects do not
7026 		 * change during a clip.
7027 		 */
7028 		tmp_entry = *entry;
7029 
7030 		/*
7031 		 * The in_transition state guarentees that the entry
7032 		 * (or entries for this range, if split occured) will be
7033 		 * there when the map lock is acquired for the second time.
7034 		 */
7035 		vm_map_unlock(map);
7036 
7037 		if (!user_wire && cur_thread != THREAD_NULL) {
7038 			interruptible_state = thread_interrupt_level(THREAD_UNINT);
7039 		} else {
7040 			interruptible_state = THREAD_UNINT;
7041 		}
7042 
7043 		if (map_pmap) {
7044 			rc = vm_fault_wire(map,
7045 			    &tmp_entry, caller_prot, tag, map_pmap, pmap_addr,
7046 			    physpage_p);
7047 		} else {
7048 			rc = vm_fault_wire(map,
7049 			    &tmp_entry, caller_prot, tag, map->pmap,
7050 			    tmp_entry.vme_start,
7051 			    physpage_p);
7052 		}
7053 
7054 		if (!user_wire && cur_thread != THREAD_NULL) {
7055 			thread_interrupt_level(interruptible_state);
7056 		}
7057 
7058 		vm_map_lock(map);
7059 
7060 		if (last_timestamp + 1 != map->timestamp) {
7061 			/*
7062 			 * Find the entry again.  It could have been clipped
7063 			 * after we unlocked the map.
7064 			 */
7065 			if (!vm_map_lookup_entry(map, tmp_entry.vme_start,
7066 			    &first_entry)) {
7067 				panic("vm_map_wire: re-lookup failed");
7068 			}
7069 
7070 			entry = first_entry;
7071 		}
7072 
7073 		last_timestamp = map->timestamp;
7074 
7075 		while ((entry != vm_map_to_entry(map)) &&
7076 		    (entry->vme_start < tmp_entry.vme_end)) {
7077 			assert(entry->in_transition);
7078 			entry->in_transition = FALSE;
7079 			if (entry->needs_wakeup) {
7080 				entry->needs_wakeup = FALSE;
7081 				need_wakeup = TRUE;
7082 			}
7083 			if (rc != KERN_SUCCESS) {       /* from vm_*_wire */
7084 				subtract_wire_counts(map, entry, user_wire);
7085 			}
7086 			entry = entry->vme_next;
7087 		}
7088 
7089 		if (rc != KERN_SUCCESS) {               /* from vm_*_wire */
7090 			goto done;
7091 		}
7092 
7093 		if ((entry != vm_map_to_entry(map)) && /* we still have entries in the map */
7094 		    (tmp_entry.vme_end != end) &&    /* AND, we are not at the end of the requested range */
7095 		    (entry->vme_start != tmp_entry.vme_end)) { /* AND, the next entry is not contiguous. */
7096 			/* found a "new" hole */
7097 			s = tmp_entry.vme_end;
7098 			rc = KERN_INVALID_ADDRESS;
7099 			goto done;
7100 		}
7101 
7102 		s = entry->vme_start;
7103 	} /* end while loop through map entries */
7104 
7105 done:
7106 	if (rc == KERN_SUCCESS) {
7107 		/* repair any damage we may have made to the VM map */
7108 		vm_map_simplify_range(map, start, end);
7109 	}
7110 
7111 	vm_map_unlock(map);
7112 
7113 	/*
7114 	 * wake up anybody waiting on entries we wired.
7115 	 */
7116 	if (need_wakeup) {
7117 		vm_map_entry_wakeup(map);
7118 	}
7119 
7120 	if (rc != KERN_SUCCESS) {
7121 		/* undo what has been wired so far */
7122 		vm_map_unwire_nested(map, start, s, user_wire,
7123 		    map_pmap, pmap_addr);
7124 		if (physpage_p) {
7125 			*physpage_p = 0;
7126 		}
7127 	}
7128 
7129 	return rc;
7130 }
7131 
7132 static __attribute__((always_inline, warn_unused_result))
7133 kern_return_t
vm_map_wire_sanitize(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_prot_ut prot_u,vm_sanitize_caller_t vm_sanitize_caller,vm_map_offset_t * start,vm_map_offset_t * end,vm_map_size_t * size,vm_prot_t * prot)7134 vm_map_wire_sanitize(
7135 	vm_map_t                map,
7136 	vm_map_offset_ut        start_u,
7137 	vm_map_offset_ut        end_u,
7138 	vm_prot_ut              prot_u,
7139 	vm_sanitize_caller_t    vm_sanitize_caller,
7140 	vm_map_offset_t        *start,
7141 	vm_map_offset_t        *end,
7142 	vm_map_size_t          *size,
7143 	vm_prot_t              *prot)
7144 {
7145 	kern_return_t   kr;
7146 
7147 	kr = vm_sanitize_addr_end(start_u, end_u, vm_sanitize_caller, map,
7148 	    VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS, start, end,
7149 	    size);
7150 	if (__improbable(kr != KERN_SUCCESS)) {
7151 		return kr;
7152 	}
7153 
7154 	kr = vm_sanitize_prot(prot_u, vm_sanitize_caller, map, prot);
7155 	if (__improbable(kr != KERN_SUCCESS)) {
7156 		return kr;
7157 	}
7158 
7159 	return KERN_SUCCESS;
7160 }
7161 
7162 /*
7163  * Validation function for vm_map_wire_nested().
7164  */
7165 kern_return_t
vm_map_wire_impl(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_prot_ut prot_u,vm_tag_t tag,boolean_t user_wire,ppnum_t * physpage_p,vm_sanitize_caller_t vm_sanitize_caller)7166 vm_map_wire_impl(
7167 	vm_map_t                map,
7168 	vm_map_offset_ut        start_u,
7169 	vm_map_offset_ut        end_u,
7170 	vm_prot_ut              prot_u,
7171 	vm_tag_t                tag,
7172 	boolean_t               user_wire,
7173 	ppnum_t                *physpage_p,
7174 	vm_sanitize_caller_t    vm_sanitize_caller)
7175 {
7176 	vm_map_offset_t start, end;
7177 	vm_map_size_t   size;
7178 	vm_prot_t       prot;
7179 	kern_return_t   kr;
7180 
7181 	/*
7182 	 * Sanitize any input parameters that are addr/size/prot/inherit
7183 	 */
7184 	kr = vm_map_wire_sanitize(map,
7185 	    start_u,
7186 	    end_u,
7187 	    prot_u,
7188 	    vm_sanitize_caller,
7189 	    &start,
7190 	    &end,
7191 	    &size,
7192 	    &prot);
7193 	if (__improbable(kr != KERN_SUCCESS)) {
7194 		if (physpage_p) {
7195 			*physpage_p = 0;
7196 		}
7197 		return vm_sanitize_get_kr(kr);
7198 	}
7199 
7200 	return vm_map_wire_nested(map, start, end, prot, tag, user_wire,
7201 	           PMAP_NULL, 0, physpage_p);
7202 }
7203 
7204 kern_return_t
vm_map_wire_external(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_prot_ut prot_u,boolean_t user_wire)7205 vm_map_wire_external(
7206 	vm_map_t                map,
7207 	vm_map_offset_ut        start_u,
7208 	vm_map_offset_ut        end_u,
7209 	vm_prot_ut              prot_u,
7210 	boolean_t               user_wire)
7211 {
7212 	vm_tag_t tag = vm_tag_bt();
7213 
7214 	return vm_map_wire_kernel(map, start_u, end_u, prot_u, tag, user_wire);
7215 }
7216 
7217 kern_return_t
vm_map_wire_kernel(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_prot_ut prot_u,vm_tag_t tag,boolean_t user_wire)7218 vm_map_wire_kernel(
7219 	vm_map_t                map,
7220 	vm_map_offset_ut        start_u,
7221 	vm_map_offset_ut        end_u,
7222 	vm_prot_ut              prot_u,
7223 	vm_tag_t                tag,
7224 	boolean_t               user_wire)
7225 {
7226 	return vm_map_wire_impl(map, start_u, end_u, prot_u, tag,
7227 	           user_wire, NULL, VM_SANITIZE_CALLER_VM_MAP_WIRE);
7228 }
7229 
7230 #if XNU_PLATFORM_MacOSX
7231 
7232 kern_return_t
vm_map_wire_and_extract(vm_map_t map,vm_map_offset_ut start_u,vm_prot_ut prot_u,boolean_t user_wire,ppnum_t * physpage_p)7233 vm_map_wire_and_extract(
7234 	vm_map_t                map,
7235 	vm_map_offset_ut        start_u,
7236 	vm_prot_ut              prot_u,
7237 	boolean_t               user_wire,
7238 	ppnum_t                *physpage_p)
7239 {
7240 	vm_tag_t         tag    = vm_tag_bt();
7241 	vm_map_size_ut   size_u = vm_sanitize_wrap_size(VM_MAP_PAGE_SIZE(map));
7242 	vm_map_offset_ut end_u  = vm_sanitize_compute_ut_end(start_u, size_u);
7243 
7244 	return vm_map_wire_impl(map, start_u, end_u, prot_u, tag,
7245 	           user_wire, physpage_p, VM_SANITIZE_CALLER_VM_MAP_WIRE);
7246 }
7247 
7248 #endif /* XNU_PLATFORM_MacOSX */
7249 
7250 static kern_return_t
vm_map_unwire_nested(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,boolean_t user_wire,pmap_t map_pmap,vm_map_offset_t pmap_addr)7251 vm_map_unwire_nested(
7252 	vm_map_t                map,
7253 	vm_map_offset_t         start,
7254 	vm_map_offset_t         end,
7255 	boolean_t               user_wire,
7256 	pmap_t                  map_pmap,
7257 	vm_map_offset_t         pmap_addr)
7258 {
7259 	vm_map_entry_t          entry;
7260 	struct vm_map_entry     *first_entry, tmp_entry;
7261 	boolean_t               need_wakeup;
7262 	boolean_t               main_map = FALSE;
7263 	unsigned int            last_timestamp;
7264 
7265 	VM_MAP_RANGE_CHECK(map, start, end);
7266 	assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
7267 	assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
7268 
7269 	if (start == end) {
7270 		/* We unwired what the caller asked for: zero pages */
7271 		return KERN_SUCCESS;
7272 	}
7273 
7274 	vm_map_lock(map);
7275 	if (map_pmap == NULL) {
7276 		main_map = TRUE;
7277 	}
7278 	last_timestamp = map->timestamp;
7279 
7280 	if (vm_map_lookup_entry(map, start, &first_entry)) {
7281 		entry = first_entry;
7282 		/*
7283 		 * vm_map_clip_start will be done later.
7284 		 * We don't want to unnest any nested sub maps here !
7285 		 */
7286 	} else {
7287 		if (!user_wire) {
7288 			panic("vm_map_unwire: start not found");
7289 		}
7290 		/*	Start address is not in map. */
7291 		vm_map_unlock(map);
7292 		return KERN_INVALID_ADDRESS;
7293 	}
7294 
7295 	if (entry->superpage_size) {
7296 		/* superpages are always wired */
7297 		vm_map_unlock(map);
7298 		return KERN_INVALID_ADDRESS;
7299 	}
7300 
7301 	need_wakeup = FALSE;
7302 	while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
7303 		if (entry->in_transition) {
7304 			/*
7305 			 * 1)
7306 			 * Another thread is wiring down this entry. Note
7307 			 * that if it is not for the other thread we would
7308 			 * be unwiring an unwired entry.  This is not
7309 			 * permitted.  If we wait, we will be unwiring memory
7310 			 * we did not wire.
7311 			 *
7312 			 * 2)
7313 			 * Another thread is unwiring this entry.  We did not
7314 			 * have a reference to it, because if we did, this
7315 			 * entry will not be getting unwired now.
7316 			 */
7317 			if (!user_wire) {
7318 				/*
7319 				 * XXX FBDP
7320 				 * This could happen:  there could be some
7321 				 * overlapping vslock/vsunlock operations
7322 				 * going on.
7323 				 * We should probably just wait and retry,
7324 				 * but then we have to be careful that this
7325 				 * entry could get "simplified" after
7326 				 * "in_transition" gets unset and before
7327 				 * we re-lookup the entry, so we would
7328 				 * have to re-clip the entry to avoid
7329 				 * re-unwiring what we have already unwired...
7330 				 * See vm_map_wire_nested().
7331 				 *
7332 				 * Or we could just ignore "in_transition"
7333 				 * here and proceed to decement the wired
7334 				 * count(s) on this entry.  That should be fine
7335 				 * as long as "wired_count" doesn't drop all
7336 				 * the way to 0 (and we should panic if THAT
7337 				 * happens).
7338 				 */
7339 				panic("vm_map_unwire: in_transition entry");
7340 			}
7341 
7342 			entry = entry->vme_next;
7343 			continue;
7344 		}
7345 
7346 		if (entry->is_sub_map) {
7347 			vm_map_offset_t sub_start;
7348 			vm_map_offset_t sub_end;
7349 			vm_map_offset_t local_end;
7350 			pmap_t          pmap;
7351 			vm_map_t        sub_map = VM_MAP_NULL;
7352 
7353 			vm_map_clip_start(map, entry, start);
7354 			vm_map_clip_end(map, entry, end);
7355 
7356 			sub_start = VME_OFFSET(entry);
7357 			sub_end = entry->vme_end - entry->vme_start;
7358 			sub_end += VME_OFFSET(entry);
7359 			local_end = entry->vme_end;
7360 			if (map_pmap == NULL) {
7361 				if (entry->use_pmap) {
7362 					pmap = VME_SUBMAP(entry)->pmap;
7363 					pmap_addr = sub_start;
7364 				} else {
7365 					pmap = map->pmap;
7366 					pmap_addr = start;
7367 				}
7368 				if (entry->wired_count == 0 ||
7369 				    (user_wire && entry->user_wired_count == 0)) {
7370 					if (!user_wire) {
7371 						panic("vm_map_unwire: entry is unwired");
7372 					}
7373 					entry = entry->vme_next;
7374 					continue;
7375 				}
7376 
7377 				/*
7378 				 * Check for holes
7379 				 * Holes: Next entry should be contiguous unless
7380 				 * this is the end of the region.
7381 				 */
7382 				if (((entry->vme_end < end) &&
7383 				    ((entry->vme_next == vm_map_to_entry(map)) ||
7384 				    (entry->vme_next->vme_start
7385 				    > entry->vme_end)))) {
7386 					if (!user_wire) {
7387 						panic("vm_map_unwire: non-contiguous region");
7388 					}
7389 /*
7390  *                                       entry = entry->vme_next;
7391  *                                       continue;
7392  */
7393 				}
7394 
7395 				subtract_wire_counts(map, entry, user_wire);
7396 
7397 				if (entry->wired_count != 0) {
7398 					entry = entry->vme_next;
7399 					continue;
7400 				}
7401 
7402 				entry->in_transition = TRUE;
7403 				tmp_entry = *entry;/* see comment in vm_map_wire() */
7404 
7405 				/*
7406 				 * We can unlock the map now. The in_transition state
7407 				 * guarantees existance of the entry.
7408 				 */
7409 				sub_map = VME_SUBMAP(entry);
7410 				vm_map_reference(sub_map);
7411 				vm_map_unlock(map);
7412 				vm_map_unwire_nested(sub_map,
7413 				    sub_start, sub_end, user_wire, pmap, pmap_addr);
7414 				vm_map_deallocate(sub_map);
7415 				sub_map = VM_MAP_NULL;
7416 				vm_map_lock(map);
7417 
7418 				if (last_timestamp + 1 != map->timestamp) {
7419 					/*
7420 					 * Find the entry again.  It could have been
7421 					 * clipped or deleted after we unlocked the map.
7422 					 */
7423 					if (!vm_map_lookup_entry(map,
7424 					    tmp_entry.vme_start,
7425 					    &first_entry)) {
7426 						if (!user_wire) {
7427 							panic("vm_map_unwire: re-lookup failed");
7428 						}
7429 						entry = first_entry->vme_next;
7430 					} else {
7431 						entry = first_entry;
7432 					}
7433 				}
7434 				last_timestamp = map->timestamp;
7435 
7436 				/*
7437 				 * clear transition bit for all constituent entries
7438 				 * that were in the original entry (saved in
7439 				 * tmp_entry).  Also check for waiters.
7440 				 */
7441 				while ((entry != vm_map_to_entry(map)) &&
7442 				    (entry->vme_start < tmp_entry.vme_end)) {
7443 					assert(entry->in_transition);
7444 					entry->in_transition = FALSE;
7445 					if (entry->needs_wakeup) {
7446 						entry->needs_wakeup = FALSE;
7447 						need_wakeup = TRUE;
7448 					}
7449 					entry = entry->vme_next;
7450 				}
7451 				continue;
7452 			} else {
7453 				tmp_entry = *entry;
7454 				sub_map = VME_SUBMAP(entry);
7455 				vm_map_reference(sub_map);
7456 				vm_map_unlock(map);
7457 				vm_map_unwire_nested(sub_map,
7458 				    sub_start, sub_end, user_wire, map_pmap,
7459 				    pmap_addr);
7460 				vm_map_deallocate(sub_map);
7461 				sub_map = VM_MAP_NULL;
7462 				vm_map_lock(map);
7463 
7464 				if (last_timestamp + 1 != map->timestamp) {
7465 					/*
7466 					 * Find the entry again.  It could have been
7467 					 * clipped or deleted after we unlocked the map.
7468 					 */
7469 					if (!vm_map_lookup_entry(map,
7470 					    tmp_entry.vme_start,
7471 					    &first_entry)) {
7472 						if (!user_wire) {
7473 							panic("vm_map_unwire: re-lookup failed");
7474 						}
7475 						entry = first_entry->vme_next;
7476 					} else {
7477 						entry = first_entry;
7478 					}
7479 				}
7480 				last_timestamp = map->timestamp;
7481 			}
7482 		}
7483 
7484 
7485 		if ((entry->wired_count == 0) ||
7486 		    (user_wire && entry->user_wired_count == 0)) {
7487 			if (!user_wire) {
7488 				panic("vm_map_unwire: entry is unwired");
7489 			}
7490 
7491 			entry = entry->vme_next;
7492 			continue;
7493 		}
7494 
7495 		assert(entry->wired_count > 0 &&
7496 		    (!user_wire || entry->user_wired_count > 0));
7497 
7498 		vm_map_clip_start(map, entry, start);
7499 		vm_map_clip_end(map, entry, end);
7500 
7501 		/*
7502 		 * Check for holes
7503 		 * Holes: Next entry should be contiguous unless
7504 		 *	  this is the end of the region.
7505 		 */
7506 		if (((entry->vme_end < end) &&
7507 		    ((entry->vme_next == vm_map_to_entry(map)) ||
7508 		    (entry->vme_next->vme_start > entry->vme_end)))) {
7509 			if (!user_wire) {
7510 				panic("vm_map_unwire: non-contiguous region");
7511 			}
7512 			/*
7513 			 * entry = entry->vme_next;
7514 			 * continue;
7515 			 */
7516 		}
7517 
7518 		subtract_wire_counts(map, entry, user_wire);
7519 
7520 		if (entry->wired_count != 0) {
7521 			entry = entry->vme_next;
7522 			continue;
7523 		}
7524 
7525 		if (entry->zero_wired_pages) {
7526 			entry->zero_wired_pages = FALSE;
7527 		}
7528 
7529 		entry->in_transition = TRUE;
7530 		tmp_entry = *entry;     /* see comment in vm_map_wire() */
7531 
7532 		/*
7533 		 * We can unlock the map now. The in_transition state
7534 		 * guarantees existance of the entry.
7535 		 */
7536 		vm_map_unlock(map);
7537 		if (map_pmap) {
7538 			vm_fault_unwire(map, &tmp_entry, FALSE, map_pmap,
7539 			    pmap_addr, tmp_entry.vme_end);
7540 		} else {
7541 			vm_fault_unwire(map, &tmp_entry, FALSE, map->pmap,
7542 			    tmp_entry.vme_start, tmp_entry.vme_end);
7543 		}
7544 		vm_map_lock(map);
7545 
7546 		if (last_timestamp + 1 != map->timestamp) {
7547 			/*
7548 			 * Find the entry again.  It could have been clipped
7549 			 * or deleted after we unlocked the map.
7550 			 */
7551 			if (!vm_map_lookup_entry(map, tmp_entry.vme_start,
7552 			    &first_entry)) {
7553 				if (!user_wire) {
7554 					panic("vm_map_unwire: re-lookup failed");
7555 				}
7556 				entry = first_entry->vme_next;
7557 			} else {
7558 				entry = first_entry;
7559 			}
7560 		}
7561 		last_timestamp = map->timestamp;
7562 
7563 		/*
7564 		 * clear transition bit for all constituent entries that
7565 		 * were in the original entry (saved in tmp_entry).  Also
7566 		 * check for waiters.
7567 		 */
7568 		while ((entry != vm_map_to_entry(map)) &&
7569 		    (entry->vme_start < tmp_entry.vme_end)) {
7570 			assert(entry->in_transition);
7571 			entry->in_transition = FALSE;
7572 			if (entry->needs_wakeup) {
7573 				entry->needs_wakeup = FALSE;
7574 				need_wakeup = TRUE;
7575 			}
7576 			entry = entry->vme_next;
7577 		}
7578 	}
7579 
7580 	/*
7581 	 * We might have fragmented the address space when we wired this
7582 	 * range of addresses.  Attempt to re-coalesce these VM map entries
7583 	 * with their neighbors now that they're no longer wired.
7584 	 * Under some circumstances, address space fragmentation can
7585 	 * prevent VM object shadow chain collapsing, which can cause
7586 	 * swap space leaks.
7587 	 */
7588 	vm_map_simplify_range(map, start, end);
7589 
7590 	vm_map_unlock(map);
7591 	/*
7592 	 * wake up anybody waiting on entries that we have unwired.
7593 	 */
7594 	if (need_wakeup) {
7595 		vm_map_entry_wakeup(map);
7596 	}
7597 	return KERN_SUCCESS;
7598 }
7599 
7600 kern_return_t
vm_map_unwire(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,boolean_t user_wire)7601 vm_map_unwire(
7602 	vm_map_t                map,
7603 	vm_map_offset_ut        start_u,
7604 	vm_map_offset_ut        end_u,
7605 	boolean_t               user_wire)
7606 {
7607 	return vm_map_unwire_impl(map, start_u, end_u, user_wire,
7608 	           VM_SANITIZE_CALLER_VM_MAP_UNWIRE);
7609 }
7610 
7611 static __attribute__((always_inline, warn_unused_result))
7612 kern_return_t
vm_map_unwire_sanitize(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_sanitize_caller_t vm_sanitize_caller,vm_map_offset_t * start,vm_map_offset_t * end,vm_map_size_t * size)7613 vm_map_unwire_sanitize(
7614 	vm_map_t                map,
7615 	vm_map_offset_ut        start_u,
7616 	vm_map_offset_ut        end_u,
7617 	vm_sanitize_caller_t    vm_sanitize_caller,
7618 	vm_map_offset_t        *start,
7619 	vm_map_offset_t        *end,
7620 	vm_map_size_t          *size)
7621 {
7622 	return vm_sanitize_addr_end(start_u, end_u, vm_sanitize_caller, map,
7623 	           VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS, start, end,
7624 	           size);
7625 }
7626 
7627 kern_return_t
vm_map_unwire_impl(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,boolean_t user_wire,vm_sanitize_caller_t vm_sanitize_caller)7628 vm_map_unwire_impl(
7629 	vm_map_t                map,
7630 	vm_map_offset_ut        start_u,
7631 	vm_map_offset_ut        end_u,
7632 	boolean_t               user_wire,
7633 	vm_sanitize_caller_t    vm_sanitize_caller)
7634 {
7635 	vm_map_offset_t start, end;
7636 	vm_map_size_t   size;
7637 	kern_return_t   kr;
7638 
7639 	/*
7640 	 * Sanitize any input parameters that are addr/size/prot/inherit
7641 	 */
7642 	kr = vm_map_unwire_sanitize(
7643 		map,
7644 		start_u,
7645 		end_u,
7646 		vm_sanitize_caller,
7647 		&start,
7648 		&end,
7649 		&size);
7650 	if (__improbable(kr != KERN_SUCCESS)) {
7651 		return vm_sanitize_get_kr(kr);
7652 	}
7653 
7654 	return vm_map_unwire_nested(map, start, end,
7655 	           user_wire, (pmap_t)NULL, 0);
7656 }
7657 
7658 
7659 /*
7660  *	vm_map_entry_zap:	[ internal use only ]
7661  *
7662  *	Remove the entry from the target map
7663  *	and put it on a zap list.
7664  */
7665 static void
vm_map_entry_zap(vm_map_t map,vm_map_entry_t entry,vm_map_zap_t zap)7666 vm_map_entry_zap(
7667 	vm_map_t                map,
7668 	vm_map_entry_t          entry,
7669 	vm_map_zap_t            zap)
7670 {
7671 	vm_map_offset_t s, e;
7672 
7673 	s = entry->vme_start;
7674 	e = entry->vme_end;
7675 	assert(VM_MAP_PAGE_ALIGNED(s, FOURK_PAGE_MASK));
7676 	assert(VM_MAP_PAGE_ALIGNED(e, FOURK_PAGE_MASK));
7677 	if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK) {
7678 		assert(page_aligned(s));
7679 		assert(page_aligned(e));
7680 	}
7681 	if (entry->map_aligned == TRUE) {
7682 		assert(VM_MAP_PAGE_ALIGNED(s, VM_MAP_PAGE_MASK(map)));
7683 		assert(VM_MAP_PAGE_ALIGNED(e, VM_MAP_PAGE_MASK(map)));
7684 	}
7685 	assert(entry->wired_count == 0);
7686 	assert(entry->user_wired_count == 0);
7687 	assert(!entry->vme_permanent);
7688 
7689 	vm_map_store_entry_unlink(map, entry, false);
7690 	map->size -= e - s;
7691 
7692 	vm_map_zap_append(zap, entry);
7693 }
7694 
7695 static void
vm_map_submap_pmap_clean(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_map_t sub_map,vm_map_offset_t offset)7696 vm_map_submap_pmap_clean(
7697 	vm_map_t        map,
7698 	vm_map_offset_t start,
7699 	vm_map_offset_t end,
7700 	vm_map_t        sub_map,
7701 	vm_map_offset_t offset)
7702 {
7703 	vm_map_offset_t submap_start;
7704 	vm_map_offset_t submap_end;
7705 	vm_map_size_t   remove_size;
7706 	vm_map_entry_t  entry;
7707 
7708 	submap_end = offset + (end - start);
7709 	submap_start = offset;
7710 
7711 	vm_map_lock_read(sub_map);
7712 	if (vm_map_lookup_entry(sub_map, offset, &entry)) {
7713 		remove_size = (entry->vme_end - entry->vme_start);
7714 		if (offset > entry->vme_start) {
7715 			remove_size -= offset - entry->vme_start;
7716 		}
7717 
7718 
7719 		if (submap_end < entry->vme_end) {
7720 			remove_size -=
7721 			    entry->vme_end - submap_end;
7722 		}
7723 		if (entry->is_sub_map) {
7724 			vm_map_submap_pmap_clean(
7725 				sub_map,
7726 				start,
7727 				start + remove_size,
7728 				VME_SUBMAP(entry),
7729 				VME_OFFSET(entry));
7730 		} else {
7731 			if (map->mapped_in_other_pmaps &&
7732 			    os_ref_get_count_raw(&map->map_refcnt) != 0 &&
7733 			    VME_OBJECT(entry) != NULL) {
7734 				vm_object_pmap_protect_options(
7735 					VME_OBJECT(entry),
7736 					(VME_OFFSET(entry) +
7737 					offset -
7738 					entry->vme_start),
7739 					remove_size,
7740 					PMAP_NULL,
7741 					PAGE_SIZE,
7742 					entry->vme_start,
7743 					VM_PROT_NONE,
7744 					PMAP_OPTIONS_REMOVE);
7745 			} else {
7746 				pmap_remove(map->pmap,
7747 				    (addr64_t)start,
7748 				    (addr64_t)(start + remove_size));
7749 			}
7750 		}
7751 	}
7752 
7753 	entry = entry->vme_next;
7754 
7755 	while ((entry != vm_map_to_entry(sub_map))
7756 	    && (entry->vme_start < submap_end)) {
7757 		remove_size = (entry->vme_end - entry->vme_start);
7758 		if (submap_end < entry->vme_end) {
7759 			remove_size -= entry->vme_end - submap_end;
7760 		}
7761 		if (entry->is_sub_map) {
7762 			vm_map_submap_pmap_clean(
7763 				sub_map,
7764 				(start + entry->vme_start) - offset,
7765 				((start + entry->vme_start) - offset) + remove_size,
7766 				VME_SUBMAP(entry),
7767 				VME_OFFSET(entry));
7768 		} else {
7769 			if (map->mapped_in_other_pmaps &&
7770 			    os_ref_get_count_raw(&map->map_refcnt) != 0 &&
7771 			    VME_OBJECT(entry) != NULL) {
7772 				vm_object_pmap_protect_options(
7773 					VME_OBJECT(entry),
7774 					VME_OFFSET(entry),
7775 					remove_size,
7776 					PMAP_NULL,
7777 					PAGE_SIZE,
7778 					entry->vme_start,
7779 					VM_PROT_NONE,
7780 					PMAP_OPTIONS_REMOVE);
7781 			} else {
7782 				pmap_remove(map->pmap,
7783 				    (addr64_t)((start + entry->vme_start)
7784 				    - offset),
7785 				    (addr64_t)(((start + entry->vme_start)
7786 				    - offset) + remove_size));
7787 			}
7788 		}
7789 		entry = entry->vme_next;
7790 	}
7791 	vm_map_unlock_read(sub_map);
7792 	return;
7793 }
7794 
7795 /*
7796  *     virt_memory_guard_ast:
7797  *
7798  *     Handle the AST callout for a virtual memory guard.
7799  *	   raise an EXC_GUARD exception and terminate the task
7800  *     if configured to do so.
7801  */
7802 void
virt_memory_guard_ast(thread_t thread,mach_exception_data_type_t code,mach_exception_data_type_t subcode)7803 virt_memory_guard_ast(
7804 	thread_t thread,
7805 	mach_exception_data_type_t code,
7806 	mach_exception_data_type_t subcode)
7807 {
7808 	task_t task = get_threadtask(thread);
7809 	assert(task != kernel_task);
7810 	assert(task == current_task());
7811 	kern_return_t sync_exception_result;
7812 	uint32_t behavior;
7813 
7814 	behavior = task->task_exc_guard;
7815 
7816 
7817 	/* Is delivery enabled */
7818 	if ((behavior & TASK_EXC_GUARD_VM_DELIVER) == 0) {
7819 		return;
7820 	}
7821 
7822 	/* If only once, make sure we're that once */
7823 	while (behavior & TASK_EXC_GUARD_VM_ONCE) {
7824 		uint32_t new_behavior = behavior & ~TASK_EXC_GUARD_VM_DELIVER;
7825 
7826 		if (OSCompareAndSwap(behavior, new_behavior, &task->task_exc_guard)) {
7827 			break;
7828 		}
7829 		behavior = task->task_exc_guard;
7830 		if ((behavior & TASK_EXC_GUARD_VM_DELIVER) == 0) {
7831 			return;
7832 		}
7833 	}
7834 
7835 	const bool fatal = task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL;
7836 	/* Raise exception synchronously and see if handler claimed it */
7837 	sync_exception_result = task_exception_notify(EXC_GUARD, code, subcode, fatal);
7838 
7839 	if (fatal) {
7840 		/*
7841 		 * If Synchronous EXC_GUARD delivery was successful then
7842 		 * kill the process and return, else kill the process
7843 		 * and deliver the exception via EXC_CORPSE_NOTIFY.
7844 		 */
7845 
7846 
7847 		int flags = PX_DEBUG_NO_HONOR;
7848 		exception_info_t info = {
7849 			.os_reason = OS_REASON_GUARD,
7850 			.exception_type = EXC_GUARD,
7851 			.mx_code = code,
7852 			.mx_subcode = subcode
7853 		};
7854 
7855 		if (sync_exception_result == KERN_SUCCESS) {
7856 			flags |= PX_PSIGNAL;
7857 		}
7858 		exit_with_mach_exception(current_proc(), info, flags);
7859 	} else if (task->task_exc_guard & TASK_EXC_GUARD_VM_CORPSE) {
7860 		/*
7861 		 * If the synchronous EXC_GUARD delivery was not successful,
7862 		 * raise a simulated crash.
7863 		 */
7864 		if (sync_exception_result != KERN_SUCCESS) {
7865 			task_violated_guard(code, subcode, NULL, FALSE);
7866 		}
7867 	}
7868 }
7869 
7870 /*
7871  * Validate policy for VM guard exceptions and encode the correct Mach exception
7872  * code and subcode if the policy allows delivering a guard exception here.
7873  */
7874 static bool
vm_map_guard_exception_internal(vm_map_offset_t address,unsigned reason,mach_exception_code_t * code,mach_exception_data_type_t * subcode)7875 vm_map_guard_exception_internal(
7876 	vm_map_offset_t            address,
7877 	unsigned                   reason,
7878 	mach_exception_code_t      *code,
7879 	mach_exception_data_type_t *subcode)
7880 {
7881 	unsigned int guard_type = GUARD_TYPE_VIRT_MEMORY;
7882 	unsigned int target = 0; /* should we pass in pid associated with map? */
7883 
7884 	task_t task = current_task_early();
7885 
7886 	/* Can't deliver exceptions to a NULL task (early boot) or kernel task */
7887 	if (task == NULL || task == kernel_task) {
7888 		return false;
7889 	}
7890 
7891 
7892 	*code = 0;
7893 	EXC_GUARD_ENCODE_TYPE(*code, guard_type);
7894 	EXC_GUARD_ENCODE_FLAVOR(*code, reason);
7895 	EXC_GUARD_ENCODE_TARGET(*code, target);
7896 	*subcode = (uint64_t)address;
7897 
7898 	return true;
7899 }
7900 
7901 /*
7902  *     vm_map_guard_exception:
7903  *
7904  *     Generate a GUARD_TYPE_VIRTUAL_MEMORY EXC_GUARD exception.
7905  *
7906  *         `reason` is kGUARD_EXC_DEALLOC_GAP when we find nothing mapped,
7907  *     or if there is a gap in the mapping when a user address space
7908  *     was requested. We report the address of the first gap found.
7909  */
7910 
7911 void
vm_map_guard_exception(vm_map_offset_t address,unsigned reason)7912 vm_map_guard_exception(
7913 	vm_map_offset_t            address,
7914 	unsigned                   reason)
7915 {
7916 	mach_exception_code_t code;
7917 	mach_exception_data_type_t subcode;
7918 	if (vm_map_guard_exception_internal(address, reason, &code, &subcode)) {
7919 		task_t task = current_task();
7920 		bool fatal = task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL;
7921 
7922 		thread_guard_violation(current_thread(), code, subcode, fatal);
7923 	}
7924 }
7925 
7926 
7927 static kern_return_t
vm_map_delete_submap_recurse(vm_map_t submap,vm_map_offset_t submap_start,vm_map_offset_t submap_end)7928 vm_map_delete_submap_recurse(
7929 	vm_map_t submap,
7930 	vm_map_offset_t submap_start,
7931 	vm_map_offset_t submap_end)
7932 {
7933 	vm_map_entry_t submap_entry;
7934 
7935 	/*
7936 	 * Verify that the submap does not contain any "permanent" entries
7937 	 * within the specified range. We permit TPRO ranges to be overwritten
7938 	 * as we only reach this path if TPRO const protection is disabled for a
7939 	 * given map.
7940 	 *
7941 	 * We do not care about gaps.
7942 	 */
7943 
7944 	vm_map_lock(submap);
7945 
7946 	if (!vm_map_lookup_entry(submap, submap_start, &submap_entry)) {
7947 		submap_entry = submap_entry->vme_next;
7948 	}
7949 
7950 	for (;
7951 	    submap_entry != vm_map_to_entry(submap) &&
7952 	    submap_entry->vme_start < submap_end;
7953 	    submap_entry = submap_entry->vme_next) {
7954 		if (submap_entry->vme_permanent
7955 #ifdef __arm64e__
7956 		    /* allow TPRO submap entries to be overwritten */
7957 		    && !submap_entry->used_for_tpro
7958 #endif
7959 		    ) {
7960 			/* "permanent" entry -> fail */
7961 			vm_map_unlock(submap);
7962 			return KERN_PROTECTION_FAILURE;
7963 		}
7964 	}
7965 	/* no "permanent" entries in the range -> success */
7966 	vm_map_unlock(submap);
7967 	return KERN_SUCCESS;
7968 }
7969 
7970 __abortlike
7971 static void
__vm_map_delete_misaligned_panic(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)7972 __vm_map_delete_misaligned_panic(
7973 	vm_map_t                map,
7974 	vm_map_offset_t         start,
7975 	vm_map_offset_t         end)
7976 {
7977 	panic("vm_map_delete(%p,0x%llx,0x%llx): start is not aligned to 0x%x",
7978 	    map, (uint64_t)start, (uint64_t)end, VM_MAP_PAGE_SIZE(map));
7979 }
7980 
7981 __abortlike
7982 static void
__vm_map_delete_failed_panic(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,kern_return_t kr)7983 __vm_map_delete_failed_panic(
7984 	vm_map_t                map,
7985 	vm_map_offset_t         start,
7986 	vm_map_offset_t         end,
7987 	kern_return_t           kr)
7988 {
7989 	panic("vm_map_delete(%p,0x%llx,0x%llx): failed unexpected with %d",
7990 	    map, (uint64_t)start, (uint64_t)end, kr);
7991 }
7992 
7993 __abortlike
7994 static void
__vm_map_delete_gap_panic(vm_map_t map,vm_map_offset_t where,vm_map_offset_t start,vm_map_offset_t end)7995 __vm_map_delete_gap_panic(
7996 	vm_map_t                map,
7997 	vm_map_offset_t         where,
7998 	vm_map_offset_t         start,
7999 	vm_map_offset_t         end)
8000 {
8001 	panic("vm_map_delete(%p,0x%llx,0x%llx): no map entry at 0x%llx",
8002 	    map, (uint64_t)start, (uint64_t)end, (uint64_t)where);
8003 }
8004 
8005 __abortlike
8006 static void
__vm_map_delete_permanent_panic(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_map_entry_t entry)8007 __vm_map_delete_permanent_panic(
8008 	vm_map_t                map,
8009 	vm_map_offset_t         start,
8010 	vm_map_offset_t         end,
8011 	vm_map_entry_t          entry)
8012 {
8013 	panic("vm_map_delete(%p,0x%llx,0x%llx): "
8014 	    "Attempting to remove permanent VM map entry %p [0x%llx:0x%llx]",
8015 	    map, (uint64_t)start, (uint64_t)end, entry,
8016 	    (uint64_t)entry->vme_start,
8017 	    (uint64_t)entry->vme_end);
8018 }
8019 
8020 __options_decl(vm_map_delete_state_t, uint32_t, {
8021 	VMDS_NONE               = 0x0000,
8022 
8023 	VMDS_FOUND_GAP          = 0x0001,
8024 	VMDS_GAPS_OK            = 0x0002,
8025 
8026 	VMDS_KERNEL_PMAP        = 0x0004,
8027 	VMDS_NEEDS_LOOKUP       = 0x0008,
8028 	VMDS_NEEDS_WAKEUP       = 0x0010,
8029 	VMDS_KERNEL_KMEMPTR     = 0x0020
8030 });
8031 
8032 /*
8033  * vm_map_clamp_to_pmap(map, start, end)
8034  *
8035  * Modify *start and *end so they fall within the bounds of map->pmap.
8036  */
8037 #if MACH_ASSERT
8038 static void
vm_map_clamp_to_pmap(vm_map_t map,vm_map_address_t * start,vm_map_address_t * end)8039 vm_map_clamp_to_pmap(vm_map_t map, vm_map_address_t *start, vm_map_address_t *end)
8040 {
8041 	vm_map_address_t min;
8042 	vm_map_address_t max;
8043 
8044 #if __x86_64__
8045 	/* x86_64 struct pmap does not have min and max fields */
8046 	if (map->pmap == kernel_pmap) {
8047 		min = VM_MIN_KERNEL_AND_KEXT_ADDRESS;
8048 		max = VM_MAX_KERNEL_ADDRESS;
8049 	} else {
8050 		min = VM_MAP_MIN_ADDRESS;
8051 		max = VM_MAP_MAX_ADDRESS;
8052 	}
8053 #else
8054 	min = map->pmap->min;
8055 	max = map->pmap->max;
8056 #endif
8057 
8058 	if (*start < min) {
8059 		*start = min;
8060 	} else if (*start > max) {
8061 		*start = max;
8062 	}
8063 	if (*end < min) {
8064 		*end = min;
8065 	} else if (*end > max) {
8066 		*end = max;
8067 	}
8068 }
8069 #endif
8070 
8071 int vm_log_map_delete_permanent_prot_none = 0;
8072 /*
8073  *	vm_map_delete:	[ internal use only ]
8074  *
8075  *	Deallocates the given address range from the target map.
8076  *	Removes all user wirings. Unwires one kernel wiring if
8077  *	VM_MAP_REMOVE_KUNWIRE is set.  Waits for kernel wirings to go
8078  *	away if VM_MAP_REMOVE_WAIT_FOR_KWIRE is set.  Sleeps
8079  *	interruptibly if VM_MAP_REMOVE_INTERRUPTIBLE is set.
8080  *
8081  *
8082  *	When the map is a kernel map, then any error in removing mappings
8083  *	will lead to a panic so that clients do not have to repeat the panic
8084  *	code at each call site.  If VM_MAP_REMOVE_INTERRUPTIBLE
8085  *	is also passed, then KERN_ABORTED will not lead to a panic.
8086  *
8087  *	This routine is called with map locked and leaves map locked.
8088  */
8089 static kmem_return_t
vm_map_delete(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vmr_flags_t flags,kmem_guard_t guard,vm_map_zap_t zap_list)8090 vm_map_delete(
8091 	vm_map_t                map,
8092 	vm_map_offset_t         start,
8093 	vm_map_offset_t         end,
8094 	vmr_flags_t             flags,
8095 	kmem_guard_t            guard,
8096 	vm_map_zap_t            zap_list)
8097 {
8098 	vm_map_entry_t          entry, next;
8099 	int                     interruptible;
8100 	vm_map_offset_t         gap_start = 0;
8101 	vm_map_offset_t         clear_in_transition_end = 0;
8102 	__unused vm_map_offset_t save_start = start;
8103 	__unused vm_map_offset_t save_end = end;
8104 	vm_map_delete_state_t   state = VMDS_NONE;
8105 	kmem_return_t           ret = { };
8106 	vm_map_range_id_t       range_id = 0;
8107 	struct kmem_page_meta  *meta = NULL;
8108 	uint32_t                size_idx, slot_idx;
8109 	struct mach_vm_range    slot;
8110 
8111 	if (vm_map_pmap(map) == kernel_pmap) {
8112 		state |= VMDS_KERNEL_PMAP;
8113 		range_id = kmem_addr_get_range(start, end - start);
8114 		if (kmem_is_ptr_range(range_id)) {
8115 			state |= VMDS_KERNEL_KMEMPTR;
8116 			slot_idx = kmem_addr_get_slot_idx(start, end, range_id, &meta,
8117 			    &size_idx, &slot);
8118 		}
8119 	}
8120 
8121 	if (map->terminated || os_ref_get_count_raw(&map->map_refcnt) == 0) {
8122 		state |= VMDS_GAPS_OK;
8123 	}
8124 
8125 	if (map->corpse_source &&
8126 	    !(flags & VM_MAP_REMOVE_TO_OVERWRITE) &&
8127 	    !map->terminated) {
8128 		/*
8129 		 * The map is being used for corpses related diagnostics.
8130 		 * So skip any entry removal to avoid perturbing the map state.
8131 		 * The cleanup will happen in task_terminate_internal after the
8132 		 * call to task_port_no_senders.
8133 		 */
8134 		goto out;
8135 	}
8136 
8137 	interruptible = (flags & VM_MAP_REMOVE_INTERRUPTIBLE) ?
8138 	    THREAD_ABORTSAFE : THREAD_UNINT;
8139 
8140 	if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) == 0 &&
8141 	    (start & VM_MAP_PAGE_MASK(map))) {
8142 		__vm_map_delete_misaligned_panic(map, start, end);
8143 	}
8144 
8145 	if ((state & VMDS_GAPS_OK) == 0) {
8146 		/*
8147 		 * If the map isn't terminated then all deletions must have
8148 		 * no gaps, and be within the [min, max) of the map.
8149 		 *
8150 		 * We got here without VM_MAP_RANGE_CHECK() being called,
8151 		 * and hence must validate bounds manually.
8152 		 *
8153 		 * It is worth noting that because vm_deallocate() will
8154 		 * round_page() the deallocation size, it's possible for "end"
8155 		 * to be 0 here due to overflow. We hence must treat it as being
8156 		 * beyond vm_map_max(map).
8157 		 *
8158 		 * Similarly, end < start means some wrap around happend,
8159 		 * which should cause an error or panic.
8160 		 */
8161 		if (end == 0 || end > vm_map_max(map)) {
8162 			state |= VMDS_FOUND_GAP;
8163 			gap_start = vm_map_max(map);
8164 			if (state & VMDS_KERNEL_PMAP) {
8165 				__vm_map_delete_gap_panic(map,
8166 				    gap_start, start, end);
8167 			}
8168 			goto out;
8169 		}
8170 
8171 		if (end < start) {
8172 			if (state & VMDS_KERNEL_PMAP) {
8173 				__vm_map_delete_gap_panic(map,
8174 				    vm_map_max(map), start, end);
8175 			}
8176 			ret.kmr_return = KERN_INVALID_ARGUMENT;
8177 			goto out;
8178 		}
8179 
8180 		if (start < vm_map_min(map)) {
8181 			state |= VMDS_FOUND_GAP;
8182 			gap_start = start;
8183 			if (state & VMDS_KERNEL_PMAP) {
8184 				__vm_map_delete_gap_panic(map,
8185 				    gap_start, start, end);
8186 			}
8187 			goto out;
8188 		}
8189 	} else {
8190 		/*
8191 		 * If the map is terminated, we must accept start/end
8192 		 * being beyond the boundaries of the map as this is
8193 		 * how some of the mappings like commpage mappings
8194 		 * can be destroyed (they're outside of those bounds).
8195 		 *
8196 		 * end < start is still something we can't cope with,
8197 		 * so just bail.
8198 		 */
8199 		if (end < start) {
8200 			goto out;
8201 		}
8202 	}
8203 
8204 
8205 	/*
8206 	 *	Find the start of the region.
8207 	 *
8208 	 *	If in a superpage, extend the range
8209 	 *	to include the start of the mapping.
8210 	 */
8211 	while (vm_map_lookup_entry_or_next(map, start, &entry)) {
8212 		if (entry->superpage_size && (start & ~SUPERPAGE_MASK)) {
8213 			start = SUPERPAGE_ROUND_DOWN(start);
8214 		} else {
8215 			SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
8216 			break;
8217 		}
8218 	}
8219 
8220 	if (entry->superpage_size) {
8221 		end = SUPERPAGE_ROUND_UP(end);
8222 	}
8223 
8224 	/*
8225 	 *	Step through all entries in this region
8226 	 */
8227 	for (vm_map_offset_t s = start; s < end;) {
8228 		/*
8229 		 * At this point, we have deleted all the memory entries
8230 		 * in [start, s) and are proceeding with the [s, end) range.
8231 		 *
8232 		 * This loop might drop the map lock, and it is possible that
8233 		 * some memory was already reallocated within [start, s)
8234 		 * and we don't want to mess with those entries.
8235 		 *
8236 		 * Some of those entries could even have been re-assembled
8237 		 * with an entry after "s" (in vm_map_simplify_entry()), so
8238 		 * we may have to vm_map_clip_start() again.
8239 		 *
8240 		 * When clear_in_transition_end is set, the we had marked
8241 		 * [start, clear_in_transition_end) as "in_transition"
8242 		 * during a previous iteration and we need to clear it.
8243 		 */
8244 
8245 		/*
8246 		 * Step 1: If needed (because we dropped locks),
8247 		 *         lookup the entry again.
8248 		 *
8249 		 *         If we're coming back from unwiring (Step 5),
8250 		 *         we also need to mark the entries as no longer
8251 		 *         in transition after that.
8252 		 */
8253 
8254 		if (state & VMDS_NEEDS_LOOKUP) {
8255 			state &= ~VMDS_NEEDS_LOOKUP;
8256 
8257 			if (vm_map_lookup_entry_or_next(map, s, &entry)) {
8258 				SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
8259 			}
8260 
8261 			if (state & VMDS_KERNEL_KMEMPTR) {
8262 				kmem_validate_slot(s, meta, size_idx, slot_idx);
8263 			}
8264 		}
8265 
8266 		if (clear_in_transition_end) {
8267 			for (vm_map_entry_t it = entry;
8268 			    it != vm_map_to_entry(map) &&
8269 			    it->vme_start < clear_in_transition_end;
8270 			    it = it->vme_next) {
8271 				assert(it->in_transition);
8272 				it->in_transition = FALSE;
8273 				if (it->needs_wakeup) {
8274 					it->needs_wakeup = FALSE;
8275 					state |= VMDS_NEEDS_WAKEUP;
8276 				}
8277 			}
8278 
8279 			clear_in_transition_end = 0;
8280 		}
8281 
8282 
8283 		/*
8284 		 * Step 2: Perform various policy checks
8285 		 *         before we do _anything_ to this entry.
8286 		 */
8287 
8288 		if (entry == vm_map_to_entry(map) || s < entry->vme_start) {
8289 			if (state & (VMDS_GAPS_OK | VMDS_FOUND_GAP)) {
8290 				/*
8291 				 * Either we found a gap already,
8292 				 * or we are tearing down a map,
8293 				 * keep going.
8294 				 */
8295 			} else if (state & VMDS_KERNEL_PMAP) {
8296 				__vm_map_delete_gap_panic(map, s, start, end);
8297 			} else if (s < end) {
8298 				state |= VMDS_FOUND_GAP;
8299 				gap_start = s;
8300 			}
8301 
8302 			if (entry == vm_map_to_entry(map) ||
8303 			    end <= entry->vme_start) {
8304 				break;
8305 			}
8306 
8307 			s = entry->vme_start;
8308 		}
8309 
8310 		if (state & VMDS_KERNEL_PMAP) {
8311 			/*
8312 			 * In the kernel map and its submaps,
8313 			 * permanent entries never die, even
8314 			 * if VM_MAP_REMOVE_IMMUTABLE is passed.
8315 			 */
8316 			if (entry->vme_permanent) {
8317 				__vm_map_delete_permanent_panic(map, start, end, entry);
8318 			}
8319 
8320 			if (flags & VM_MAP_REMOVE_GUESS_SIZE) {
8321 				end = entry->vme_end;
8322 				flags &= ~VM_MAP_REMOVE_GUESS_SIZE;
8323 			}
8324 
8325 			/*
8326 			 * In the kernel map and its submaps,
8327 			 * the removal of an atomic/guarded entry is strict.
8328 			 *
8329 			 * An atomic entry is processed only if it was
8330 			 * specifically targeted.
8331 			 *
8332 			 * We might have deleted non-atomic entries before
8333 			 * we reach this this point however...
8334 			 */
8335 			kmem_entry_validate_guard(map, entry,
8336 			    start, end - start, guard);
8337 		}
8338 
8339 		/*
8340 		 * Step 2.1: handle "permanent" and "submap" entries
8341 		 * *before* clipping to avoid triggering some unnecessary
8342 		 * un-nesting of the shared region.
8343 		 */
8344 		if (entry->vme_permanent && entry->is_sub_map) {
8345 //			printf("FBDP %s:%d permanent submap...\n", __FUNCTION__, __LINE__);
8346 			/*
8347 			 * Un-mapping a "permanent" mapping of a user-space
8348 			 * submap is not allowed unless...
8349 			 */
8350 			if (flags & VM_MAP_REMOVE_IMMUTABLE) {
8351 				/*
8352 				 * a. explicitly requested by the kernel caller.
8353 				 */
8354 //				printf("FBDP %s:%d flags & REMOVE_IMMUTABLE\n", __FUNCTION__, __LINE__);
8355 			} else if ((flags & VM_MAP_REMOVE_IMMUTABLE_CODE) &&
8356 			    developer_mode_state()) {
8357 				/*
8358 				 * b. we're in "developer" mode (for
8359 				 *    breakpoints, dtrace probes, ...).
8360 				 */
8361 //				printf("FBDP %s:%d flags & REMOVE_IMMUTABLE_CODE\n", __FUNCTION__, __LINE__);
8362 			} else if (map->terminated) {
8363 				/*
8364 				 * c. this is the final address space cleanup.
8365 				 */
8366 //				printf("FBDP %s:%d map->terminated\n", __FUNCTION__, __LINE__);
8367 			} else {
8368 				vm_map_offset_t submap_start, submap_end;
8369 				kern_return_t submap_kr;
8370 
8371 				/*
8372 				 * Check if there are any "permanent" mappings
8373 				 * in this range in the submap.
8374 				 */
8375 				if (entry->in_transition) {
8376 					/* can that even happen ? */
8377 					goto in_transition;
8378 				}
8379 				/* compute the clipped range in the submap */
8380 				submap_start = s - entry->vme_start;
8381 				submap_start += VME_OFFSET(entry);
8382 				submap_end = end - entry->vme_start;
8383 				submap_end += VME_OFFSET(entry);
8384 				submap_kr = vm_map_delete_submap_recurse(
8385 					VME_SUBMAP(entry),
8386 					submap_start,
8387 					submap_end);
8388 				if (submap_kr != KERN_SUCCESS) {
8389 					/*
8390 					 * There are some "permanent" mappings
8391 					 * in the submap: we are not allowed
8392 					 * to remove this range.
8393 					 */
8394 					printf("%d[%s] removing permanent submap entry "
8395 					    "%p [0x%llx:0x%llx] prot 0x%x/0x%x -> KERN_PROT_FAILURE\n",
8396 					    proc_selfpid(),
8397 					    (get_bsdtask_info(current_task())
8398 					    ? proc_name_address(get_bsdtask_info(current_task()))
8399 					    : "?"), entry,
8400 					    (uint64_t)entry->vme_start,
8401 					    (uint64_t)entry->vme_end,
8402 					    entry->protection,
8403 					    entry->max_protection);
8404 					DTRACE_VM6(vm_map_delete_permanent_deny_submap,
8405 					    vm_map_entry_t, entry,
8406 					    vm_map_offset_t, entry->vme_start,
8407 					    vm_map_offset_t, entry->vme_end,
8408 					    vm_prot_t, entry->protection,
8409 					    vm_prot_t, entry->max_protection,
8410 					    int, VME_ALIAS(entry));
8411 					ret.kmr_return = KERN_PROTECTION_FAILURE;
8412 					goto out;
8413 				}
8414 				/* no permanent mappings: proceed */
8415 			}
8416 		}
8417 
8418 		/*
8419 		 * Step 3: Perform any clipping needed.
8420 		 *
8421 		 *         After this, "entry" starts at "s", ends before "end"
8422 		 */
8423 
8424 		if (entry->vme_start < s) {
8425 			if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) &&
8426 			    entry->map_aligned &&
8427 			    !VM_MAP_PAGE_ALIGNED(s, VM_MAP_PAGE_MASK(map))) {
8428 				/*
8429 				 * The entry will no longer be map-aligned
8430 				 * after clipping and the caller said it's OK.
8431 				 */
8432 				entry->map_aligned = FALSE;
8433 			}
8434 			vm_map_clip_start(map, entry, s);
8435 			SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
8436 		}
8437 
8438 		if (end < entry->vme_end) {
8439 			if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) &&
8440 			    entry->map_aligned &&
8441 			    !VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map))) {
8442 				/*
8443 				 * The entry will no longer be map-aligned
8444 				 * after clipping and the caller said it's OK.
8445 				 */
8446 				entry->map_aligned = FALSE;
8447 			}
8448 			vm_map_clip_end(map, entry, end);
8449 		}
8450 
8451 		if (entry->vme_permanent && entry->is_sub_map) {
8452 			/*
8453 			 * We already went through step 2.1 which did not deny
8454 			 * the removal of this "permanent" and "is_sub_map"
8455 			 * entry.
8456 			 * Now that we've clipped what we actually want to
8457 			 * delete, undo the "permanent" part to allow the
8458 			 * removal to proceed.
8459 			 */
8460 			DTRACE_VM6(vm_map_delete_permanent_allow_submap,
8461 			    vm_map_entry_t, entry,
8462 			    vm_map_offset_t, entry->vme_start,
8463 			    vm_map_offset_t, entry->vme_end,
8464 			    vm_prot_t, entry->protection,
8465 			    vm_prot_t, entry->max_protection,
8466 			    int, VME_ALIAS(entry));
8467 			entry->vme_permanent = false;
8468 		}
8469 
8470 		assert(s == entry->vme_start);
8471 		assert(entry->vme_end <= end);
8472 
8473 
8474 		/*
8475 		 * Step 4: If the entry is in flux, wait for this to resolve.
8476 		 */
8477 
8478 		if (entry->in_transition) {
8479 			wait_result_t wait_result;
8480 
8481 in_transition:
8482 			/*
8483 			 * Another thread is wiring/unwiring this entry.
8484 			 * Let the other thread know we are waiting.
8485 			 */
8486 
8487 			entry->needs_wakeup = TRUE;
8488 
8489 			/*
8490 			 * wake up anybody waiting on entries that we have
8491 			 * already unwired/deleted.
8492 			 */
8493 			if (state & VMDS_NEEDS_WAKEUP) {
8494 				vm_map_entry_wakeup(map);
8495 				state &= ~VMDS_NEEDS_WAKEUP;
8496 			}
8497 
8498 			wait_result = vm_map_entry_wait(map, interruptible);
8499 
8500 			if (interruptible &&
8501 			    wait_result == THREAD_INTERRUPTED) {
8502 				/*
8503 				 * We do not clear the needs_wakeup flag,
8504 				 * since we cannot tell if we were the only one.
8505 				 */
8506 				ret.kmr_return = KERN_ABORTED;
8507 				return ret;
8508 			}
8509 
8510 			/*
8511 			 * The entry could have been clipped or it
8512 			 * may not exist anymore.  Look it up again.
8513 			 */
8514 			state |= VMDS_NEEDS_LOOKUP;
8515 			continue;
8516 		}
8517 
8518 
8519 		/*
8520 		 * Step 5: Handle wiring
8521 		 */
8522 
8523 		if (entry->wired_count) {
8524 			struct vm_map_entry tmp_entry;
8525 			boolean_t           user_wire;
8526 			unsigned int        last_timestamp;
8527 
8528 			user_wire = entry->user_wired_count > 0;
8529 
8530 			/*
8531 			 *      Remove a kernel wiring if requested
8532 			 */
8533 			if (flags & VM_MAP_REMOVE_KUNWIRE) {
8534 				entry->wired_count--;
8535 				vme_btref_consider_and_put(entry);
8536 			}
8537 
8538 			/*
8539 			 *	Remove all user wirings for proper accounting
8540 			 */
8541 			while (entry->user_wired_count) {
8542 				subtract_wire_counts(map, entry, user_wire);
8543 			}
8544 
8545 			/*
8546 			 * All our DMA I/O operations in IOKit are currently
8547 			 * done by wiring through the map entries of the task
8548 			 * requesting the I/O.
8549 			 *
8550 			 * Because of this, we must always wait for kernel wirings
8551 			 * to go away on the entries before deleting them.
8552 			 *
8553 			 * Any caller who wants to actually remove a kernel wiring
8554 			 * should explicitly set the VM_MAP_REMOVE_KUNWIRE flag to
8555 			 * properly remove one wiring instead of blasting through
8556 			 * them all.
8557 			 */
8558 			if (entry->wired_count != 0) {
8559 				assert(map != kernel_map);
8560 				/*
8561 				 * Cannot continue.  Typical case is when
8562 				 * a user thread has physical io pending on
8563 				 * on this page.  Either wait for the
8564 				 * kernel wiring to go away or return an
8565 				 * error.
8566 				 */
8567 				wait_result_t wait_result;
8568 
8569 				entry->needs_wakeup = TRUE;
8570 				wait_result = vm_map_entry_wait(map,
8571 				    interruptible);
8572 
8573 				if (interruptible &&
8574 				    wait_result == THREAD_INTERRUPTED) {
8575 					/*
8576 					 * We do not clear the
8577 					 * needs_wakeup flag, since we
8578 					 * cannot tell if we were the
8579 					 * only one.
8580 					 */
8581 					ret.kmr_return = KERN_ABORTED;
8582 					return ret;
8583 				}
8584 
8585 
8586 				/*
8587 				 * The entry could have been clipped or
8588 				 * it may not exist anymore.  Look it
8589 				 * up again.
8590 				 */
8591 				state |= VMDS_NEEDS_LOOKUP;
8592 				continue;
8593 			}
8594 
8595 			/*
8596 			 * We can unlock the map now.
8597 			 *
8598 			 * The entry might be split once we unlock the map,
8599 			 * but we need the range as defined by this entry
8600 			 * to be stable. So we must make a local copy.
8601 			 *
8602 			 * The underlying objects do not change during clips,
8603 			 * and the in_transition state guarentees existence
8604 			 * of the entry.
8605 			 */
8606 			last_timestamp = map->timestamp;
8607 			entry->in_transition = TRUE;
8608 			tmp_entry = *entry;
8609 			vm_map_unlock(map);
8610 
8611 			if (tmp_entry.is_sub_map) {
8612 				vm_map_t sub_map;
8613 				vm_map_offset_t sub_start, sub_end;
8614 				pmap_t pmap;
8615 				vm_map_offset_t pmap_addr;
8616 
8617 
8618 				sub_map = VME_SUBMAP(&tmp_entry);
8619 				sub_start = VME_OFFSET(&tmp_entry);
8620 				sub_end = sub_start + (tmp_entry.vme_end -
8621 				    tmp_entry.vme_start);
8622 				if (tmp_entry.use_pmap) {
8623 					pmap = sub_map->pmap;
8624 					pmap_addr = tmp_entry.vme_start;
8625 				} else {
8626 					pmap = map->pmap;
8627 					pmap_addr = tmp_entry.vme_start;
8628 				}
8629 				(void) vm_map_unwire_nested(sub_map,
8630 				    sub_start, sub_end,
8631 				    user_wire,
8632 				    pmap, pmap_addr);
8633 			} else {
8634 				vm_map_offset_t entry_end = tmp_entry.vme_end;
8635 				vm_map_offset_t max_end;
8636 
8637 				if (flags & VM_MAP_REMOVE_NOKUNWIRE_LAST) {
8638 					max_end = end - VM_MAP_PAGE_SIZE(map);
8639 					if (entry_end > max_end) {
8640 						entry_end = max_end;
8641 					}
8642 				}
8643 
8644 				if (tmp_entry.vme_kernel_object) {
8645 					pmap_protect_options(
8646 						map->pmap,
8647 						tmp_entry.vme_start,
8648 						entry_end,
8649 						VM_PROT_NONE,
8650 						PMAP_OPTIONS_REMOVE,
8651 						NULL);
8652 				}
8653 				vm_fault_unwire(map, &tmp_entry,
8654 				    tmp_entry.vme_kernel_object, map->pmap,
8655 				    tmp_entry.vme_start, entry_end);
8656 			}
8657 
8658 			vm_map_lock(map);
8659 
8660 			/*
8661 			 * Unwiring happened, we can now go back to deleting
8662 			 * them (after we clear the in_transition bit for the range).
8663 			 */
8664 			if (last_timestamp + 1 != map->timestamp) {
8665 				state |= VMDS_NEEDS_LOOKUP;
8666 			}
8667 			clear_in_transition_end = tmp_entry.vme_end;
8668 			continue;
8669 		}
8670 
8671 		assert(entry->wired_count == 0);
8672 		assert(entry->user_wired_count == 0);
8673 
8674 
8675 		/*
8676 		 * Step 6: Entry is unwired and ready for us to delete !
8677 		 */
8678 
8679 		if (!entry->vme_permanent) {
8680 			/*
8681 			 * Typical case: the entry really shouldn't be permanent
8682 			 */
8683 		} else if ((flags & VM_MAP_REMOVE_IMMUTABLE_CODE) &&
8684 		    (entry->protection & VM_PROT_EXECUTE) &&
8685 		    developer_mode_state()) {
8686 			/*
8687 			 * Allow debuggers to undo executable mappings
8688 			 * when developer mode is on.
8689 			 */
8690 #if 0
8691 			printf("FBDP %d[%s] removing permanent executable entry "
8692 			    "%p [0x%llx:0x%llx] prot 0x%x/0x%x\n",
8693 			    proc_selfpid(),
8694 			    (current_task()->bsd_info
8695 			    ? proc_name_address(current_task()->bsd_info)
8696 			    : "?"), entry,
8697 			    (uint64_t)entry->vme_start,
8698 			    (uint64_t)entry->vme_end,
8699 			    entry->protection,
8700 			    entry->max_protection);
8701 #endif
8702 			entry->vme_permanent = FALSE;
8703 		} else if ((flags & VM_MAP_REMOVE_IMMUTABLE) || map->terminated) {
8704 #if 0
8705 			printf("FBDP %d[%s] removing permanent entry "
8706 			    "%p [0x%llx:0x%llx] prot 0x%x/0x%x\n",
8707 			    proc_selfpid(),
8708 			    (current_task()->bsd_info
8709 			    ? proc_name_address(current_task()->bsd_info)
8710 			    : "?"), entry,
8711 			    (uint64_t)entry->vme_start,
8712 			    (uint64_t)entry->vme_end,
8713 			    entry->protection,
8714 			    entry->max_protection);
8715 #endif
8716 			entry->vme_permanent = FALSE;
8717 #if CODE_SIGNING_MONITOR
8718 		} else if ((entry->protection & VM_PROT_EXECUTE) && !csm_enabled()) {
8719 			entry->vme_permanent = FALSE;
8720 
8721 			printf("%d[%s] %s(0x%llx,0x%llx): "
8722 			    "code signing monitor disabled, allowing for permanent executable entry [0x%llx:0x%llx] "
8723 			    "prot 0x%x/0x%x\n",
8724 			    proc_selfpid(),
8725 			    (get_bsdtask_info(current_task())
8726 			    ? proc_name_address(get_bsdtask_info(current_task()))
8727 			    : "?"),
8728 			    __FUNCTION__,
8729 			    (uint64_t)start,
8730 			    (uint64_t)end,
8731 			    (uint64_t)entry->vme_start,
8732 			    (uint64_t)entry->vme_end,
8733 			    entry->protection,
8734 			    entry->max_protection);
8735 #endif
8736 		} else {
8737 			DTRACE_VM6(vm_map_delete_permanent,
8738 			    vm_map_entry_t, entry,
8739 			    vm_map_offset_t, entry->vme_start,
8740 			    vm_map_offset_t, entry->vme_end,
8741 			    vm_prot_t, entry->protection,
8742 			    vm_prot_t, entry->max_protection,
8743 			    int, VME_ALIAS(entry));
8744 		}
8745 
8746 		if (entry->is_sub_map) {
8747 			assertf(VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)) >= VM_MAP_PAGE_SHIFT(map),
8748 			    "map %p (%d) entry %p submap %p (%d)\n",
8749 			    map, VM_MAP_PAGE_SHIFT(map), entry,
8750 			    VME_SUBMAP(entry),
8751 			    VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)));
8752 			if (entry->use_pmap) {
8753 #ifndef NO_NESTED_PMAP
8754 				int pmap_flags;
8755 
8756 				if (map->terminated) {
8757 					/*
8758 					 * This is the final cleanup of the
8759 					 * address space being terminated.
8760 					 * No new mappings are expected and
8761 					 * we don't really need to unnest the
8762 					 * shared region (and lose the "global"
8763 					 * pmap mappings, if applicable).
8764 					 *
8765 					 * Tell the pmap layer that we're
8766 					 * "clean" wrt nesting.
8767 					 */
8768 					pmap_flags = PMAP_UNNEST_CLEAN;
8769 				} else {
8770 					/*
8771 					 * We're unmapping part of the nested
8772 					 * shared region, so we can't keep the
8773 					 * nested pmap.
8774 					 */
8775 					pmap_flags = 0;
8776 				}
8777 				pmap_unnest_options(
8778 					map->pmap,
8779 					(addr64_t)entry->vme_start,
8780 					entry->vme_end - entry->vme_start,
8781 					pmap_flags);
8782 #endif  /* NO_NESTED_PMAP */
8783 				if (map->mapped_in_other_pmaps &&
8784 				    os_ref_get_count_raw(&map->map_refcnt) != 0) {
8785 					/* clean up parent map/maps */
8786 					vm_map_submap_pmap_clean(
8787 						map, entry->vme_start,
8788 						entry->vme_end,
8789 						VME_SUBMAP(entry),
8790 						VME_OFFSET(entry));
8791 				}
8792 			} else {
8793 				vm_map_submap_pmap_clean(
8794 					map, entry->vme_start, entry->vme_end,
8795 					VME_SUBMAP(entry),
8796 					VME_OFFSET(entry));
8797 			}
8798 		} else if (entry->vme_kernel_object ||
8799 		    VME_OBJECT(entry) == compressor_object) {
8800 			/*
8801 			 * nothing to do
8802 			 */
8803 		} else if (map->mapped_in_other_pmaps &&
8804 		    os_ref_get_count_raw(&map->map_refcnt) != 0) {
8805 			vm_object_pmap_protect_options(
8806 				VME_OBJECT(entry), VME_OFFSET(entry),
8807 				entry->vme_end - entry->vme_start,
8808 				PMAP_NULL,
8809 				PAGE_SIZE,
8810 				entry->vme_start,
8811 				VM_PROT_NONE,
8812 				PMAP_OPTIONS_REMOVE);
8813 		} else if ((VME_OBJECT(entry) != VM_OBJECT_NULL) ||
8814 		    (state & VMDS_KERNEL_PMAP)) {
8815 			/* Remove translations associated
8816 			 * with this range unless the entry
8817 			 * does not have an object, or
8818 			 * it's the kernel map or a descendant
8819 			 * since the platform could potentially
8820 			 * create "backdoor" mappings invisible
8821 			 * to the VM. It is expected that
8822 			 * objectless, non-kernel ranges
8823 			 * do not have such VM invisible
8824 			 * translations.
8825 			 */
8826 			vm_map_address_t remove_start = entry->vme_start;
8827 			vm_map_address_t remove_end = entry->vme_end;
8828 #if MACH_ASSERT
8829 			/*
8830 			 * Prevent panics in pmap_remove() from some vm test code
8831 			 * which uses virtual address ranges that pmap disallows.
8832 			 */
8833 			if (thread_get_test_option(test_option_vm_map_clamp_pmap_remove)) {
8834 				vm_map_clamp_to_pmap(map, &remove_start, &remove_end);
8835 			}
8836 #endif /* MACH_ASSERT */
8837 			pmap_remove(map->pmap, remove_start, remove_end);
8838 		}
8839 
8840 #if DEBUG
8841 		/*
8842 		 * All pmap mappings for this map entry must have been
8843 		 * cleared by now.
8844 		 */
8845 		assert(pmap_is_empty(map->pmap,
8846 		    entry->vme_start,
8847 		    entry->vme_end));
8848 #endif /* DEBUG */
8849 
8850 		if (entry->iokit_acct) {
8851 			/* alternate accounting */
8852 			DTRACE_VM4(vm_map_iokit_unmapped_region,
8853 			    vm_map_t, map,
8854 			    vm_map_offset_t, entry->vme_start,
8855 			    vm_map_offset_t, entry->vme_end,
8856 			    int, VME_ALIAS(entry));
8857 			vm_map_iokit_unmapped_region(map,
8858 			    (entry->vme_end -
8859 			    entry->vme_start));
8860 			entry->iokit_acct = FALSE;
8861 			entry->use_pmap = FALSE;
8862 		}
8863 
8864 		/* move "s" forward */
8865 		s    = entry->vme_end;
8866 		next = entry->vme_next;
8867 		if (!entry->map_aligned) {
8868 			vm_map_offset_t rounded_s;
8869 
8870 			/*
8871 			 * Skip artificial gap due to mis-aligned entry
8872 			 * on devices with a page size smaller than the
8873 			 * map's page size (i.e. 16k task on a 4k device).
8874 			 */
8875 			rounded_s = VM_MAP_ROUND_PAGE(s, VM_MAP_PAGE_MASK(map));
8876 			if (next == vm_map_to_entry(map)) {
8877 				s = rounded_s;
8878 			} else if (s < rounded_s) {
8879 				s = MIN(rounded_s, next->vme_start);
8880 			}
8881 		}
8882 		ret.kmr_size += s - entry->vme_start;
8883 
8884 		if (entry->vme_permanent) {
8885 			/*
8886 			 * A permanent entry can not be removed, so leave it
8887 			 * in place but remove all access permissions.
8888 			 */
8889 			if (__improbable(vm_log_map_delete_permanent_prot_none)) {
8890 				printf("%s:%d %d[%s] map %p entry %p [ 0x%llx - 0x%llx ] submap %d prot 0x%x/0x%x -> 0/0\n",
8891 				    __FUNCTION__, __LINE__,
8892 				    proc_selfpid(),
8893 				    (get_bsdtask_info(current_task())
8894 				    ? proc_name_address(get_bsdtask_info(current_task()))
8895 				    : "?"),
8896 				    map,
8897 				    entry,
8898 				    (uint64_t)entry->vme_start,
8899 				    (uint64_t)entry->vme_end,
8900 				    entry->is_sub_map,
8901 				    entry->protection,
8902 				    entry->max_protection);
8903 			}
8904 			DTRACE_VM6(vm_map_delete_permanent_prot_none,
8905 			    vm_map_entry_t, entry,
8906 			    vm_map_offset_t, entry->vme_start,
8907 			    vm_map_offset_t, entry->vme_end,
8908 			    vm_prot_t, entry->protection,
8909 			    vm_prot_t, entry->max_protection,
8910 			    int, VME_ALIAS(entry));
8911 			entry->protection = VM_PROT_NONE;
8912 			entry->max_protection = VM_PROT_NONE;
8913 #ifdef __arm64e__
8914 			entry->used_for_tpro = FALSE;
8915 #endif
8916 		} else {
8917 			vm_map_entry_zap(map, entry, zap_list);
8918 		}
8919 
8920 		entry = next;
8921 		next  = VM_MAP_ENTRY_NULL;
8922 
8923 		if ((flags & VM_MAP_REMOVE_NO_YIELD) == 0 && s < end) {
8924 			unsigned int last_timestamp = map->timestamp++;
8925 
8926 			if (lck_rw_lock_yield_exclusive(&map->lock,
8927 			    LCK_RW_YIELD_ANY_WAITER)) {
8928 				if (last_timestamp != map->timestamp + 1) {
8929 					state |= VMDS_NEEDS_LOOKUP;
8930 				}
8931 			} else {
8932 				/* we didn't yield, undo our change */
8933 				map->timestamp--;
8934 			}
8935 		}
8936 	}
8937 
8938 	if (map->wait_for_space) {
8939 		thread_wakeup((event_t) map);
8940 	}
8941 
8942 	if (state & VMDS_NEEDS_WAKEUP) {
8943 		vm_map_entry_wakeup(map);
8944 	}
8945 
8946 out:
8947 	if ((state & VMDS_KERNEL_PMAP) && ret.kmr_return) {
8948 		__vm_map_delete_failed_panic(map, start, end, ret.kmr_return);
8949 	}
8950 
8951 	if (state & VMDS_KERNEL_KMEMPTR) {
8952 		kmem_free_space(start, end, range_id, &slot);
8953 	}
8954 
8955 	if (state & VMDS_FOUND_GAP) {
8956 		DTRACE_VM3(kern_vm_deallocate_gap,
8957 		    vm_map_offset_t, gap_start,
8958 		    vm_map_offset_t, save_start,
8959 		    vm_map_offset_t, save_end);
8960 		if (flags & VM_MAP_REMOVE_GAPS_FAIL) {
8961 			ret.kmr_return = KERN_INVALID_VALUE;
8962 		} else {
8963 			vm_map_guard_exception(gap_start, kGUARD_EXC_DEALLOC_GAP);
8964 		}
8965 	}
8966 
8967 	return ret;
8968 }
8969 
8970 kmem_return_t
vm_map_remove_and_unlock(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vmr_flags_t flags,kmem_guard_t guard)8971 vm_map_remove_and_unlock(
8972 	vm_map_t        map,
8973 	vm_map_offset_t start,
8974 	vm_map_offset_t end,
8975 	vmr_flags_t     flags,
8976 	kmem_guard_t    guard)
8977 {
8978 	kmem_return_t ret;
8979 	VM_MAP_ZAP_DECLARE(zap);
8980 
8981 	ret = vm_map_delete(map, start, end, flags, guard, &zap);
8982 	vm_map_unlock(map);
8983 
8984 	vm_map_zap_dispose(&zap);
8985 
8986 	return ret;
8987 }
8988 
8989 /*
8990  *	vm_map_remove_guard:
8991  *
8992  *	Remove the given address range from the target map.
8993  *	This is the exported form of vm_map_delete.
8994  */
8995 kmem_return_t
vm_map_remove_guard(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vmr_flags_t flags,kmem_guard_t guard)8996 vm_map_remove_guard(
8997 	vm_map_t        map,
8998 	vm_map_offset_t start,
8999 	vm_map_offset_t end,
9000 	vmr_flags_t     flags,
9001 	kmem_guard_t    guard)
9002 {
9003 	vm_map_lock(map);
9004 	return vm_map_remove_and_unlock(map, start, end, flags, guard);
9005 }
9006 
9007 
9008 /*
9009  *  vm_map_setup:
9010  *
9011  *  Perform any required setup on a new task's map. Must be called before the task
9012  *  is enabled for IPC access, since after this point other threads may be able
9013  *  to look up the task port and make VM API calls.
9014  */
9015 void
vm_map_setup(vm_map_t map,task_t task)9016 vm_map_setup(vm_map_t map, task_t task)
9017 {
9018 	/*
9019 	 * map does NOT take a reference on owning_task. If the map has terminated,
9020 	 * it is possible that the pointer is NULL, so reads of owning_task must
9021 	 * happen under the map lock and explicitly check for NULL.
9022 	 */
9023 	vm_map_lock(map);
9024 	assert(!map->owning_task);
9025 	map->owning_task = task;
9026 	vm_map_unlock(map);
9027 #if CONFIG_DEFERRED_RECLAIM
9028 	vm_deferred_reclamation_metadata_t vdrm = task->deferred_reclamation_metadata;
9029 	if (vdrm) {
9030 		vm_deferred_reclamation_task_fork_register(vdrm);
9031 	}
9032 #endif /* CONFIG_DEFERRED_RECLAIM */
9033 }
9034 
9035 /*
9036  *	vm_map_terminate:
9037  *
9038  *	Clean out a task's map.
9039  */
9040 kern_return_t
vm_map_terminate(vm_map_t map)9041 vm_map_terminate(
9042 	vm_map_t        map)
9043 {
9044 	vm_map_lock(map);
9045 	map->terminated = TRUE;
9046 	map->owning_task = NULL;
9047 	vm_map_disable_hole_optimization(map);
9048 	(void)vm_map_remove_and_unlock(map, map->min_offset, map->max_offset,
9049 	    VM_MAP_REMOVE_NO_FLAGS, KMEM_GUARD_NONE);
9050 	return KERN_SUCCESS;
9051 }
9052 
9053 /*
9054  *	Routine:	vm_map_copy_allocate
9055  *
9056  *	Description:
9057  *		Allocates and initializes a map copy object.
9058  */
9059 static vm_map_copy_t
vm_map_copy_allocate(uint16_t type)9060 vm_map_copy_allocate(uint16_t type)
9061 {
9062 	vm_map_copy_t new_copy;
9063 
9064 	new_copy = zalloc_id(ZONE_ID_VM_MAP_COPY, Z_WAITOK | Z_ZERO);
9065 	new_copy->type = type;
9066 	if (type == VM_MAP_COPY_ENTRY_LIST) {
9067 		new_copy->c_u.hdr.rb_head_store.rbh_root = (void*)(int)SKIP_RB_TREE;
9068 		vm_map_store_init(&new_copy->cpy_hdr);
9069 	}
9070 	return new_copy;
9071 }
9072 
9073 /*
9074  *	Routine:	vm_map_copy_discard
9075  *
9076  *	Description:
9077  *		Dispose of a map copy object (returned by
9078  *		vm_map_copyin).
9079  */
9080 void
vm_map_copy_discard(vm_map_copy_t copy)9081 vm_map_copy_discard(
9082 	vm_map_copy_t   copy)
9083 {
9084 	if (copy == VM_MAP_COPY_NULL) {
9085 		return;
9086 	}
9087 
9088 	/*
9089 	 * Assert that the vm_map_copy is coming from the right
9090 	 * zone and hasn't been forged
9091 	 */
9092 	vm_map_copy_require(copy);
9093 
9094 	switch (copy->type) {
9095 	case VM_MAP_COPY_ENTRY_LIST:
9096 		while (vm_map_copy_first_entry(copy) !=
9097 		    vm_map_copy_to_entry(copy)) {
9098 			vm_map_entry_t  entry = vm_map_copy_first_entry(copy);
9099 
9100 			vm_map_copy_entry_unlink(copy, entry);
9101 			if (entry->is_sub_map) {
9102 				vm_map_deallocate(VME_SUBMAP(entry));
9103 			} else {
9104 				vm_object_deallocate(VME_OBJECT(entry));
9105 			}
9106 			vm_map_copy_entry_dispose(entry);
9107 		}
9108 		break;
9109 	case VM_MAP_COPY_KERNEL_BUFFER:
9110 
9111 		/*
9112 		 * The vm_map_copy_t and possibly the data buffer were
9113 		 * allocated by a single call to kalloc_data(), i.e. the
9114 		 * vm_map_copy_t was not allocated out of the zone.
9115 		 */
9116 		if (copy->size > msg_ool_size_small || copy->offset) {
9117 			panic("Invalid vm_map_copy_t sz:%lld, ofst:%lld",
9118 			    (long long)copy->size, (long long)copy->offset);
9119 		}
9120 		kfree_data(copy->cpy_kdata, copy->size);
9121 	}
9122 	zfree_id(ZONE_ID_VM_MAP_COPY, copy);
9123 }
9124 
9125 #if XNU_PLATFORM_MacOSX
9126 
9127 __exported
9128 extern vm_map_copy_t vm_map_copy_copy(vm_map_copy_t copy);
9129 
9130 /*
9131  *	Routine:	vm_map_copy_copy
9132  *
9133  *	Description:
9134  *			Move the information in a map copy object to
9135  *			a new map copy object, leaving the old one
9136  *			empty.
9137  *
9138  *			This is used by kernel routines that need
9139  *			to look at out-of-line data (in copyin form)
9140  *			before deciding whether to return SUCCESS.
9141  *			If the routine returns FAILURE, the original
9142  *			copy object will be deallocated; therefore,
9143  *			these routines must make a copy of the copy
9144  *			object and leave the original empty so that
9145  *			deallocation will not fail.
9146  */
9147 vm_map_copy_t
vm_map_copy_copy(vm_map_copy_t copy)9148 vm_map_copy_copy(
9149 	vm_map_copy_t   copy)
9150 {
9151 	vm_map_copy_t   new_copy;
9152 
9153 	if (copy == VM_MAP_COPY_NULL) {
9154 		return VM_MAP_COPY_NULL;
9155 	}
9156 
9157 	/*
9158 	 * Assert that the vm_map_copy is coming from the right
9159 	 * zone and hasn't been forged
9160 	 */
9161 	vm_map_copy_require(copy);
9162 
9163 	/*
9164 	 * Allocate a new copy object, and copy the information
9165 	 * from the old one into it.
9166 	 */
9167 
9168 	new_copy = zalloc_id(ZONE_ID_VM_MAP_COPY, Z_WAITOK | Z_ZERO | Z_NOFAIL);
9169 	memcpy((void *) new_copy, (void *) copy, sizeof(struct vm_map_copy));
9170 #if __has_feature(ptrauth_calls)
9171 	if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) {
9172 		new_copy->cpy_kdata = copy->cpy_kdata;
9173 	}
9174 #endif
9175 
9176 	if (copy->type == VM_MAP_COPY_ENTRY_LIST) {
9177 		/*
9178 		 * The links in the entry chain must be
9179 		 * changed to point to the new copy object.
9180 		 */
9181 		vm_map_copy_first_entry(copy)->vme_prev
9182 		        = vm_map_copy_to_entry(new_copy);
9183 		vm_map_copy_last_entry(copy)->vme_next
9184 		        = vm_map_copy_to_entry(new_copy);
9185 	}
9186 
9187 	/*
9188 	 * Change the old copy object into one that contains
9189 	 * nothing to be deallocated.
9190 	 */
9191 	bzero(copy, sizeof(struct vm_map_copy));
9192 	copy->type = VM_MAP_COPY_KERNEL_BUFFER;
9193 
9194 	/*
9195 	 * Return the new object.
9196 	 */
9197 	return new_copy;
9198 }
9199 
9200 #endif /* XNU_PLATFORM_MacOSX */
9201 
9202 static boolean_t
vm_map_entry_is_overwritable(vm_map_t dst_map __unused,vm_map_entry_t entry)9203 vm_map_entry_is_overwritable(
9204 	vm_map_t        dst_map __unused,
9205 	vm_map_entry_t  entry)
9206 {
9207 	if (!(entry->protection & VM_PROT_WRITE)) {
9208 		/* can't overwrite if not writable */
9209 		return FALSE;
9210 	}
9211 #if !__x86_64__
9212 	if (entry->used_for_jit &&
9213 	    vm_map_cs_enforcement(dst_map) &&
9214 	    !dst_map->cs_debugged) {
9215 		/*
9216 		 * Can't overwrite a JIT region while cs_enforced
9217 		 * and not cs_debugged.
9218 		 */
9219 		return FALSE;
9220 	}
9221 
9222 #if __arm64e__
9223 	/* Do not allow overwrite HW assisted TPRO entries */
9224 	if (entry->used_for_tpro) {
9225 		return FALSE;
9226 	}
9227 #endif /* __arm64e__ */
9228 
9229 	if (entry->vme_permanent) {
9230 		if (entry->is_sub_map) {
9231 			/*
9232 			 * We can't tell if the submap contains "permanent"
9233 			 * entries within the range targeted by the caller.
9234 			 * The caller will have to check for that with
9235 			 * vm_map_overwrite_submap_recurse() for example.
9236 			 */
9237 		} else {
9238 			/*
9239 			 * Do not allow overwriting of a "permanent"
9240 			 * entry.
9241 			 */
9242 			DTRACE_VM6(vm_map_delete_permanent_deny_overwrite,
9243 			    vm_map_entry_t, entry,
9244 			    vm_map_offset_t, entry->vme_start,
9245 			    vm_map_offset_t, entry->vme_end,
9246 			    vm_prot_t, entry->protection,
9247 			    vm_prot_t, entry->max_protection,
9248 			    int, VME_ALIAS(entry));
9249 			return FALSE;
9250 		}
9251 	}
9252 #endif /* !__x86_64__ */
9253 
9254 	if (entry->is_sub_map) {
9255 		/* remember not to assume every entry has a VM object... */
9256 	}
9257 
9258 
9259 	return TRUE;
9260 }
9261 
9262 static kern_return_t
vm_map_overwrite_submap_recurse(vm_map_t dst_map,vm_map_offset_t dst_addr,vm_map_size_t dst_size)9263 vm_map_overwrite_submap_recurse(
9264 	vm_map_t        dst_map,
9265 	vm_map_offset_t dst_addr,
9266 	vm_map_size_t   dst_size)
9267 {
9268 	vm_map_offset_t dst_end;
9269 	vm_map_entry_t  tmp_entry;
9270 	vm_map_entry_t  entry;
9271 	kern_return_t   result;
9272 	boolean_t       encountered_sub_map = FALSE;
9273 
9274 
9275 
9276 	/*
9277 	 *	Verify that the destination is all writeable
9278 	 *	initially.  We have to trunc the destination
9279 	 *	address and round the copy size or we'll end up
9280 	 *	splitting entries in strange ways.
9281 	 */
9282 
9283 	dst_end = vm_map_round_page(dst_addr + dst_size,
9284 	    VM_MAP_PAGE_MASK(dst_map));
9285 	vm_map_lock(dst_map);
9286 
9287 start_pass_1:
9288 	if (!vm_map_lookup_entry(dst_map, dst_addr, &tmp_entry)) {
9289 		vm_map_unlock(dst_map);
9290 		return KERN_INVALID_ADDRESS;
9291 	}
9292 
9293 	vm_map_clip_start(dst_map,
9294 	    tmp_entry,
9295 	    vm_map_trunc_page(dst_addr,
9296 	    VM_MAP_PAGE_MASK(dst_map)));
9297 	if (tmp_entry->is_sub_map) {
9298 		/* clipping did unnest if needed */
9299 		assert(!tmp_entry->use_pmap);
9300 	}
9301 
9302 	for (entry = tmp_entry;;) {
9303 		vm_map_entry_t  next;
9304 
9305 		next = entry->vme_next;
9306 		while (entry->is_sub_map) {
9307 			vm_map_offset_t sub_start;
9308 			vm_map_offset_t sub_end;
9309 			vm_map_offset_t local_end;
9310 			vm_map_t        sub_map;
9311 
9312 			if (entry->in_transition) {
9313 				/*
9314 				 * Say that we are waiting, and wait for entry.
9315 				 */
9316 				entry->needs_wakeup = TRUE;
9317 				vm_map_entry_wait(dst_map, THREAD_UNINT);
9318 
9319 				goto start_pass_1;
9320 			}
9321 
9322 			encountered_sub_map = TRUE;
9323 			sub_start = VME_OFFSET(entry);
9324 
9325 			if (entry->vme_end < dst_end) {
9326 				sub_end = entry->vme_end;
9327 			} else {
9328 				sub_end = dst_end;
9329 			}
9330 			sub_end -= entry->vme_start;
9331 			sub_end += VME_OFFSET(entry);
9332 			local_end = entry->vme_end;
9333 			sub_map = VME_SUBMAP(entry);
9334 			vm_map_reference(sub_map);
9335 			vm_map_unlock(dst_map);
9336 
9337 			result = vm_map_overwrite_submap_recurse(
9338 				sub_map,
9339 				sub_start,
9340 				sub_end - sub_start);
9341 
9342 			vm_map_deallocate(sub_map);
9343 			sub_map = VM_MAP_NULL;
9344 
9345 			if (result != KERN_SUCCESS) {
9346 				return result;
9347 			}
9348 			if (dst_end <= entry->vme_end) {
9349 				return KERN_SUCCESS;
9350 			}
9351 			vm_map_lock(dst_map);
9352 			if (!vm_map_lookup_entry(dst_map, local_end,
9353 			    &tmp_entry)) {
9354 				vm_map_unlock(dst_map);
9355 				return KERN_INVALID_ADDRESS;
9356 			}
9357 			entry = tmp_entry;
9358 			next = entry->vme_next;
9359 		}
9360 		assert(!entry->is_sub_map);
9361 
9362 		if (!(entry->protection & VM_PROT_WRITE)) {
9363 			vm_map_unlock(dst_map);
9364 			return KERN_PROTECTION_FAILURE;
9365 		}
9366 
9367 		if (!vm_map_entry_is_overwritable(dst_map, entry)) {
9368 			vm_map_unlock(dst_map);
9369 			return KERN_PROTECTION_FAILURE;
9370 		}
9371 
9372 		/*
9373 		 *	If the entry is in transition, we must wait
9374 		 *	for it to exit that state.  Anything could happen
9375 		 *	when we unlock the map, so start over.
9376 		 */
9377 		if (entry->in_transition) {
9378 			/*
9379 			 * Say that we are waiting, and wait for entry.
9380 			 */
9381 			entry->needs_wakeup = TRUE;
9382 			vm_map_entry_wait(dst_map, THREAD_UNINT);
9383 
9384 			goto start_pass_1;
9385 		}
9386 
9387 /*
9388  *		our range is contained completely within this map entry
9389  */
9390 		if (dst_end <= entry->vme_end) {
9391 			vm_map_unlock(dst_map);
9392 			return KERN_SUCCESS;
9393 		}
9394 /*
9395  *		check that range specified is contiguous region
9396  */
9397 		if ((next == vm_map_to_entry(dst_map)) ||
9398 		    (next->vme_start != entry->vme_end)) {
9399 			vm_map_unlock(dst_map);
9400 			return KERN_INVALID_ADDRESS;
9401 		}
9402 
9403 		/*
9404 		 *	Check for permanent objects in the destination.
9405 		 */
9406 		assert(!entry->is_sub_map);
9407 		if ((VME_OBJECT(entry) != VM_OBJECT_NULL) &&
9408 		    ((!VME_OBJECT(entry)->internal) ||
9409 		    (VME_OBJECT(entry)->true_share))) {
9410 			if (encountered_sub_map) {
9411 				vm_map_unlock(dst_map);
9412 				return KERN_FAILURE;
9413 			}
9414 		}
9415 
9416 
9417 		entry = next;
9418 	}/* for */
9419 	vm_map_unlock(dst_map);
9420 	return KERN_SUCCESS;
9421 }
9422 
9423 /*
9424  *	Routine:	vm_map_copy_overwrite
9425  *
9426  *	Description:
9427  *		Copy the memory described by the map copy
9428  *		object (copy; returned by vm_map_copyin) onto
9429  *		the specified destination region (dst_map, dst_addr).
9430  *		The destination must be writeable.
9431  *
9432  *		Unlike vm_map_copyout, this routine actually
9433  *		writes over previously-mapped memory.  If the
9434  *		previous mapping was to a permanent (user-supplied)
9435  *		memory object, it is preserved.
9436  *
9437  *		The attributes (protection and inheritance) of the
9438  *		destination region are preserved.
9439  *
9440  *		If successful, consumes the copy object.
9441  *		Otherwise, the caller is responsible for it.
9442  *
9443  *	Implementation notes:
9444  *		To overwrite aligned temporary virtual memory, it is
9445  *		sufficient to remove the previous mapping and insert
9446  *		the new copy.  This replacement is done either on
9447  *		the whole region (if no permanent virtual memory
9448  *		objects are embedded in the destination region) or
9449  *		in individual map entries.
9450  *
9451  *		To overwrite permanent virtual memory , it is necessary
9452  *		to copy each page, as the external memory management
9453  *		interface currently does not provide any optimizations.
9454  *
9455  *		Unaligned memory also has to be copied.  It is possible
9456  *		to use 'vm_trickery' to copy the aligned data.  This is
9457  *		not done but not hard to implement.
9458  *
9459  *		Once a page of permanent memory has been overwritten,
9460  *		it is impossible to interrupt this function; otherwise,
9461  *		the call would be neither atomic nor location-independent.
9462  *		The kernel-state portion of a user thread must be
9463  *		interruptible.
9464  *
9465  *		It may be expensive to forward all requests that might
9466  *		overwrite permanent memory (vm_write, vm_copy) to
9467  *		uninterruptible kernel threads.  This routine may be
9468  *		called by interruptible threads; however, success is
9469  *		not guaranteed -- if the request cannot be performed
9470  *		atomically and interruptibly, an error indication is
9471  *		returned.
9472  *
9473  *		Callers of this function must call vm_map_copy_require on
9474  *		previously created vm_map_copy_t or pass a newly created
9475  *		one to ensure that it hasn't been forged.
9476  */
9477 static kern_return_t
vm_map_copy_overwrite_nested(vm_map_t dst_map,vm_map_address_t dst_addr,vm_map_copy_t copy,boolean_t interruptible,pmap_t pmap,boolean_t discard_on_success)9478 vm_map_copy_overwrite_nested(
9479 	vm_map_t                dst_map,
9480 	vm_map_address_t        dst_addr,
9481 	vm_map_copy_t           copy,
9482 	boolean_t               interruptible,
9483 	pmap_t                  pmap,
9484 	boolean_t               discard_on_success)
9485 {
9486 	vm_map_offset_t         dst_end;
9487 	vm_map_entry_t          tmp_entry;
9488 	vm_map_entry_t          entry;
9489 	kern_return_t           kr;
9490 	boolean_t               aligned = TRUE;
9491 	boolean_t               contains_permanent_objects = FALSE;
9492 	boolean_t               encountered_sub_map = FALSE;
9493 	vm_map_offset_t         base_addr;
9494 	vm_map_size_t           copy_size;
9495 	vm_map_size_t           total_size;
9496 	uint16_t                copy_page_shift;
9497 
9498 	/*
9499 	 *	Check for special kernel buffer allocated
9500 	 *	by new_ipc_kmsg_copyin.
9501 	 */
9502 
9503 	if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) {
9504 		kr = vm_map_copyout_kernel_buffer(
9505 			dst_map, &dst_addr,
9506 			copy, copy->size, TRUE, discard_on_success);
9507 		return kr;
9508 	}
9509 
9510 	/*
9511 	 *      Only works for entry lists at the moment.  Will
9512 	 *	support page lists later.
9513 	 */
9514 
9515 	assert(copy->type == VM_MAP_COPY_ENTRY_LIST);
9516 
9517 	if (copy->size == 0) {
9518 		if (discard_on_success) {
9519 			vm_map_copy_discard(copy);
9520 		}
9521 		return KERN_SUCCESS;
9522 	}
9523 
9524 	copy_page_shift = copy->cpy_hdr.page_shift;
9525 
9526 	/*
9527 	 *	Verify that the destination is all writeable
9528 	 *	initially.  We have to trunc the destination
9529 	 *	address and round the copy size or we'll end up
9530 	 *	splitting entries in strange ways.
9531 	 */
9532 
9533 	if (!VM_MAP_PAGE_ALIGNED(copy->size,
9534 	    VM_MAP_PAGE_MASK(dst_map)) ||
9535 	    !VM_MAP_PAGE_ALIGNED(copy->offset,
9536 	    VM_MAP_PAGE_MASK(dst_map)) ||
9537 	    !VM_MAP_PAGE_ALIGNED(dst_addr,
9538 	    VM_MAP_PAGE_MASK(dst_map)) ||
9539 	    copy_page_shift != VM_MAP_PAGE_SHIFT(dst_map)) {
9540 		aligned = FALSE;
9541 		dst_end = vm_map_round_page(dst_addr + copy->size,
9542 		    VM_MAP_PAGE_MASK(dst_map));
9543 	} else {
9544 		dst_end = dst_addr + copy->size;
9545 	}
9546 
9547 	vm_map_lock(dst_map);
9548 
9549 	/* LP64todo - remove this check when vm_map_commpage64()
9550 	 * no longer has to stuff in a map_entry for the commpage
9551 	 * above the map's max_offset.
9552 	 */
9553 	if (dst_addr >= dst_map->max_offset) {
9554 		vm_map_unlock(dst_map);
9555 		return KERN_INVALID_ADDRESS;
9556 	}
9557 
9558 start_pass_1:
9559 	if (!vm_map_lookup_entry(dst_map, dst_addr, &tmp_entry)) {
9560 		vm_map_unlock(dst_map);
9561 		return KERN_INVALID_ADDRESS;
9562 	}
9563 	vm_map_clip_start(dst_map,
9564 	    tmp_entry,
9565 	    vm_map_trunc_page(dst_addr,
9566 	    VM_MAP_PAGE_MASK(dst_map)));
9567 	for (entry = tmp_entry;;) {
9568 		vm_map_entry_t  next = entry->vme_next;
9569 
9570 		while (entry->is_sub_map) {
9571 			vm_map_offset_t sub_start;
9572 			vm_map_offset_t sub_end;
9573 			vm_map_offset_t local_end;
9574 
9575 			if (entry->in_transition) {
9576 				/*
9577 				 * Say that we are waiting, and wait for entry.
9578 				 */
9579 				entry->needs_wakeup = TRUE;
9580 				vm_map_entry_wait(dst_map, THREAD_UNINT);
9581 
9582 				goto start_pass_1;
9583 			}
9584 
9585 			local_end = entry->vme_end;
9586 			if (!(entry->needs_copy)) {
9587 				vm_map_t sub_map = VM_MAP_NULL;
9588 
9589 				/* if needs_copy we are a COW submap */
9590 				/* in such a case we just replace so */
9591 				/* there is no need for the follow-  */
9592 				/* ing check.                        */
9593 				encountered_sub_map = TRUE;
9594 				sub_start = VME_OFFSET(entry);
9595 
9596 				if (entry->vme_end < dst_end) {
9597 					sub_end = entry->vme_end;
9598 				} else {
9599 					sub_end = dst_end;
9600 				}
9601 				sub_end -= entry->vme_start;
9602 				sub_end += VME_OFFSET(entry);
9603 				sub_map = VME_SUBMAP(entry);
9604 				vm_map_reference(sub_map);
9605 				vm_map_unlock(dst_map);
9606 
9607 				kr = vm_map_overwrite_submap_recurse(
9608 					sub_map,
9609 					sub_start,
9610 					sub_end - sub_start);
9611 
9612 				vm_map_deallocate(sub_map);
9613 				sub_map = VM_MAP_NULL;
9614 				if (kr != KERN_SUCCESS) {
9615 					return kr;
9616 				}
9617 				vm_map_lock(dst_map);
9618 			}
9619 
9620 			if (dst_end <= entry->vme_end) {
9621 				goto start_overwrite;
9622 			}
9623 			if (!vm_map_lookup_entry(dst_map, local_end,
9624 			    &entry)) {
9625 				vm_map_unlock(dst_map);
9626 				return KERN_INVALID_ADDRESS;
9627 			}
9628 			next = entry->vme_next;
9629 		}
9630 		assert(!entry->is_sub_map);
9631 
9632 		if (!(entry->protection & VM_PROT_WRITE)) {
9633 			vm_map_unlock(dst_map);
9634 			return KERN_PROTECTION_FAILURE;
9635 		}
9636 
9637 		if (!vm_map_entry_is_overwritable(dst_map, entry)) {
9638 			vm_map_unlock(dst_map);
9639 			return KERN_PROTECTION_FAILURE;
9640 		}
9641 
9642 		/*
9643 		 *	If the entry is in transition, we must wait
9644 		 *	for it to exit that state.  Anything could happen
9645 		 *	when we unlock the map, so start over.
9646 		 */
9647 		if (entry->in_transition) {
9648 			/*
9649 			 * Say that we are waiting, and wait for entry.
9650 			 */
9651 			entry->needs_wakeup = TRUE;
9652 			vm_map_entry_wait(dst_map, THREAD_UNINT);
9653 
9654 			goto start_pass_1;
9655 		}
9656 
9657 /*
9658  *		our range is contained completely within this map entry
9659  */
9660 		if (dst_end <= entry->vme_end) {
9661 			break;
9662 		}
9663 /*
9664  *		check that range specified is contiguous region
9665  */
9666 		if ((next == vm_map_to_entry(dst_map)) ||
9667 		    (next->vme_start != entry->vme_end)) {
9668 			vm_map_unlock(dst_map);
9669 			return KERN_INVALID_ADDRESS;
9670 		}
9671 
9672 
9673 		/*
9674 		 *	Check for permanent objects in the destination.
9675 		 */
9676 		assert(!entry->is_sub_map);
9677 		if ((VME_OBJECT(entry) != VM_OBJECT_NULL) &&
9678 		    ((!VME_OBJECT(entry)->internal) ||
9679 		    (VME_OBJECT(entry)->true_share))) {
9680 			contains_permanent_objects = TRUE;
9681 		}
9682 
9683 		entry = next;
9684 	}/* for */
9685 
9686 start_overwrite:
9687 	/*
9688 	 *	If there are permanent objects in the destination, then
9689 	 *	the copy cannot be interrupted.
9690 	 */
9691 
9692 	if (interruptible && contains_permanent_objects) {
9693 		vm_map_unlock(dst_map);
9694 		return KERN_FAILURE;   /* XXX */
9695 	}
9696 
9697 	/*
9698 	 *
9699 	 *	Make a second pass, overwriting the data
9700 	 *	At the beginning of each loop iteration,
9701 	 *	the next entry to be overwritten is "tmp_entry"
9702 	 *	(initially, the value returned from the lookup above),
9703 	 *	and the starting address expected in that entry
9704 	 *	is "start".
9705 	 */
9706 
9707 	total_size = copy->size;
9708 	if (encountered_sub_map) {
9709 		copy_size = 0;
9710 		/* re-calculate tmp_entry since we've had the map */
9711 		/* unlocked */
9712 		if (!vm_map_lookup_entry( dst_map, dst_addr, &tmp_entry)) {
9713 			vm_map_unlock(dst_map);
9714 			return KERN_INVALID_ADDRESS;
9715 		}
9716 	} else {
9717 		copy_size = copy->size;
9718 	}
9719 
9720 	base_addr = dst_addr;
9721 	while (TRUE) {
9722 		/* deconstruct the copy object and do in parts */
9723 		/* only in sub_map, interruptable case */
9724 		vm_map_entry_t  copy_entry;
9725 		vm_map_entry_t  previous_prev = VM_MAP_ENTRY_NULL;
9726 		vm_map_entry_t  next_copy = VM_MAP_ENTRY_NULL;
9727 		int             nentries;
9728 		int             remaining_entries = 0;
9729 		vm_map_offset_t new_offset = 0;
9730 
9731 		for (entry = tmp_entry; copy_size == 0;) {
9732 			vm_map_entry_t  next;
9733 
9734 			next = entry->vme_next;
9735 
9736 			/* tmp_entry and base address are moved along */
9737 			/* each time we encounter a sub-map.  Otherwise */
9738 			/* entry can outpase tmp_entry, and the copy_size */
9739 			/* may reflect the distance between them */
9740 			/* if the current entry is found to be in transition */
9741 			/* we will start over at the beginning or the last */
9742 			/* encounter of a submap as dictated by base_addr */
9743 			/* we will zero copy_size accordingly. */
9744 			if (entry->in_transition) {
9745 				/*
9746 				 * Say that we are waiting, and wait for entry.
9747 				 */
9748 				entry->needs_wakeup = TRUE;
9749 				vm_map_entry_wait(dst_map, THREAD_UNINT);
9750 
9751 				if (!vm_map_lookup_entry(dst_map, base_addr,
9752 				    &tmp_entry)) {
9753 					vm_map_unlock(dst_map);
9754 					return KERN_INVALID_ADDRESS;
9755 				}
9756 				copy_size = 0;
9757 				entry = tmp_entry;
9758 				continue;
9759 			}
9760 			if (entry->is_sub_map) {
9761 				vm_map_offset_t sub_start;
9762 				vm_map_offset_t sub_end;
9763 				vm_map_offset_t local_end;
9764 				vm_map_t        sub_map = VM_MAP_NULL;
9765 				bool            use_pmap;
9766 
9767 				if (entry->needs_copy) {
9768 					/* if this is a COW submap */
9769 					/* just back the range with a */
9770 					/* anonymous entry */
9771 					assert(!entry->vme_permanent);
9772 					if (entry->vme_end < dst_end) {
9773 						sub_end = entry->vme_end;
9774 					} else {
9775 						sub_end = dst_end;
9776 					}
9777 					if (entry->vme_start < base_addr) {
9778 						sub_start = base_addr;
9779 					} else {
9780 						sub_start = entry->vme_start;
9781 					}
9782 					vm_map_clip_end(
9783 						dst_map, entry, sub_end);
9784 					vm_map_clip_start(
9785 						dst_map, entry, sub_start);
9786 					assert(!entry->use_pmap);
9787 					assert(!entry->iokit_acct);
9788 					entry->use_pmap = TRUE;
9789 					vm_map_deallocate(VME_SUBMAP(entry));
9790 					assert(!entry->vme_permanent);
9791 					VME_OBJECT_SET(entry, VM_OBJECT_NULL, false, 0);
9792 					VME_OFFSET_SET(entry, 0);
9793 					entry->is_shared = FALSE;
9794 					entry->needs_copy = FALSE;
9795 					entry->protection = VM_PROT_DEFAULT;
9796 					entry->max_protection = VM_PROT_ALL;
9797 					entry->wired_count = 0;
9798 					entry->user_wired_count = 0;
9799 					if (entry->inheritance
9800 					    == VM_INHERIT_SHARE) {
9801 						entry->inheritance = VM_INHERIT_COPY;
9802 					}
9803 					continue;
9804 				}
9805 				/* first take care of any non-sub_map */
9806 				/* entries to send */
9807 				if (base_addr < entry->vme_start) {
9808 					/* stuff to send */
9809 					copy_size =
9810 					    entry->vme_start - base_addr;
9811 					break;
9812 				}
9813 				sub_start = VME_OFFSET(entry);
9814 
9815 				if (entry->vme_end < dst_end) {
9816 					sub_end = entry->vme_end;
9817 				} else {
9818 					sub_end = dst_end;
9819 				}
9820 				sub_end -= entry->vme_start;
9821 				sub_end += VME_OFFSET(entry);
9822 				local_end = entry->vme_end;
9823 				use_pmap = entry->use_pmap;
9824 				sub_map = VME_SUBMAP(entry);
9825 				vm_map_reference(sub_map);
9826 				vm_map_unlock(dst_map);
9827 				copy_size = sub_end - sub_start;
9828 
9829 				/* adjust the copy object */
9830 				if (total_size > copy_size) {
9831 					vm_map_size_t   local_size = 0;
9832 					vm_map_size_t   entry_size;
9833 
9834 					nentries = 1;
9835 					new_offset = copy->offset;
9836 					copy_entry = vm_map_copy_first_entry(copy);
9837 					while (copy_entry !=
9838 					    vm_map_copy_to_entry(copy)) {
9839 						entry_size = copy_entry->vme_end -
9840 						    copy_entry->vme_start;
9841 						if ((local_size < copy_size) &&
9842 						    ((local_size + entry_size)
9843 						    >= copy_size)) {
9844 							vm_map_copy_clip_end(copy,
9845 							    copy_entry,
9846 							    copy_entry->vme_start +
9847 							    (copy_size - local_size));
9848 							entry_size = copy_entry->vme_end -
9849 							    copy_entry->vme_start;
9850 							local_size += entry_size;
9851 							new_offset += entry_size;
9852 						}
9853 						if (local_size >= copy_size) {
9854 							next_copy = copy_entry->vme_next;
9855 							copy_entry->vme_next =
9856 							    vm_map_copy_to_entry(copy);
9857 							previous_prev =
9858 							    copy->cpy_hdr.links.prev;
9859 							copy->cpy_hdr.links.prev = copy_entry;
9860 							copy->size = copy_size;
9861 							remaining_entries =
9862 							    copy->cpy_hdr.nentries;
9863 							remaining_entries -= nentries;
9864 							copy->cpy_hdr.nentries = nentries;
9865 							break;
9866 						} else {
9867 							local_size += entry_size;
9868 							new_offset += entry_size;
9869 							nentries++;
9870 						}
9871 						copy_entry = copy_entry->vme_next;
9872 					}
9873 				}
9874 
9875 				if ((use_pmap) && (pmap == NULL)) {
9876 					kr = vm_map_copy_overwrite_nested(
9877 						sub_map,
9878 						sub_start,
9879 						copy,
9880 						interruptible,
9881 						sub_map->pmap,
9882 						TRUE);
9883 				} else if (pmap != NULL) {
9884 					kr = vm_map_copy_overwrite_nested(
9885 						sub_map,
9886 						sub_start,
9887 						copy,
9888 						interruptible, pmap,
9889 						TRUE);
9890 				} else {
9891 					kr = vm_map_copy_overwrite_nested(
9892 						sub_map,
9893 						sub_start,
9894 						copy,
9895 						interruptible,
9896 						dst_map->pmap,
9897 						TRUE);
9898 				}
9899 
9900 				vm_map_deallocate(sub_map);
9901 				sub_map = VM_MAP_NULL;
9902 
9903 				if (kr != KERN_SUCCESS) {
9904 					if (next_copy != NULL) {
9905 						copy->cpy_hdr.nentries +=
9906 						    remaining_entries;
9907 						copy->cpy_hdr.links.prev->vme_next =
9908 						    next_copy;
9909 						copy->cpy_hdr.links.prev
9910 						        = previous_prev;
9911 						copy->size = total_size;
9912 					}
9913 					return kr;
9914 				}
9915 				if (dst_end <= local_end) {
9916 					return KERN_SUCCESS;
9917 				}
9918 				/* otherwise copy no longer exists, it was */
9919 				/* destroyed after successful copy_overwrite */
9920 				copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
9921 				copy->offset = new_offset;
9922 				copy->cpy_hdr.page_shift = copy_page_shift;
9923 
9924 				total_size -= copy_size;
9925 				copy_size = 0;
9926 				/* put back remainder of copy in container */
9927 				if (next_copy != NULL) {
9928 					copy->cpy_hdr.nentries = remaining_entries;
9929 					copy->cpy_hdr.links.next = next_copy;
9930 					copy->cpy_hdr.links.prev = previous_prev;
9931 					copy->size = total_size;
9932 					next_copy->vme_prev =
9933 					    vm_map_copy_to_entry(copy);
9934 					next_copy = NULL;
9935 				}
9936 				base_addr = local_end;
9937 				vm_map_lock(dst_map);
9938 				if (!vm_map_lookup_entry(dst_map,
9939 				    local_end, &tmp_entry)) {
9940 					vm_map_unlock(dst_map);
9941 					return KERN_INVALID_ADDRESS;
9942 				}
9943 				entry = tmp_entry;
9944 				continue;
9945 			}
9946 			assert(!entry->is_sub_map);
9947 
9948 			if (dst_end <= entry->vme_end) {
9949 				copy_size = dst_end - base_addr;
9950 				break;
9951 			}
9952 
9953 			if ((next == vm_map_to_entry(dst_map)) ||
9954 			    (next->vme_start != entry->vme_end)) {
9955 				vm_map_unlock(dst_map);
9956 				return KERN_INVALID_ADDRESS;
9957 			}
9958 
9959 			entry = next;
9960 		}/* for */
9961 
9962 		next_copy = NULL;
9963 		nentries = 1;
9964 
9965 		/* adjust the copy object */
9966 		if (total_size > copy_size) {
9967 			vm_map_size_t   local_size = 0;
9968 			vm_map_size_t   entry_size;
9969 
9970 			new_offset = copy->offset;
9971 			copy_entry = vm_map_copy_first_entry(copy);
9972 			while (copy_entry != vm_map_copy_to_entry(copy)) {
9973 				entry_size = copy_entry->vme_end -
9974 				    copy_entry->vme_start;
9975 				if ((local_size < copy_size) &&
9976 				    ((local_size + entry_size)
9977 				    >= copy_size)) {
9978 					vm_map_copy_clip_end(copy, copy_entry,
9979 					    copy_entry->vme_start +
9980 					    (copy_size - local_size));
9981 					entry_size = copy_entry->vme_end -
9982 					    copy_entry->vme_start;
9983 					local_size += entry_size;
9984 					new_offset += entry_size;
9985 				}
9986 				if (local_size >= copy_size) {
9987 					next_copy = copy_entry->vme_next;
9988 					copy_entry->vme_next =
9989 					    vm_map_copy_to_entry(copy);
9990 					previous_prev =
9991 					    copy->cpy_hdr.links.prev;
9992 					copy->cpy_hdr.links.prev = copy_entry;
9993 					copy->size = copy_size;
9994 					remaining_entries =
9995 					    copy->cpy_hdr.nentries;
9996 					remaining_entries -= nentries;
9997 					copy->cpy_hdr.nentries = nentries;
9998 					break;
9999 				} else {
10000 					local_size += entry_size;
10001 					new_offset += entry_size;
10002 					nentries++;
10003 				}
10004 				copy_entry = copy_entry->vme_next;
10005 			}
10006 		}
10007 
10008 		if (aligned) {
10009 			pmap_t  local_pmap;
10010 
10011 			if (pmap) {
10012 				local_pmap = pmap;
10013 			} else {
10014 				local_pmap = dst_map->pmap;
10015 			}
10016 
10017 			if ((kr =  vm_map_copy_overwrite_aligned(
10018 				    dst_map, tmp_entry, copy,
10019 				    base_addr, local_pmap)) != KERN_SUCCESS) {
10020 				if (next_copy != NULL) {
10021 					copy->cpy_hdr.nentries +=
10022 					    remaining_entries;
10023 					copy->cpy_hdr.links.prev->vme_next =
10024 					    next_copy;
10025 					copy->cpy_hdr.links.prev =
10026 					    previous_prev;
10027 					copy->size += copy_size;
10028 				}
10029 				return kr;
10030 			}
10031 			vm_map_unlock(dst_map);
10032 		} else {
10033 			/*
10034 			 * Performance gain:
10035 			 *
10036 			 * if the copy and dst address are misaligned but the same
10037 			 * offset within the page we can copy_not_aligned the
10038 			 * misaligned parts and copy aligned the rest.  If they are
10039 			 * aligned but len is unaligned we simply need to copy
10040 			 * the end bit unaligned.  We'll need to split the misaligned
10041 			 * bits of the region in this case !
10042 			 */
10043 			/* ALWAYS UNLOCKS THE dst_map MAP */
10044 			kr = vm_map_copy_overwrite_unaligned(
10045 				dst_map,
10046 				tmp_entry,
10047 				copy,
10048 				base_addr,
10049 				discard_on_success);
10050 			if (kr != KERN_SUCCESS) {
10051 				if (next_copy != NULL) {
10052 					copy->cpy_hdr.nentries +=
10053 					    remaining_entries;
10054 					copy->cpy_hdr.links.prev->vme_next =
10055 					    next_copy;
10056 					copy->cpy_hdr.links.prev =
10057 					    previous_prev;
10058 					copy->size += copy_size;
10059 				}
10060 				return kr;
10061 			}
10062 		}
10063 		total_size -= copy_size;
10064 		if (total_size == 0) {
10065 			break;
10066 		}
10067 		base_addr += copy_size;
10068 		copy_size = 0;
10069 		copy->offset = new_offset;
10070 		if (next_copy != NULL) {
10071 			copy->cpy_hdr.nentries = remaining_entries;
10072 			copy->cpy_hdr.links.next = next_copy;
10073 			copy->cpy_hdr.links.prev = previous_prev;
10074 			next_copy->vme_prev = vm_map_copy_to_entry(copy);
10075 			copy->size = total_size;
10076 		}
10077 		vm_map_lock(dst_map);
10078 		while (TRUE) {
10079 			if (!vm_map_lookup_entry(dst_map,
10080 			    base_addr, &tmp_entry)) {
10081 				vm_map_unlock(dst_map);
10082 				return KERN_INVALID_ADDRESS;
10083 			}
10084 			if (tmp_entry->in_transition) {
10085 				entry->needs_wakeup = TRUE;
10086 				vm_map_entry_wait(dst_map, THREAD_UNINT);
10087 			} else {
10088 				break;
10089 			}
10090 		}
10091 		vm_map_clip_start(dst_map,
10092 		    tmp_entry,
10093 		    vm_map_trunc_page(base_addr,
10094 		    VM_MAP_PAGE_MASK(dst_map)));
10095 
10096 		entry = tmp_entry;
10097 	} /* while */
10098 
10099 	/*
10100 	 *	Throw away the vm_map_copy object
10101 	 */
10102 	if (discard_on_success) {
10103 		vm_map_copy_discard(copy);
10104 	}
10105 
10106 	return KERN_SUCCESS;
10107 }/* vm_map_copy_overwrite */
10108 
10109 static __attribute__((always_inline, warn_unused_result))
10110 kern_return_t
vm_map_copy_addr_size_sanitize(vm_map_t map,vm_map_offset_ut addr_u,vm_map_size_ut size_u,vm_sanitize_caller_t vm_sanitize_caller,vm_map_offset_t * addr,vm_map_offset_t * end,vm_map_size_t * size)10111 vm_map_copy_addr_size_sanitize(
10112 	vm_map_t                map,
10113 	vm_map_offset_ut        addr_u,
10114 	vm_map_size_ut          size_u,
10115 	vm_sanitize_caller_t    vm_sanitize_caller,
10116 	vm_map_offset_t        *addr,
10117 	vm_map_offset_t        *end,
10118 	vm_map_size_t          *size)
10119 {
10120 	vm_sanitize_flags_t flags = VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH |
10121 	    VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES |
10122 	    VM_SANITIZE_FLAGS_CHECK_ADDR_RANGE;
10123 
10124 	return vm_sanitize_addr_size(addr_u, size_u,
10125 	           vm_sanitize_caller, map,
10126 	           flags,
10127 	           addr, end, size);
10128 }
10129 
10130 kern_return_t
vm_map_copy_overwrite(vm_map_t dst_map,vm_map_offset_ut dst_addr_u,vm_map_copy_t copy,vm_map_size_ut copy_size_u,boolean_t interruptible)10131 vm_map_copy_overwrite(
10132 	vm_map_t                dst_map,
10133 	vm_map_offset_ut        dst_addr_u,
10134 	vm_map_copy_t           copy,
10135 	vm_map_size_ut          copy_size_u,
10136 	boolean_t               interruptible)
10137 {
10138 	vm_map_offset_t dst_addr, dst_end;
10139 	vm_map_size_t   copy_size;
10140 	vm_map_size_t   head_size, tail_size;
10141 	vm_map_copy_t   head_copy, tail_copy;
10142 	vm_map_offset_t head_addr, tail_addr;
10143 	vm_map_entry_t  entry;
10144 	kern_return_t   kr;
10145 	vm_map_offset_t effective_page_mask, effective_page_size;
10146 	uint16_t        copy_page_shift;
10147 
10148 	head_size = 0;
10149 	tail_size = 0;
10150 	head_copy = NULL;
10151 	tail_copy = NULL;
10152 	head_addr = 0;
10153 	tail_addr = 0;
10154 
10155 	/*
10156 	 *	Check for null copy object.
10157 	 */
10158 	if (copy == VM_MAP_COPY_NULL) {
10159 		return KERN_SUCCESS;
10160 	}
10161 
10162 	/*
10163 	 * Sanitize any input parameters that are addr/size/prot/inherit
10164 	 */
10165 	kr = vm_map_copy_addr_size_sanitize(
10166 		dst_map,
10167 		dst_addr_u,
10168 		copy_size_u,
10169 		VM_SANITIZE_CALLER_VM_MAP_COPY_OVERWRITE,
10170 		&dst_addr,
10171 		&dst_end,
10172 		&copy_size);
10173 	if (__improbable(kr != KERN_SUCCESS)) {
10174 		return vm_sanitize_get_kr(kr);
10175 	}
10176 
10177 	/*
10178 	 * Assert that the vm_map_copy is coming from the right
10179 	 * zone and hasn't been forged
10180 	 */
10181 	vm_map_copy_require(copy);
10182 
10183 	if (interruptible ||
10184 	    copy->type != VM_MAP_COPY_ENTRY_LIST) {
10185 		/*
10186 		 * We can't split the "copy" map if we're interruptible
10187 		 * or if we don't have a "copy" map...
10188 		 */
10189 blunt_copy:
10190 		kr = vm_map_copy_overwrite_nested(dst_map,
10191 		    dst_addr,
10192 		    copy,
10193 		    interruptible,
10194 		    (pmap_t) NULL,
10195 		    TRUE);
10196 		if (kr) {
10197 			ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOVERWRITE_FULL_NESTED_ERROR), kr /* arg */);
10198 		}
10199 		return kr;
10200 	}
10201 
10202 	copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy);
10203 	if (copy_page_shift < PAGE_SHIFT ||
10204 	    VM_MAP_PAGE_SHIFT(dst_map) < PAGE_SHIFT) {
10205 		goto blunt_copy;
10206 	}
10207 
10208 	if (VM_MAP_PAGE_SHIFT(dst_map) < PAGE_SHIFT) {
10209 		effective_page_mask = VM_MAP_PAGE_MASK(dst_map);
10210 	} else {
10211 		effective_page_mask = MAX(VM_MAP_PAGE_MASK(dst_map), PAGE_MASK);
10212 		effective_page_mask = MAX(VM_MAP_COPY_PAGE_MASK(copy),
10213 		    effective_page_mask);
10214 	}
10215 	effective_page_size = effective_page_mask + 1;
10216 
10217 	if (copy_size < VM_MAP_COPY_OVERWRITE_OPTIMIZATION_THRESHOLD_PAGES * effective_page_size) {
10218 		/*
10219 		 * Too small to bother with optimizing...
10220 		 */
10221 		goto blunt_copy;
10222 	}
10223 
10224 	if ((dst_addr & effective_page_mask) !=
10225 	    (copy->offset & effective_page_mask)) {
10226 		/*
10227 		 * Incompatible mis-alignment of source and destination...
10228 		 */
10229 		goto blunt_copy;
10230 	}
10231 
10232 	/*
10233 	 * Proper alignment or identical mis-alignment at the beginning.
10234 	 * Let's try and do a small unaligned copy first (if needed)
10235 	 * and then an aligned copy for the rest.
10236 	 */
10237 	if (!vm_map_page_aligned(dst_addr, effective_page_mask)) {
10238 		head_addr = dst_addr;
10239 		head_size = (effective_page_size -
10240 		    (copy->offset & effective_page_mask));
10241 		head_size = MIN(head_size, copy_size);
10242 	}
10243 	if (!vm_map_page_aligned(copy->offset + copy_size,
10244 	    effective_page_mask)) {
10245 		/*
10246 		 * Mis-alignment at the end.
10247 		 * Do an aligned copy up to the last page and
10248 		 * then an unaligned copy for the remaining bytes.
10249 		 */
10250 		tail_size = ((copy->offset + copy_size) &
10251 		    effective_page_mask);
10252 		tail_size = MIN(tail_size, copy_size);
10253 		tail_addr = dst_addr + copy_size - tail_size;
10254 		assert(tail_addr >= head_addr + head_size);
10255 	}
10256 	assert(head_size + tail_size <= copy_size);
10257 
10258 	if (head_size + tail_size == copy_size) {
10259 		/*
10260 		 * It's all unaligned, no optimization possible...
10261 		 */
10262 		goto blunt_copy;
10263 	}
10264 
10265 	/*
10266 	 * Can't optimize if there are any submaps in the
10267 	 * destination due to the way we free the "copy" map
10268 	 * progressively in vm_map_copy_overwrite_nested()
10269 	 * in that case.
10270 	 */
10271 	vm_map_lock_read(dst_map);
10272 	if (!vm_map_lookup_entry(dst_map, dst_addr, &entry)) {
10273 		vm_map_unlock_read(dst_map);
10274 		goto blunt_copy;
10275 	}
10276 	for (;
10277 	    (entry != vm_map_to_entry(dst_map) &&
10278 	    entry->vme_start < dst_addr + copy_size);
10279 	    entry = entry->vme_next) {
10280 		if (entry->is_sub_map) {
10281 			vm_map_unlock_read(dst_map);
10282 			goto blunt_copy;
10283 		}
10284 	}
10285 	vm_map_unlock_read(dst_map);
10286 
10287 	if (head_size) {
10288 		/*
10289 		 * Unaligned copy of the first "head_size" bytes, to reach
10290 		 * a page boundary.
10291 		 */
10292 
10293 		/*
10294 		 * Extract "head_copy" out of "copy".
10295 		 */
10296 		head_copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
10297 		head_copy->cpy_hdr.entries_pageable =
10298 		    copy->cpy_hdr.entries_pageable;
10299 		head_copy->cpy_hdr.page_shift = copy_page_shift;
10300 
10301 		entry = vm_map_copy_first_entry(copy);
10302 		if (entry->vme_end < copy->offset + head_size) {
10303 			head_size = entry->vme_end - copy->offset;
10304 		}
10305 
10306 		head_copy->offset = copy->offset;
10307 		head_copy->size = head_size;
10308 		copy->offset += head_size;
10309 		copy->size -= head_size;
10310 		copy_size -= head_size;
10311 		assert(copy_size > 0);
10312 
10313 		vm_map_copy_clip_end(copy, entry, copy->offset);
10314 		vm_map_copy_entry_unlink(copy, entry);
10315 		vm_map_copy_entry_link(head_copy,
10316 		    vm_map_copy_to_entry(head_copy),
10317 		    entry);
10318 
10319 		/*
10320 		 * Do the unaligned copy.
10321 		 */
10322 		kr = vm_map_copy_overwrite_nested(dst_map,
10323 		    head_addr,
10324 		    head_copy,
10325 		    interruptible,
10326 		    (pmap_t) NULL,
10327 		    FALSE);
10328 		if (kr != KERN_SUCCESS) {
10329 			ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOVERWRITE_PARTIAL_HEAD_NESTED_ERROR), kr /* arg */);
10330 			goto done;
10331 		}
10332 	}
10333 
10334 	if (tail_size) {
10335 		/*
10336 		 * Extract "tail_copy" out of "copy".
10337 		 */
10338 		tail_copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
10339 		tail_copy->cpy_hdr.entries_pageable =
10340 		    copy->cpy_hdr.entries_pageable;
10341 		tail_copy->cpy_hdr.page_shift = copy_page_shift;
10342 
10343 		tail_copy->offset = copy->offset + copy_size - tail_size;
10344 		tail_copy->size = tail_size;
10345 
10346 		copy->size -= tail_size;
10347 		copy_size -= tail_size;
10348 		assert(copy_size > 0);
10349 
10350 		entry = vm_map_copy_last_entry(copy);
10351 		vm_map_copy_clip_start(copy, entry, tail_copy->offset);
10352 		entry = vm_map_copy_last_entry(copy);
10353 		vm_map_copy_entry_unlink(copy, entry);
10354 		vm_map_copy_entry_link(tail_copy,
10355 		    vm_map_copy_last_entry(tail_copy),
10356 		    entry);
10357 	}
10358 
10359 	/*
10360 	 * If we are here from ipc_kmsg_copyout_ool_descriptor(),
10361 	 * we want to avoid TOCTOU issues w.r.t copy->size but
10362 	 * we don't need to change vm_map_copy_overwrite_nested()
10363 	 * and all other vm_map_copy_overwrite variants.
10364 	 *
10365 	 * So we assign the original copy_size that was passed into
10366 	 * this routine back to copy.
10367 	 *
10368 	 * This use of local 'copy_size' passed into this routine is
10369 	 * to try and protect against TOCTOU attacks where the kernel
10370 	 * has been exploited. We don't expect this to be an issue
10371 	 * during normal system operation.
10372 	 */
10373 	assertf(copy->size == copy_size,
10374 	    "Mismatch of copy sizes. Expected 0x%llx, Got 0x%llx\n", (uint64_t) copy_size, (uint64_t) copy->size);
10375 	copy->size = copy_size;
10376 
10377 	/*
10378 	 * Copy most (or possibly all) of the data.
10379 	 */
10380 	kr = vm_map_copy_overwrite_nested(dst_map,
10381 	    dst_addr + head_size,
10382 	    copy,
10383 	    interruptible,
10384 	    (pmap_t) NULL,
10385 	    FALSE);
10386 	if (kr != KERN_SUCCESS) {
10387 		ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOVERWRITE_PARTIAL_NESTED_ERROR), kr /* arg */);
10388 		goto done;
10389 	}
10390 
10391 	if (tail_size) {
10392 		kr = vm_map_copy_overwrite_nested(dst_map,
10393 		    tail_addr,
10394 		    tail_copy,
10395 		    interruptible,
10396 		    (pmap_t) NULL,
10397 		    FALSE);
10398 		if (kr) {
10399 			ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOVERWRITE_PARTIAL_TAIL_NESTED_ERROR), kr /* arg */);
10400 		}
10401 	}
10402 
10403 done:
10404 	assert(copy->type == VM_MAP_COPY_ENTRY_LIST);
10405 	if (kr == KERN_SUCCESS) {
10406 		/*
10407 		 * Discard all the copy maps.
10408 		 */
10409 		if (head_copy) {
10410 			vm_map_copy_discard(head_copy);
10411 			head_copy = NULL;
10412 		}
10413 		vm_map_copy_discard(copy);
10414 		if (tail_copy) {
10415 			vm_map_copy_discard(tail_copy);
10416 			tail_copy = NULL;
10417 		}
10418 	} else {
10419 		/*
10420 		 * Re-assemble the original copy map.
10421 		 */
10422 		if (head_copy) {
10423 			entry = vm_map_copy_first_entry(head_copy);
10424 			vm_map_copy_entry_unlink(head_copy, entry);
10425 			vm_map_copy_entry_link(copy,
10426 			    vm_map_copy_to_entry(copy),
10427 			    entry);
10428 			copy->offset -= head_size;
10429 			copy->size += head_size;
10430 			vm_map_copy_discard(head_copy);
10431 			head_copy = NULL;
10432 		}
10433 		if (tail_copy) {
10434 			entry = vm_map_copy_last_entry(tail_copy);
10435 			vm_map_copy_entry_unlink(tail_copy, entry);
10436 			vm_map_copy_entry_link(copy,
10437 			    vm_map_copy_last_entry(copy),
10438 			    entry);
10439 			copy->size += tail_size;
10440 			vm_map_copy_discard(tail_copy);
10441 			tail_copy = NULL;
10442 		}
10443 	}
10444 	return kr;
10445 }
10446 
10447 
10448 /*
10449  *	Routine: vm_map_copy_overwrite_unaligned	[internal use only]
10450  *
10451  *	Decription:
10452  *	Physically copy unaligned data
10453  *
10454  *	Implementation:
10455  *	Unaligned parts of pages have to be physically copied.  We use
10456  *	a modified form of vm_fault_copy (which understands none-aligned
10457  *	page offsets and sizes) to do the copy.  We attempt to copy as
10458  *	much memory in one go as possibly, however vm_fault_copy copies
10459  *	within 1 memory object so we have to find the smaller of "amount left"
10460  *	"source object data size" and "target object data size".  With
10461  *	unaligned data we don't need to split regions, therefore the source
10462  *	(copy) object should be one map entry, the target range may be split
10463  *	over multiple map entries however.  In any event we are pessimistic
10464  *	about these assumptions.
10465  *
10466  *	Callers of this function must call vm_map_copy_require on
10467  *	previously created vm_map_copy_t or pass a newly created
10468  *	one to ensure that it hasn't been forged.
10469  *
10470  *	Assumptions:
10471  *	dst_map is locked on entry and is return locked on success,
10472  *	unlocked on error.
10473  */
10474 
10475 static kern_return_t
vm_map_copy_overwrite_unaligned(vm_map_t dst_map,vm_map_entry_t entry,vm_map_copy_t copy,vm_map_offset_t start,boolean_t discard_on_success)10476 vm_map_copy_overwrite_unaligned(
10477 	vm_map_t        dst_map,
10478 	vm_map_entry_t  entry,
10479 	vm_map_copy_t   copy,
10480 	vm_map_offset_t start,
10481 	boolean_t       discard_on_success)
10482 {
10483 	vm_map_entry_t          copy_entry;
10484 	vm_map_entry_t          copy_entry_next;
10485 	vm_map_version_t        version;
10486 	vm_object_t             dst_object;
10487 	vm_object_offset_t      dst_offset;
10488 	vm_object_offset_t      src_offset;
10489 	vm_object_offset_t      entry_offset;
10490 	vm_map_offset_t         entry_end;
10491 	vm_map_size_t           src_size,
10492 	    dst_size,
10493 	    copy_size,
10494 	    amount_left;
10495 	kern_return_t           kr = KERN_SUCCESS;
10496 
10497 
10498 	copy_entry = vm_map_copy_first_entry(copy);
10499 
10500 	vm_map_lock_write_to_read(dst_map);
10501 
10502 	src_offset = copy->offset - trunc_page_mask_64(copy->offset, VM_MAP_COPY_PAGE_MASK(copy));
10503 	amount_left = copy->size;
10504 /*
10505  *	unaligned so we never clipped this entry, we need the offset into
10506  *	the vm_object not just the data.
10507  */
10508 	while (amount_left > 0) {
10509 		if (entry == vm_map_to_entry(dst_map)) {
10510 			vm_map_unlock_read(dst_map);
10511 			return KERN_INVALID_ADDRESS;
10512 		}
10513 
10514 		/* "start" must be within the current map entry */
10515 		assert((start >= entry->vme_start) && (start < entry->vme_end));
10516 
10517 		/*
10518 		 *	Check protection again
10519 		 */
10520 		if (!(entry->protection & VM_PROT_WRITE)) {
10521 			vm_map_unlock_read(dst_map);
10522 			return KERN_PROTECTION_FAILURE;
10523 		}
10524 		if (entry->is_sub_map) {
10525 			/* not implemented... */
10526 			vm_map_unlock_read(dst_map);
10527 			return KERN_INVALID_ARGUMENT;
10528 		}
10529 		if (!vm_map_entry_is_overwritable(dst_map, entry)) {
10530 			vm_map_unlock_read(dst_map);
10531 			return KERN_PROTECTION_FAILURE;
10532 		}
10533 		/*
10534 		 *	If the entry is in transition, we must wait
10535 		 *	for it to exit that state.  Anything could happen
10536 		 *	when we unlock the map, so start over.
10537 		 */
10538 		if (entry->in_transition) {
10539 			/*
10540 			 * Say that we are waiting, and wait for entry.
10541 			 */
10542 			entry->needs_wakeup = TRUE;
10543 			vm_map_entry_wait(dst_map, THREAD_UNINT);
10544 
10545 			goto RetryLookup;
10546 		}
10547 
10548 		dst_offset = start - entry->vme_start;
10549 
10550 		dst_size = entry->vme_end - start;
10551 
10552 		src_size = copy_entry->vme_end -
10553 		    (copy_entry->vme_start + src_offset);
10554 
10555 		if (dst_size < src_size) {
10556 /*
10557  *			we can only copy dst_size bytes before
10558  *			we have to get the next destination entry
10559  */
10560 			copy_size = dst_size;
10561 		} else {
10562 /*
10563  *			we can only copy src_size bytes before
10564  *			we have to get the next source copy entry
10565  */
10566 			copy_size = src_size;
10567 		}
10568 
10569 		if (copy_size > amount_left) {
10570 			copy_size = amount_left;
10571 		}
10572 /*
10573  *		Entry needs copy, create a shadow shadow object for
10574  *		Copy on write region.
10575  */
10576 		assert(!entry->is_sub_map);
10577 		if (entry->needs_copy) {
10578 			if (vm_map_lock_read_to_write(dst_map)) {
10579 				vm_map_lock_read(dst_map);
10580 				goto RetryLookup;
10581 			}
10582 			VME_OBJECT_SHADOW(entry,
10583 			    (vm_map_size_t)(entry->vme_end
10584 			    - entry->vme_start),
10585 			    vm_map_always_shadow(dst_map));
10586 			entry->needs_copy = FALSE;
10587 			vm_map_lock_write_to_read(dst_map);
10588 		}
10589 		dst_object = VME_OBJECT(entry);
10590 /*
10591  *		unlike with the virtual (aligned) copy we're going
10592  *		to fault on it therefore we need a target object.
10593  */
10594 		if (dst_object == VM_OBJECT_NULL) {
10595 			if (vm_map_lock_read_to_write(dst_map)) {
10596 				vm_map_lock_read(dst_map);
10597 				goto RetryLookup;
10598 			}
10599 			dst_object = vm_object_allocate((vm_map_size_t)
10600 			    entry->vme_end - entry->vme_start);
10601 			VME_OBJECT_SET(entry, dst_object, false, 0);
10602 			VME_OFFSET_SET(entry, 0);
10603 			assert(entry->use_pmap);
10604 			vm_map_lock_write_to_read(dst_map);
10605 		}
10606 /*
10607  *		Take an object reference and unlock map. The "entry" may
10608  *		disappear or change when the map is unlocked.
10609  */
10610 		vm_object_reference(dst_object);
10611 		version.main_timestamp = dst_map->timestamp;
10612 		entry_offset = VME_OFFSET(entry);
10613 		entry_end = entry->vme_end;
10614 		vm_map_unlock_read(dst_map);
10615 /*
10616  *		Copy as much as possible in one pass
10617  */
10618 		kr = vm_fault_copy(
10619 			VME_OBJECT(copy_entry),
10620 			VME_OFFSET(copy_entry) + src_offset,
10621 			&copy_size,
10622 			dst_object,
10623 			entry_offset + dst_offset,
10624 			dst_map,
10625 			&version,
10626 			THREAD_UNINT );
10627 
10628 		start += copy_size;
10629 		src_offset += copy_size;
10630 		amount_left -= copy_size;
10631 /*
10632  *		Release the object reference
10633  */
10634 		vm_object_deallocate(dst_object);
10635 /*
10636  *		If a hard error occurred, return it now
10637  */
10638 		if (kr != KERN_SUCCESS) {
10639 			return kr;
10640 		}
10641 
10642 		if ((copy_entry->vme_start + src_offset) == copy_entry->vme_end
10643 		    || amount_left == 0) {
10644 /*
10645  *			all done with this copy entry, dispose.
10646  */
10647 			copy_entry_next = copy_entry->vme_next;
10648 
10649 			if (discard_on_success) {
10650 				vm_map_copy_entry_unlink(copy, copy_entry);
10651 				assert(!copy_entry->is_sub_map);
10652 				vm_object_deallocate(VME_OBJECT(copy_entry));
10653 				vm_map_copy_entry_dispose(copy_entry);
10654 			}
10655 
10656 			if (copy_entry_next == vm_map_copy_to_entry(copy) &&
10657 			    amount_left) {
10658 /*
10659  *				not finished copying but run out of source
10660  */
10661 				return KERN_INVALID_ADDRESS;
10662 			}
10663 
10664 			copy_entry = copy_entry_next;
10665 
10666 			src_offset = 0;
10667 		}
10668 
10669 		if (amount_left == 0) {
10670 			return KERN_SUCCESS;
10671 		}
10672 
10673 		vm_map_lock_read(dst_map);
10674 		if (version.main_timestamp == dst_map->timestamp) {
10675 			if (start == entry_end) {
10676 /*
10677  *				destination region is split.  Use the version
10678  *				information to avoid a lookup in the normal
10679  *				case.
10680  */
10681 				entry = entry->vme_next;
10682 /*
10683  *				should be contiguous. Fail if we encounter
10684  *				a hole in the destination.
10685  */
10686 				if (start != entry->vme_start) {
10687 					vm_map_unlock_read(dst_map);
10688 					return KERN_INVALID_ADDRESS;
10689 				}
10690 			}
10691 		} else {
10692 /*
10693  *			Map version check failed.
10694  *			we must lookup the entry because somebody
10695  *			might have changed the map behind our backs.
10696  */
10697 RetryLookup:
10698 			if (!vm_map_lookup_entry(dst_map, start, &entry)) {
10699 				vm_map_unlock_read(dst_map);
10700 				return KERN_INVALID_ADDRESS;
10701 			}
10702 		}
10703 	}/* while */
10704 
10705 	return KERN_SUCCESS;
10706 }/* vm_map_copy_overwrite_unaligned */
10707 
10708 /*
10709  *	Routine: vm_map_copy_overwrite_aligned	[internal use only]
10710  *
10711  *	Description:
10712  *	Does all the vm_trickery possible for whole pages.
10713  *
10714  *	Implementation:
10715  *
10716  *	If there are no permanent objects in the destination,
10717  *	and the source and destination map entry zones match,
10718  *	and the destination map entry is not shared,
10719  *	then the map entries can be deleted and replaced
10720  *	with those from the copy.  The following code is the
10721  *	basic idea of what to do, but there are lots of annoying
10722  *	little details about getting protection and inheritance
10723  *	right.  Should add protection, inheritance, and sharing checks
10724  *	to the above pass and make sure that no wiring is involved.
10725  *
10726  *	Callers of this function must call vm_map_copy_require on
10727  *	previously created vm_map_copy_t or pass a newly created
10728  *	one to ensure that it hasn't been forged.
10729  */
10730 
10731 int vm_map_copy_overwrite_aligned_src_not_internal = 0;
10732 int vm_map_copy_overwrite_aligned_src_not_symmetric = 0;
10733 int vm_map_copy_overwrite_aligned_src_large = 0;
10734 
10735 static kern_return_t
vm_map_copy_overwrite_aligned(vm_map_t dst_map,vm_map_entry_t tmp_entry,vm_map_copy_t copy,vm_map_offset_t start,__unused pmap_t pmap)10736 vm_map_copy_overwrite_aligned(
10737 	vm_map_t        dst_map,
10738 	vm_map_entry_t  tmp_entry,
10739 	vm_map_copy_t   copy,
10740 	vm_map_offset_t start,
10741 	__unused pmap_t pmap)
10742 {
10743 	vm_object_t     object;
10744 	vm_map_entry_t  copy_entry;
10745 	vm_map_size_t   copy_size;
10746 	vm_map_size_t   size;
10747 	vm_map_entry_t  entry;
10748 
10749 	while ((copy_entry = vm_map_copy_first_entry(copy))
10750 	    != vm_map_copy_to_entry(copy)) {
10751 		copy_size = (copy_entry->vme_end - copy_entry->vme_start);
10752 
10753 		entry = tmp_entry;
10754 
10755 		if (entry->is_sub_map) {
10756 			/* unnested when clipped earlier */
10757 			assert(!entry->use_pmap);
10758 		}
10759 		if (entry == vm_map_to_entry(dst_map)) {
10760 			vm_map_unlock(dst_map);
10761 			return KERN_INVALID_ADDRESS;
10762 		}
10763 		size = (entry->vme_end - entry->vme_start);
10764 		/*
10765 		 *	Make sure that no holes popped up in the
10766 		 *	address map, and that the protection is
10767 		 *	still valid, in case the map was unlocked
10768 		 *	earlier.
10769 		 */
10770 
10771 		if ((entry->vme_start != start) || ((entry->is_sub_map)
10772 		    && !entry->needs_copy)) {
10773 			vm_map_unlock(dst_map);
10774 			return KERN_INVALID_ADDRESS;
10775 		}
10776 		assert(entry != vm_map_to_entry(dst_map));
10777 
10778 		/*
10779 		 *	Check protection again
10780 		 */
10781 
10782 		if (!(entry->protection & VM_PROT_WRITE)) {
10783 			vm_map_unlock(dst_map);
10784 			return KERN_PROTECTION_FAILURE;
10785 		}
10786 
10787 		if (entry->is_sub_map) {
10788 			/* not properly implemented */
10789 			vm_map_unlock(dst_map);
10790 			return KERN_PROTECTION_FAILURE;
10791 		}
10792 
10793 		if (!vm_map_entry_is_overwritable(dst_map, entry)) {
10794 			vm_map_unlock(dst_map);
10795 			return KERN_PROTECTION_FAILURE;
10796 		}
10797 
10798 		/*
10799 		 *	If the entry is in transition, we must wait
10800 		 *	for it to exit that state.  Anything could happen
10801 		 *	when we unlock the map, so start over.
10802 		 */
10803 		if (entry->in_transition) {
10804 			/*
10805 			 * Say that we are waiting, and wait for entry.
10806 			 */
10807 			entry->needs_wakeup = TRUE;
10808 			vm_map_entry_wait(dst_map, THREAD_UNINT);
10809 
10810 			goto RetryLookup;
10811 		}
10812 
10813 		/*
10814 		 *	Adjust to source size first
10815 		 */
10816 
10817 		if (copy_size < size) {
10818 			if (entry->map_aligned &&
10819 			    !VM_MAP_PAGE_ALIGNED(entry->vme_start + copy_size,
10820 			    VM_MAP_PAGE_MASK(dst_map))) {
10821 				/* no longer map-aligned */
10822 				entry->map_aligned = FALSE;
10823 			}
10824 			vm_map_clip_end(dst_map, entry, entry->vme_start + copy_size);
10825 			size = copy_size;
10826 		}
10827 
10828 		/*
10829 		 *	Adjust to destination size
10830 		 */
10831 
10832 		if (size < copy_size) {
10833 			vm_map_copy_clip_end(copy, copy_entry,
10834 			    copy_entry->vme_start + size);
10835 			copy_size = size;
10836 		}
10837 
10838 		assert((entry->vme_end - entry->vme_start) == size);
10839 		assert((tmp_entry->vme_end - tmp_entry->vme_start) == size);
10840 		assert((copy_entry->vme_end - copy_entry->vme_start) == size);
10841 
10842 		/*
10843 		 *	If the destination contains temporary unshared memory,
10844 		 *	we can perform the copy by throwing it away and
10845 		 *	installing the source data.
10846 		 *
10847 		 *	Exceptions for mappings with special semantics:
10848 		 *	+ "permanent" entries,
10849 		 *	+ JIT regions,
10850 		 *	+ TPRO regions,
10851 		 *      + pmap-specific protection policies,
10852 		 *	+ VM objects with COPY_NONE copy strategy.
10853 		 */
10854 
10855 		object = VME_OBJECT(entry);
10856 		if ((!entry->is_shared &&
10857 		    !entry->vme_permanent &&
10858 		    !entry->used_for_jit &&
10859 #if __arm64e__
10860 		    !entry->used_for_tpro &&
10861 #endif /* __arm64e__ */
10862 		    !(entry->protection & VM_PROT_EXECUTE) &&
10863 		    !pmap_has_prot_policy(dst_map->pmap, entry->translated_allow_execute, entry->protection) &&
10864 		    ((object == VM_OBJECT_NULL) ||
10865 		    (object->internal &&
10866 		    !object->true_share &&
10867 		    object->copy_strategy != MEMORY_OBJECT_COPY_NONE))) ||
10868 		    entry->needs_copy) {
10869 			vm_object_t     old_object = VME_OBJECT(entry);
10870 			vm_object_offset_t      old_offset = VME_OFFSET(entry);
10871 			vm_object_offset_t      offset;
10872 
10873 			assert(!entry->is_sub_map);
10874 			/*
10875 			 * Ensure that the source and destination aren't
10876 			 * identical
10877 			 */
10878 			if (old_object == VME_OBJECT(copy_entry) &&
10879 			    old_offset == VME_OFFSET(copy_entry)) {
10880 				vm_map_copy_entry_unlink(copy, copy_entry);
10881 				vm_map_copy_entry_dispose(copy_entry);
10882 
10883 				if (old_object != VM_OBJECT_NULL) {
10884 					vm_object_deallocate(old_object);
10885 				}
10886 
10887 				start = tmp_entry->vme_end;
10888 				tmp_entry = tmp_entry->vme_next;
10889 				continue;
10890 			}
10891 
10892 #if XNU_TARGET_OS_OSX
10893 #define __TRADEOFF1_OBJ_SIZE (64 * 1024 * 1024) /* 64 MB */
10894 #define __TRADEOFF1_COPY_SIZE (128 * 1024)      /* 128 KB */
10895 			if (VME_OBJECT(copy_entry) != VM_OBJECT_NULL &&
10896 			    VME_OBJECT(copy_entry)->vo_size >= __TRADEOFF1_OBJ_SIZE &&
10897 			    copy_size <= __TRADEOFF1_COPY_SIZE) {
10898 				/*
10899 				 * Virtual vs. Physical copy tradeoff #1.
10900 				 *
10901 				 * Copying only a few pages out of a large
10902 				 * object:  do a physical copy instead of
10903 				 * a virtual copy, to avoid possibly keeping
10904 				 * the entire large object alive because of
10905 				 * those few copy-on-write pages.
10906 				 */
10907 				vm_map_copy_overwrite_aligned_src_large++;
10908 				goto slow_copy;
10909 			}
10910 #endif /* XNU_TARGET_OS_OSX */
10911 
10912 			if ((dst_map->pmap != kernel_pmap) &&
10913 			    (VME_ALIAS(entry) >= VM_MEMORY_MALLOC) &&
10914 			    (VME_ALIAS(entry) <= VM_MEMORY_MALLOC_MEDIUM)) {
10915 				vm_object_t new_object, new_shadow;
10916 
10917 				/*
10918 				 * We're about to map something over a mapping
10919 				 * established by malloc()...
10920 				 */
10921 				new_object = VME_OBJECT(copy_entry);
10922 				if (new_object != VM_OBJECT_NULL) {
10923 					vm_object_lock_shared(new_object);
10924 				}
10925 				while (new_object != VM_OBJECT_NULL &&
10926 #if XNU_TARGET_OS_OSX
10927 				    !new_object->true_share &&
10928 				    new_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC &&
10929 #endif /* XNU_TARGET_OS_OSX */
10930 				    new_object->internal) {
10931 					new_shadow = new_object->shadow;
10932 					if (new_shadow == VM_OBJECT_NULL) {
10933 						break;
10934 					}
10935 					vm_object_lock_shared(new_shadow);
10936 					vm_object_unlock(new_object);
10937 					new_object = new_shadow;
10938 				}
10939 				if (new_object != VM_OBJECT_NULL) {
10940 					if (!new_object->internal) {
10941 						/*
10942 						 * The new mapping is backed
10943 						 * by an external object.  We
10944 						 * don't want malloc'ed memory
10945 						 * to be replaced with such a
10946 						 * non-anonymous mapping, so
10947 						 * let's go off the optimized
10948 						 * path...
10949 						 */
10950 						vm_map_copy_overwrite_aligned_src_not_internal++;
10951 						vm_object_unlock(new_object);
10952 						goto slow_copy;
10953 					}
10954 #if XNU_TARGET_OS_OSX
10955 					if (new_object->true_share ||
10956 					    new_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
10957 						/*
10958 						 * Same if there's a "true_share"
10959 						 * object in the shadow chain, or
10960 						 * an object with a non-default
10961 						 * (SYMMETRIC) copy strategy.
10962 						 */
10963 						vm_map_copy_overwrite_aligned_src_not_symmetric++;
10964 						vm_object_unlock(new_object);
10965 						goto slow_copy;
10966 					}
10967 #endif /* XNU_TARGET_OS_OSX */
10968 					vm_object_unlock(new_object);
10969 				}
10970 				/*
10971 				 * The new mapping is still backed by
10972 				 * anonymous (internal) memory, so it's
10973 				 * OK to substitute it for the original
10974 				 * malloc() mapping.
10975 				 */
10976 			}
10977 
10978 			if (old_object != VM_OBJECT_NULL) {
10979 				assert(!entry->vme_permanent);
10980 				if (entry->is_sub_map) {
10981 					if (entry->use_pmap) {
10982 #ifndef NO_NESTED_PMAP
10983 						pmap_unnest(dst_map->pmap,
10984 						    (addr64_t)entry->vme_start,
10985 						    entry->vme_end - entry->vme_start);
10986 #endif  /* NO_NESTED_PMAP */
10987 						if (dst_map->mapped_in_other_pmaps) {
10988 							/* clean up parent */
10989 							/* map/maps */
10990 							vm_map_submap_pmap_clean(
10991 								dst_map, entry->vme_start,
10992 								entry->vme_end,
10993 								VME_SUBMAP(entry),
10994 								VME_OFFSET(entry));
10995 						}
10996 					} else {
10997 						vm_map_submap_pmap_clean(
10998 							dst_map, entry->vme_start,
10999 							entry->vme_end,
11000 							VME_SUBMAP(entry),
11001 							VME_OFFSET(entry));
11002 					}
11003 					vm_map_deallocate(VME_SUBMAP(entry));
11004 				} else {
11005 					if (dst_map->mapped_in_other_pmaps) {
11006 						vm_object_pmap_protect_options(
11007 							VME_OBJECT(entry),
11008 							VME_OFFSET(entry),
11009 							entry->vme_end
11010 							- entry->vme_start,
11011 							PMAP_NULL,
11012 							PAGE_SIZE,
11013 							entry->vme_start,
11014 							VM_PROT_NONE,
11015 							PMAP_OPTIONS_REMOVE);
11016 					} else {
11017 						pmap_remove_options(
11018 							dst_map->pmap,
11019 							(addr64_t)(entry->vme_start),
11020 							(addr64_t)(entry->vme_end),
11021 							PMAP_OPTIONS_REMOVE);
11022 					}
11023 					vm_object_deallocate(old_object);
11024 				}
11025 			}
11026 
11027 			if (entry->iokit_acct) {
11028 				/* keep using iokit accounting */
11029 				entry->use_pmap = FALSE;
11030 			} else {
11031 				/* use pmap accounting */
11032 				entry->use_pmap = TRUE;
11033 			}
11034 			assert(!entry->vme_permanent);
11035 			VME_OBJECT_SET(entry, VME_OBJECT(copy_entry), false, 0);
11036 			object = VME_OBJECT(entry);
11037 			entry->needs_copy = copy_entry->needs_copy;
11038 			entry->wired_count = 0;
11039 			entry->user_wired_count = 0;
11040 			offset = VME_OFFSET(copy_entry);
11041 			VME_OFFSET_SET(entry, offset);
11042 
11043 			vm_map_copy_entry_unlink(copy, copy_entry);
11044 			vm_map_copy_entry_dispose(copy_entry);
11045 
11046 			/*
11047 			 * we could try to push pages into the pmap at this point, BUT
11048 			 * this optimization only saved on average 2 us per page if ALL
11049 			 * the pages in the source were currently mapped
11050 			 * and ALL the pages in the dest were touched, if there were fewer
11051 			 * than 2/3 of the pages touched, this optimization actually cost more cycles
11052 			 * it also puts a lot of pressure on the pmap layer w/r to mapping structures
11053 			 */
11054 
11055 			/*
11056 			 *	Set up for the next iteration.  The map
11057 			 *	has not been unlocked, so the next
11058 			 *	address should be at the end of this
11059 			 *	entry, and the next map entry should be
11060 			 *	the one following it.
11061 			 */
11062 
11063 			start = tmp_entry->vme_end;
11064 			tmp_entry = tmp_entry->vme_next;
11065 		} else {
11066 			vm_map_version_t        version;
11067 			vm_object_t             dst_object;
11068 			vm_object_offset_t      dst_offset;
11069 			kern_return_t           r;
11070 
11071 slow_copy:
11072 			if (entry->needs_copy) {
11073 				VME_OBJECT_SHADOW(entry,
11074 				    (entry->vme_end -
11075 				    entry->vme_start),
11076 				    vm_map_always_shadow(dst_map));
11077 				entry->needs_copy = FALSE;
11078 			}
11079 
11080 			dst_object = VME_OBJECT(entry);
11081 			dst_offset = VME_OFFSET(entry);
11082 
11083 			/*
11084 			 *	Take an object reference, and record
11085 			 *	the map version information so that the
11086 			 *	map can be safely unlocked.
11087 			 */
11088 
11089 			if (dst_object == VM_OBJECT_NULL) {
11090 				/*
11091 				 * We would usually have just taken the
11092 				 * optimized path above if the destination
11093 				 * object has not been allocated yet.  But we
11094 				 * now disable that optimization if the copy
11095 				 * entry's object is not backed by anonymous
11096 				 * memory to avoid replacing malloc'ed
11097 				 * (i.e. re-usable) anonymous memory with a
11098 				 * not-so-anonymous mapping.
11099 				 * So we have to handle this case here and
11100 				 * allocate a new VM object for this map entry.
11101 				 */
11102 				dst_object = vm_object_allocate(
11103 					entry->vme_end - entry->vme_start);
11104 				dst_offset = 0;
11105 				VME_OBJECT_SET(entry, dst_object, false, 0);
11106 				VME_OFFSET_SET(entry, dst_offset);
11107 				assert(entry->use_pmap);
11108 			}
11109 
11110 			vm_object_reference(dst_object);
11111 
11112 			/* account for unlock bumping up timestamp */
11113 			version.main_timestamp = dst_map->timestamp + 1;
11114 
11115 			vm_map_unlock(dst_map);
11116 
11117 			/*
11118 			 *	Copy as much as possible in one pass
11119 			 */
11120 
11121 			copy_size = size;
11122 			r = vm_fault_copy(
11123 				VME_OBJECT(copy_entry),
11124 				VME_OFFSET(copy_entry),
11125 				&copy_size,
11126 				dst_object,
11127 				dst_offset,
11128 				dst_map,
11129 				&version,
11130 				THREAD_UNINT );
11131 
11132 			/*
11133 			 *	Release the object reference
11134 			 */
11135 
11136 			vm_object_deallocate(dst_object);
11137 
11138 			/*
11139 			 *	If a hard error occurred, return it now
11140 			 */
11141 
11142 			if (r != KERN_SUCCESS) {
11143 				return r;
11144 			}
11145 
11146 			if (copy_size != 0) {
11147 				/*
11148 				 *	Dispose of the copied region
11149 				 */
11150 
11151 				vm_map_copy_clip_end(copy, copy_entry,
11152 				    copy_entry->vme_start + copy_size);
11153 				vm_map_copy_entry_unlink(copy, copy_entry);
11154 				vm_object_deallocate(VME_OBJECT(copy_entry));
11155 				vm_map_copy_entry_dispose(copy_entry);
11156 			}
11157 
11158 			/*
11159 			 *	Pick up in the destination map where we left off.
11160 			 *
11161 			 *	Use the version information to avoid a lookup
11162 			 *	in the normal case.
11163 			 */
11164 
11165 			start += copy_size;
11166 			vm_map_lock(dst_map);
11167 			if (version.main_timestamp == dst_map->timestamp &&
11168 			    copy_size != 0) {
11169 				/* We can safely use saved tmp_entry value */
11170 
11171 				if (tmp_entry->map_aligned &&
11172 				    !VM_MAP_PAGE_ALIGNED(
11173 					    start,
11174 					    VM_MAP_PAGE_MASK(dst_map))) {
11175 					/* no longer map-aligned */
11176 					tmp_entry->map_aligned = FALSE;
11177 				}
11178 				vm_map_clip_end(dst_map, tmp_entry, start);
11179 				tmp_entry = tmp_entry->vme_next;
11180 			} else {
11181 				/* Must do lookup of tmp_entry */
11182 
11183 RetryLookup:
11184 				if (!vm_map_lookup_entry(dst_map, start, &tmp_entry)) {
11185 					vm_map_unlock(dst_map);
11186 					return KERN_INVALID_ADDRESS;
11187 				}
11188 				if (tmp_entry->map_aligned &&
11189 				    !VM_MAP_PAGE_ALIGNED(
11190 					    start,
11191 					    VM_MAP_PAGE_MASK(dst_map))) {
11192 					/* no longer map-aligned */
11193 					tmp_entry->map_aligned = FALSE;
11194 				}
11195 				vm_map_clip_start(dst_map, tmp_entry, start);
11196 			}
11197 		}
11198 	}/* while */
11199 
11200 	return KERN_SUCCESS;
11201 }/* vm_map_copy_overwrite_aligned */
11202 
11203 /*
11204  *	Routine: vm_map_copyin_kernel_buffer [internal use only]
11205  *
11206  *	Description:
11207  *		Copy in data to a kernel buffer from space in the
11208  *		source map. The original space may be optionally
11209  *		deallocated.
11210  *
11211  *		If successful, returns a new copy object.
11212  */
11213 static kern_return_t
vm_map_copyin_kernel_buffer(vm_map_t src_map,vm_map_offset_t src_addr,vm_map_size_t len,boolean_t src_destroy,vm_map_copy_t * copy_result)11214 vm_map_copyin_kernel_buffer(
11215 	vm_map_t        src_map,
11216 	vm_map_offset_t src_addr,
11217 	vm_map_size_t   len,
11218 	boolean_t       src_destroy,
11219 	vm_map_copy_t   *copy_result)
11220 {
11221 	kern_return_t kr;
11222 	vm_map_copy_t copy;
11223 	void *kdata;
11224 
11225 	if (len > msg_ool_size_small) {
11226 		return KERN_INVALID_ARGUMENT;
11227 	}
11228 
11229 	kdata = kalloc_data(len, Z_WAITOK);
11230 	if (kdata == NULL) {
11231 		return KERN_RESOURCE_SHORTAGE;
11232 	}
11233 	kr = copyinmap(src_map, src_addr, kdata, (vm_size_t)len);
11234 	if (kr != KERN_SUCCESS) {
11235 		kfree_data(kdata, len);
11236 		return kr;
11237 	}
11238 
11239 	copy = vm_map_copy_allocate(VM_MAP_COPY_KERNEL_BUFFER);
11240 	copy->cpy_kdata = kdata;
11241 	copy->size = len;
11242 	copy->offset = 0;
11243 
11244 	if (src_destroy) {
11245 		vmr_flags_t flags = VM_MAP_REMOVE_INTERRUPTIBLE;
11246 
11247 		if (src_map == kernel_map) {
11248 			flags |= VM_MAP_REMOVE_KUNWIRE;
11249 		}
11250 
11251 		(void)vm_map_remove_guard(src_map,
11252 		    vm_map_trunc_page(src_addr, VM_MAP_PAGE_MASK(src_map)),
11253 		    vm_map_round_page(src_addr + len, VM_MAP_PAGE_MASK(src_map)),
11254 		    flags, KMEM_GUARD_NONE);
11255 	}
11256 
11257 	*copy_result = copy;
11258 	return KERN_SUCCESS;
11259 }
11260 
11261 /*
11262  *	Routine: vm_map_copyout_kernel_buffer	[internal use only]
11263  *
11264  *	Description:
11265  *		Copy out data from a kernel buffer into space in the
11266  *		destination map. The space may be otpionally dynamically
11267  *		allocated.
11268  *
11269  *		If successful, consumes the copy object.
11270  *		Otherwise, the caller is responsible for it.
11271  *
11272  *		Callers of this function must call vm_map_copy_require on
11273  *		previously created vm_map_copy_t or pass a newly created
11274  *		one to ensure that it hasn't been forged.
11275  */
11276 static int vm_map_copyout_kernel_buffer_failures = 0;
11277 static kern_return_t
vm_map_copyout_kernel_buffer(vm_map_t map,vm_map_address_t * addr,vm_map_copy_t copy,vm_map_size_t copy_size,boolean_t overwrite,boolean_t consume_on_success)11278 vm_map_copyout_kernel_buffer(
11279 	vm_map_t                map,
11280 	vm_map_address_t        *addr,  /* IN/OUT */
11281 	vm_map_copy_t           copy,
11282 	vm_map_size_t           copy_size,
11283 	boolean_t               overwrite,
11284 	boolean_t               consume_on_success)
11285 {
11286 	kern_return_t kr = KERN_SUCCESS;
11287 	thread_t thread = current_thread();
11288 
11289 	assert(copy->size == copy_size);
11290 
11291 	/*
11292 	 * check for corrupted vm_map_copy structure
11293 	 */
11294 	if (copy_size > msg_ool_size_small || copy->offset) {
11295 		panic("Invalid vm_map_copy_t sz:%lld, ofst:%lld",
11296 		    (long long)copy->size, (long long)copy->offset);
11297 	}
11298 
11299 	if (!overwrite) {
11300 		/*
11301 		 * Allocate space in the target map for the data
11302 		 */
11303 		vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_ANYWHERE();
11304 
11305 		if (map == kernel_map) {
11306 			vmk_flags.vmkf_range_id = KMEM_RANGE_ID_DATA;
11307 		}
11308 
11309 		*addr = 0;
11310 		kr = vm_map_enter(map,
11311 		    addr,
11312 		    vm_map_round_page(copy_size,
11313 		    VM_MAP_PAGE_MASK(map)),
11314 		    (vm_map_offset_t) 0,
11315 		    vmk_flags,
11316 		    VM_OBJECT_NULL,
11317 		    (vm_object_offset_t) 0,
11318 		    FALSE,
11319 		    VM_PROT_DEFAULT,
11320 		    VM_PROT_ALL,
11321 		    VM_INHERIT_DEFAULT);
11322 		if (kr != KERN_SUCCESS) {
11323 			return kr;
11324 		}
11325 #if KASAN
11326 		if (map->pmap == kernel_pmap) {
11327 			kasan_notify_address(*addr, copy->size);
11328 		}
11329 #endif
11330 	}
11331 
11332 	/*
11333 	 * Copyout the data from the kernel buffer to the target map.
11334 	 */
11335 	if (thread->map == map) {
11336 		/*
11337 		 * If the target map is the current map, just do
11338 		 * the copy.
11339 		 */
11340 		assert((vm_size_t)copy_size == copy_size);
11341 		if (copyout(copy->cpy_kdata, *addr, (vm_size_t)copy_size)) {
11342 			kr = KERN_INVALID_ADDRESS;
11343 		}
11344 	} else {
11345 		vm_map_switch_context_t switch_ctx;
11346 
11347 		/*
11348 		 * If the target map is another map, assume the
11349 		 * target's address space identity for the duration
11350 		 * of the copy.
11351 		 */
11352 		vm_map_reference(map);
11353 		switch_ctx = vm_map_switch_to(map);
11354 
11355 		assert((vm_size_t)copy_size == copy_size);
11356 		if (copyout(copy->cpy_kdata, *addr, (vm_size_t)copy_size)) {
11357 			vm_map_copyout_kernel_buffer_failures++;
11358 			kr = KERN_INVALID_ADDRESS;
11359 		}
11360 
11361 		vm_map_switch_back(switch_ctx);
11362 		vm_map_deallocate(map);
11363 	}
11364 
11365 	if (kr != KERN_SUCCESS) {
11366 		/* the copy failed, clean up */
11367 		if (!overwrite) {
11368 			/*
11369 			 * Deallocate the space we allocated in the target map.
11370 			 */
11371 			(void) vm_map_remove(map,
11372 			    vm_map_trunc_page(*addr,
11373 			    VM_MAP_PAGE_MASK(map)),
11374 			    vm_map_round_page((*addr +
11375 			    vm_map_round_page(copy_size,
11376 			    VM_MAP_PAGE_MASK(map))),
11377 			    VM_MAP_PAGE_MASK(map)));
11378 			*addr = 0;
11379 		}
11380 	} else {
11381 		/* copy was successful, dicard the copy structure */
11382 		if (consume_on_success) {
11383 			kfree_data(copy->cpy_kdata, copy_size);
11384 			zfree_id(ZONE_ID_VM_MAP_COPY, copy);
11385 		}
11386 	}
11387 
11388 	return kr;
11389 }
11390 
11391 /*
11392  *	Routine:	vm_map_copy_insert      [internal use only]
11393  *
11394  *	Description:
11395  *		Link a copy chain ("copy") into a map at the
11396  *		specified location (after "where").
11397  *
11398  *		Callers of this function must call vm_map_copy_require on
11399  *		previously created vm_map_copy_t or pass a newly created
11400  *		one to ensure that it hasn't been forged.
11401  *	Side effects:
11402  *		The copy chain is destroyed.
11403  */
11404 static void
vm_map_copy_insert(vm_map_t map,vm_map_entry_t after_where,vm_map_copy_t copy)11405 vm_map_copy_insert(
11406 	vm_map_t        map,
11407 	vm_map_entry_t  after_where,
11408 	vm_map_copy_t   copy)
11409 {
11410 	vm_map_entry_t  entry;
11411 
11412 	while (vm_map_copy_first_entry(copy) != vm_map_copy_to_entry(copy)) {
11413 		entry = vm_map_copy_first_entry(copy);
11414 		vm_map_copy_entry_unlink(copy, entry);
11415 		vm_map_store_entry_link(map, after_where, entry,
11416 		    VM_MAP_KERNEL_FLAGS_NONE);
11417 		after_where = entry;
11418 	}
11419 	zfree_id(ZONE_ID_VM_MAP_COPY, copy);
11420 }
11421 
11422 /*
11423  * Callers of this function must call vm_map_copy_require on
11424  * previously created vm_map_copy_t or pass a newly created
11425  * one to ensure that it hasn't been forged.
11426  */
11427 void
vm_map_copy_remap(vm_map_t map,vm_map_entry_t where,vm_map_copy_t copy,vm_map_offset_t adjustment,vm_prot_t cur_prot,vm_prot_t max_prot,vm_inherit_t inheritance)11428 vm_map_copy_remap(
11429 	vm_map_t        map,
11430 	vm_map_entry_t  where,
11431 	vm_map_copy_t   copy,
11432 	vm_map_offset_t adjustment,
11433 	vm_prot_t       cur_prot,
11434 	vm_prot_t       max_prot,
11435 	vm_inherit_t    inheritance)
11436 {
11437 	vm_map_entry_t  copy_entry, new_entry;
11438 
11439 	for (copy_entry = vm_map_copy_first_entry(copy);
11440 	    copy_entry != vm_map_copy_to_entry(copy);
11441 	    copy_entry = copy_entry->vme_next) {
11442 		/* get a new VM map entry for the map */
11443 		new_entry = vm_map_entry_create(map);
11444 		/* copy the "copy entry" to the new entry */
11445 		vm_map_entry_copy(map, new_entry, copy_entry);
11446 		/* adjust "start" and "end" */
11447 		new_entry->vme_start += adjustment;
11448 		new_entry->vme_end += adjustment;
11449 		/* clear some attributes */
11450 		new_entry->inheritance = inheritance;
11451 		new_entry->protection = cur_prot;
11452 		new_entry->max_protection = max_prot;
11453 		new_entry->behavior = VM_BEHAVIOR_DEFAULT;
11454 		/* take an extra reference on the entry's "object" */
11455 		if (new_entry->is_sub_map) {
11456 			assert(!new_entry->use_pmap); /* not nested */
11457 			vm_map_reference(VME_SUBMAP(new_entry));
11458 		} else {
11459 			vm_object_reference(VME_OBJECT(new_entry));
11460 		}
11461 		/* insert the new entry in the map */
11462 		vm_map_store_entry_link(map, where, new_entry,
11463 		    VM_MAP_KERNEL_FLAGS_NONE);
11464 		/* continue inserting the "copy entries" after the new entry */
11465 		where = new_entry;
11466 	}
11467 }
11468 
11469 
11470 /*
11471  * Returns true if *size matches (or is in the range of) copy->size.
11472  * Upon returning true, the *size field is updated with the actual size of the
11473  * copy object (may be different for VM_MAP_COPY_ENTRY_LIST types)
11474  */
11475 boolean_t
vm_map_copy_validate_size(vm_map_t dst_map,vm_map_copy_t copy,vm_map_size_t * size)11476 vm_map_copy_validate_size(
11477 	vm_map_t                dst_map,
11478 	vm_map_copy_t           copy,
11479 	vm_map_size_t           *size)
11480 {
11481 	if (copy == VM_MAP_COPY_NULL) {
11482 		return FALSE;
11483 	}
11484 
11485 	/*
11486 	 * Assert that the vm_map_copy is coming from the right
11487 	 * zone and hasn't been forged
11488 	 */
11489 	vm_map_copy_require(copy);
11490 
11491 	vm_map_size_t copy_sz = copy->size;
11492 	vm_map_size_t sz = *size;
11493 	switch (copy->type) {
11494 	case VM_MAP_COPY_KERNEL_BUFFER:
11495 		if (sz == copy_sz) {
11496 			return TRUE;
11497 		}
11498 		break;
11499 	case VM_MAP_COPY_ENTRY_LIST:
11500 		/*
11501 		 * potential page-size rounding prevents us from exactly
11502 		 * validating this flavor of vm_map_copy, but we can at least
11503 		 * assert that it's within a range.
11504 		 */
11505 		if (copy_sz >= sz &&
11506 		    copy_sz <= vm_map_round_page(sz, VM_MAP_PAGE_MASK(dst_map))) {
11507 			*size = copy_sz;
11508 			return TRUE;
11509 		}
11510 		break;
11511 	default:
11512 		break;
11513 	}
11514 	return FALSE;
11515 }
11516 
11517 static kern_return_t
vm_map_copyout_internal(vm_map_t dst_map,vm_map_address_t * dst_addr,vm_map_copy_t copy,vm_map_size_ut copy_size_u,boolean_t consume_on_success,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)11518 vm_map_copyout_internal(
11519 	vm_map_t                dst_map,
11520 	vm_map_address_t       *dst_addr,      /* OUT */
11521 	vm_map_copy_t           copy,
11522 	vm_map_size_ut          copy_size_u,
11523 	boolean_t               consume_on_success,
11524 	vm_prot_t               cur_protection,
11525 	vm_prot_t               max_protection,
11526 	vm_inherit_t            inheritance)
11527 {
11528 	vm_map_size_t           size, copy_size;
11529 	vm_map_size_t           adjustment;
11530 	vm_map_offset_t         start;
11531 	vm_object_offset_t      vm_copy_start;
11532 	vm_map_entry_t          last;
11533 	vm_map_entry_t          entry;
11534 	vm_map_copy_t           original_copy;
11535 	kern_return_t           kr;
11536 	vm_map_kernel_flags_t   vmk_flags = VM_MAP_KERNEL_FLAGS_ANYWHERE();
11537 
11538 	/*
11539 	 *	Check for null copy object.
11540 	 */
11541 
11542 	if (copy == VM_MAP_COPY_NULL) {
11543 		*dst_addr = 0;
11544 		return KERN_SUCCESS;
11545 	}
11546 
11547 	/*
11548 	 * Assert that the vm_map_copy is coming from the right
11549 	 * zone and hasn't been forged
11550 	 */
11551 	vm_map_copy_require(copy);
11552 
11553 	if (!VM_SANITIZE_UNSAFE_IS_EQUAL(copy_size_u, copy->size)) {
11554 		*dst_addr = 0;
11555 		ktriage_record(thread_tid(current_thread()),
11556 		    KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM,
11557 		    KDBG_TRIAGE_RESERVED,
11558 		    KDBG_TRIAGE_VM_COPYOUT_INTERNAL_SIZE_ERROR),
11559 		    KERN_FAILURE /* arg */);
11560 		return KERN_FAILURE;
11561 	}
11562 	copy_size = copy->size;
11563 
11564 	/*
11565 	 *	Check for special kernel buffer allocated
11566 	 *	by new_ipc_kmsg_copyin.
11567 	 */
11568 
11569 	if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) {
11570 		kr = vm_map_copyout_kernel_buffer(dst_map, dst_addr,
11571 		    copy, copy_size, FALSE,
11572 		    consume_on_success);
11573 		if (kr) {
11574 			ktriage_record(thread_tid(current_thread()),
11575 			    KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM,
11576 			    KDBG_TRIAGE_RESERVED,
11577 			    KDBG_TRIAGE_VM_COPYOUT_KERNEL_BUFFER_ERROR), kr /* arg */);
11578 		}
11579 		return kr;
11580 	}
11581 
11582 
11583 	original_copy = copy;
11584 	if (copy->cpy_hdr.page_shift != VM_MAP_PAGE_SHIFT(dst_map)) {
11585 		vm_map_copy_t target_copy;
11586 		vm_map_offset_t overmap_start, overmap_end, trimmed_start;
11587 
11588 		target_copy = VM_MAP_COPY_NULL;
11589 		DEBUG4K_ADJUST("adjusting...\n");
11590 		kr = vm_map_copy_adjust_to_target(
11591 			copy,
11592 			0, /* offset */
11593 			copy->size, /* size */
11594 			dst_map,
11595 			TRUE, /* copy */
11596 			&target_copy,
11597 			&overmap_start,
11598 			&overmap_end,
11599 			&trimmed_start);
11600 		if (kr != KERN_SUCCESS) {
11601 			DEBUG4K_COPY("adjust failed 0x%x\n", kr);
11602 			ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOUT_INTERNAL_ADJUSTING_ERROR), kr /* arg */);
11603 			return kr;
11604 		}
11605 		DEBUG4K_COPY("copy %p (%d 0x%llx 0x%llx) dst_map %p (%d) target_copy %p (%d 0x%llx 0x%llx) overmap_start 0x%llx overmap_end 0x%llx trimmed_start 0x%llx\n", copy, copy->cpy_hdr.page_shift, copy->offset, (uint64_t)copy->size, dst_map, VM_MAP_PAGE_SHIFT(dst_map), target_copy, target_copy->cpy_hdr.page_shift, target_copy->offset, (uint64_t)target_copy->size, (uint64_t)overmap_start, (uint64_t)overmap_end, (uint64_t)trimmed_start);
11606 		if (target_copy != copy) {
11607 			copy = target_copy;
11608 		}
11609 		copy_size = copy->size;
11610 	}
11611 
11612 	/*
11613 	 *	Find space for the data
11614 	 */
11615 
11616 	vm_copy_start = vm_map_trunc_page((vm_map_size_t)copy->offset,
11617 	    VM_MAP_COPY_PAGE_MASK(copy));
11618 	size = vm_map_round_page((vm_map_size_t)copy->offset + copy_size,
11619 	    VM_MAP_COPY_PAGE_MASK(copy))
11620 	    - vm_copy_start;
11621 
11622 	vm_map_kernel_flags_update_range_id(&vmk_flags, dst_map, size);
11623 
11624 	vm_map_lock(dst_map);
11625 	kr = vm_map_locate_space_anywhere(dst_map, size, 0, vmk_flags,
11626 	    &start, &last);
11627 	if (kr != KERN_SUCCESS) {
11628 		vm_map_unlock(dst_map);
11629 		ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOUT_INTERNAL_SPACE_ERROR), kr /* arg */);
11630 		return kr;
11631 	}
11632 
11633 	adjustment = start - vm_copy_start;
11634 	if (!consume_on_success) {
11635 		/*
11636 		 * We're not allowed to consume "copy", so we'll have to
11637 		 * copy its map entries into the destination map below.
11638 		 * No need to re-allocate map entries from the correct
11639 		 * (pageable or not) zone, since we'll get new map entries
11640 		 * during the transfer.
11641 		 * We'll also adjust the map entries's "start" and "end"
11642 		 * during the transfer, to keep "copy"'s entries consistent
11643 		 * with its "offset".
11644 		 */
11645 		goto after_adjustments;
11646 	}
11647 
11648 	/*
11649 	 *	Since we're going to just drop the map
11650 	 *	entries from the copy into the destination
11651 	 *	map, they must come from the same pool.
11652 	 */
11653 
11654 	if (copy->cpy_hdr.entries_pageable != dst_map->hdr.entries_pageable) {
11655 		/*
11656 		 * Mismatches occur when dealing with the default
11657 		 * pager.
11658 		 */
11659 		vm_map_entry_t  next, new;
11660 
11661 		/*
11662 		 * Find the zone that the copies were allocated from
11663 		 */
11664 
11665 		entry = vm_map_copy_first_entry(copy);
11666 
11667 		/*
11668 		 * Reinitialize the copy so that vm_map_copy_entry_link
11669 		 * will work.
11670 		 */
11671 		vm_map_store_copy_reset(copy, entry);
11672 		copy->cpy_hdr.entries_pageable = dst_map->hdr.entries_pageable;
11673 
11674 		/*
11675 		 * Copy each entry.
11676 		 */
11677 		while (entry != vm_map_copy_to_entry(copy)) {
11678 			new = vm_map_copy_entry_create(copy);
11679 			vm_map_entry_copy_full(new, entry);
11680 			new->vme_no_copy_on_read = FALSE;
11681 			assert(!new->iokit_acct);
11682 			if (new->is_sub_map) {
11683 				/* clr address space specifics */
11684 				new->use_pmap = FALSE;
11685 			}
11686 			vm_map_copy_entry_link(copy,
11687 			    vm_map_copy_last_entry(copy),
11688 			    new);
11689 			next = entry->vme_next;
11690 			vm_map_entry_dispose(entry);
11691 			entry = next;
11692 		}
11693 	}
11694 
11695 	/*
11696 	 *	Adjust the addresses in the copy chain, and
11697 	 *	reset the region attributes.
11698 	 */
11699 
11700 	for (entry = vm_map_copy_first_entry(copy);
11701 	    entry != vm_map_copy_to_entry(copy);
11702 	    entry = entry->vme_next) {
11703 		if (VM_MAP_PAGE_SHIFT(dst_map) == PAGE_SHIFT) {
11704 			/*
11705 			 * We're injecting this copy entry into a map that
11706 			 * has the standard page alignment, so clear
11707 			 * "map_aligned" (which might have been inherited
11708 			 * from the original map entry).
11709 			 */
11710 			entry->map_aligned = FALSE;
11711 		}
11712 
11713 		entry->vme_start += adjustment;
11714 		entry->vme_end += adjustment;
11715 
11716 		if (entry->map_aligned) {
11717 			assert(VM_MAP_PAGE_ALIGNED(entry->vme_start,
11718 			    VM_MAP_PAGE_MASK(dst_map)));
11719 			assert(VM_MAP_PAGE_ALIGNED(entry->vme_end,
11720 			    VM_MAP_PAGE_MASK(dst_map)));
11721 		}
11722 
11723 		entry->inheritance = VM_INHERIT_DEFAULT;
11724 		entry->protection = VM_PROT_DEFAULT;
11725 		entry->max_protection = VM_PROT_ALL;
11726 		entry->behavior = VM_BEHAVIOR_DEFAULT;
11727 
11728 		/*
11729 		 * If the entry is now wired,
11730 		 * map the pages into the destination map.
11731 		 */
11732 		if (entry->wired_count != 0) {
11733 			vm_map_offset_t va;
11734 			vm_object_offset_t       offset;
11735 			vm_object_t object;
11736 			vm_prot_t prot;
11737 			int     type_of_fault;
11738 			uint8_t object_lock_type = OBJECT_LOCK_EXCLUSIVE;
11739 
11740 			/* TODO4K would need to use actual page size */
11741 			assert(VM_MAP_PAGE_SHIFT(dst_map) == PAGE_SHIFT);
11742 
11743 			object = VME_OBJECT(entry);
11744 			offset = VME_OFFSET(entry);
11745 			va = entry->vme_start;
11746 
11747 			pmap_pageable(dst_map->pmap,
11748 			    entry->vme_start,
11749 			    entry->vme_end,
11750 			    TRUE);
11751 
11752 			while (va < entry->vme_end) {
11753 				vm_page_t       m;
11754 				struct vm_object_fault_info fault_info = {
11755 					.interruptible = THREAD_UNINT,
11756 				};
11757 
11758 				/*
11759 				 * Look up the page in the object.
11760 				 * Assert that the page will be found in the
11761 				 * top object:
11762 				 * either
11763 				 *	the object was newly created by
11764 				 *	vm_object_copy_slowly, and has
11765 				 *	copies of all of the pages from
11766 				 *	the source object
11767 				 * or
11768 				 *	the object was moved from the old
11769 				 *	map entry; because the old map
11770 				 *	entry was wired, all of the pages
11771 				 *	were in the top-level object.
11772 				 *	(XXX not true if we wire pages for
11773 				 *	 reading)
11774 				 */
11775 				vm_object_lock(object);
11776 
11777 				m = vm_page_lookup(object, offset);
11778 				if (m == VM_PAGE_NULL || !VM_PAGE_WIRED(m) ||
11779 				    m->vmp_absent) {
11780 					panic("vm_map_copyout: wiring %p", m);
11781 				}
11782 
11783 				prot = entry->protection;
11784 
11785 				if (override_nx(dst_map, VME_ALIAS(entry)) &&
11786 				    prot) {
11787 					prot |= VM_PROT_EXECUTE;
11788 				}
11789 
11790 				type_of_fault = DBG_CACHE_HIT_FAULT;
11791 
11792 				fault_info.user_tag = VME_ALIAS(entry);
11793 				fault_info.pmap_options = 0;
11794 				if (entry->iokit_acct ||
11795 				    (!entry->is_sub_map && !entry->use_pmap)) {
11796 					fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
11797 				}
11798 				if (entry->vme_xnu_user_debug &&
11799 				    !VM_PAGE_OBJECT(m)->code_signed) {
11800 					/*
11801 					 * Modified code-signed executable
11802 					 * region: this page does not belong
11803 					 * to a code-signed VM object, so it
11804 					 * must have been copied and should
11805 					 * therefore be typed XNU_USER_DEBUG
11806 					 * rather than XNU_USER_EXEC.
11807 					 */
11808 					fault_info.pmap_options |= PMAP_OPTIONS_XNU_USER_DEBUG;
11809 				}
11810 
11811 				vm_fault_enter(m,
11812 				    dst_map->pmap,
11813 				    va,
11814 				    PAGE_SIZE, 0,
11815 				    prot,
11816 				    prot,
11817 				    VM_PAGE_WIRED(m),
11818 				    VM_KERN_MEMORY_NONE,            /* tag - not wiring */
11819 				    &fault_info,
11820 				    NULL,             /* need_retry */
11821 				    &type_of_fault,
11822 				    &object_lock_type); /*Exclusive mode lock. Will remain unchanged.*/
11823 
11824 				vm_object_unlock(object);
11825 
11826 				offset += PAGE_SIZE_64;
11827 				va += PAGE_SIZE;
11828 			}
11829 		}
11830 	}
11831 
11832 after_adjustments:
11833 
11834 	/*
11835 	 *	Correct the page alignment for the result
11836 	 */
11837 
11838 	*dst_addr = start + (copy->offset - vm_copy_start);
11839 
11840 #if KASAN
11841 	kasan_notify_address(*dst_addr, size);
11842 #endif
11843 
11844 	/*
11845 	 *	Update the hints and the map size
11846 	 */
11847 
11848 	if (consume_on_success) {
11849 		SAVE_HINT_MAP_WRITE(dst_map, vm_map_copy_last_entry(copy));
11850 	} else {
11851 		SAVE_HINT_MAP_WRITE(dst_map, last);
11852 	}
11853 
11854 	dst_map->size += size;
11855 
11856 	/*
11857 	 *	Link in the copy
11858 	 */
11859 
11860 	if (consume_on_success) {
11861 		vm_map_copy_insert(dst_map, last, copy);
11862 		if (copy != original_copy) {
11863 			vm_map_copy_discard(original_copy);
11864 			original_copy = VM_MAP_COPY_NULL;
11865 		}
11866 	} else {
11867 		vm_map_copy_remap(dst_map, last, copy, adjustment,
11868 		    cur_protection, max_protection,
11869 		    inheritance);
11870 		if (copy != original_copy && original_copy != VM_MAP_COPY_NULL) {
11871 			vm_map_copy_discard(copy);
11872 			copy = original_copy;
11873 		}
11874 	}
11875 
11876 
11877 	vm_map_unlock(dst_map);
11878 
11879 	/*
11880 	 * XXX	If wiring_required, call vm_map_pageable
11881 	 */
11882 
11883 	return KERN_SUCCESS;
11884 }
11885 
11886 /*
11887  *	Routine:	vm_map_copyout_size
11888  *
11889  *	Description:
11890  *		Copy out a copy chain ("copy") into newly-allocated
11891  *		space in the destination map. Uses a prevalidated
11892  *		size for the copy object (vm_map_copy_validate_size).
11893  *
11894  *		If successful, consumes the copy object.
11895  *		Otherwise, the caller is responsible for it.
11896  */
11897 kern_return_t
vm_map_copyout_size(vm_map_t dst_map,vm_map_address_t * dst_addr,vm_map_copy_t copy,vm_map_size_ut copy_size)11898 vm_map_copyout_size(
11899 	vm_map_t                dst_map,
11900 	vm_map_address_t       *dst_addr,      /* OUT */
11901 	vm_map_copy_t           copy,
11902 	vm_map_size_ut          copy_size)
11903 {
11904 	return vm_map_copyout_internal(dst_map, dst_addr, copy, copy_size,
11905 	           TRUE,                     /* consume_on_success */
11906 	           VM_PROT_DEFAULT,
11907 	           VM_PROT_ALL,
11908 	           VM_INHERIT_DEFAULT);
11909 }
11910 
11911 /*
11912  *	Routine:	vm_map_copyout
11913  *
11914  *	Description:
11915  *		Copy out a copy chain ("copy") into newly-allocated
11916  *		space in the destination map.
11917  *
11918  *		If successful, consumes the copy object.
11919  *		Otherwise, the caller is responsible for it.
11920  */
11921 kern_return_t
vm_map_copyout(vm_map_t dst_map,vm_map_address_t * dst_addr,vm_map_copy_t copy)11922 vm_map_copyout(
11923 	vm_map_t                dst_map,
11924 	vm_map_address_t       *dst_addr,      /* OUT */
11925 	vm_map_copy_t           copy)
11926 {
11927 	return vm_map_copyout_internal(dst_map, dst_addr, copy, copy ? copy->size : 0,
11928 	           TRUE,                     /* consume_on_success */
11929 	           VM_PROT_DEFAULT,
11930 	           VM_PROT_ALL,
11931 	           VM_INHERIT_DEFAULT);
11932 }
11933 
11934 /*
11935  *	Routine:	vm_map_copyin
11936  *
11937  *	Description:
11938  *		see vm_map_copyin_common.  Exported via Unsupported.exports.
11939  *
11940  */
11941 kern_return_t
vm_map_copyin(vm_map_t src_map,vm_map_address_ut src_addr,vm_map_size_ut len,boolean_t src_destroy,vm_map_copy_t * copy_result)11942 vm_map_copyin(
11943 	vm_map_t                src_map,
11944 	vm_map_address_ut       src_addr,
11945 	vm_map_size_ut          len,
11946 	boolean_t               src_destroy,
11947 	vm_map_copy_t          *copy_result)   /* OUT */
11948 {
11949 	return vm_map_copyin_common(src_map, src_addr, len, src_destroy,
11950 	           FALSE, copy_result, FALSE);
11951 }
11952 
11953 /*
11954  *	Routine:	vm_map_copyin_common
11955  *
11956  *	Description:
11957  *		Copy the specified region (src_addr, len) from the
11958  *		source address space (src_map), possibly removing
11959  *		the region from the source address space (src_destroy).
11960  *
11961  *	Returns:
11962  *		A vm_map_copy_t object (copy_result), suitable for
11963  *		insertion into another address space (using vm_map_copyout),
11964  *		copying over another address space region (using
11965  *		vm_map_copy_overwrite).  If the copy is unused, it
11966  *		should be destroyed (using vm_map_copy_discard).
11967  *
11968  *	In/out conditions:
11969  *		The source map should not be locked on entry.
11970  */
11971 
11972 typedef struct submap_map {
11973 	vm_map_t        parent_map;
11974 	vm_map_offset_t base_start;
11975 	vm_map_offset_t base_end;
11976 	vm_map_size_t   base_len;
11977 	struct submap_map *next;
11978 } submap_map_t;
11979 
11980 kern_return_t
vm_map_copyin_common(vm_map_t src_map,vm_map_address_ut src_addr,vm_map_size_ut len,boolean_t src_destroy,__unused boolean_t src_volatile,vm_map_copy_t * copy_result,boolean_t use_maxprot)11981 vm_map_copyin_common(
11982 	vm_map_t                src_map,
11983 	vm_map_address_ut       src_addr,
11984 	vm_map_size_ut          len,
11985 	boolean_t               src_destroy,
11986 	__unused boolean_t      src_volatile,
11987 	vm_map_copy_t          *copy_result,   /* OUT */
11988 	boolean_t               use_maxprot)
11989 {
11990 	int flags;
11991 
11992 	flags = 0;
11993 	if (src_destroy) {
11994 		flags |= VM_MAP_COPYIN_SRC_DESTROY;
11995 	}
11996 	if (use_maxprot) {
11997 		flags |= VM_MAP_COPYIN_USE_MAXPROT;
11998 	}
11999 	return vm_map_copyin_internal(src_map,
12000 	           src_addr,
12001 	           len,
12002 	           flags,
12003 	           copy_result);
12004 }
12005 
12006 static __attribute__((always_inline, warn_unused_result))
12007 kern_return_t
vm_map_copyin_sanitize(vm_map_t src_map,vm_map_address_ut src_addr_u,vm_map_size_ut len_u,vm_map_offset_t * src_start,vm_map_offset_t * src_end,vm_map_size_t * len,vm_map_offset_t * src_addr_unaligned)12008 vm_map_copyin_sanitize(
12009 	vm_map_t                src_map,
12010 	vm_map_address_ut       src_addr_u,
12011 	vm_map_size_ut          len_u,
12012 	vm_map_offset_t        *src_start,
12013 	vm_map_offset_t        *src_end,
12014 	vm_map_size_t          *len,
12015 	vm_map_offset_t        *src_addr_unaligned)
12016 {
12017 	kern_return_t   kr;
12018 	vm_sanitize_flags_t flags = VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS |
12019 	    VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES |
12020 	    VM_SANITIZE_FLAGS_CHECK_ADDR_RANGE;
12021 
12022 #if KASAN_TBI
12023 	if (vm_kernel_map_is_kernel(src_map)) {
12024 		flags |= VM_SANITIZE_FLAGS_CANONICALIZE;
12025 	}
12026 #endif /* KASAN_TBI */
12027 
12028 	kr = vm_sanitize_addr_size(src_addr_u, len_u,
12029 	    VM_SANITIZE_CALLER_VM_MAP_COPYIN,
12030 	    src_map,
12031 	    flags,
12032 	    src_start, src_end, len);
12033 	if (__improbable(kr != KERN_SUCCESS)) {
12034 		return kr;
12035 	}
12036 
12037 	/*
12038 	 *	Compute (page aligned) start and end of region
12039 	 */
12040 	*src_addr_unaligned  = *src_start; /* remember unaligned value */
12041 	*src_start = vm_map_trunc_page(*src_addr_unaligned,
12042 	    VM_MAP_PAGE_MASK(src_map));
12043 	*src_end   = vm_map_round_page(*src_end, VM_MAP_PAGE_MASK(src_map));
12044 	return KERN_SUCCESS;
12045 }
12046 
12047 kern_return_t
vm_map_copyin_internal(vm_map_t src_map,vm_map_address_ut src_addr_u,vm_map_size_ut len_u,int flags,vm_map_copy_t * copy_result)12048 vm_map_copyin_internal(
12049 	vm_map_t                src_map,
12050 	vm_map_address_ut       src_addr_u,
12051 	vm_map_size_ut          len_u,
12052 	int                     flags,
12053 	vm_map_copy_t          *copy_result)   /* OUT */
12054 {
12055 	vm_map_entry_t  tmp_entry;      /* Result of last map lookup --
12056 	                                 * in multi-level lookup, this
12057 	                                 * entry contains the actual
12058 	                                 * vm_object/offset.
12059 	                                 */
12060 	vm_map_entry_t  new_entry = VM_MAP_ENTRY_NULL;  /* Map entry for copy */
12061 
12062 	vm_map_offset_t src_start;      /* Start of current entry --
12063 	                                 * where copy is taking place now
12064 	                                 */
12065 	vm_map_offset_t src_end;        /* End of entire region to be
12066 	                                 * copied */
12067 	vm_map_offset_t src_addr_unaligned;
12068 	vm_map_offset_t src_base;
12069 	vm_map_size_t   len;
12070 	vm_map_t        base_map = src_map;
12071 	boolean_t       map_share = FALSE;
12072 	submap_map_t    *parent_maps = NULL;
12073 
12074 	vm_map_copy_t   copy;           /* Resulting copy */
12075 	vm_map_address_t copy_addr;
12076 	vm_map_size_t   copy_size;
12077 	boolean_t       src_destroy;
12078 	boolean_t       use_maxprot;
12079 	boolean_t       preserve_purgeable;
12080 	boolean_t       entry_was_shared;
12081 	vm_map_entry_t  saved_src_entry;
12082 	kern_return_t   kr;
12083 
12084 	if (flags & ~VM_MAP_COPYIN_ALL_FLAGS) {
12085 		return KERN_INVALID_ARGUMENT;
12086 	}
12087 
12088 	/*
12089 	 *	Check for copies of zero bytes.
12090 	 */
12091 	if (VM_SANITIZE_UNSAFE_IS_ZERO(len_u)) {
12092 		*copy_result = VM_MAP_COPY_NULL;
12093 		return KERN_SUCCESS;
12094 	}
12095 
12096 	/*
12097 	 * Sanitize any input parameters that are addr/size/prot/inherit
12098 	 */
12099 	kr = vm_map_copyin_sanitize(
12100 		src_map,
12101 		src_addr_u,
12102 		len_u,
12103 		&src_start,
12104 		&src_end,
12105 		&len,
12106 		&src_addr_unaligned);
12107 	if (__improbable(kr != KERN_SUCCESS)) {
12108 		return vm_sanitize_get_kr(kr);
12109 	}
12110 
12111 
12112 	src_destroy = (flags & VM_MAP_COPYIN_SRC_DESTROY) ? TRUE : FALSE;
12113 	use_maxprot = (flags & VM_MAP_COPYIN_USE_MAXPROT) ? TRUE : FALSE;
12114 	preserve_purgeable =
12115 	    (flags & VM_MAP_COPYIN_PRESERVE_PURGEABLE) ? TRUE : FALSE;
12116 
12117 	/*
12118 	 * If the copy is sufficiently small, use a kernel buffer instead
12119 	 * of making a virtual copy.  The theory being that the cost of
12120 	 * setting up VM (and taking C-O-W faults) dominates the copy costs
12121 	 * for small regions.
12122 	 */
12123 	if ((len <= msg_ool_size_small) &&
12124 	    !use_maxprot &&
12125 	    !preserve_purgeable &&
12126 	    !(flags & VM_MAP_COPYIN_ENTRY_LIST) &&
12127 	    /*
12128 	     * Since the "msg_ool_size_small" threshold was increased and
12129 	     * vm_map_copyin_kernel_buffer() doesn't handle accesses beyond the
12130 	     * address space limits, we revert to doing a virtual copy if the
12131 	     * copied range goes beyond those limits.  Otherwise, mach_vm_read()
12132 	     * of the commpage would now fail when it used to work.
12133 	     */
12134 	    (src_start >= vm_map_min(src_map) &&
12135 	    src_start < vm_map_max(src_map) &&
12136 	    src_end >= vm_map_min(src_map) &&
12137 	    src_end < vm_map_max(src_map))) {
12138 		return vm_map_copyin_kernel_buffer(src_map, src_addr_unaligned, len,
12139 		           src_destroy, copy_result);
12140 	}
12141 
12142 	/*
12143 	 *	Allocate a header element for the list.
12144 	 *
12145 	 *	Use the start and end in the header to
12146 	 *	remember the endpoints prior to rounding.
12147 	 */
12148 
12149 	copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
12150 	copy->cpy_hdr.entries_pageable = TRUE;
12151 	copy->cpy_hdr.page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(src_map);
12152 	copy->offset = src_addr_unaligned;
12153 	copy->size = len;
12154 
12155 	new_entry = vm_map_copy_entry_create(copy);
12156 
12157 #define RETURN(x)                                               \
12158 	MACRO_BEGIN                                             \
12159 	vm_map_unlock(src_map);                                 \
12160 	if(src_map != base_map)                                 \
12161 	        vm_map_deallocate(src_map);                     \
12162 	if (new_entry != VM_MAP_ENTRY_NULL)                     \
12163 	        vm_map_copy_entry_dispose(new_entry);           \
12164 	vm_map_copy_discard(copy);                              \
12165 	{                                                       \
12166 	        submap_map_t	*_ptr;                          \
12167                                                                 \
12168 	        for(_ptr = parent_maps; _ptr != NULL; _ptr = parent_maps) { \
12169 	                parent_maps=parent_maps->next;          \
12170 	                if (_ptr->parent_map != base_map)       \
12171 	                        vm_map_deallocate(_ptr->parent_map);    \
12172 	                kfree_type(submap_map_t, _ptr);         \
12173 	        }                                               \
12174 	}                                                       \
12175 	MACRO_RETURN(x);                                        \
12176 	MACRO_END
12177 
12178 	/*
12179 	 *	Find the beginning of the region.
12180 	 */
12181 
12182 	vm_map_lock(src_map);
12183 
12184 	/*
12185 	 * Lookup the original "src_addr_unaligned" rather than the truncated
12186 	 * "src_start", in case "src_start" falls in a non-map-aligned
12187 	 * map entry *before* the map entry that contains "src_addr_unaligned"...
12188 	 */
12189 	if (!vm_map_lookup_entry(src_map, src_addr_unaligned, &tmp_entry)) {
12190 		RETURN(KERN_INVALID_ADDRESS);
12191 	}
12192 	if (!tmp_entry->is_sub_map) {
12193 		/*
12194 		 * ... but clip to the map-rounded "src_start" rather than
12195 		 * "src_addr_unaligned" to preserve map-alignment.  We'll adjust the
12196 		 * first copy entry at the end, if needed.
12197 		 */
12198 		vm_map_clip_start(src_map, tmp_entry, src_start);
12199 	}
12200 	if (src_start < tmp_entry->vme_start) {
12201 		/*
12202 		 * Move "src_start" up to the start of the
12203 		 * first map entry to copy.
12204 		 */
12205 		src_start = tmp_entry->vme_start;
12206 	}
12207 	/* set for later submap fix-up */
12208 	copy_addr = src_start;
12209 
12210 	/*
12211 	 *	Go through entries until we get to the end.
12212 	 */
12213 
12214 	while (TRUE) {
12215 		vm_map_entry_t  src_entry = tmp_entry;  /* Top-level entry */
12216 		vm_map_size_t   src_size;               /* Size of source
12217 		                                         * map entry (in both
12218 		                                         * maps)
12219 		                                         */
12220 
12221 		vm_object_t             src_object;     /* Object to copy */
12222 		vm_object_offset_t      src_offset;
12223 
12224 		vm_object_t             new_copy_object;/* vm_object_copy_* result */
12225 
12226 		boolean_t       src_needs_copy;         /* Should source map
12227 		                                         * be made read-only
12228 		                                         * for copy-on-write?
12229 		                                         */
12230 
12231 		boolean_t       new_entry_needs_copy;   /* Will new entry be COW? */
12232 
12233 		boolean_t       was_wired;              /* Was source wired? */
12234 		boolean_t       saved_used_for_jit;     /* Saved used_for_jit. */
12235 		vm_map_version_t version;               /* Version before locks
12236 		                                         * dropped to make copy
12237 		                                         */
12238 		kern_return_t   result;                 /* Return value from
12239 		                                         * copy_strategically.
12240 		                                         */
12241 		while (tmp_entry->is_sub_map) {
12242 			vm_map_size_t submap_len;
12243 			submap_map_t *ptr;
12244 
12245 			ptr = kalloc_type(submap_map_t, Z_WAITOK);
12246 			ptr->next = parent_maps;
12247 			parent_maps = ptr;
12248 			ptr->parent_map = src_map;
12249 			ptr->base_start = src_start;
12250 			ptr->base_end = src_end;
12251 			submap_len = tmp_entry->vme_end - src_start;
12252 			if (submap_len > (src_end - src_start)) {
12253 				submap_len = src_end - src_start;
12254 			}
12255 			ptr->base_len = submap_len;
12256 
12257 			src_start -= tmp_entry->vme_start;
12258 			src_start += VME_OFFSET(tmp_entry);
12259 			src_end = src_start + submap_len;
12260 			src_map = VME_SUBMAP(tmp_entry);
12261 			vm_map_lock(src_map);
12262 			/* keep an outstanding reference for all maps in */
12263 			/* the parents tree except the base map */
12264 			vm_map_reference(src_map);
12265 			vm_map_unlock(ptr->parent_map);
12266 			if (!vm_map_lookup_entry(
12267 				    src_map, src_start, &tmp_entry)) {
12268 				RETURN(KERN_INVALID_ADDRESS);
12269 			}
12270 			map_share = TRUE;
12271 			if (!tmp_entry->is_sub_map) {
12272 				vm_map_clip_start(src_map, tmp_entry, src_start);
12273 			}
12274 			src_entry = tmp_entry;
12275 		}
12276 		/* we are now in the lowest level submap... */
12277 
12278 		if ((VME_OBJECT(tmp_entry) != VM_OBJECT_NULL) &&
12279 		    (VME_OBJECT(tmp_entry)->phys_contiguous)) {
12280 			/* This is not, supported for now.In future */
12281 			/* we will need to detect the phys_contig   */
12282 			/* condition and then upgrade copy_slowly   */
12283 			/* to do physical copy from the device mem  */
12284 			/* based object. We can piggy-back off of   */
12285 			/* the was wired boolean to set-up the      */
12286 			/* proper handling */
12287 			RETURN(KERN_PROTECTION_FAILURE);
12288 		}
12289 		/*
12290 		 *	Create a new address map entry to hold the result.
12291 		 *	Fill in the fields from the appropriate source entries.
12292 		 *	We must unlock the source map to do this if we need
12293 		 *	to allocate a map entry.
12294 		 */
12295 		if (new_entry == VM_MAP_ENTRY_NULL) {
12296 			version.main_timestamp = src_map->timestamp;
12297 			vm_map_unlock(src_map);
12298 
12299 			new_entry = vm_map_copy_entry_create(copy);
12300 
12301 			vm_map_lock(src_map);
12302 			if ((version.main_timestamp + 1) != src_map->timestamp) {
12303 				if (!vm_map_lookup_entry(src_map, src_start,
12304 				    &tmp_entry)) {
12305 					RETURN(KERN_INVALID_ADDRESS);
12306 				}
12307 				if (!tmp_entry->is_sub_map) {
12308 					vm_map_clip_start(src_map, tmp_entry, src_start);
12309 				}
12310 				continue; /* restart w/ new tmp_entry */
12311 			}
12312 		}
12313 
12314 		/*
12315 		 *	Verify that the region can be read.
12316 		 */
12317 		if (((src_entry->protection & VM_PROT_READ) == VM_PROT_NONE &&
12318 		    !use_maxprot) ||
12319 		    (src_entry->max_protection & VM_PROT_READ) == 0) {
12320 			RETURN(KERN_PROTECTION_FAILURE);
12321 		}
12322 
12323 		src_object = VME_OBJECT(src_entry);
12324 
12325 
12326 		/*
12327 		 *	Clip against the endpoints of the entire region.
12328 		 */
12329 
12330 		vm_map_clip_end(src_map, src_entry, src_end);
12331 
12332 		src_size = src_entry->vme_end - src_start;
12333 		src_offset = VME_OFFSET(src_entry);
12334 		was_wired = (src_entry->wired_count != 0);
12335 
12336 		vm_map_entry_copy(src_map, new_entry, src_entry);
12337 		if (new_entry->is_sub_map) {
12338 			/* clr address space specifics */
12339 			new_entry->use_pmap = FALSE;
12340 		} else {
12341 			/*
12342 			 * We're dealing with a copy-on-write operation,
12343 			 * so the resulting mapping should not inherit the
12344 			 * original mapping's accounting settings.
12345 			 * "iokit_acct" should have been cleared in
12346 			 * vm_map_entry_copy().
12347 			 * "use_pmap" should be reset to its default (TRUE)
12348 			 * so that the new mapping gets accounted for in
12349 			 * the task's memory footprint.
12350 			 */
12351 			assert(!new_entry->iokit_acct);
12352 			new_entry->use_pmap = TRUE;
12353 		}
12354 
12355 		/*
12356 		 *	Attempt non-blocking copy-on-write optimizations.
12357 		 */
12358 
12359 		/*
12360 		 * If we are destroying the source, and the object
12361 		 * is internal, we could move the object reference
12362 		 * from the source to the copy.  The copy is
12363 		 * copy-on-write only if the source is.
12364 		 * We make another reference to the object, because
12365 		 * destroying the source entry will deallocate it.
12366 		 *
12367 		 * This memory transfer has to be atomic, (to prevent
12368 		 * the VM object from being shared or copied while
12369 		 * it's being moved here), so we could only do this
12370 		 * if we won't have to unlock the VM map until the
12371 		 * original mapping has been fully removed.
12372 		 */
12373 
12374 RestartCopy:
12375 		if ((src_object == VM_OBJECT_NULL ||
12376 		    (!was_wired && !map_share && !tmp_entry->is_shared
12377 		    && !(debug4k_no_cow_copyin && VM_MAP_PAGE_SHIFT(src_map) < PAGE_SHIFT))) &&
12378 		    vm_object_copy_quickly(
12379 			    VME_OBJECT(new_entry),
12380 			    src_offset,
12381 			    src_size,
12382 			    &src_needs_copy,
12383 			    &new_entry_needs_copy)) {
12384 			new_entry->needs_copy = new_entry_needs_copy;
12385 
12386 			/*
12387 			 *	Handle copy-on-write obligations
12388 			 */
12389 
12390 			if (src_needs_copy && !tmp_entry->needs_copy) {
12391 				vm_prot_t prot;
12392 
12393 				prot = src_entry->protection & ~VM_PROT_WRITE;
12394 
12395 				if (override_nx(src_map, VME_ALIAS(src_entry))
12396 				    && prot) {
12397 					prot |= VM_PROT_EXECUTE;
12398 				}
12399 
12400 				vm_object_pmap_protect(
12401 					src_object,
12402 					src_offset,
12403 					src_size,
12404 					(src_entry->is_shared ?
12405 					PMAP_NULL
12406 					: src_map->pmap),
12407 					VM_MAP_PAGE_SIZE(src_map),
12408 					src_entry->vme_start,
12409 					prot);
12410 
12411 				assert(tmp_entry->wired_count == 0);
12412 				tmp_entry->needs_copy = TRUE;
12413 			}
12414 
12415 			/*
12416 			 *	The map has never been unlocked, so it's safe
12417 			 *	to move to the next entry rather than doing
12418 			 *	another lookup.
12419 			 */
12420 
12421 			goto CopySuccessful;
12422 		}
12423 
12424 		entry_was_shared = tmp_entry->is_shared;
12425 
12426 		/*
12427 		 *	Take an object reference, so that we may
12428 		 *	release the map lock(s).
12429 		 */
12430 
12431 		assert(src_object != VM_OBJECT_NULL);
12432 		vm_object_reference(src_object);
12433 
12434 		/*
12435 		 *	Record the timestamp for later verification.
12436 		 *	Unlock the map.
12437 		 */
12438 
12439 		version.main_timestamp = src_map->timestamp;
12440 		vm_map_unlock(src_map); /* Increments timestamp once! */
12441 		saved_src_entry = src_entry;
12442 		tmp_entry = VM_MAP_ENTRY_NULL;
12443 		src_entry = VM_MAP_ENTRY_NULL;
12444 
12445 		/*
12446 		 *	Perform the copy
12447 		 */
12448 
12449 		if (was_wired ||
12450 		    (src_object->copy_strategy == MEMORY_OBJECT_COPY_DELAY_FORK &&
12451 		    !(flags & VM_MAP_COPYIN_FORK)) ||
12452 		    (debug4k_no_cow_copyin &&
12453 		    VM_MAP_PAGE_SHIFT(src_map) < PAGE_SHIFT)) {
12454 CopySlowly:
12455 			vm_object_lock(src_object);
12456 			result = vm_object_copy_slowly(
12457 				src_object,
12458 				src_offset,
12459 				src_size,
12460 				THREAD_UNINT,
12461 				&new_copy_object);
12462 			/* VME_OBJECT_SET will reset used_for_jit|tpro, so preserve it. */
12463 			saved_used_for_jit = new_entry->used_for_jit;
12464 			VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
12465 			new_entry->used_for_jit = saved_used_for_jit;
12466 			VME_OFFSET_SET(new_entry,
12467 			    src_offset - vm_object_trunc_page(src_offset));
12468 			new_entry->needs_copy = FALSE;
12469 		} else if (src_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC &&
12470 		    (entry_was_shared || map_share)) {
12471 			vm_object_t new_object;
12472 
12473 			vm_object_lock_shared(src_object);
12474 			new_object = vm_object_copy_delayed(
12475 				src_object,
12476 				src_offset,
12477 				src_size,
12478 				TRUE);
12479 			if (new_object == VM_OBJECT_NULL) {
12480 				goto CopySlowly;
12481 			}
12482 
12483 			VME_OBJECT_SET(new_entry, new_object, false, 0);
12484 			assert(new_entry->wired_count == 0);
12485 			new_entry->needs_copy = TRUE;
12486 			assert(!new_entry->iokit_acct);
12487 			assert(new_object->purgable == VM_PURGABLE_DENY);
12488 			assertf(new_entry->use_pmap, "src_map %p new_entry %p\n", src_map, new_entry);
12489 			result = KERN_SUCCESS;
12490 		} else {
12491 			vm_object_offset_t new_offset;
12492 			new_offset = VME_OFFSET(new_entry);
12493 			result = vm_object_copy_strategically(src_object,
12494 			    src_offset,
12495 			    src_size,
12496 			    (flags & VM_MAP_COPYIN_FORK),
12497 			    &new_copy_object,
12498 			    &new_offset,
12499 			    &new_entry_needs_copy);
12500 			/* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
12501 			saved_used_for_jit = new_entry->used_for_jit;
12502 			VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
12503 			new_entry->used_for_jit = saved_used_for_jit;
12504 			if (new_offset != VME_OFFSET(new_entry)) {
12505 				VME_OFFSET_SET(new_entry, new_offset);
12506 			}
12507 
12508 			new_entry->needs_copy = new_entry_needs_copy;
12509 		}
12510 
12511 		if (result == KERN_SUCCESS &&
12512 		    ((preserve_purgeable &&
12513 		    src_object->purgable != VM_PURGABLE_DENY) ||
12514 		    new_entry->used_for_jit)) {
12515 			/*
12516 			 * Purgeable objects should be COPY_NONE, true share;
12517 			 * this should be propogated to the copy.
12518 			 *
12519 			 * Also force mappings the pmap specially protects to
12520 			 * be COPY_NONE; trying to COW these mappings would
12521 			 * change the effective protections, which could have
12522 			 * side effects if the pmap layer relies on the
12523 			 * specified protections.
12524 			 */
12525 
12526 			vm_object_t     new_object;
12527 
12528 			new_object = VME_OBJECT(new_entry);
12529 			assert(new_object != src_object);
12530 			vm_object_lock(new_object);
12531 			assert(os_ref_get_count_raw(&new_object->ref_count) == 1);
12532 			assert(new_object->shadow == VM_OBJECT_NULL);
12533 			assert(new_object->vo_copy == VM_OBJECT_NULL);
12534 			assert(new_object->vo_owner == NULL);
12535 
12536 			new_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
12537 
12538 			if (preserve_purgeable &&
12539 			    src_object->purgable != VM_PURGABLE_DENY) {
12540 				VM_OBJECT_SET_TRUE_SHARE(new_object, TRUE);
12541 
12542 				/* start as non-volatile with no owner... */
12543 				VM_OBJECT_SET_PURGABLE(new_object, VM_PURGABLE_NONVOLATILE);
12544 				vm_purgeable_nonvolatile_enqueue(new_object, NULL);
12545 				/* ... and move to src_object's purgeable state */
12546 				if (src_object->purgable != VM_PURGABLE_NONVOLATILE) {
12547 					int state;
12548 					state = src_object->purgable;
12549 					vm_object_purgable_control(
12550 						new_object,
12551 						VM_PURGABLE_SET_STATE_FROM_KERNEL,
12552 						&state);
12553 				}
12554 				/* no pmap accounting for purgeable objects */
12555 				new_entry->use_pmap = FALSE;
12556 			}
12557 
12558 			vm_object_unlock(new_object);
12559 			new_object = VM_OBJECT_NULL;
12560 		}
12561 
12562 		/*
12563 		 *	Throw away the extra reference
12564 		 */
12565 
12566 		vm_object_deallocate(src_object);
12567 
12568 		if (result != KERN_SUCCESS &&
12569 		    result != KERN_MEMORY_RESTART_COPY) {
12570 			vm_map_lock(src_map);
12571 			RETURN(result);
12572 		}
12573 
12574 		/*
12575 		 *	Verify that the map has not substantially
12576 		 *	changed while the copy was being made.
12577 		 */
12578 
12579 		vm_map_lock(src_map);
12580 
12581 		if ((version.main_timestamp + 1) == src_map->timestamp) {
12582 			/* src_map hasn't changed: src_entry is still valid */
12583 			src_entry = saved_src_entry;
12584 			goto VerificationSuccessful;
12585 		}
12586 
12587 		/*
12588 		 *	Simple version comparison failed.
12589 		 *
12590 		 *	Retry the lookup and verify that the
12591 		 *	same object/offset are still present.
12592 		 *
12593 		 *	[Note: a memory manager that colludes with
12594 		 *	the calling task can detect that we have
12595 		 *	cheated.  While the map was unlocked, the
12596 		 *	mapping could have been changed and restored.]
12597 		 */
12598 
12599 		if (!vm_map_lookup_entry(src_map, src_start, &tmp_entry)) {
12600 			if (result != KERN_MEMORY_RESTART_COPY) {
12601 				vm_object_deallocate(VME_OBJECT(new_entry));
12602 				VME_OBJECT_SET(new_entry, VM_OBJECT_NULL, false, 0);
12603 				/* reset accounting state */
12604 				new_entry->iokit_acct = FALSE;
12605 				new_entry->use_pmap = TRUE;
12606 			}
12607 			RETURN(KERN_INVALID_ADDRESS);
12608 		}
12609 
12610 		src_entry = tmp_entry;
12611 		vm_map_clip_start(src_map, src_entry, src_start);
12612 
12613 		if ((((src_entry->protection & VM_PROT_READ) == VM_PROT_NONE) &&
12614 		    !use_maxprot) ||
12615 		    ((src_entry->max_protection & VM_PROT_READ) == 0)) {
12616 			goto VerificationFailed;
12617 		}
12618 
12619 		if (src_entry->vme_end < new_entry->vme_end) {
12620 			/*
12621 			 * This entry might have been shortened
12622 			 * (vm_map_clip_end) or been replaced with
12623 			 * an entry that ends closer to "src_start"
12624 			 * than before.
12625 			 * Adjust "new_entry" accordingly; copying
12626 			 * less memory would be correct but we also
12627 			 * redo the copy (see below) if the new entry
12628 			 * no longer points at the same object/offset.
12629 			 */
12630 			assert(VM_MAP_PAGE_ALIGNED(src_entry->vme_end,
12631 			    VM_MAP_COPY_PAGE_MASK(copy)));
12632 			new_entry->vme_end = src_entry->vme_end;
12633 			src_size = new_entry->vme_end - src_start;
12634 		} else if (src_entry->vme_end > new_entry->vme_end) {
12635 			/*
12636 			 * This entry might have been extended
12637 			 * (vm_map_entry_simplify() or coalesce)
12638 			 * or been replaced with an entry that ends farther
12639 			 * from "src_start" than before.
12640 			 *
12641 			 * We've called vm_object_copy_*() only on
12642 			 * the previous <start:end> range, so we can't
12643 			 * just extend new_entry.  We have to re-do
12644 			 * the copy based on the new entry as if it was
12645 			 * pointing at a different object/offset (see
12646 			 * "Verification failed" below).
12647 			 */
12648 		}
12649 
12650 		if ((VME_OBJECT(src_entry) != src_object) ||
12651 		    (VME_OFFSET(src_entry) != src_offset) ||
12652 		    (src_entry->vme_end > new_entry->vme_end)) {
12653 			/*
12654 			 *	Verification failed.
12655 			 *
12656 			 *	Start over with this top-level entry.
12657 			 */
12658 
12659 VerificationFailed:     ;
12660 
12661 			vm_object_deallocate(VME_OBJECT(new_entry));
12662 			tmp_entry = src_entry;
12663 			continue;
12664 		}
12665 
12666 		/*
12667 		 *	Verification succeeded.
12668 		 */
12669 
12670 VerificationSuccessful:;
12671 
12672 		if (result == KERN_MEMORY_RESTART_COPY) {
12673 			goto RestartCopy;
12674 		}
12675 
12676 		/*
12677 		 *	Copy succeeded.
12678 		 */
12679 
12680 CopySuccessful: ;
12681 
12682 		/*
12683 		 *	Link in the new copy entry.
12684 		 */
12685 
12686 		vm_map_copy_entry_link(copy, vm_map_copy_last_entry(copy),
12687 		    new_entry);
12688 
12689 		/*
12690 		 *	Determine whether the entire region
12691 		 *	has been copied.
12692 		 */
12693 		src_base = src_start;
12694 		src_start = new_entry->vme_end;
12695 		new_entry = VM_MAP_ENTRY_NULL;
12696 		while ((src_start >= src_end) && (src_end != 0)) {
12697 			submap_map_t    *ptr;
12698 
12699 			if (src_map == base_map) {
12700 				/* back to the top */
12701 				break;
12702 			}
12703 
12704 			ptr = parent_maps;
12705 			assert(ptr != NULL);
12706 			parent_maps = parent_maps->next;
12707 
12708 			/* fix up the damage we did in that submap */
12709 			vm_map_simplify_range(src_map,
12710 			    src_base,
12711 			    src_end);
12712 
12713 			vm_map_unlock(src_map);
12714 			vm_map_deallocate(src_map);
12715 			vm_map_lock(ptr->parent_map);
12716 			src_map = ptr->parent_map;
12717 			src_base = ptr->base_start;
12718 			src_start = ptr->base_start + ptr->base_len;
12719 			src_end = ptr->base_end;
12720 			if (!vm_map_lookup_entry(src_map,
12721 			    src_start,
12722 			    &tmp_entry) &&
12723 			    (src_end > src_start)) {
12724 				RETURN(KERN_INVALID_ADDRESS);
12725 			}
12726 			kfree_type(submap_map_t, ptr);
12727 			if (parent_maps == NULL) {
12728 				map_share = FALSE;
12729 			}
12730 			src_entry = tmp_entry->vme_prev;
12731 		}
12732 
12733 		if ((VM_MAP_PAGE_SHIFT(src_map) != PAGE_SHIFT) &&
12734 		    (src_start >= src_addr_unaligned + len) &&
12735 		    (src_addr_unaligned + len != 0)) {
12736 			/*
12737 			 * Stop copying now, even though we haven't reached
12738 			 * "src_end".  We'll adjust the end of the last copy
12739 			 * entry at the end, if needed.
12740 			 *
12741 			 * If src_map's aligment is different from the
12742 			 * system's page-alignment, there could be
12743 			 * extra non-map-aligned map entries between
12744 			 * the original (non-rounded) "src_addr_unaligned + len"
12745 			 * and the rounded "src_end".
12746 			 * We do not want to copy those map entries since
12747 			 * they're not part of the copied range.
12748 			 */
12749 			break;
12750 		}
12751 
12752 		if ((src_start >= src_end) && (src_end != 0)) {
12753 			break;
12754 		}
12755 
12756 		/*
12757 		 *	Verify that there are no gaps in the region
12758 		 */
12759 
12760 		tmp_entry = src_entry->vme_next;
12761 		if ((tmp_entry->vme_start != src_start) ||
12762 		    (tmp_entry == vm_map_to_entry(src_map))) {
12763 			RETURN(KERN_INVALID_ADDRESS);
12764 		}
12765 	}
12766 
12767 	/*
12768 	 * If the source should be destroyed, do it now, since the
12769 	 * copy was successful.
12770 	 */
12771 	if (src_destroy) {
12772 		vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_FLAGS;
12773 
12774 		if (src_map == kernel_map) {
12775 			remove_flags |= VM_MAP_REMOVE_KUNWIRE;
12776 		}
12777 		(void)vm_map_remove_and_unlock(src_map,
12778 		    vm_map_trunc_page(src_addr_unaligned, VM_MAP_PAGE_MASK(src_map)),
12779 		    src_end,
12780 		    remove_flags,
12781 		    KMEM_GUARD_NONE);
12782 	} else {
12783 		/* fix up the damage we did in the base map */
12784 		vm_map_simplify_range(
12785 			src_map,
12786 			vm_map_trunc_page(src_addr_unaligned,
12787 			VM_MAP_PAGE_MASK(src_map)),
12788 			vm_map_round_page(src_end,
12789 			VM_MAP_PAGE_MASK(src_map)));
12790 		vm_map_unlock(src_map);
12791 	}
12792 
12793 	tmp_entry = VM_MAP_ENTRY_NULL;
12794 
12795 	if (VM_MAP_PAGE_SHIFT(src_map) > PAGE_SHIFT &&
12796 	    VM_MAP_PAGE_SHIFT(src_map) != VM_MAP_COPY_PAGE_SHIFT(copy)) {
12797 		vm_map_offset_t original_start, original_offset, original_end;
12798 
12799 		assert(VM_MAP_COPY_PAGE_MASK(copy) == PAGE_MASK);
12800 
12801 		/* adjust alignment of first copy_entry's "vme_start" */
12802 		tmp_entry = vm_map_copy_first_entry(copy);
12803 		if (tmp_entry != vm_map_copy_to_entry(copy)) {
12804 			vm_map_offset_t adjustment;
12805 
12806 			original_start = tmp_entry->vme_start;
12807 			original_offset = VME_OFFSET(tmp_entry);
12808 
12809 			/* map-align the start of the first copy entry... */
12810 			adjustment = (tmp_entry->vme_start -
12811 			    vm_map_trunc_page(
12812 				    tmp_entry->vme_start,
12813 				    VM_MAP_PAGE_MASK(src_map)));
12814 			tmp_entry->vme_start -= adjustment;
12815 			VME_OFFSET_SET(tmp_entry,
12816 			    VME_OFFSET(tmp_entry) - adjustment);
12817 			copy_addr -= adjustment;
12818 			assert(tmp_entry->vme_start < tmp_entry->vme_end);
12819 			/* ... adjust for mis-aligned start of copy range */
12820 			adjustment =
12821 			    (vm_map_trunc_page(copy->offset,
12822 			    PAGE_MASK) -
12823 			    vm_map_trunc_page(copy->offset,
12824 			    VM_MAP_PAGE_MASK(src_map)));
12825 			if (adjustment) {
12826 				assert(page_aligned(adjustment));
12827 				assert(adjustment < VM_MAP_PAGE_SIZE(src_map));
12828 				tmp_entry->vme_start += adjustment;
12829 				VME_OFFSET_SET(tmp_entry,
12830 				    (VME_OFFSET(tmp_entry) +
12831 				    adjustment));
12832 				copy_addr += adjustment;
12833 				assert(tmp_entry->vme_start < tmp_entry->vme_end);
12834 			}
12835 
12836 			/*
12837 			 * Assert that the adjustments haven't exposed
12838 			 * more than was originally copied...
12839 			 */
12840 			assert(tmp_entry->vme_start >= original_start);
12841 			assert(VME_OFFSET(tmp_entry) >= original_offset);
12842 			/*
12843 			 * ... and that it did not adjust outside of a
12844 			 * a single 16K page.
12845 			 */
12846 			assert(vm_map_trunc_page(tmp_entry->vme_start,
12847 			    VM_MAP_PAGE_MASK(src_map)) ==
12848 			    vm_map_trunc_page(original_start,
12849 			    VM_MAP_PAGE_MASK(src_map)));
12850 		}
12851 
12852 		/* adjust alignment of last copy_entry's "vme_end" */
12853 		tmp_entry = vm_map_copy_last_entry(copy);
12854 		if (tmp_entry != vm_map_copy_to_entry(copy)) {
12855 			vm_map_offset_t adjustment;
12856 
12857 			original_end = tmp_entry->vme_end;
12858 
12859 			/* map-align the end of the last copy entry... */
12860 			tmp_entry->vme_end =
12861 			    vm_map_round_page(tmp_entry->vme_end,
12862 			    VM_MAP_PAGE_MASK(src_map));
12863 			/* ... adjust for mis-aligned end of copy range */
12864 			adjustment =
12865 			    (vm_map_round_page((copy->offset +
12866 			    copy->size),
12867 			    VM_MAP_PAGE_MASK(src_map)) -
12868 			    vm_map_round_page((copy->offset +
12869 			    copy->size),
12870 			    PAGE_MASK));
12871 			if (adjustment) {
12872 				assert(page_aligned(adjustment));
12873 				assert(adjustment < VM_MAP_PAGE_SIZE(src_map));
12874 				tmp_entry->vme_end -= adjustment;
12875 				assert(tmp_entry->vme_start < tmp_entry->vme_end);
12876 			}
12877 
12878 			/*
12879 			 * Assert that the adjustments haven't exposed
12880 			 * more than was originally copied...
12881 			 */
12882 			assert(tmp_entry->vme_end <= original_end);
12883 			/*
12884 			 * ... and that it did not adjust outside of a
12885 			 * a single 16K page.
12886 			 */
12887 			assert(vm_map_round_page(tmp_entry->vme_end,
12888 			    VM_MAP_PAGE_MASK(src_map)) ==
12889 			    vm_map_round_page(original_end,
12890 			    VM_MAP_PAGE_MASK(src_map)));
12891 		}
12892 	}
12893 
12894 	/* Fix-up start and end points in copy.  This is necessary */
12895 	/* when the various entries in the copy object were picked */
12896 	/* up from different sub-maps */
12897 
12898 	tmp_entry = vm_map_copy_first_entry(copy);
12899 	copy_size = 0; /* compute actual size */
12900 	while (tmp_entry != vm_map_copy_to_entry(copy)) {
12901 		assert(VM_MAP_PAGE_ALIGNED(
12902 			    copy_addr + (tmp_entry->vme_end -
12903 			    tmp_entry->vme_start),
12904 			    MIN(VM_MAP_COPY_PAGE_MASK(copy), PAGE_MASK)));
12905 		assert(VM_MAP_PAGE_ALIGNED(
12906 			    copy_addr,
12907 			    MIN(VM_MAP_COPY_PAGE_MASK(copy), PAGE_MASK)));
12908 
12909 		/*
12910 		 * The copy_entries will be injected directly into the
12911 		 * destination map and might not be "map aligned" there...
12912 		 */
12913 		tmp_entry->map_aligned = FALSE;
12914 
12915 		tmp_entry->vme_end = copy_addr +
12916 		    (tmp_entry->vme_end - tmp_entry->vme_start);
12917 		tmp_entry->vme_start = copy_addr;
12918 		assert(tmp_entry->vme_start < tmp_entry->vme_end);
12919 		copy_addr += tmp_entry->vme_end - tmp_entry->vme_start;
12920 		copy_size += tmp_entry->vme_end - tmp_entry->vme_start;
12921 		tmp_entry = (struct vm_map_entry *)tmp_entry->vme_next;
12922 	}
12923 
12924 	if (VM_MAP_PAGE_SHIFT(src_map) != PAGE_SHIFT &&
12925 	    copy_size < copy->size) {
12926 		/*
12927 		 * The actual size of the VM map copy is smaller than what
12928 		 * was requested by the caller.  This must be because some
12929 		 * PAGE_SIZE-sized pages are missing at the end of the last
12930 		 * VM_MAP_PAGE_SIZE(src_map)-sized chunk of the range.
12931 		 * The caller might not have been aware of those missing
12932 		 * pages and might not want to be aware of it, which is
12933 		 * fine as long as they don't try to access (and crash on)
12934 		 * those missing pages.
12935 		 * Let's adjust the size of the "copy", to avoid failing
12936 		 * in vm_map_copyout() or vm_map_copy_overwrite().
12937 		 */
12938 		assert(vm_map_round_page(copy_size,
12939 		    VM_MAP_PAGE_MASK(src_map)) ==
12940 		    vm_map_round_page(copy->size,
12941 		    VM_MAP_PAGE_MASK(src_map)));
12942 		copy->size = copy_size;
12943 	}
12944 
12945 	*copy_result = copy;
12946 	return KERN_SUCCESS;
12947 
12948 #undef  RETURN
12949 }
12950 
12951 kern_return_t
vm_map_copy_extract(vm_map_t src_map,vm_map_address_t src_addr,vm_map_size_t len,boolean_t do_copy,vm_map_copy_t * copy_result,vm_prot_t * cur_prot,vm_prot_t * max_prot,vm_inherit_t inheritance,vm_map_kernel_flags_t vmk_flags)12952 vm_map_copy_extract(
12953 	vm_map_t                src_map,
12954 	vm_map_address_t        src_addr,
12955 	vm_map_size_t           len,
12956 	boolean_t               do_copy,
12957 	vm_map_copy_t           *copy_result,   /* OUT */
12958 	vm_prot_t               *cur_prot,      /* IN/OUT */
12959 	vm_prot_t               *max_prot,      /* IN/OUT */
12960 	vm_inherit_t            inheritance,
12961 	vm_map_kernel_flags_t   vmk_flags)
12962 {
12963 	vm_map_copy_t   copy;
12964 	kern_return_t   kr;
12965 	vm_prot_t required_cur_prot, required_max_prot;
12966 
12967 	/*
12968 	 *	Check for copies of zero bytes.
12969 	 */
12970 
12971 	if (len == 0) {
12972 		*copy_result = VM_MAP_COPY_NULL;
12973 		return KERN_SUCCESS;
12974 	}
12975 
12976 	/*
12977 	 *	Check that the end address doesn't overflow
12978 	 */
12979 	if (src_addr + len < src_addr) {
12980 		return KERN_INVALID_ADDRESS;
12981 	}
12982 	if (__improbable(vm_map_range_overflows(src_map, src_addr, len))) {
12983 		return KERN_INVALID_ADDRESS;
12984 	}
12985 
12986 	if (VM_MAP_PAGE_SIZE(src_map) < PAGE_SIZE) {
12987 		DEBUG4K_SHARE("src_map %p src_addr 0x%llx src_end 0x%llx\n", src_map, (uint64_t)src_addr, (uint64_t)(src_addr + len));
12988 	}
12989 
12990 	required_cur_prot = *cur_prot;
12991 	required_max_prot = *max_prot;
12992 
12993 	/*
12994 	 *	Allocate a header element for the list.
12995 	 *
12996 	 *	Use the start and end in the header to
12997 	 *	remember the endpoints prior to rounding.
12998 	 */
12999 
13000 	copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
13001 	copy->cpy_hdr.entries_pageable = vmk_flags.vmkf_copy_pageable;
13002 	copy->offset = 0;
13003 	copy->size = len;
13004 
13005 	kr = vm_map_remap_extract(src_map,
13006 	    src_addr,
13007 	    len,
13008 	    do_copy,             /* copy */
13009 	    copy,
13010 	    cur_prot,            /* IN/OUT */
13011 	    max_prot,            /* IN/OUT */
13012 	    inheritance,
13013 	    vmk_flags);
13014 	if (kr != KERN_SUCCESS) {
13015 		vm_map_copy_discard(copy);
13016 		if ((kr == KERN_INVALID_ADDRESS ||
13017 		    kr == KERN_INVALID_ARGUMENT) &&
13018 		    src_map->terminated) {
13019 			/* tell the caller that this address space is gone */
13020 			kr = KERN_TERMINATED;
13021 		}
13022 		return kr;
13023 	}
13024 	if (required_cur_prot != VM_PROT_NONE) {
13025 		assert((*cur_prot & required_cur_prot) == required_cur_prot);
13026 		assert((*max_prot & required_max_prot) == required_max_prot);
13027 	}
13028 
13029 	*copy_result = copy;
13030 	return KERN_SUCCESS;
13031 }
13032 
13033 static void
vm_map_fork_share(vm_map_t old_map,vm_map_entry_t old_entry,vm_map_t new_map)13034 vm_map_fork_share(
13035 	vm_map_t        old_map,
13036 	vm_map_entry_t  old_entry,
13037 	vm_map_t        new_map)
13038 {
13039 	vm_object_t     object;
13040 	vm_map_entry_t  new_entry;
13041 
13042 	/*
13043 	 *	New sharing code.  New map entry
13044 	 *	references original object.  Internal
13045 	 *	objects use asynchronous copy algorithm for
13046 	 *	future copies.  First make sure we have
13047 	 *	the right object.  If we need a shadow,
13048 	 *	or someone else already has one, then
13049 	 *	make a new shadow and share it.
13050 	 */
13051 
13052 	if (!old_entry->is_sub_map) {
13053 		object = VME_OBJECT(old_entry);
13054 	}
13055 
13056 	if (old_entry->is_sub_map) {
13057 		assert(old_entry->wired_count == 0);
13058 #ifndef NO_NESTED_PMAP
13059 #if !PMAP_FORK_NEST
13060 		if (old_entry->use_pmap) {
13061 			kern_return_t   result;
13062 
13063 			result = pmap_nest(new_map->pmap,
13064 			    (VME_SUBMAP(old_entry))->pmap,
13065 			    (addr64_t)old_entry->vme_start,
13066 			    (uint64_t)(old_entry->vme_end - old_entry->vme_start));
13067 			if (result) {
13068 				panic("vm_map_fork_share: pmap_nest failed!");
13069 			}
13070 		}
13071 #endif /* !PMAP_FORK_NEST */
13072 #endif  /* NO_NESTED_PMAP */
13073 	} else if (object == VM_OBJECT_NULL) {
13074 		object = vm_object_allocate((vm_map_size_t)(old_entry->vme_end -
13075 		    old_entry->vme_start));
13076 		VME_OFFSET_SET(old_entry, 0);
13077 		VME_OBJECT_SET(old_entry, object, false, 0);
13078 		old_entry->use_pmap = TRUE;
13079 //		assert(!old_entry->needs_copy);
13080 	} else if (object->copy_strategy !=
13081 	    MEMORY_OBJECT_COPY_SYMMETRIC) {
13082 		/*
13083 		 *	We are already using an asymmetric
13084 		 *	copy, and therefore we already have
13085 		 *	the right object.
13086 		 */
13087 
13088 		assert(!old_entry->needs_copy);
13089 	} else if (old_entry->needs_copy ||       /* case 1 */
13090 	    object->shadowed ||                 /* case 2 */
13091 	    (!object->true_share &&             /* case 3 */
13092 	    !old_entry->is_shared &&
13093 	    (object->vo_size >
13094 	    (vm_map_size_t)(old_entry->vme_end -
13095 	    old_entry->vme_start)))) {
13096 		bool is_writable;
13097 
13098 		/*
13099 		 *	We need to create a shadow.
13100 		 *	There are three cases here.
13101 		 *	In the first case, we need to
13102 		 *	complete a deferred symmetrical
13103 		 *	copy that we participated in.
13104 		 *	In the second and third cases,
13105 		 *	we need to create the shadow so
13106 		 *	that changes that we make to the
13107 		 *	object do not interfere with
13108 		 *	any symmetrical copies which
13109 		 *	have occured (case 2) or which
13110 		 *	might occur (case 3).
13111 		 *
13112 		 *	The first case is when we had
13113 		 *	deferred shadow object creation
13114 		 *	via the entry->needs_copy mechanism.
13115 		 *	This mechanism only works when
13116 		 *	only one entry points to the source
13117 		 *	object, and we are about to create
13118 		 *	a second entry pointing to the
13119 		 *	same object. The problem is that
13120 		 *	there is no way of mapping from
13121 		 *	an object to the entries pointing
13122 		 *	to it. (Deferred shadow creation
13123 		 *	works with one entry because occurs
13124 		 *	at fault time, and we walk from the
13125 		 *	entry to the object when handling
13126 		 *	the fault.)
13127 		 *
13128 		 *	The second case is when the object
13129 		 *	to be shared has already been copied
13130 		 *	with a symmetric copy, but we point
13131 		 *	directly to the object without
13132 		 *	needs_copy set in our entry. (This
13133 		 *	can happen because different ranges
13134 		 *	of an object can be pointed to by
13135 		 *	different entries. In particular,
13136 		 *	a single entry pointing to an object
13137 		 *	can be split by a call to vm_inherit,
13138 		 *	which, combined with task_create, can
13139 		 *	result in the different entries
13140 		 *	having different needs_copy values.)
13141 		 *	The shadowed flag in the object allows
13142 		 *	us to detect this case. The problem
13143 		 *	with this case is that if this object
13144 		 *	has or will have shadows, then we
13145 		 *	must not perform an asymmetric copy
13146 		 *	of this object, since such a copy
13147 		 *	allows the object to be changed, which
13148 		 *	will break the previous symmetrical
13149 		 *	copies (which rely upon the object
13150 		 *	not changing). In a sense, the shadowed
13151 		 *	flag says "don't change this object".
13152 		 *	We fix this by creating a shadow
13153 		 *	object for this object, and sharing
13154 		 *	that. This works because we are free
13155 		 *	to change the shadow object (and thus
13156 		 *	to use an asymmetric copy strategy);
13157 		 *	this is also semantically correct,
13158 		 *	since this object is temporary, and
13159 		 *	therefore a copy of the object is
13160 		 *	as good as the object itself. (This
13161 		 *	is not true for permanent objects,
13162 		 *	since the pager needs to see changes,
13163 		 *	which won't happen if the changes
13164 		 *	are made to a copy.)
13165 		 *
13166 		 *	The third case is when the object
13167 		 *	to be shared has parts sticking
13168 		 *	outside of the entry we're working
13169 		 *	with, and thus may in the future
13170 		 *	be subject to a symmetrical copy.
13171 		 *	(This is a preemptive version of
13172 		 *	case 2.)
13173 		 */
13174 		VME_OBJECT_SHADOW(old_entry,
13175 		    (vm_map_size_t) (old_entry->vme_end -
13176 		    old_entry->vme_start),
13177 		    vm_map_always_shadow(old_map));
13178 
13179 		/*
13180 		 *	If we're making a shadow for other than
13181 		 *	copy on write reasons, then we have
13182 		 *	to remove write permission.
13183 		 */
13184 
13185 		is_writable = false;
13186 		if (old_entry->protection & VM_PROT_WRITE) {
13187 			is_writable = true;
13188 #if __arm64e__
13189 		} else if (old_entry->used_for_tpro) {
13190 			is_writable = true;
13191 #endif /* __arm64e__ */
13192 		}
13193 		if (!old_entry->needs_copy && is_writable) {
13194 			vm_prot_t prot;
13195 
13196 			if (pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, old_entry->protection)) {
13197 				panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
13198 				    __FUNCTION__, old_map, old_map->pmap,
13199 				    old_entry,
13200 				    (uint64_t)old_entry->vme_start,
13201 				    (uint64_t)old_entry->vme_end,
13202 				    old_entry->protection);
13203 			}
13204 
13205 			prot = old_entry->protection & ~VM_PROT_WRITE;
13206 
13207 			if (pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, prot)) {
13208 				panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
13209 				    __FUNCTION__, old_map, old_map->pmap,
13210 				    old_entry,
13211 				    (uint64_t)old_entry->vme_start,
13212 				    (uint64_t)old_entry->vme_end,
13213 				    prot);
13214 			}
13215 
13216 			if (override_nx(old_map, VME_ALIAS(old_entry)) && prot) {
13217 				prot |= VM_PROT_EXECUTE;
13218 			}
13219 
13220 
13221 			if (old_map->mapped_in_other_pmaps) {
13222 				vm_object_pmap_protect(
13223 					VME_OBJECT(old_entry),
13224 					VME_OFFSET(old_entry),
13225 					(old_entry->vme_end -
13226 					old_entry->vme_start),
13227 					PMAP_NULL,
13228 					PAGE_SIZE,
13229 					old_entry->vme_start,
13230 					prot);
13231 			} else {
13232 				pmap_protect(old_map->pmap,
13233 				    old_entry->vme_start,
13234 				    old_entry->vme_end,
13235 				    prot);
13236 			}
13237 		}
13238 
13239 		old_entry->needs_copy = FALSE;
13240 		object = VME_OBJECT(old_entry);
13241 	}
13242 
13243 
13244 	/*
13245 	 *	If object was using a symmetric copy strategy,
13246 	 *	change its copy strategy to the default
13247 	 *	asymmetric copy strategy, which is copy_delay
13248 	 *	in the non-norma case and copy_call in the
13249 	 *	norma case. Bump the reference count for the
13250 	 *	new entry.
13251 	 */
13252 
13253 	if (old_entry->is_sub_map) {
13254 		vm_map_reference(VME_SUBMAP(old_entry));
13255 	} else {
13256 		vm_object_lock(object);
13257 		vm_object_reference_locked(object);
13258 		if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
13259 			object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
13260 		}
13261 		vm_object_unlock(object);
13262 	}
13263 
13264 	/*
13265 	 *	Clone the entry, using object ref from above.
13266 	 *	Mark both entries as shared.
13267 	 */
13268 
13269 	new_entry = vm_map_entry_create(new_map); /* Never the kernel map or descendants */
13270 	vm_map_entry_copy(old_map, new_entry, old_entry);
13271 	old_entry->is_shared = TRUE;
13272 	new_entry->is_shared = TRUE;
13273 
13274 	/*
13275 	 * We're dealing with a shared mapping, so the resulting mapping
13276 	 * should inherit some of the original mapping's accounting settings.
13277 	 * "iokit_acct" should have been cleared in vm_map_entry_copy().
13278 	 * "use_pmap" should stay the same as before (if it hasn't been reset
13279 	 * to TRUE when we cleared "iokit_acct").
13280 	 */
13281 	assert(!new_entry->iokit_acct);
13282 
13283 	/*
13284 	 *	If old entry's inheritence is VM_INHERIT_NONE,
13285 	 *	the new entry is for corpse fork, remove the
13286 	 *	write permission from the new entry.
13287 	 */
13288 	if (old_entry->inheritance == VM_INHERIT_NONE) {
13289 		new_entry->protection &= ~VM_PROT_WRITE;
13290 		new_entry->max_protection &= ~VM_PROT_WRITE;
13291 	}
13292 
13293 	/*
13294 	 *	Insert the entry into the new map -- we
13295 	 *	know we're inserting at the end of the new
13296 	 *	map.
13297 	 */
13298 
13299 	vm_map_store_entry_link(new_map, vm_map_last_entry(new_map), new_entry,
13300 	    VM_MAP_KERNEL_FLAGS_NONE);
13301 
13302 	/*
13303 	 *	Update the physical map
13304 	 */
13305 
13306 	if (old_entry->is_sub_map) {
13307 		/* Bill Angell pmap support goes here */
13308 	} else {
13309 		pmap_copy(new_map->pmap, old_map->pmap, new_entry->vme_start,
13310 		    old_entry->vme_end - old_entry->vme_start,
13311 		    old_entry->vme_start);
13312 	}
13313 }
13314 
13315 static boolean_t
vm_map_fork_copy(vm_map_t old_map,vm_map_entry_t * old_entry_p,vm_map_t new_map,int vm_map_copyin_flags)13316 vm_map_fork_copy(
13317 	vm_map_t        old_map,
13318 	vm_map_entry_t  *old_entry_p,
13319 	vm_map_t        new_map,
13320 	int             vm_map_copyin_flags)
13321 {
13322 	vm_map_entry_t old_entry = *old_entry_p;
13323 	vm_map_size_t entry_size = old_entry->vme_end - old_entry->vme_start;
13324 	vm_map_offset_t start = old_entry->vme_start;
13325 	vm_map_copy_t copy;
13326 	vm_map_entry_t last = vm_map_last_entry(new_map);
13327 
13328 	vm_map_unlock(old_map);
13329 	/*
13330 	 *	Use maxprot version of copyin because we
13331 	 *	care about whether this memory can ever
13332 	 *	be accessed, not just whether it's accessible
13333 	 *	right now.
13334 	 */
13335 	vm_map_copyin_flags |= VM_MAP_COPYIN_USE_MAXPROT;
13336 	if (vm_map_copyin_internal(old_map, start, entry_size,
13337 	    vm_map_copyin_flags, &copy)
13338 	    != KERN_SUCCESS) {
13339 		/*
13340 		 *	The map might have changed while it
13341 		 *	was unlocked, check it again.  Skip
13342 		 *	any blank space or permanently
13343 		 *	unreadable region.
13344 		 */
13345 		vm_map_lock(old_map);
13346 		if (!vm_map_lookup_entry(old_map, start, &last) ||
13347 		    (last->max_protection & VM_PROT_READ) == VM_PROT_NONE) {
13348 			last = last->vme_next;
13349 		}
13350 		*old_entry_p = last;
13351 
13352 		/*
13353 		 * XXX	For some error returns, want to
13354 		 * XXX	skip to the next element.  Note
13355 		 *	that INVALID_ADDRESS and
13356 		 *	PROTECTION_FAILURE are handled above.
13357 		 */
13358 
13359 		return FALSE;
13360 	}
13361 
13362 	/*
13363 	 * Assert that the vm_map_copy is coming from the right
13364 	 * zone and hasn't been forged
13365 	 */
13366 	vm_map_copy_require(copy);
13367 
13368 	/*
13369 	 *	Insert the copy into the new map
13370 	 */
13371 	vm_map_copy_insert(new_map, last, copy);
13372 
13373 	/*
13374 	 *	Pick up the traversal at the end of
13375 	 *	the copied region.
13376 	 */
13377 
13378 	vm_map_lock(old_map);
13379 	start += entry_size;
13380 	if (!vm_map_lookup_entry(old_map, start, &last)) {
13381 		last = last->vme_next;
13382 	} else {
13383 		if (last->vme_start == start) {
13384 			/*
13385 			 * No need to clip here and we don't
13386 			 * want to cause any unnecessary
13387 			 * unnesting...
13388 			 */
13389 		} else {
13390 			vm_map_clip_start(old_map, last, start);
13391 		}
13392 	}
13393 	*old_entry_p = last;
13394 
13395 	return TRUE;
13396 }
13397 
13398 #if PMAP_FORK_NEST
13399 #define PMAP_FORK_NEST_DEBUG 0
13400 static inline void
vm_map_fork_unnest(pmap_t new_pmap,vm_map_offset_t pre_nested_start,vm_map_offset_t pre_nested_end,vm_map_offset_t start,vm_map_offset_t end)13401 vm_map_fork_unnest(
13402 	pmap_t new_pmap,
13403 	vm_map_offset_t pre_nested_start,
13404 	vm_map_offset_t pre_nested_end,
13405 	vm_map_offset_t start,
13406 	vm_map_offset_t end)
13407 {
13408 	kern_return_t kr;
13409 	vm_map_offset_t nesting_mask, start_unnest, end_unnest;
13410 
13411 	assertf(pre_nested_start <= pre_nested_end,
13412 	    "pre_nested start 0x%llx end 0x%llx",
13413 	    (uint64_t)pre_nested_start, (uint64_t)pre_nested_end);
13414 	assertf(start <= end,
13415 	    "start 0x%llx end 0x%llx",
13416 	    (uint64_t) start, (uint64_t)end);
13417 
13418 	if (pre_nested_start == pre_nested_end) {
13419 		/* nothing was pre-nested: done */
13420 		return;
13421 	}
13422 	if (end <= pre_nested_start) {
13423 		/* fully before pre-nested range: done */
13424 		return;
13425 	}
13426 	if (start >= pre_nested_end) {
13427 		/* fully after pre-nested range: done */
13428 		return;
13429 	}
13430 	/* ignore parts of range outside of pre_nested range */
13431 	if (start < pre_nested_start) {
13432 		start = pre_nested_start;
13433 	}
13434 	if (end > pre_nested_end) {
13435 		end = pre_nested_end;
13436 	}
13437 	nesting_mask = pmap_shared_region_size_min(new_pmap) - 1;
13438 	start_unnest = start & ~nesting_mask;
13439 	end_unnest = (end + nesting_mask) & ~nesting_mask;
13440 	kr = pmap_unnest(new_pmap,
13441 	    (addr64_t)start_unnest,
13442 	    (uint64_t)(end_unnest - start_unnest));
13443 #if PMAP_FORK_NEST_DEBUG
13444 	printf("PMAP_FORK_NEST %s:%d new_pmap %p 0x%llx:0x%llx -> pmap_unnest 0x%llx:0x%llx kr 0x%x\n", __FUNCTION__, __LINE__, new_pmap, (uint64_t)start, (uint64_t)end, (uint64_t)start_unnest, (uint64_t)end_unnest, kr);
13445 #endif /* PMAP_FORK_NEST_DEBUG */
13446 	assertf(kr == KERN_SUCCESS,
13447 	    "0x%llx 0x%llx pmap_unnest(%p, 0x%llx, 0x%llx) -> 0x%x",
13448 	    (uint64_t)start, (uint64_t)end, new_pmap,
13449 	    (uint64_t)start_unnest, (uint64_t)(end_unnest - start_unnest),
13450 	    kr);
13451 }
13452 #endif /* PMAP_FORK_NEST */
13453 
13454 void
vm_map_inherit_limits(vm_map_t new_map,const struct _vm_map * old_map)13455 vm_map_inherit_limits(vm_map_t new_map, const struct _vm_map *old_map)
13456 {
13457 	new_map->size_limit = old_map->size_limit;
13458 	new_map->data_limit = old_map->data_limit;
13459 	new_map->user_wire_limit = old_map->user_wire_limit;
13460 	new_map->reserved_regions = old_map->reserved_regions;
13461 }
13462 
13463 /*
13464  *	vm_map_fork:
13465  *
13466  *	Create and return a new map based on the old
13467  *	map, according to the inheritance values on the
13468  *	regions in that map and the options.
13469  *
13470  *	The source map must not be locked.
13471  */
13472 vm_map_t
vm_map_fork(ledger_t ledger,vm_map_t old_map,int options)13473 vm_map_fork(
13474 	ledger_t        ledger,
13475 	vm_map_t        old_map,
13476 	int             options)
13477 {
13478 	pmap_t          new_pmap;
13479 	vm_map_t        new_map;
13480 	vm_map_entry_t  old_entry;
13481 	vm_map_size_t   new_size = 0, entry_size;
13482 	vm_map_entry_t  new_entry;
13483 	boolean_t       src_needs_copy;
13484 	boolean_t       new_entry_needs_copy;
13485 	boolean_t       pmap_is64bit;
13486 	int             vm_map_copyin_flags;
13487 	vm_inherit_t    old_entry_inheritance;
13488 	int             map_create_options;
13489 	kern_return_t   footprint_collect_kr;
13490 
13491 	if (options & ~(VM_MAP_FORK_SHARE_IF_INHERIT_NONE |
13492 	    VM_MAP_FORK_PRESERVE_PURGEABLE |
13493 	    VM_MAP_FORK_CORPSE_FOOTPRINT |
13494 	    VM_MAP_FORK_SHARE_IF_OWNED)) {
13495 		/* unsupported option */
13496 		return VM_MAP_NULL;
13497 	}
13498 
13499 	pmap_is64bit =
13500 #if defined(__i386__) || defined(__x86_64__)
13501 	    old_map->pmap->pm_task_map != TASK_MAP_32BIT;
13502 #elif defined(__arm64__)
13503 	    old_map->pmap->is_64bit;
13504 #else
13505 #error Unknown architecture.
13506 #endif
13507 
13508 	unsigned int pmap_flags = 0;
13509 	pmap_flags |= pmap_is64bit ? PMAP_CREATE_64BIT : 0;
13510 #if defined(HAS_APPLE_PAC)
13511 	pmap_flags |= old_map->pmap->disable_jop ? PMAP_CREATE_DISABLE_JOP : 0;
13512 #endif
13513 #if CONFIG_ROSETTA
13514 	pmap_flags |= old_map->pmap->is_rosetta ? PMAP_CREATE_ROSETTA : 0;
13515 #endif
13516 #if PMAP_CREATE_FORCE_4K_PAGES
13517 	if (VM_MAP_PAGE_SIZE(old_map) == FOURK_PAGE_SIZE &&
13518 	    PAGE_SIZE != FOURK_PAGE_SIZE) {
13519 		pmap_flags |= PMAP_CREATE_FORCE_4K_PAGES;
13520 	}
13521 #endif /* PMAP_CREATE_FORCE_4K_PAGES */
13522 	new_pmap = pmap_create_options(ledger, (vm_map_size_t) 0, pmap_flags);
13523 	if (new_pmap == NULL) {
13524 		return VM_MAP_NULL;
13525 	}
13526 
13527 	vm_map_reference(old_map);
13528 	vm_map_lock(old_map);
13529 
13530 	map_create_options = 0;
13531 	if (old_map->hdr.entries_pageable) {
13532 		map_create_options |= VM_MAP_CREATE_PAGEABLE;
13533 	}
13534 	if (options & VM_MAP_FORK_CORPSE_FOOTPRINT) {
13535 		map_create_options |= VM_MAP_CREATE_CORPSE_FOOTPRINT;
13536 		footprint_collect_kr = KERN_SUCCESS;
13537 	}
13538 	new_map = vm_map_create_options(new_pmap,
13539 	    old_map->min_offset,
13540 	    old_map->max_offset,
13541 	    map_create_options);
13542 
13543 	/* inherit cs_enforcement */
13544 	vm_map_cs_enforcement_set(new_map, old_map->cs_enforcement);
13545 
13546 	vm_map_lock(new_map);
13547 	vm_commit_pagezero_status(new_map);
13548 	/* inherit the parent map's page size */
13549 	vm_map_set_page_shift(new_map, VM_MAP_PAGE_SHIFT(old_map));
13550 
13551 	/* inherit the parent rlimits */
13552 	vm_map_inherit_limits(new_map, old_map);
13553 
13554 #if CONFIG_MAP_RANGES
13555 	/* inherit the parent map's VM ranges */
13556 	vm_map_range_fork(new_map, old_map);
13557 #endif
13558 
13559 #if CODE_SIGNING_MONITOR
13560 	/* Prepare the monitor for the fork */
13561 	csm_fork_prepare(old_map->pmap, new_pmap);
13562 #endif
13563 
13564 #if PMAP_FORK_NEST
13565 	/*
13566 	 * Pre-nest the shared region's pmap.
13567 	 */
13568 	vm_map_offset_t pre_nested_start = 0, pre_nested_end = 0;
13569 	pmap_fork_nest(old_map->pmap, new_pmap,
13570 	    &pre_nested_start, &pre_nested_end);
13571 #if PMAP_FORK_NEST_DEBUG
13572 	printf("PMAP_FORK_NEST %s:%d old %p new %p pre_nested start 0x%llx end 0x%llx\n", __FUNCTION__, __LINE__, old_map->pmap, new_pmap, (uint64_t)pre_nested_start, (uint64_t)pre_nested_end);
13573 #endif /* PMAP_FORK_NEST_DEBUG */
13574 #endif /* PMAP_FORK_NEST */
13575 
13576 	for (old_entry = vm_map_first_entry(old_map); old_entry != vm_map_to_entry(old_map);) {
13577 		/*
13578 		 * Abort any corpse collection if the system is shutting down.
13579 		 */
13580 		if ((options & VM_MAP_FORK_CORPSE_FOOTPRINT) &&
13581 		    get_system_inshutdown()) {
13582 #if PMAP_FORK_NEST
13583 			new_entry = vm_map_last_entry(new_map);
13584 			if (new_entry == vm_map_to_entry(new_map)) {
13585 				/* unnest all that was pre-nested */
13586 				vm_map_fork_unnest(new_pmap,
13587 				    pre_nested_start, pre_nested_end,
13588 				    vm_map_min(new_map), vm_map_max(new_map));
13589 			} else if (new_entry->vme_end < vm_map_max(new_map)) {
13590 				/* unnest hole at the end, if pre-nested */
13591 				vm_map_fork_unnest(new_pmap,
13592 				    pre_nested_start, pre_nested_end,
13593 				    new_entry->vme_end, vm_map_max(new_map));
13594 			}
13595 #endif /* PMAP_FORK_NEST */
13596 			vm_map_corpse_footprint_collect_done(new_map);
13597 			vm_map_unlock(new_map);
13598 			vm_map_unlock(old_map);
13599 			vm_map_deallocate(new_map);
13600 			vm_map_deallocate(old_map);
13601 			printf("Aborting corpse map due to system shutdown\n");
13602 			return VM_MAP_NULL;
13603 		}
13604 
13605 		entry_size = old_entry->vme_end - old_entry->vme_start;
13606 
13607 #if PMAP_FORK_NEST
13608 		/*
13609 		 * Undo any unnecessary pre-nesting.
13610 		 */
13611 		vm_map_offset_t prev_end;
13612 		if (old_entry == vm_map_first_entry(old_map)) {
13613 			prev_end = vm_map_min(old_map);
13614 		} else {
13615 			prev_end = old_entry->vme_prev->vme_end;
13616 		}
13617 		if (prev_end < old_entry->vme_start) {
13618 			/* unnest hole before this entry, if pre-nested */
13619 			vm_map_fork_unnest(new_pmap,
13620 			    pre_nested_start, pre_nested_end,
13621 			    prev_end, old_entry->vme_start);
13622 		}
13623 		if (old_entry->is_sub_map && old_entry->use_pmap) {
13624 			/* keep this entry nested in the child */
13625 #if PMAP_FORK_NEST_DEBUG
13626 			printf("PMAP_FORK_NEST %s:%d new_pmap %p keeping 0x%llx:0x%llx nested\n", __FUNCTION__, __LINE__, new_pmap, (uint64_t)old_entry->vme_start, (uint64_t)old_entry->vme_end);
13627 #endif /* PMAP_FORK_NEST_DEBUG */
13628 		} else {
13629 			/* undo nesting for this entry, if pre-nested */
13630 			vm_map_fork_unnest(new_pmap,
13631 			    pre_nested_start, pre_nested_end,
13632 			    old_entry->vme_start, old_entry->vme_end);
13633 		}
13634 #endif /* PMAP_FORK_NEST */
13635 
13636 		old_entry_inheritance = old_entry->inheritance;
13637 
13638 		/*
13639 		 * If caller used the VM_MAP_FORK_SHARE_IF_INHERIT_NONE option
13640 		 * share VM_INHERIT_NONE entries that are not backed by a
13641 		 * device pager.
13642 		 */
13643 		if (old_entry_inheritance == VM_INHERIT_NONE &&
13644 		    (options & VM_MAP_FORK_SHARE_IF_INHERIT_NONE) &&
13645 		    (old_entry->protection & VM_PROT_READ) &&
13646 		    !(!old_entry->is_sub_map &&
13647 		    VME_OBJECT(old_entry) != NULL &&
13648 		    VME_OBJECT(old_entry)->pager != NULL &&
13649 		    is_device_pager_ops(
13650 			    VME_OBJECT(old_entry)->pager->mo_pager_ops))) {
13651 			old_entry_inheritance = VM_INHERIT_SHARE;
13652 		}
13653 		if (old_entry_inheritance == VM_INHERIT_COPY &&
13654 		    (options & VM_MAP_FORK_SHARE_IF_OWNED) &&
13655 		    !old_entry->is_sub_map &&
13656 		    VME_OBJECT(old_entry) != VM_OBJECT_NULL) {
13657 			vm_object_t object;
13658 			task_t owner;
13659 			object = VME_OBJECT(old_entry);
13660 			owner = VM_OBJECT_OWNER(object);
13661 			if (owner != TASK_NULL &&
13662 			    owner->map == old_map) {
13663 				/*
13664 				 * This mapping points at a VM object owned
13665 				 * by the task being forked.
13666 				 * Some tools reporting memory accounting
13667 				 * info rely on the object ID, so share this
13668 				 * mapping instead of copying, to make the
13669 				 * corpse look exactly like the original
13670 				 * task in that respect.
13671 				 */
13672 				assert(object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC);
13673 				old_entry_inheritance = VM_INHERIT_SHARE;
13674 			}
13675 		}
13676 
13677 		if (old_entry_inheritance != VM_INHERIT_NONE &&
13678 		    (options & VM_MAP_FORK_CORPSE_FOOTPRINT) &&
13679 		    footprint_collect_kr == KERN_SUCCESS) {
13680 			/*
13681 			 * The corpse won't have old_map->pmap to query
13682 			 * footprint information, so collect that data now
13683 			 * and store it in new_map->vmmap_corpse_footprint
13684 			 * for later autopsy.
13685 			 */
13686 			footprint_collect_kr =
13687 			    vm_map_corpse_footprint_collect(old_map,
13688 			    old_entry,
13689 			    new_map);
13690 		}
13691 
13692 		switch (old_entry_inheritance) {
13693 		case VM_INHERIT_NONE:
13694 			break;
13695 
13696 		case VM_INHERIT_SHARE:
13697 			vm_map_fork_share(old_map, old_entry, new_map);
13698 			new_size += entry_size;
13699 			break;
13700 
13701 		case VM_INHERIT_COPY:
13702 
13703 			/*
13704 			 *	Inline the copy_quickly case;
13705 			 *	upon failure, fall back on call
13706 			 *	to vm_map_fork_copy.
13707 			 */
13708 
13709 			if (old_entry->is_sub_map) {
13710 				break;
13711 			}
13712 			if ((old_entry->wired_count != 0) ||
13713 			    ((VME_OBJECT(old_entry) != NULL) &&
13714 			    (VME_OBJECT(old_entry)->true_share))) {
13715 				goto slow_vm_map_fork_copy;
13716 			}
13717 
13718 			new_entry = vm_map_entry_create(new_map); /* never the kernel map or descendants */
13719 			vm_map_entry_copy(old_map, new_entry, old_entry);
13720 			if (old_entry->vme_permanent) {
13721 				/* inherit "permanent" on fork() */
13722 				new_entry->vme_permanent = TRUE;
13723 			}
13724 
13725 			if (new_entry->used_for_jit == TRUE && new_map->jit_entry_exists == FALSE) {
13726 				new_map->jit_entry_exists = TRUE;
13727 			}
13728 
13729 			if (new_entry->is_sub_map) {
13730 				/* clear address space specifics */
13731 				new_entry->use_pmap = FALSE;
13732 			} else {
13733 				/*
13734 				 * We're dealing with a copy-on-write operation,
13735 				 * so the resulting mapping should not inherit
13736 				 * the original mapping's accounting settings.
13737 				 * "iokit_acct" should have been cleared in
13738 				 * vm_map_entry_copy().
13739 				 * "use_pmap" should be reset to its default
13740 				 * (TRUE) so that the new mapping gets
13741 				 * accounted for in the task's memory footprint.
13742 				 */
13743 				assert(!new_entry->iokit_acct);
13744 				new_entry->use_pmap = TRUE;
13745 			}
13746 
13747 			if (!vm_object_copy_quickly(
13748 				    VME_OBJECT(new_entry),
13749 				    VME_OFFSET(old_entry),
13750 				    (old_entry->vme_end -
13751 				    old_entry->vme_start),
13752 				    &src_needs_copy,
13753 				    &new_entry_needs_copy)) {
13754 				vm_map_entry_dispose(new_entry);
13755 				goto slow_vm_map_fork_copy;
13756 			}
13757 
13758 			/*
13759 			 *	Handle copy-on-write obligations
13760 			 */
13761 
13762 			if (src_needs_copy && !old_entry->needs_copy) {
13763 				vm_prot_t prot;
13764 
13765 				if (pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, old_entry->protection)) {
13766 					panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
13767 					    __FUNCTION__,
13768 					    old_map, old_map->pmap, old_entry,
13769 					    (uint64_t)old_entry->vme_start,
13770 					    (uint64_t)old_entry->vme_end,
13771 					    old_entry->protection);
13772 				}
13773 
13774 				prot = old_entry->protection & ~VM_PROT_WRITE;
13775 
13776 				if (override_nx(old_map, VME_ALIAS(old_entry))
13777 				    && prot) {
13778 					prot |= VM_PROT_EXECUTE;
13779 				}
13780 
13781 				if (pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, prot)) {
13782 					panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
13783 					    __FUNCTION__,
13784 					    old_map, old_map->pmap, old_entry,
13785 					    (uint64_t)old_entry->vme_start,
13786 					    (uint64_t)old_entry->vme_end,
13787 					    prot);
13788 				}
13789 
13790 				vm_object_pmap_protect(
13791 					VME_OBJECT(old_entry),
13792 					VME_OFFSET(old_entry),
13793 					(old_entry->vme_end -
13794 					old_entry->vme_start),
13795 					((old_entry->is_shared
13796 					|| old_map->mapped_in_other_pmaps)
13797 					? PMAP_NULL :
13798 					old_map->pmap),
13799 					VM_MAP_PAGE_SIZE(old_map),
13800 					old_entry->vme_start,
13801 					prot);
13802 
13803 				assert(old_entry->wired_count == 0);
13804 				old_entry->needs_copy = TRUE;
13805 			}
13806 			new_entry->needs_copy = new_entry_needs_copy;
13807 
13808 			/*
13809 			 *	Insert the entry at the end
13810 			 *	of the map.
13811 			 */
13812 
13813 			vm_map_store_entry_link(new_map,
13814 			    vm_map_last_entry(new_map),
13815 			    new_entry,
13816 			    VM_MAP_KERNEL_FLAGS_NONE);
13817 			new_size += entry_size;
13818 			break;
13819 
13820 slow_vm_map_fork_copy:
13821 			vm_map_copyin_flags = VM_MAP_COPYIN_FORK;
13822 			if (options & VM_MAP_FORK_PRESERVE_PURGEABLE) {
13823 				vm_map_copyin_flags |=
13824 				    VM_MAP_COPYIN_PRESERVE_PURGEABLE;
13825 			}
13826 			if (vm_map_fork_copy(old_map,
13827 			    &old_entry,
13828 			    new_map,
13829 			    vm_map_copyin_flags)) {
13830 				new_size += entry_size;
13831 			}
13832 			continue;
13833 		}
13834 		old_entry = old_entry->vme_next;
13835 	}
13836 
13837 #if PMAP_FORK_NEST
13838 	new_entry = vm_map_last_entry(new_map);
13839 	if (new_entry == vm_map_to_entry(new_map)) {
13840 		/* unnest all that was pre-nested */
13841 		vm_map_fork_unnest(new_pmap,
13842 		    pre_nested_start, pre_nested_end,
13843 		    vm_map_min(new_map), vm_map_max(new_map));
13844 	} else if (new_entry->vme_end < vm_map_max(new_map)) {
13845 		/* unnest hole at the end, if pre-nested */
13846 		vm_map_fork_unnest(new_pmap,
13847 		    pre_nested_start, pre_nested_end,
13848 		    new_entry->vme_end, vm_map_max(new_map));
13849 	}
13850 #endif /* PMAP_FORK_NEST */
13851 
13852 #if defined(__arm64__)
13853 	pmap_insert_commpage(new_map->pmap);
13854 #endif /* __arm64__ */
13855 
13856 	new_map->size = new_size;
13857 
13858 	if (options & VM_MAP_FORK_CORPSE_FOOTPRINT) {
13859 		vm_map_corpse_footprint_collect_done(new_map);
13860 	}
13861 
13862 	/* Propagate JIT entitlement for the pmap layer. */
13863 	if (pmap_get_jit_entitled(old_map->pmap)) {
13864 		/* Tell the pmap that it supports JIT. */
13865 		pmap_set_jit_entitled(new_map->pmap);
13866 	}
13867 
13868 	/* Propagate TPRO settings for the pmap layer */
13869 	if (pmap_get_tpro(old_map->pmap)) {
13870 		/* Tell the pmap that it supports TPRO */
13871 		pmap_set_tpro(new_map->pmap);
13872 	}
13873 
13874 
13875 	vm_map_unlock(new_map);
13876 	vm_map_unlock(old_map);
13877 	vm_map_deallocate(old_map);
13878 
13879 	return new_map;
13880 }
13881 
13882 /*
13883  * vm_map_exec:
13884  *
13885  *      Setup the "new_map" with the proper execution environment according
13886  *	to the type of executable (platform, 64bit, chroot environment).
13887  *	Map the comm page and shared region, etc...
13888  */
13889 kern_return_t
vm_map_exec(vm_map_t new_map,task_t task,boolean_t is64bit,void * fsroot,cpu_type_t cpu,cpu_subtype_t cpu_subtype,boolean_t reslide,boolean_t is_driverkit,uint32_t rsr_version)13890 vm_map_exec(
13891 	vm_map_t        new_map,
13892 	task_t          task,
13893 	boolean_t       is64bit,
13894 	void            *fsroot,
13895 	cpu_type_t      cpu,
13896 	cpu_subtype_t   cpu_subtype,
13897 	boolean_t       reslide,
13898 	boolean_t       is_driverkit,
13899 	uint32_t        rsr_version)
13900 {
13901 	SHARED_REGION_TRACE_DEBUG(
13902 		("shared_region: task %p: vm_map_exec(%p,%p,%p,0x%x,0x%x): ->\n",
13903 		(void *)VM_KERNEL_ADDRPERM(current_task()),
13904 		(void *)VM_KERNEL_ADDRPERM(new_map),
13905 		(void *)VM_KERNEL_ADDRPERM(task),
13906 		(void *)VM_KERNEL_ADDRPERM(fsroot),
13907 		cpu,
13908 		cpu_subtype));
13909 	(void) vm_commpage_enter(new_map, task, is64bit);
13910 
13911 	(void) vm_shared_region_enter(new_map, task, is64bit, fsroot, cpu, cpu_subtype, reslide, is_driverkit, rsr_version);
13912 
13913 	SHARED_REGION_TRACE_DEBUG(
13914 		("shared_region: task %p: vm_map_exec(%p,%p,%p,0x%x,0x%x): <-\n",
13915 		(void *)VM_KERNEL_ADDRPERM(current_task()),
13916 		(void *)VM_KERNEL_ADDRPERM(new_map),
13917 		(void *)VM_KERNEL_ADDRPERM(task),
13918 		(void *)VM_KERNEL_ADDRPERM(fsroot),
13919 		cpu,
13920 		cpu_subtype));
13921 
13922 	/*
13923 	 * Some devices have region(s) of memory that shouldn't get allocated by
13924 	 * user processes. The following code creates dummy vm_map_entry_t's for each
13925 	 * of the regions that needs to be reserved to prevent any allocations in
13926 	 * those regions.
13927 	 */
13928 	kern_return_t kr = KERN_FAILURE;
13929 	vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_FIXED_PERMANENT();
13930 	vmk_flags.vmkf_beyond_max = true;
13931 
13932 	const struct vm_reserved_region *regions = NULL;
13933 	size_t num_regions = ml_get_vm_reserved_regions(is64bit, &regions);
13934 	assert((num_regions == 0) || (num_regions > 0 && regions != NULL));
13935 
13936 	for (size_t i = 0; i < num_regions; ++i) {
13937 		vm_map_offset_t address = regions[i].vmrr_addr;
13938 
13939 		kr = vm_map_enter(
13940 			new_map,
13941 			&address,
13942 			regions[i].vmrr_size,
13943 			(vm_map_offset_t)0,
13944 			vmk_flags,
13945 			VM_OBJECT_NULL,
13946 			(vm_object_offset_t)0,
13947 			FALSE,
13948 			VM_PROT_NONE,
13949 			VM_PROT_NONE,
13950 			VM_INHERIT_COPY);
13951 
13952 		if (kr != KERN_SUCCESS) {
13953 			os_log_error(OS_LOG_DEFAULT, "Failed to reserve %s region in user map %p %d", regions[i].vmrr_name, new_map, kr);
13954 			return KERN_FAILURE;
13955 		}
13956 	}
13957 
13958 	new_map->reserved_regions = (num_regions ? TRUE : FALSE);
13959 
13960 	return KERN_SUCCESS;
13961 }
13962 
13963 uint64_t vm_map_lookup_and_lock_object_copy_slowly_count = 0;
13964 uint64_t vm_map_lookup_and_lock_object_copy_slowly_size = 0;
13965 uint64_t vm_map_lookup_and_lock_object_copy_slowly_max = 0;
13966 uint64_t vm_map_lookup_and_lock_object_copy_slowly_restart = 0;
13967 uint64_t vm_map_lookup_and_lock_object_copy_slowly_error = 0;
13968 uint64_t vm_map_lookup_and_lock_object_copy_strategically_count = 0;
13969 uint64_t vm_map_lookup_and_lock_object_copy_strategically_size = 0;
13970 uint64_t vm_map_lookup_and_lock_object_copy_strategically_max = 0;
13971 uint64_t vm_map_lookup_and_lock_object_copy_strategically_restart = 0;
13972 uint64_t vm_map_lookup_and_lock_object_copy_strategically_error = 0;
13973 uint64_t vm_map_lookup_and_lock_object_copy_shadow_count = 0;
13974 uint64_t vm_map_lookup_and_lock_object_copy_shadow_size = 0;
13975 uint64_t vm_map_lookup_and_lock_object_copy_shadow_max = 0;
13976 /*
13977  *	vm_map_lookup_and_lock_object:
13978  *
13979  *	Finds the VM object, offset, and
13980  *	protection for a given virtual address in the
13981  *	specified map, assuming a page fault of the
13982  *	type specified.
13983  *
13984  *	Returns the (object, offset, protection) for
13985  *	this address, whether it is wired down, and whether
13986  *	this map has the only reference to the data in question.
13987  *	In order to later verify this lookup, a "version"
13988  *	is returned.
13989  *	If contended != NULL, *contended will be set to
13990  *	true iff the thread had to spin or block to acquire
13991  *	an exclusive lock.
13992  *
13993  *	The map MUST be locked by the caller and WILL be
13994  *	locked on exit.  In order to guarantee the
13995  *	existence of the returned object, it is returned
13996  *	locked.
13997  *
13998  *	If a lookup is requested with "write protection"
13999  *	specified, the map may be changed to perform virtual
14000  *	copying operations, although the data referenced will
14001  *	remain the same.
14002  *
14003  *  If fault_info is provided, then the information is
14004  *  initialized according to the properties of the map entry
14005  *  NB: only properties of the entry are initialized,
14006  *  namely:
14007  *    - user_tag
14008  *    - pmap_options
14009  *    - iokit_acct
14010  *    - behavior
14011  *    - lo_offset
14012  *    - hi_offset
14013  *    - no_cache
14014  *    - cs_bypass
14015  *    - csm_associated
14016  *    - resilient_media
14017  *    - vme_xnu_user_debug
14018  *    - vme_no_copy_on_read
14019  *    - used_for_tpro
14020  */
14021 kern_return_t
vm_map_lookup_and_lock_object(vm_map_t * var_map,vm_map_offset_t vaddr,vm_prot_t fault_type,int object_lock_type,vm_map_version_t * out_version,vm_object_t * object,vm_object_offset_t * offset,vm_prot_t * out_prot,boolean_t * wired,vm_object_fault_info_t fault_info,vm_map_t * real_map,bool * contended)14022 vm_map_lookup_and_lock_object(
14023 	vm_map_t                *var_map,       /* IN/OUT */
14024 	vm_map_offset_t         vaddr,
14025 	vm_prot_t               fault_type,
14026 	int                     object_lock_type,
14027 	vm_map_version_t        *out_version,   /* OUT */
14028 	vm_object_t             *object,        /* OUT */
14029 	vm_object_offset_t      *offset,        /* OUT */
14030 	vm_prot_t               *out_prot,      /* OUT */
14031 	boolean_t               *wired,         /* OUT */
14032 	vm_object_fault_info_t  fault_info,     /* OUT */
14033 	vm_map_t                *real_map,      /* OUT */
14034 	bool                    *contended)     /* OUT */
14035 {
14036 	vm_map_entry_t                  entry;
14037 	vm_map_t                        map = *var_map;
14038 	vm_map_t                        old_map = *var_map;
14039 	vm_map_t                        cow_sub_map_parent = VM_MAP_NULL;
14040 	vm_map_offset_t                 cow_parent_vaddr = 0;
14041 	vm_map_offset_t                 old_start = 0;
14042 	vm_map_offset_t                 old_end = 0;
14043 	vm_prot_t                       prot;
14044 	boolean_t                       mask_protections;
14045 	boolean_t                       force_copy;
14046 	boolean_t                       no_force_copy_if_executable;
14047 	boolean_t                       submap_needed_copy;
14048 	vm_prot_t                       original_fault_type;
14049 	vm_map_size_t                   fault_page_mask;
14050 
14051 	/*
14052 	 * VM_PROT_MASK means that the caller wants us to use "fault_type"
14053 	 * as a mask against the mapping's actual protections, not as an
14054 	 * absolute value.
14055 	 */
14056 	mask_protections = (fault_type & VM_PROT_IS_MASK) ? TRUE : FALSE;
14057 	force_copy = (fault_type & VM_PROT_COPY) ? TRUE : FALSE;
14058 	no_force_copy_if_executable = (fault_type & VM_PROT_COPY_FAIL_IF_EXECUTABLE) ? TRUE : FALSE;
14059 	fault_type &= VM_PROT_ALL;
14060 	original_fault_type = fault_type;
14061 	if (contended) {
14062 		*contended = false;
14063 	}
14064 
14065 	*real_map = map;
14066 
14067 	fault_page_mask = MIN(VM_MAP_PAGE_MASK(map), PAGE_MASK);
14068 	vaddr = VM_MAP_TRUNC_PAGE(vaddr, fault_page_mask);
14069 
14070 RetryLookup:
14071 	fault_type = original_fault_type;
14072 
14073 	/*
14074 	 *	If the map has an interesting hint, try it before calling
14075 	 *	full blown lookup routine.
14076 	 */
14077 	entry = map->hint;
14078 
14079 	if ((entry == vm_map_to_entry(map)) ||
14080 	    (vaddr < entry->vme_start) || (vaddr >= entry->vme_end)) {
14081 		vm_map_entry_t  tmp_entry;
14082 
14083 		/*
14084 		 *	Entry was either not a valid hint, or the vaddr
14085 		 *	was not contained in the entry, so do a full lookup.
14086 		 */
14087 		if (!vm_map_lookup_entry(map, vaddr, &tmp_entry)) {
14088 			if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14089 				vm_map_unlock(cow_sub_map_parent);
14090 			}
14091 			if ((*real_map != map)
14092 			    && (*real_map != cow_sub_map_parent)) {
14093 				vm_map_unlock(*real_map);
14094 			}
14095 			return KERN_INVALID_ADDRESS;
14096 		}
14097 
14098 		entry = tmp_entry;
14099 	}
14100 	if (map == old_map) {
14101 		old_start = entry->vme_start;
14102 		old_end = entry->vme_end;
14103 	}
14104 
14105 	/*
14106 	 *	Handle submaps.  Drop lock on upper map, submap is
14107 	 *	returned locked.
14108 	 */
14109 
14110 	submap_needed_copy = FALSE;
14111 submap_recurse:
14112 	if (entry->is_sub_map) {
14113 		vm_map_offset_t         local_vaddr;
14114 		vm_map_offset_t         end_delta;
14115 		vm_map_offset_t         start_delta;
14116 		vm_map_offset_t         top_entry_saved_start;
14117 		vm_object_offset_t      top_entry_saved_offset;
14118 		vm_map_entry_t          submap_entry, saved_submap_entry;
14119 		vm_object_offset_t      submap_entry_offset;
14120 		vm_object_size_t        submap_entry_size;
14121 		vm_prot_t               subentry_protection;
14122 		vm_prot_t               subentry_max_protection;
14123 		boolean_t               subentry_no_copy_on_read;
14124 		boolean_t               subentry_permanent;
14125 		boolean_t               subentry_csm_associated;
14126 #if __arm64e__
14127 		boolean_t               subentry_used_for_tpro;
14128 #endif /* __arm64e__ */
14129 		boolean_t               mapped_needs_copy = FALSE;
14130 		vm_map_version_t        version;
14131 
14132 		assertf(VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)) >= VM_MAP_PAGE_SHIFT(map),
14133 		    "map %p (%d) entry %p submap %p (%d)\n",
14134 		    map, VM_MAP_PAGE_SHIFT(map), entry,
14135 		    VME_SUBMAP(entry), VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)));
14136 
14137 		local_vaddr = vaddr;
14138 		top_entry_saved_start = entry->vme_start;
14139 		top_entry_saved_offset = VME_OFFSET(entry);
14140 
14141 		if ((entry->use_pmap &&
14142 		    !((fault_type & VM_PROT_WRITE) ||
14143 		    force_copy))) {
14144 			/* if real_map equals map we unlock below */
14145 			if ((*real_map != map) &&
14146 			    (*real_map != cow_sub_map_parent)) {
14147 				vm_map_unlock(*real_map);
14148 			}
14149 			*real_map = VME_SUBMAP(entry);
14150 		}
14151 
14152 		if (entry->needs_copy &&
14153 		    ((fault_type & VM_PROT_WRITE) ||
14154 		    force_copy)) {
14155 			if (!mapped_needs_copy) {
14156 				if (vm_map_lock_read_to_write(map)) {
14157 					vm_map_lock_read(map);
14158 					*real_map = map;
14159 					goto RetryLookup;
14160 				}
14161 				vm_map_lock_read(VME_SUBMAP(entry));
14162 				*var_map = VME_SUBMAP(entry);
14163 				cow_sub_map_parent = map;
14164 				/* reset base to map before cow object */
14165 				/* this is the map which will accept   */
14166 				/* the new cow object */
14167 				old_start = entry->vme_start;
14168 				old_end = entry->vme_end;
14169 				cow_parent_vaddr = vaddr;
14170 				mapped_needs_copy = TRUE;
14171 			} else {
14172 				vm_map_lock_read(VME_SUBMAP(entry));
14173 				*var_map = VME_SUBMAP(entry);
14174 				if ((cow_sub_map_parent != map) &&
14175 				    (*real_map != map)) {
14176 					vm_map_unlock(map);
14177 				}
14178 			}
14179 		} else {
14180 			if (entry->needs_copy) {
14181 				submap_needed_copy = TRUE;
14182 			}
14183 			vm_map_lock_read(VME_SUBMAP(entry));
14184 			*var_map = VME_SUBMAP(entry);
14185 			/* leave map locked if it is a target */
14186 			/* cow sub_map above otherwise, just  */
14187 			/* follow the maps down to the object */
14188 			/* here we unlock knowing we are not  */
14189 			/* revisiting the map.  */
14190 			if ((*real_map != map) && (map != cow_sub_map_parent)) {
14191 				vm_map_unlock_read(map);
14192 			}
14193 		}
14194 
14195 		entry = NULL;
14196 		map = *var_map;
14197 
14198 		/* calculate the offset in the submap for vaddr */
14199 		local_vaddr = (local_vaddr - top_entry_saved_start) + top_entry_saved_offset;
14200 		assertf(VM_MAP_PAGE_ALIGNED(local_vaddr, fault_page_mask),
14201 		    "local_vaddr 0x%llx entry->vme_start 0x%llx fault_page_mask 0x%llx\n",
14202 		    (uint64_t)local_vaddr, (uint64_t)top_entry_saved_start, (uint64_t)fault_page_mask);
14203 
14204 RetrySubMap:
14205 		if (!vm_map_lookup_entry(map, local_vaddr, &submap_entry)) {
14206 			if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14207 				vm_map_unlock(cow_sub_map_parent);
14208 			}
14209 			if ((*real_map != map)
14210 			    && (*real_map != cow_sub_map_parent)) {
14211 				vm_map_unlock(*real_map);
14212 			}
14213 			*real_map = map;
14214 			return KERN_INVALID_ADDRESS;
14215 		}
14216 
14217 		/* find the attenuated shadow of the underlying object */
14218 		/* on our target map */
14219 
14220 		/* in english the submap object may extend beyond the     */
14221 		/* region mapped by the entry or, may only fill a portion */
14222 		/* of it.  For our purposes, we only care if the object   */
14223 		/* doesn't fill.  In this case the area which will        */
14224 		/* ultimately be clipped in the top map will only need    */
14225 		/* to be as big as the portion of the underlying entry    */
14226 		/* which is mapped */
14227 		start_delta = submap_entry->vme_start > top_entry_saved_offset ?
14228 		    submap_entry->vme_start - top_entry_saved_offset : 0;
14229 
14230 		end_delta =
14231 		    (top_entry_saved_offset + start_delta + (old_end - old_start)) <=
14232 		    submap_entry->vme_end ?
14233 		    0 : (top_entry_saved_offset +
14234 		    (old_end - old_start))
14235 		    - submap_entry->vme_end;
14236 
14237 		old_start += start_delta;
14238 		old_end -= end_delta;
14239 
14240 		if (submap_entry->is_sub_map) {
14241 			entry = submap_entry;
14242 			vaddr = local_vaddr;
14243 			goto submap_recurse;
14244 		}
14245 
14246 		if (((fault_type & VM_PROT_WRITE) ||
14247 		    force_copy)
14248 		    && cow_sub_map_parent) {
14249 			vm_object_t     sub_object, copy_object;
14250 			vm_object_offset_t copy_offset;
14251 			vm_map_offset_t local_start;
14252 			vm_map_offset_t local_end;
14253 			boolean_t       object_copied = FALSE;
14254 			vm_object_offset_t object_copied_offset = 0;
14255 			boolean_t       object_copied_needs_copy = FALSE;
14256 			kern_return_t   kr = KERN_SUCCESS;
14257 
14258 			if (vm_map_lock_read_to_write(map)) {
14259 				vm_map_lock_read(map);
14260 				old_start -= start_delta;
14261 				old_end += end_delta;
14262 				goto RetrySubMap;
14263 			}
14264 
14265 
14266 			sub_object = VME_OBJECT(submap_entry);
14267 			if (sub_object == VM_OBJECT_NULL) {
14268 				sub_object =
14269 				    vm_object_allocate(
14270 					(vm_map_size_t)
14271 					(submap_entry->vme_end -
14272 					submap_entry->vme_start));
14273 				VME_OBJECT_SET(submap_entry, sub_object, false, 0);
14274 				VME_OFFSET_SET(submap_entry, 0);
14275 				assert(!submap_entry->is_sub_map);
14276 				assert(submap_entry->use_pmap);
14277 			}
14278 			local_start =  local_vaddr -
14279 			    (cow_parent_vaddr - old_start);
14280 			local_end = local_vaddr +
14281 			    (old_end - cow_parent_vaddr);
14282 			vm_map_clip_start(map, submap_entry, local_start);
14283 			vm_map_clip_end(map, submap_entry, local_end);
14284 			if (submap_entry->is_sub_map) {
14285 				/* unnesting was done when clipping */
14286 				assert(!submap_entry->use_pmap);
14287 			}
14288 
14289 			/* This is the COW case, lets connect */
14290 			/* an entry in our space to the underlying */
14291 			/* object in the submap, bypassing the  */
14292 			/* submap. */
14293 			submap_entry_offset = VME_OFFSET(submap_entry);
14294 			submap_entry_size = submap_entry->vme_end - submap_entry->vme_start;
14295 
14296 			if ((submap_entry->wired_count != 0 ||
14297 			    sub_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) &&
14298 			    (submap_entry->protection & VM_PROT_EXECUTE) &&
14299 			    no_force_copy_if_executable) {
14300 //				printf("FBDP map %p entry %p start 0x%llx end 0x%llx wired %d strat %d\n", map, submap_entry, (uint64_t)local_start, (uint64_t)local_end, submap_entry->wired_count, sub_object->copy_strategy);
14301 				if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14302 					vm_map_unlock(cow_sub_map_parent);
14303 				}
14304 				if ((*real_map != map)
14305 				    && (*real_map != cow_sub_map_parent)) {
14306 					vm_map_unlock(*real_map);
14307 				}
14308 				*real_map = map;
14309 				ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_NO_COW_ON_EXECUTABLE), 0 /* arg */);
14310 				vm_map_lock_write_to_read(map);
14311 				kr = KERN_PROTECTION_FAILURE;
14312 				DTRACE_VM4(submap_no_copy_executable,
14313 				    vm_map_t, map,
14314 				    vm_object_offset_t, submap_entry_offset,
14315 				    vm_object_size_t, submap_entry_size,
14316 				    int, kr);
14317 				return kr;
14318 			}
14319 
14320 			if (submap_entry->wired_count != 0) {
14321 				vm_object_reference(sub_object);
14322 
14323 				assertf(VM_MAP_PAGE_ALIGNED(VME_OFFSET(submap_entry), VM_MAP_PAGE_MASK(map)),
14324 				    "submap_entry %p offset 0x%llx\n",
14325 				    submap_entry, VME_OFFSET(submap_entry));
14326 
14327 				DTRACE_VM6(submap_copy_slowly,
14328 				    vm_map_t, cow_sub_map_parent,
14329 				    vm_map_offset_t, vaddr,
14330 				    vm_map_t, map,
14331 				    vm_object_size_t, submap_entry_size,
14332 				    int, submap_entry->wired_count,
14333 				    int, sub_object->copy_strategy);
14334 
14335 				saved_submap_entry = submap_entry;
14336 				version.main_timestamp = map->timestamp;
14337 				vm_map_unlock(map); /* Increments timestamp by 1 */
14338 				submap_entry = VM_MAP_ENTRY_NULL;
14339 
14340 				vm_object_lock(sub_object);
14341 				kr = vm_object_copy_slowly(sub_object,
14342 				    submap_entry_offset,
14343 				    submap_entry_size,
14344 				    FALSE, /* interruptible */
14345 				    &copy_object);
14346 				object_copied = TRUE;
14347 				object_copied_offset = 0;
14348 				/* 4k: account for extra offset in physical page */
14349 				object_copied_offset += submap_entry_offset - vm_object_trunc_page(submap_entry_offset);
14350 				object_copied_needs_copy = FALSE;
14351 				vm_object_deallocate(sub_object);
14352 
14353 				vm_map_lock(map);
14354 
14355 				if (kr != KERN_SUCCESS &&
14356 				    kr != KERN_MEMORY_RESTART_COPY) {
14357 					if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14358 						vm_map_unlock(cow_sub_map_parent);
14359 					}
14360 					if ((*real_map != map)
14361 					    && (*real_map != cow_sub_map_parent)) {
14362 						vm_map_unlock(*real_map);
14363 					}
14364 					*real_map = map;
14365 					vm_object_deallocate(copy_object);
14366 					copy_object = VM_OBJECT_NULL;
14367 					ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_COPY_SLOWLY_FAILED), 0 /* arg */);
14368 					vm_map_lock_write_to_read(map);
14369 					DTRACE_VM4(submap_copy_error_slowly,
14370 					    vm_object_t, sub_object,
14371 					    vm_object_offset_t, submap_entry_offset,
14372 					    vm_object_size_t, submap_entry_size,
14373 					    int, kr);
14374 					vm_map_lookup_and_lock_object_copy_slowly_error++;
14375 					return kr;
14376 				}
14377 
14378 				if ((kr == KERN_SUCCESS) &&
14379 				    (version.main_timestamp + 1) == map->timestamp) {
14380 					submap_entry = saved_submap_entry;
14381 				} else {
14382 					saved_submap_entry = NULL;
14383 					old_start -= start_delta;
14384 					old_end += end_delta;
14385 					vm_object_deallocate(copy_object);
14386 					copy_object = VM_OBJECT_NULL;
14387 					vm_map_lock_write_to_read(map);
14388 					vm_map_lookup_and_lock_object_copy_slowly_restart++;
14389 					goto RetrySubMap;
14390 				}
14391 				vm_map_lookup_and_lock_object_copy_slowly_count++;
14392 				vm_map_lookup_and_lock_object_copy_slowly_size += submap_entry_size;
14393 				if (submap_entry_size > vm_map_lookup_and_lock_object_copy_slowly_max) {
14394 					vm_map_lookup_and_lock_object_copy_slowly_max = submap_entry_size;
14395 				}
14396 			} else if (sub_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
14397 				submap_entry_offset = VME_OFFSET(submap_entry);
14398 				copy_object = VM_OBJECT_NULL;
14399 				object_copied_offset = submap_entry_offset;
14400 				object_copied_needs_copy = FALSE;
14401 				DTRACE_VM6(submap_copy_strategically,
14402 				    vm_map_t, cow_sub_map_parent,
14403 				    vm_map_offset_t, vaddr,
14404 				    vm_map_t, map,
14405 				    vm_object_size_t, submap_entry_size,
14406 				    int, submap_entry->wired_count,
14407 				    int, sub_object->copy_strategy);
14408 				kr = vm_object_copy_strategically(
14409 					sub_object,
14410 					submap_entry_offset,
14411 					submap_entry->vme_end - submap_entry->vme_start,
14412 					false, /* forking */
14413 					&copy_object,
14414 					&object_copied_offset,
14415 					&object_copied_needs_copy);
14416 				if (kr == KERN_MEMORY_RESTART_COPY) {
14417 					old_start -= start_delta;
14418 					old_end += end_delta;
14419 					vm_object_deallocate(copy_object);
14420 					copy_object = VM_OBJECT_NULL;
14421 					vm_map_lock_write_to_read(map);
14422 					vm_map_lookup_and_lock_object_copy_strategically_restart++;
14423 					goto RetrySubMap;
14424 				}
14425 				if (kr != KERN_SUCCESS) {
14426 					if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14427 						vm_map_unlock(cow_sub_map_parent);
14428 					}
14429 					if ((*real_map != map)
14430 					    && (*real_map != cow_sub_map_parent)) {
14431 						vm_map_unlock(*real_map);
14432 					}
14433 					*real_map = map;
14434 					vm_object_deallocate(copy_object);
14435 					copy_object = VM_OBJECT_NULL;
14436 					ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_COPY_STRAT_FAILED), 0 /* arg */);
14437 					vm_map_lock_write_to_read(map);
14438 					DTRACE_VM4(submap_copy_error_strategically,
14439 					    vm_object_t, sub_object,
14440 					    vm_object_offset_t, submap_entry_offset,
14441 					    vm_object_size_t, submap_entry_size,
14442 					    int, kr);
14443 					vm_map_lookup_and_lock_object_copy_strategically_error++;
14444 					return kr;
14445 				}
14446 				assert(copy_object != VM_OBJECT_NULL);
14447 				assert(copy_object != sub_object);
14448 				object_copied = TRUE;
14449 				vm_map_lookup_and_lock_object_copy_strategically_count++;
14450 				vm_map_lookup_and_lock_object_copy_strategically_size += submap_entry_size;
14451 				if (submap_entry_size > vm_map_lookup_and_lock_object_copy_strategically_max) {
14452 					vm_map_lookup_and_lock_object_copy_strategically_max = submap_entry_size;
14453 				}
14454 			} else {
14455 				/* set up shadow object */
14456 				object_copied = FALSE;
14457 				copy_object = sub_object;
14458 				vm_object_lock(sub_object);
14459 				vm_object_reference_locked(sub_object);
14460 				VM_OBJECT_SET_SHADOWED(sub_object, TRUE);
14461 				vm_object_unlock(sub_object);
14462 
14463 				assert(submap_entry->wired_count == 0);
14464 				submap_entry->needs_copy = TRUE;
14465 
14466 				prot = submap_entry->protection;
14467 				if (pmap_has_prot_policy(map->pmap, submap_entry->translated_allow_execute, prot)) {
14468 					panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
14469 					    __FUNCTION__,
14470 					    map, map->pmap, submap_entry,
14471 					    (uint64_t)submap_entry->vme_start,
14472 					    (uint64_t)submap_entry->vme_end,
14473 					    prot);
14474 				}
14475 				prot = prot & ~VM_PROT_WRITE;
14476 				if (pmap_has_prot_policy(map->pmap, submap_entry->translated_allow_execute, prot)) {
14477 					panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
14478 					    __FUNCTION__,
14479 					    map, map->pmap, submap_entry,
14480 					    (uint64_t)submap_entry->vme_start,
14481 					    (uint64_t)submap_entry->vme_end,
14482 					    prot);
14483 				}
14484 
14485 				if (override_nx(old_map,
14486 				    VME_ALIAS(submap_entry))
14487 				    && prot) {
14488 					prot |= VM_PROT_EXECUTE;
14489 				}
14490 
14491 				vm_object_pmap_protect(
14492 					sub_object,
14493 					VME_OFFSET(submap_entry),
14494 					submap_entry->vme_end -
14495 					submap_entry->vme_start,
14496 					(submap_entry->is_shared
14497 					|| map->mapped_in_other_pmaps) ?
14498 					PMAP_NULL : map->pmap,
14499 					VM_MAP_PAGE_SIZE(map),
14500 					submap_entry->vme_start,
14501 					prot);
14502 				vm_map_lookup_and_lock_object_copy_shadow_count++;
14503 				vm_map_lookup_and_lock_object_copy_shadow_size += submap_entry_size;
14504 				if (submap_entry_size > vm_map_lookup_and_lock_object_copy_shadow_max) {
14505 					vm_map_lookup_and_lock_object_copy_shadow_max = submap_entry_size;
14506 				}
14507 			}
14508 
14509 			/*
14510 			 * Adjust the fault offset to the submap entry.
14511 			 */
14512 			copy_offset = (local_vaddr -
14513 			    submap_entry->vme_start +
14514 			    VME_OFFSET(submap_entry));
14515 
14516 			/* This works diffently than the   */
14517 			/* normal submap case. We go back  */
14518 			/* to the parent of the cow map and*/
14519 			/* clip out the target portion of  */
14520 			/* the sub_map, substituting the   */
14521 			/* new copy object,                */
14522 
14523 			subentry_protection = submap_entry->protection;
14524 			subentry_max_protection = submap_entry->max_protection;
14525 			subentry_no_copy_on_read = submap_entry->vme_no_copy_on_read;
14526 			subentry_permanent = submap_entry->vme_permanent;
14527 			subentry_csm_associated = submap_entry->csm_associated;
14528 #if __arm64e__
14529 			subentry_used_for_tpro = submap_entry->used_for_tpro;
14530 #endif // __arm64e__
14531 			vm_map_unlock(map);
14532 			submap_entry = NULL; /* not valid after map unlock */
14533 
14534 			local_start = old_start;
14535 			local_end = old_end;
14536 			map = cow_sub_map_parent;
14537 			*var_map = cow_sub_map_parent;
14538 			vaddr = cow_parent_vaddr;
14539 			cow_sub_map_parent = NULL;
14540 
14541 			if (!vm_map_lookup_entry(map,
14542 			    vaddr, &entry)) {
14543 				if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14544 					vm_map_unlock(cow_sub_map_parent);
14545 				}
14546 				if ((*real_map != map)
14547 				    && (*real_map != cow_sub_map_parent)) {
14548 					vm_map_unlock(*real_map);
14549 				}
14550 				*real_map = map;
14551 				vm_object_deallocate(
14552 					copy_object);
14553 				copy_object = VM_OBJECT_NULL;
14554 				vm_map_lock_write_to_read(map);
14555 				DTRACE_VM4(submap_lookup_post_unlock,
14556 				    uint64_t, (uint64_t)entry->vme_start,
14557 				    uint64_t, (uint64_t)entry->vme_end,
14558 				    vm_map_offset_t, vaddr,
14559 				    int, object_copied);
14560 				return KERN_INVALID_ADDRESS;
14561 			}
14562 
14563 			/* clip out the portion of space */
14564 			/* mapped by the sub map which   */
14565 			/* corresponds to the underlying */
14566 			/* object */
14567 
14568 			/*
14569 			 * Clip (and unnest) the smallest nested chunk
14570 			 * possible around the faulting address...
14571 			 */
14572 			local_start = vaddr & ~(pmap_shared_region_size_min(map->pmap) - 1);
14573 			local_end = local_start + pmap_shared_region_size_min(map->pmap);
14574 			/*
14575 			 * ... but don't go beyond the "old_start" to "old_end"
14576 			 * range, to avoid spanning over another VM region
14577 			 * with a possibly different VM object and/or offset.
14578 			 */
14579 			if (local_start < old_start) {
14580 				local_start = old_start;
14581 			}
14582 			if (local_end > old_end) {
14583 				local_end = old_end;
14584 			}
14585 			/*
14586 			 * Adjust copy_offset to the start of the range.
14587 			 */
14588 			copy_offset -= (vaddr - local_start);
14589 
14590 			vm_map_clip_start(map, entry, local_start);
14591 			vm_map_clip_end(map, entry, local_end);
14592 			if (entry->is_sub_map) {
14593 				/* unnesting was done when clipping */
14594 				assert(!entry->use_pmap);
14595 			}
14596 
14597 			/* substitute copy object for */
14598 			/* shared map entry           */
14599 			vm_map_deallocate(VME_SUBMAP(entry));
14600 			assert(!entry->iokit_acct);
14601 			entry->use_pmap = TRUE;
14602 			VME_OBJECT_SET(entry, copy_object, false, 0);
14603 
14604 			/* propagate the submap entry's protections */
14605 			if (entry->protection != VM_PROT_READ) {
14606 				/*
14607 				 * Someone has already altered the top entry's
14608 				 * protections via vm_protect(VM_PROT_COPY).
14609 				 * Respect these new values and ignore the
14610 				 * submap entry's protections.
14611 				 */
14612 			} else {
14613 				/*
14614 				 * Regular copy-on-write: propagate the submap
14615 				 * entry's protections to the top map entry.
14616 				 */
14617 				entry->protection |= subentry_protection;
14618 			}
14619 			entry->max_protection |= subentry_max_protection;
14620 			/* propagate some attributes from subentry */
14621 			entry->vme_no_copy_on_read = subentry_no_copy_on_read;
14622 			entry->vme_permanent = subentry_permanent;
14623 			entry->csm_associated = subentry_csm_associated;
14624 #if __arm64e__
14625 			/* propagate TPRO iff the destination map has TPRO enabled */
14626 			if (subentry_used_for_tpro) {
14627 				if (vm_map_tpro(map)) {
14628 					entry->used_for_tpro = subentry_used_for_tpro;
14629 				} else {
14630 					/* "permanent" came from being TPRO */
14631 					entry->vme_permanent = FALSE;
14632 				}
14633 			}
14634 #endif /* __arm64e */
14635 			if ((entry->protection & VM_PROT_WRITE) &&
14636 			    (entry->protection & VM_PROT_EXECUTE) &&
14637 #if XNU_TARGET_OS_OSX
14638 			    map->pmap != kernel_pmap &&
14639 			    (vm_map_cs_enforcement(map)
14640 #if __arm64__
14641 			    || !VM_MAP_IS_EXOTIC(map)
14642 #endif /* __arm64__ */
14643 			    ) &&
14644 #endif /* XNU_TARGET_OS_OSX */
14645 #if CODE_SIGNING_MONITOR
14646 			    (csm_address_space_exempt(map->pmap) != KERN_SUCCESS) &&
14647 #endif
14648 			    !(entry->used_for_jit) &&
14649 			    VM_MAP_POLICY_WX_STRIP_X(map)) {
14650 				DTRACE_VM3(cs_wx,
14651 				    uint64_t, (uint64_t)entry->vme_start,
14652 				    uint64_t, (uint64_t)entry->vme_end,
14653 				    vm_prot_t, entry->protection);
14654 				printf("CODE SIGNING: %d[%s] %s:%d(0x%llx,0x%llx,0x%x) can't have both write and exec at the same time\n",
14655 				    proc_selfpid(),
14656 				    (get_bsdtask_info(current_task())
14657 				    ? proc_name_address(get_bsdtask_info(current_task()))
14658 				    : "?"),
14659 				    __FUNCTION__, __LINE__,
14660 #if DEVELOPMENT || DEBUG
14661 				    (uint64_t)entry->vme_start,
14662 				    (uint64_t)entry->vme_end,
14663 #else /* DEVELOPMENT || DEBUG */
14664 				    (uint64_t)0,
14665 				    (uint64_t)0,
14666 #endif /* DEVELOPMENT || DEBUG */
14667 				    entry->protection);
14668 				entry->protection &= ~VM_PROT_EXECUTE;
14669 			}
14670 
14671 			if (object_copied) {
14672 				VME_OFFSET_SET(entry, local_start - old_start + object_copied_offset);
14673 				entry->needs_copy = object_copied_needs_copy;
14674 				entry->is_shared = FALSE;
14675 			} else {
14676 				assert(VME_OBJECT(entry) != VM_OBJECT_NULL);
14677 				assert(VME_OBJECT(entry)->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC);
14678 				assert(entry->wired_count == 0);
14679 				VME_OFFSET_SET(entry, copy_offset);
14680 				entry->needs_copy = TRUE;
14681 				if (map != old_map) {
14682 					entry->is_shared = TRUE;
14683 				}
14684 			}
14685 			if (entry->inheritance == VM_INHERIT_SHARE) {
14686 				entry->inheritance = VM_INHERIT_COPY;
14687 			}
14688 
14689 			vm_map_lock_write_to_read(map);
14690 		} else {
14691 			if ((cow_sub_map_parent)
14692 			    && (cow_sub_map_parent != *real_map)
14693 			    && (cow_sub_map_parent != map)) {
14694 				vm_map_unlock(cow_sub_map_parent);
14695 			}
14696 			entry = submap_entry;
14697 			vaddr = local_vaddr;
14698 		}
14699 	}
14700 
14701 	/*
14702 	 *	Check whether this task is allowed to have
14703 	 *	this page.
14704 	 */
14705 
14706 	prot = entry->protection;
14707 
14708 	if (override_nx(old_map, VME_ALIAS(entry)) && prot) {
14709 		/*
14710 		 * HACK -- if not a stack, then allow execution
14711 		 */
14712 		prot |= VM_PROT_EXECUTE;
14713 	}
14714 
14715 #if __arm64e__
14716 	/*
14717 	 * If the entry we're dealing with is TPRO and we have a write
14718 	 * fault, inject VM_PROT_WRITE into protections. This allows us
14719 	 * to maintain RO permissions when not marked as TPRO.
14720 	 */
14721 	if (entry->used_for_tpro && (fault_type & VM_PROT_WRITE)) {
14722 		prot |= VM_PROT_WRITE;
14723 	}
14724 #endif /* __arm64e__ */
14725 	if (mask_protections) {
14726 		fault_type &= prot;
14727 		if (fault_type == VM_PROT_NONE) {
14728 			goto protection_failure;
14729 		}
14730 	}
14731 	if (((fault_type & prot) != fault_type)
14732 #if __arm64__
14733 	    /* prefetch abort in execute-only page */
14734 	    && !(prot == VM_PROT_EXECUTE && fault_type == (VM_PROT_READ | VM_PROT_EXECUTE))
14735 #elif defined(__x86_64__)
14736 	    /* Consider the UEXEC bit when handling an EXECUTE fault */
14737 	    && !((fault_type & VM_PROT_EXECUTE) && !(prot & VM_PROT_EXECUTE) && (prot & VM_PROT_UEXEC))
14738 #endif
14739 	    ) {
14740 protection_failure:
14741 		if (*real_map != map) {
14742 			vm_map_unlock(*real_map);
14743 		}
14744 		*real_map = map;
14745 
14746 		if ((fault_type & VM_PROT_EXECUTE) && prot) {
14747 			log_stack_execution_failure((addr64_t)vaddr, prot);
14748 		}
14749 
14750 		DTRACE_VM2(prot_fault, int, 1, (uint64_t *), NULL);
14751 		DTRACE_VM3(prot_fault_detailed, vm_prot_t, fault_type, vm_prot_t, prot, void *, vaddr);
14752 		/*
14753 		 * Noisy (esp. internally) and can be inferred from CrashReports. So OFF for now.
14754 		 *
14755 		 * ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_PROTECTION_FAILURE), 0);
14756 		 */
14757 		return KERN_PROTECTION_FAILURE;
14758 	}
14759 
14760 	/*
14761 	 *	If this page is not pageable, we have to get
14762 	 *	it for all possible accesses.
14763 	 */
14764 
14765 	*wired = (entry->wired_count != 0);
14766 	if (*wired) {
14767 		fault_type = prot;
14768 	}
14769 
14770 	/*
14771 	 *	If the entry was copy-on-write, we either ...
14772 	 */
14773 
14774 	if (entry->needs_copy) {
14775 		/*
14776 		 *	If we want to write the page, we may as well
14777 		 *	handle that now since we've got the map locked.
14778 		 *
14779 		 *	If we don't need to write the page, we just
14780 		 *	demote the permissions allowed.
14781 		 */
14782 
14783 		if ((fault_type & VM_PROT_WRITE) || *wired || force_copy) {
14784 			/*
14785 			 *	Make a new object, and place it in the
14786 			 *	object chain.  Note that no new references
14787 			 *	have appeared -- one just moved from the
14788 			 *	map to the new object.
14789 			 */
14790 
14791 			if (vm_map_lock_read_to_write(map)) {
14792 				vm_map_lock_read(map);
14793 				goto RetryLookup;
14794 			}
14795 
14796 			if (VME_OBJECT(entry)->shadowed == FALSE) {
14797 				vm_object_lock(VME_OBJECT(entry));
14798 				VM_OBJECT_SET_SHADOWED(VME_OBJECT(entry), TRUE);
14799 				vm_object_unlock(VME_OBJECT(entry));
14800 			}
14801 			VME_OBJECT_SHADOW(entry,
14802 			    (vm_map_size_t) (entry->vme_end -
14803 			    entry->vme_start),
14804 			    vm_map_always_shadow(map));
14805 			entry->needs_copy = FALSE;
14806 
14807 			vm_map_lock_write_to_read(map);
14808 		}
14809 		if ((fault_type & VM_PROT_WRITE) == 0 && *wired == 0) {
14810 			/*
14811 			 *	We're attempting to read a copy-on-write
14812 			 *	page -- don't allow writes.
14813 			 */
14814 
14815 			prot &= (~VM_PROT_WRITE);
14816 		}
14817 	}
14818 
14819 	if (submap_needed_copy && (prot & VM_PROT_WRITE)) {
14820 		/*
14821 		 * We went through a "needs_copy" submap without triggering
14822 		 * a copy, so granting write access to the page would bypass
14823 		 * that submap's "needs_copy".
14824 		 */
14825 		assert(!(fault_type & VM_PROT_WRITE));
14826 		assert(!*wired);
14827 		assert(!force_copy);
14828 		// printf("FBDP %d[%s] submap_needed_copy for %p 0x%llx\n", proc_selfpid(), proc_name_address(current_task()->bsd_info), map, vaddr);
14829 		prot &= ~VM_PROT_WRITE;
14830 	}
14831 
14832 	/*
14833 	 *	Create an object if necessary.
14834 	 */
14835 	if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
14836 		if (vm_map_lock_read_to_write(map)) {
14837 			vm_map_lock_read(map);
14838 			goto RetryLookup;
14839 		}
14840 
14841 		VME_OBJECT_SET(entry,
14842 		    vm_object_allocate(
14843 			    (vm_map_size_t)(entry->vme_end -
14844 			    entry->vme_start)), false, 0);
14845 		VME_OFFSET_SET(entry, 0);
14846 		assert(entry->use_pmap);
14847 		vm_map_lock_write_to_read(map);
14848 	}
14849 
14850 	/*
14851 	 *	Return the object/offset from this entry.  If the entry
14852 	 *	was copy-on-write or empty, it has been fixed up.  Also
14853 	 *	return the protection.
14854 	 */
14855 
14856 	*offset = (vaddr - entry->vme_start) + VME_OFFSET(entry);
14857 	*object = VME_OBJECT(entry);
14858 	*out_prot = prot;
14859 	KDBG_FILTERED(MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_MAP_LOOKUP_OBJECT), VM_KERNEL_UNSLIDE_OR_PERM(*object), (unsigned long) VME_ALIAS(entry), 0, 0);
14860 
14861 	if (fault_info) {
14862 		/*
14863 		 * Initialize fault information according to the entry being faulted
14864 		 * from.
14865 		 */
14866 		fault_info->user_tag = VME_ALIAS(entry);
14867 		fault_info->pmap_options = 0;
14868 		if (entry->iokit_acct ||
14869 		    (!entry->is_sub_map && !entry->use_pmap)) {
14870 			fault_info->pmap_options |= PMAP_OPTIONS_ALT_ACCT;
14871 		}
14872 		if (fault_info->behavior == VM_BEHAVIOR_DEFAULT) {
14873 			fault_info->behavior = entry->behavior;
14874 		}
14875 		fault_info->lo_offset = VME_OFFSET(entry);
14876 		fault_info->hi_offset =
14877 		    (entry->vme_end - entry->vme_start) + VME_OFFSET(entry);
14878 		fault_info->no_cache  = entry->no_cache;
14879 		fault_info->io_sync = FALSE;
14880 		fault_info->cs_bypass = (entry->used_for_jit ||
14881 #if CODE_SIGNING_MONITOR
14882 		    (csm_address_space_exempt(map->pmap) == KERN_SUCCESS) ||
14883 #endif
14884 		    entry->vme_resilient_codesign);
14885 		fault_info->mark_zf_absent = FALSE;
14886 		fault_info->batch_pmap_op = FALSE;
14887 		/*
14888 		 * The pmap layer will validate this page
14889 		 * before allowing it to be executed from.
14890 		 */
14891 #if CODE_SIGNING_MONITOR
14892 		fault_info->csm_associated = entry->csm_associated;
14893 #else
14894 		fault_info->csm_associated = FALSE;
14895 #endif
14896 
14897 		fault_info->resilient_media = entry->vme_resilient_media;
14898 		fault_info->fi_xnu_user_debug = entry->vme_xnu_user_debug;
14899 		fault_info->no_copy_on_read = entry->vme_no_copy_on_read;
14900 #if __arm64e__
14901 		fault_info->fi_used_for_tpro = entry->used_for_tpro;
14902 #else /* __arm64e__ */
14903 		fault_info->fi_used_for_tpro = FALSE;
14904 #endif
14905 		if (entry->translated_allow_execute) {
14906 			fault_info->pmap_options |= PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE;
14907 		}
14908 	}
14909 
14910 	/*
14911 	 *	Lock the object to prevent it from disappearing
14912 	 */
14913 	if (object_lock_type == OBJECT_LOCK_EXCLUSIVE) {
14914 		if (contended == NULL) {
14915 			vm_object_lock(*object);
14916 		} else {
14917 			*contended = vm_object_lock_check_contended(*object);
14918 		}
14919 	} else {
14920 		vm_object_lock_shared(*object);
14921 	}
14922 
14923 	/*
14924 	 *	Save the version number
14925 	 */
14926 
14927 	out_version->main_timestamp = map->timestamp;
14928 
14929 	return KERN_SUCCESS;
14930 }
14931 
14932 
14933 /*
14934  *	vm_map_verify:
14935  *
14936  *	Verifies that the map in question has not changed
14937  *	since the given version. The map has to be locked
14938  *	("shared" mode is fine) before calling this function
14939  *	and it will be returned locked too.
14940  */
14941 boolean_t
vm_map_verify(vm_map_t map,vm_map_version_t * version)14942 vm_map_verify(
14943 	vm_map_t                map,
14944 	vm_map_version_t        *version)       /* REF */
14945 {
14946 	boolean_t       result;
14947 
14948 	vm_map_lock_assert_held(map);
14949 	result = (map->timestamp == version->main_timestamp);
14950 
14951 	return result;
14952 }
14953 
14954 
14955 /*
14956  *	TEMPORARYTEMPORARYTEMPORARYTEMPORARYTEMPORARYTEMPORARY
14957  *	Goes away after regular vm_region_recurse function migrates to
14958  *	64 bits
14959  *	vm_region_recurse: A form of vm_region which follows the
14960  *	submaps in a target map
14961  *
14962  */
14963 
14964 kern_return_t
vm_map_region_recurse_64(vm_map_t map,vm_map_offset_ut * address_u,vm_map_size_ut * size_u,natural_t * nesting_depth,vm_region_submap_info_64_t submap_info,mach_msg_type_number_t * count)14965 vm_map_region_recurse_64(
14966 	vm_map_t                map,
14967 	vm_map_offset_ut       *address_u,      /* IN/OUT */
14968 	vm_map_size_ut         *size_u,         /* OUT */
14969 	natural_t              *nesting_depth,  /* IN/OUT */
14970 	vm_region_submap_info_64_t submap_info, /* IN/OUT */
14971 	mach_msg_type_number_t *count)          /* IN/OUT */
14972 {
14973 	mach_msg_type_number_t  original_count;
14974 	vm_region_extended_info_data_t  extended;
14975 	vm_map_entry_t                  tmp_entry;
14976 	vm_map_offset_t                 user_address;
14977 	unsigned int                    user_max_depth;
14978 
14979 	/*
14980 	 * "curr_entry" is the VM map entry preceding or including the
14981 	 * address we're looking for.
14982 	 * "curr_map" is the map or sub-map containing "curr_entry".
14983 	 * "curr_address" is the equivalent of the top map's "user_address"
14984 	 * in the current map.
14985 	 * "curr_offset" is the cumulated offset of "curr_map" in the
14986 	 * target task's address space.
14987 	 * "curr_depth" is the depth of "curr_map" in the chain of
14988 	 * sub-maps.
14989 	 *
14990 	 * "curr_max_below" and "curr_max_above" limit the range (around
14991 	 * "curr_address") we should take into account in the current (sub)map.
14992 	 * They limit the range to what's visible through the map entries
14993 	 * we've traversed from the top map to the current map.
14994 	 *
14995 	 */
14996 	vm_map_entry_t                  curr_entry;
14997 	vm_map_t                        curr_entry_submap;
14998 	vm_map_address_t                curr_entry_start;
14999 	vm_object_offset_t              curr_entry_offset;
15000 	vm_map_address_t                curr_address;
15001 	vm_map_offset_t                 curr_offset;
15002 	vm_map_t                        curr_map;
15003 	unsigned int                    curr_depth;
15004 	vm_map_offset_t                 curr_max_below, curr_max_above;
15005 	vm_map_offset_t                 curr_skip;
15006 
15007 	/*
15008 	 * "next_" is the same as "curr_" but for the VM region immediately
15009 	 * after the address we're looking for.  We need to keep track of this
15010 	 * too because we want to return info about that region if the
15011 	 * address we're looking for is not mapped.
15012 	 */
15013 	vm_map_entry_t                  next_entry;
15014 	vm_map_offset_t                 next_offset;
15015 	vm_map_offset_t                 next_address;
15016 	vm_map_t                        next_map;
15017 	unsigned int                    next_depth;
15018 	vm_map_offset_t                 next_max_below, next_max_above;
15019 	vm_map_offset_t                 next_skip;
15020 
15021 	boolean_t                       look_for_pages;
15022 	vm_region_submap_short_info_64_t short_info;
15023 	boolean_t                       do_region_footprint;
15024 	int                             effective_page_size, effective_page_shift;
15025 	boolean_t                       submap_needed_copy;
15026 
15027 	if (map == VM_MAP_NULL) {
15028 		/* no address space to work on */
15029 		return KERN_INVALID_ARGUMENT;
15030 	}
15031 
15032 	user_address = vm_sanitize_addr(map, *address_u);
15033 
15034 	effective_page_shift = vm_self_region_page_shift(map);
15035 	effective_page_size = (1 << effective_page_shift);
15036 
15037 	if (*count < VM_REGION_SUBMAP_SHORT_INFO_COUNT_64) {
15038 		/*
15039 		 * "info" structure is not big enough and
15040 		 * would overflow
15041 		 */
15042 		return KERN_INVALID_ARGUMENT;
15043 	}
15044 
15045 	do_region_footprint = task_self_region_footprint();
15046 	original_count = *count;
15047 
15048 	if (original_count < VM_REGION_SUBMAP_INFO_V0_COUNT_64) {
15049 		*count = VM_REGION_SUBMAP_SHORT_INFO_COUNT_64;
15050 		look_for_pages = FALSE;
15051 		short_info = (vm_region_submap_short_info_64_t) submap_info;
15052 		submap_info = NULL;
15053 	} else {
15054 		look_for_pages = TRUE;
15055 		*count = VM_REGION_SUBMAP_INFO_V0_COUNT_64;
15056 		short_info = NULL;
15057 
15058 		if (original_count >= VM_REGION_SUBMAP_INFO_V1_COUNT_64) {
15059 			*count = VM_REGION_SUBMAP_INFO_V1_COUNT_64;
15060 		}
15061 		if (original_count >= VM_REGION_SUBMAP_INFO_V2_COUNT_64) {
15062 			*count = VM_REGION_SUBMAP_INFO_V2_COUNT_64;
15063 		}
15064 	}
15065 
15066 	user_max_depth = *nesting_depth;
15067 	submap_needed_copy = FALSE;
15068 
15069 	if (not_in_kdp) {
15070 		vm_map_lock_read(map);
15071 	}
15072 
15073 recurse_again:
15074 	curr_entry = NULL;
15075 	curr_map = map;
15076 	curr_address = user_address;
15077 	curr_offset = 0;
15078 	curr_skip = 0;
15079 	curr_depth = 0;
15080 	curr_max_above = ((vm_map_offset_t) -1) - curr_address;
15081 	curr_max_below = curr_address;
15082 
15083 	next_entry = NULL;
15084 	next_map = NULL;
15085 	next_address = 0;
15086 	next_offset = 0;
15087 	next_skip = 0;
15088 	next_depth = 0;
15089 	next_max_above = (vm_map_offset_t) -1;
15090 	next_max_below = (vm_map_offset_t) -1;
15091 
15092 	for (;;) {
15093 		if (vm_map_lookup_entry(curr_map,
15094 		    curr_address,
15095 		    &tmp_entry)) {
15096 			/* tmp_entry contains the address we're looking for */
15097 			curr_entry = tmp_entry;
15098 		} else {
15099 			vm_map_offset_t skip;
15100 			/*
15101 			 * The address is not mapped.  "tmp_entry" is the
15102 			 * map entry preceding the address.  We want the next
15103 			 * one, if it exists.
15104 			 */
15105 			curr_entry = tmp_entry->vme_next;
15106 
15107 			if (curr_entry == vm_map_to_entry(curr_map) ||
15108 			    (curr_entry->vme_start >=
15109 			    curr_address + curr_max_above)) {
15110 				/* no next entry at this level: stop looking */
15111 				if (not_in_kdp) {
15112 					vm_map_unlock_read(curr_map);
15113 				}
15114 				curr_entry = NULL;
15115 				curr_map = NULL;
15116 				curr_skip = 0;
15117 				curr_offset = 0;
15118 				curr_depth = 0;
15119 				curr_max_above = 0;
15120 				curr_max_below = 0;
15121 				break;
15122 			}
15123 
15124 			/* adjust current address and offset */
15125 			skip = curr_entry->vme_start - curr_address;
15126 			curr_address = curr_entry->vme_start;
15127 			curr_skip += skip;
15128 			curr_offset += skip;
15129 			curr_max_above -= skip;
15130 			curr_max_below = 0;
15131 		}
15132 
15133 		/*
15134 		 * Is the next entry at this level closer to the address (or
15135 		 * deeper in the submap chain) than the one we had
15136 		 * so far ?
15137 		 */
15138 		tmp_entry = curr_entry->vme_next;
15139 		if (tmp_entry == vm_map_to_entry(curr_map)) {
15140 			/* no next entry at this level */
15141 		} else if (tmp_entry->vme_start >=
15142 		    curr_address + curr_max_above) {
15143 			/*
15144 			 * tmp_entry is beyond the scope of what we mapped of
15145 			 * this submap in the upper level: ignore it.
15146 			 */
15147 		} else if ((next_entry == NULL) ||
15148 		    (tmp_entry->vme_start + curr_offset <=
15149 		    next_entry->vme_start + next_offset)) {
15150 			/*
15151 			 * We didn't have a "next_entry" or this one is
15152 			 * closer to the address we're looking for:
15153 			 * use this "tmp_entry" as the new "next_entry".
15154 			 */
15155 			if (next_entry != NULL) {
15156 				/* unlock the last "next_map" */
15157 				if (next_map != curr_map && not_in_kdp) {
15158 					vm_map_unlock_read(next_map);
15159 				}
15160 			}
15161 			next_entry = tmp_entry;
15162 			next_map = curr_map;
15163 			next_depth = curr_depth;
15164 			next_address = next_entry->vme_start;
15165 			next_skip = curr_skip;
15166 			next_skip += (next_address - curr_address);
15167 			next_offset = curr_offset;
15168 			next_offset += (next_address - curr_address);
15169 			next_max_above = MIN(next_max_above, curr_max_above);
15170 			next_max_above = MIN(next_max_above,
15171 			    next_entry->vme_end - next_address);
15172 			next_max_below = MIN(next_max_below, curr_max_below);
15173 			next_max_below = MIN(next_max_below,
15174 			    next_address - next_entry->vme_start);
15175 		}
15176 
15177 		/*
15178 		 * "curr_max_{above,below}" allow us to keep track of the
15179 		 * portion of the submap that is actually mapped at this level:
15180 		 * the rest of that submap is irrelevant to us, since it's not
15181 		 * mapped here.
15182 		 * The relevant portion of the map starts at
15183 		 * "VME_OFFSET(curr_entry)" up to the size of "curr_entry".
15184 		 */
15185 		curr_max_above = MIN(curr_max_above,
15186 		    curr_entry->vme_end - curr_address);
15187 		curr_max_below = MIN(curr_max_below,
15188 		    curr_address - curr_entry->vme_start);
15189 
15190 		if (!curr_entry->is_sub_map ||
15191 		    curr_depth >= user_max_depth) {
15192 			/*
15193 			 * We hit a leaf map or we reached the maximum depth
15194 			 * we could, so stop looking.  Keep the current map
15195 			 * locked.
15196 			 */
15197 			break;
15198 		}
15199 
15200 		/*
15201 		 * Get down to the next submap level.
15202 		 */
15203 
15204 		if (curr_entry->needs_copy) {
15205 			/* everything below this is effectively copy-on-write */
15206 			submap_needed_copy = TRUE;
15207 		}
15208 
15209 		/*
15210 		 * Lock the next level and unlock the current level,
15211 		 * unless we need to keep it locked to access the "next_entry"
15212 		 * later.
15213 		 */
15214 		curr_entry_submap = VME_SUBMAP(curr_entry);
15215 		curr_entry_start = curr_entry->vme_start;
15216 		curr_entry_offset = VME_OFFSET(curr_entry);
15217 		curr_entry = VM_MAP_ENTRY_NULL; /* no longer valid after unlocking the map */
15218 		if (not_in_kdp) {
15219 			vm_map_lock_read(curr_entry_submap);
15220 		}
15221 		if (curr_map == next_map) {
15222 			/* keep "next_map" locked in case we need it */
15223 		} else {
15224 			/* release this map */
15225 			if (not_in_kdp) {
15226 				vm_map_unlock_read(curr_map);
15227 			}
15228 		}
15229 
15230 		/*
15231 		 * Adjust the offset.  "curr_entry" mapped the submap
15232 		 * at relative address "curr_entry_start" in the
15233 		 * curr_map but skips the first "curr_entry_offset"
15234 		 * bytes of the submap.
15235 		 * "curr_offset" always represents the offset of a virtual
15236 		 * address in the curr_map relative to the absolute address
15237 		 * space (i.e. the top-level VM map).
15238 		 */
15239 		curr_offset += curr_entry_offset - curr_entry_start;
15240 		curr_address = user_address + curr_offset;
15241 		/* switch to the submap */
15242 		curr_map = curr_entry_submap;
15243 		curr_depth++;
15244 	}
15245 
15246 // LP64todo: all the current tools are 32bit, obviously never worked for 64b
15247 // so probably should be a real 32b ID vs. ptr.
15248 // Current users just check for equality
15249 
15250 	if (curr_entry == NULL) {
15251 		/* no VM region contains the address... */
15252 
15253 		if (do_region_footprint && /* we want footprint numbers */
15254 		    next_entry == NULL && /* & there are no more regions */
15255 		    /* & we haven't already provided our fake region: */
15256 		    user_address <= vm_map_last_entry(map)->vme_end) {
15257 			ledger_amount_t ledger_resident, ledger_compressed;
15258 
15259 			/*
15260 			 * Add a fake memory region to account for
15261 			 * purgeable and/or ledger-tagged memory that
15262 			 * counts towards this task's memory footprint,
15263 			 * i.e. the resident/compressed pages of non-volatile
15264 			 * objects owned by that task.
15265 			 */
15266 			task_ledgers_footprint(map->pmap->ledger,
15267 			    &ledger_resident,
15268 			    &ledger_compressed);
15269 			if (ledger_resident + ledger_compressed == 0) {
15270 				/* no purgeable memory usage to report */
15271 				return KERN_INVALID_ADDRESS;
15272 			}
15273 			/* fake region to show nonvolatile footprint */
15274 			if (look_for_pages) {
15275 				submap_info->protection = VM_PROT_DEFAULT;
15276 				submap_info->max_protection = VM_PROT_DEFAULT;
15277 				submap_info->inheritance = VM_INHERIT_DEFAULT;
15278 				submap_info->offset = 0;
15279 				submap_info->user_tag = -1;
15280 				submap_info->pages_resident = (unsigned int) (ledger_resident / effective_page_size);
15281 				submap_info->pages_shared_now_private = 0;
15282 				submap_info->pages_swapped_out = (unsigned int) (ledger_compressed / effective_page_size);
15283 				submap_info->pages_dirtied = submap_info->pages_resident;
15284 				submap_info->ref_count = 1;
15285 				submap_info->shadow_depth = 0;
15286 				submap_info->external_pager = 0;
15287 				submap_info->share_mode = SM_PRIVATE;
15288 				if (submap_needed_copy) {
15289 					submap_info->share_mode = SM_COW;
15290 				}
15291 				submap_info->is_submap = 0;
15292 				submap_info->behavior = VM_BEHAVIOR_DEFAULT;
15293 				submap_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
15294 				submap_info->user_wired_count = 0;
15295 				submap_info->pages_reusable = 0;
15296 			} else {
15297 				short_info->user_tag = -1;
15298 				short_info->offset = 0;
15299 				short_info->protection = VM_PROT_DEFAULT;
15300 				short_info->inheritance = VM_INHERIT_DEFAULT;
15301 				short_info->max_protection = VM_PROT_DEFAULT;
15302 				short_info->behavior = VM_BEHAVIOR_DEFAULT;
15303 				short_info->user_wired_count = 0;
15304 				short_info->is_submap = 0;
15305 				short_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
15306 				short_info->external_pager = 0;
15307 				short_info->shadow_depth = 0;
15308 				short_info->share_mode = SM_PRIVATE;
15309 				if (submap_needed_copy) {
15310 					short_info->share_mode = SM_COW;
15311 				}
15312 				short_info->ref_count = 1;
15313 			}
15314 			*nesting_depth = 0;
15315 			*address_u = vm_sanitize_wrap_addr(vm_map_last_entry(map)->vme_end);
15316 			*size_u    = vm_sanitize_wrap_size(ledger_resident + ledger_compressed);
15317 			return KERN_SUCCESS;
15318 		}
15319 
15320 		if (next_entry == NULL) {
15321 			/* ... and no VM region follows it either */
15322 			return KERN_INVALID_ADDRESS;
15323 		}
15324 		/* ... gather info about the next VM region */
15325 		curr_entry = next_entry;
15326 		curr_map = next_map;    /* still locked ... */
15327 		curr_address = next_address;
15328 		curr_skip = next_skip;
15329 		curr_offset = next_offset;
15330 		curr_depth = next_depth;
15331 		curr_max_above = next_max_above;
15332 		curr_max_below = next_max_below;
15333 	} else {
15334 		/* we won't need "next_entry" after all */
15335 		if (next_entry != NULL) {
15336 			/* release "next_map" */
15337 			if (next_map != curr_map && not_in_kdp) {
15338 				vm_map_unlock_read(next_map);
15339 			}
15340 		}
15341 	}
15342 	next_entry = NULL;
15343 	next_map = NULL;
15344 	next_offset = 0;
15345 	next_skip = 0;
15346 	next_depth = 0;
15347 	next_max_below = -1;
15348 	next_max_above = -1;
15349 
15350 	if (curr_entry->is_sub_map &&
15351 	    curr_depth < user_max_depth) {
15352 		/*
15353 		 * We're not as deep as we could be:  we must have
15354 		 * gone back up after not finding anything mapped
15355 		 * below the original top-level map entry's.
15356 		 * Let's move "curr_address" forward and recurse again.
15357 		 */
15358 		user_address = curr_address;
15359 		goto recurse_again;
15360 	}
15361 
15362 	*nesting_depth = curr_depth;
15363 	*address_u = vm_sanitize_wrap_addr(
15364 		user_address + curr_skip - curr_max_below);
15365 	*size_u    = vm_sanitize_wrap_size(curr_max_above + curr_max_below);
15366 
15367 	if (look_for_pages) {
15368 		submap_info->user_tag = VME_ALIAS(curr_entry);
15369 		submap_info->offset = VME_OFFSET(curr_entry);
15370 		submap_info->protection = curr_entry->protection;
15371 		submap_info->inheritance = curr_entry->inheritance;
15372 		submap_info->max_protection = curr_entry->max_protection;
15373 		submap_info->behavior = curr_entry->behavior;
15374 		submap_info->user_wired_count = curr_entry->user_wired_count;
15375 		submap_info->is_submap = curr_entry->is_sub_map;
15376 		if (curr_entry->is_sub_map) {
15377 			submap_info->object_id = VM_OBJECT_ID(VME_SUBMAP(curr_entry));
15378 		} else {
15379 			submap_info->object_id = VM_OBJECT_ID(VME_OBJECT(curr_entry));
15380 		}
15381 	} else {
15382 		short_info->user_tag = VME_ALIAS(curr_entry);
15383 		short_info->offset = VME_OFFSET(curr_entry);
15384 		short_info->protection = curr_entry->protection;
15385 		short_info->inheritance = curr_entry->inheritance;
15386 		short_info->max_protection = curr_entry->max_protection;
15387 		short_info->behavior = curr_entry->behavior;
15388 		short_info->user_wired_count = curr_entry->user_wired_count;
15389 		short_info->is_submap = curr_entry->is_sub_map;
15390 		if (curr_entry->is_sub_map) {
15391 			short_info->object_id = VM_OBJECT_ID(VME_SUBMAP(curr_entry));
15392 		} else {
15393 			short_info->object_id = VM_OBJECT_ID(VME_OBJECT(curr_entry));
15394 		}
15395 	}
15396 
15397 	extended.pages_resident = 0;
15398 	extended.pages_swapped_out = 0;
15399 	extended.pages_shared_now_private = 0;
15400 	extended.pages_dirtied = 0;
15401 	extended.pages_reusable = 0;
15402 	extended.external_pager = 0;
15403 	extended.shadow_depth = 0;
15404 	extended.share_mode = SM_EMPTY;
15405 	extended.ref_count = 0;
15406 
15407 	if (not_in_kdp) {
15408 		if (!curr_entry->is_sub_map) {
15409 			vm_map_offset_t range_start, range_end;
15410 			range_start = MAX((curr_address - curr_max_below),
15411 			    curr_entry->vme_start);
15412 			range_end = MIN((curr_address + curr_max_above),
15413 			    curr_entry->vme_end);
15414 			vm_map_region_walk(curr_map,
15415 			    range_start,
15416 			    curr_entry,
15417 			    (VME_OFFSET(curr_entry) +
15418 			    (range_start -
15419 			    curr_entry->vme_start)),
15420 			    range_end - range_start,
15421 			    &extended,
15422 			    look_for_pages, VM_REGION_EXTENDED_INFO_COUNT);
15423 			if (submap_needed_copy) {
15424 				extended.share_mode = SM_COW;
15425 			}
15426 		} else {
15427 			if (curr_entry->use_pmap) {
15428 				extended.share_mode = SM_TRUESHARED;
15429 			} else {
15430 				extended.share_mode = SM_PRIVATE;
15431 			}
15432 			extended.ref_count = os_ref_get_count_raw(&VME_SUBMAP(curr_entry)->map_refcnt);
15433 		}
15434 	}
15435 
15436 	if (look_for_pages) {
15437 		submap_info->pages_resident = extended.pages_resident;
15438 		submap_info->pages_swapped_out = extended.pages_swapped_out;
15439 		submap_info->pages_shared_now_private =
15440 		    extended.pages_shared_now_private;
15441 		submap_info->pages_dirtied = extended.pages_dirtied;
15442 		submap_info->external_pager = extended.external_pager;
15443 		submap_info->shadow_depth = extended.shadow_depth;
15444 		submap_info->share_mode = extended.share_mode;
15445 		submap_info->ref_count = extended.ref_count;
15446 
15447 		if (original_count >= VM_REGION_SUBMAP_INFO_V1_COUNT_64) {
15448 			submap_info->pages_reusable = extended.pages_reusable;
15449 		}
15450 		if (original_count >= VM_REGION_SUBMAP_INFO_V2_COUNT_64) {
15451 			if (curr_entry->is_sub_map) {
15452 				submap_info->object_id_full = (vm_object_id_t)VM_KERNEL_ADDRHASH(VME_SUBMAP(curr_entry));
15453 			} else if (VME_OBJECT(curr_entry)) {
15454 				submap_info->object_id_full = (vm_object_id_t)VM_KERNEL_ADDRHASH(VME_OBJECT(curr_entry));
15455 			} else {
15456 				submap_info->object_id_full = 0ull;
15457 			}
15458 		}
15459 	} else {
15460 		short_info->external_pager = extended.external_pager;
15461 		short_info->shadow_depth = extended.shadow_depth;
15462 		short_info->share_mode = extended.share_mode;
15463 		short_info->ref_count = extended.ref_count;
15464 	}
15465 
15466 	if (not_in_kdp) {
15467 		vm_map_unlock_read(curr_map);
15468 	}
15469 
15470 	return KERN_SUCCESS;
15471 }
15472 
15473 /*
15474  *	vm_region:
15475  *
15476  *	User call to obtain information about a region in
15477  *	a task's address map. Currently, only one flavor is
15478  *	supported.
15479  *
15480  *	XXX The reserved and behavior fields cannot be filled
15481  *	    in until the vm merge from the IK is completed, and
15482  *	    vm_reserve is implemented.
15483  */
15484 
15485 kern_return_t
vm_map_region(vm_map_t map,vm_map_offset_ut * address_u,vm_map_size_ut * size_u,vm_region_flavor_t flavor,vm_region_info_t info,mach_msg_type_number_t * count,mach_port_t * object_name)15486 vm_map_region(
15487 	vm_map_t                map,
15488 	vm_map_offset_ut       *address_u,      /* IN/OUT */
15489 	vm_map_size_ut         *size_u,         /* OUT */
15490 	vm_region_flavor_t      flavor,         /* IN */
15491 	vm_region_info_t        info,           /* OUT */
15492 	mach_msg_type_number_t *count,          /* IN/OUT */
15493 	mach_port_t            *object_name)    /* OUT */
15494 {
15495 	vm_map_entry_t          tmp_entry;
15496 	vm_map_entry_t          entry;
15497 	vm_map_offset_t         start;
15498 
15499 	if (map == VM_MAP_NULL) {
15500 		return KERN_INVALID_ARGUMENT;
15501 	}
15502 
15503 	start = vm_sanitize_addr(map, *address_u);
15504 
15505 	switch (flavor) {
15506 	case VM_REGION_BASIC_INFO:
15507 		/* legacy for old 32-bit objects info */
15508 	{
15509 		vm_region_basic_info_t  basic;
15510 
15511 		if (*count < VM_REGION_BASIC_INFO_COUNT) {
15512 			return KERN_INVALID_ARGUMENT;
15513 		}
15514 
15515 		basic = (vm_region_basic_info_t) info;
15516 		*count = VM_REGION_BASIC_INFO_COUNT;
15517 
15518 		vm_map_lock_read(map);
15519 
15520 		if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15521 			if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15522 				vm_map_unlock_read(map);
15523 				return KERN_INVALID_ADDRESS;
15524 			}
15525 		} else {
15526 			entry = tmp_entry;
15527 		}
15528 
15529 		start = entry->vme_start;
15530 
15531 		basic->offset = (uint32_t)VME_OFFSET(entry);
15532 		basic->protection = entry->protection;
15533 		basic->inheritance = entry->inheritance;
15534 		basic->max_protection = entry->max_protection;
15535 		basic->behavior = entry->behavior;
15536 		basic->user_wired_count = entry->user_wired_count;
15537 		basic->reserved = entry->is_sub_map;
15538 
15539 		*address_u = vm_sanitize_wrap_addr(start);
15540 		*size_u    = vm_sanitize_wrap_size(entry->vme_end - start);
15541 
15542 		if (object_name) {
15543 			*object_name = IP_NULL;
15544 		}
15545 		if (entry->is_sub_map) {
15546 			basic->shared = FALSE;
15547 		} else {
15548 			basic->shared = entry->is_shared;
15549 		}
15550 
15551 		vm_map_unlock_read(map);
15552 		return KERN_SUCCESS;
15553 	}
15554 
15555 	case VM_REGION_BASIC_INFO_64:
15556 	{
15557 		vm_region_basic_info_64_t       basic;
15558 
15559 		if (*count < VM_REGION_BASIC_INFO_COUNT_64) {
15560 			return KERN_INVALID_ARGUMENT;
15561 		}
15562 
15563 		basic = (vm_region_basic_info_64_t) info;
15564 		*count = VM_REGION_BASIC_INFO_COUNT_64;
15565 
15566 		vm_map_lock_read(map);
15567 
15568 		if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15569 			if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15570 				vm_map_unlock_read(map);
15571 				return KERN_INVALID_ADDRESS;
15572 			}
15573 		} else {
15574 			entry = tmp_entry;
15575 		}
15576 
15577 		start = entry->vme_start;
15578 
15579 		basic->offset = VME_OFFSET(entry);
15580 		basic->protection = entry->protection;
15581 		basic->inheritance = entry->inheritance;
15582 		basic->max_protection = entry->max_protection;
15583 		basic->behavior = entry->behavior;
15584 		basic->user_wired_count = entry->user_wired_count;
15585 		basic->reserved = entry->is_sub_map;
15586 
15587 		*address_u = vm_sanitize_wrap_addr(start);
15588 		*size_u    = vm_sanitize_wrap_size(entry->vme_end - start);
15589 
15590 		if (object_name) {
15591 			*object_name = IP_NULL;
15592 		}
15593 		if (entry->is_sub_map) {
15594 			basic->shared = FALSE;
15595 		} else {
15596 			basic->shared = entry->is_shared;
15597 		}
15598 
15599 		vm_map_unlock_read(map);
15600 		return KERN_SUCCESS;
15601 	}
15602 	case VM_REGION_EXTENDED_INFO:
15603 		if (*count < VM_REGION_EXTENDED_INFO_COUNT) {
15604 			return KERN_INVALID_ARGUMENT;
15605 		}
15606 		OS_FALLTHROUGH;
15607 	case VM_REGION_EXTENDED_INFO__legacy:
15608 	{
15609 		vm_region_extended_info_t       extended;
15610 		mach_msg_type_number_t original_count;
15611 		int effective_page_size, effective_page_shift;
15612 
15613 		if (*count < VM_REGION_EXTENDED_INFO_COUNT__legacy) {
15614 			return KERN_INVALID_ARGUMENT;
15615 		}
15616 
15617 		extended = (vm_region_extended_info_t) info;
15618 
15619 		effective_page_shift = vm_self_region_page_shift(map);
15620 		effective_page_size = (1 << effective_page_shift);
15621 
15622 		vm_map_lock_read(map);
15623 
15624 		if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15625 			if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15626 				vm_map_unlock_read(map);
15627 				return KERN_INVALID_ADDRESS;
15628 			}
15629 		} else {
15630 			entry = tmp_entry;
15631 		}
15632 		start = entry->vme_start;
15633 
15634 		extended->protection = entry->protection;
15635 		extended->user_tag = VME_ALIAS(entry);
15636 		extended->pages_resident = 0;
15637 		extended->pages_swapped_out = 0;
15638 		extended->pages_shared_now_private = 0;
15639 		extended->pages_dirtied = 0;
15640 		extended->external_pager = 0;
15641 		extended->shadow_depth = 0;
15642 
15643 		original_count = *count;
15644 		if (flavor == VM_REGION_EXTENDED_INFO__legacy) {
15645 			*count = VM_REGION_EXTENDED_INFO_COUNT__legacy;
15646 		} else {
15647 			extended->pages_reusable = 0;
15648 			*count = VM_REGION_EXTENDED_INFO_COUNT;
15649 		}
15650 
15651 		vm_map_region_walk(map, start, entry, VME_OFFSET(entry), entry->vme_end - start, extended, TRUE, *count);
15652 
15653 		if (object_name) {
15654 			*object_name = IP_NULL;
15655 		}
15656 
15657 		*address_u = vm_sanitize_wrap_addr(start);
15658 		*size_u    = vm_sanitize_wrap_size(entry->vme_end - start);
15659 
15660 		vm_map_unlock_read(map);
15661 		return KERN_SUCCESS;
15662 	}
15663 	case VM_REGION_TOP_INFO:
15664 	{
15665 		vm_region_top_info_t    top;
15666 
15667 		if (*count < VM_REGION_TOP_INFO_COUNT) {
15668 			return KERN_INVALID_ARGUMENT;
15669 		}
15670 
15671 		top = (vm_region_top_info_t) info;
15672 		*count = VM_REGION_TOP_INFO_COUNT;
15673 
15674 		vm_map_lock_read(map);
15675 
15676 		if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15677 			if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15678 				vm_map_unlock_read(map);
15679 				return KERN_INVALID_ADDRESS;
15680 			}
15681 		} else {
15682 			entry = tmp_entry;
15683 		}
15684 		start = entry->vme_start;
15685 
15686 		top->private_pages_resident = 0;
15687 		top->shared_pages_resident = 0;
15688 
15689 		vm_map_region_top_walk(entry, top);
15690 
15691 		if (object_name) {
15692 			*object_name = IP_NULL;
15693 		}
15694 
15695 		*address_u = vm_sanitize_wrap_addr(start);
15696 		*size_u    = vm_sanitize_wrap_size(entry->vme_end - start);
15697 
15698 		vm_map_unlock_read(map);
15699 		return KERN_SUCCESS;
15700 	}
15701 	default:
15702 		return KERN_INVALID_ARGUMENT;
15703 	}
15704 }
15705 
15706 #define OBJ_RESIDENT_COUNT(obj, entry_size)                             \
15707 	MIN((entry_size),                                               \
15708 	    ((obj)->all_reusable ?                                      \
15709 	     (obj)->wired_page_count :                                  \
15710 	     (obj)->resident_page_count - (obj)->reusable_page_count))
15711 
15712 void
vm_map_region_top_walk(vm_map_entry_t entry,vm_region_top_info_t top)15713 vm_map_region_top_walk(
15714 	vm_map_entry_t             entry,
15715 	vm_region_top_info_t       top)
15716 {
15717 	if (entry->is_sub_map || VME_OBJECT(entry) == 0) {
15718 		top->share_mode = SM_EMPTY;
15719 		top->ref_count = 0;
15720 		top->obj_id = 0;
15721 		return;
15722 	}
15723 
15724 	{
15725 		struct  vm_object *obj, *tmp_obj;
15726 		int             ref_count;
15727 		uint32_t        entry_size;
15728 
15729 		entry_size = (uint32_t) ((entry->vme_end - entry->vme_start) / PAGE_SIZE_64);
15730 
15731 		obj = VME_OBJECT(entry);
15732 
15733 		vm_object_lock(obj);
15734 
15735 		if ((ref_count = os_ref_get_count_raw(&obj->ref_count)) > 1 &&
15736 		    obj->paging_in_progress) {
15737 			ref_count--;
15738 		}
15739 
15740 		assert(obj->reusable_page_count <= obj->resident_page_count);
15741 		if (obj->shadow) {
15742 			if (ref_count == 1) {
15743 				top->private_pages_resident =
15744 				    OBJ_RESIDENT_COUNT(obj, entry_size);
15745 			} else {
15746 				top->shared_pages_resident =
15747 				    OBJ_RESIDENT_COUNT(obj, entry_size);
15748 			}
15749 			top->ref_count  = ref_count;
15750 			top->share_mode = SM_COW;
15751 
15752 			while ((tmp_obj = obj->shadow)) {
15753 				vm_object_lock(tmp_obj);
15754 				vm_object_unlock(obj);
15755 				obj = tmp_obj;
15756 
15757 				if ((ref_count = os_ref_get_count_raw(&obj->ref_count)) > 1 &&
15758 				    obj->paging_in_progress) {
15759 					ref_count--;
15760 				}
15761 
15762 				assert(obj->reusable_page_count <= obj->resident_page_count);
15763 				top->shared_pages_resident +=
15764 				    OBJ_RESIDENT_COUNT(obj, entry_size);
15765 				top->ref_count += ref_count - 1;
15766 			}
15767 		} else {
15768 			if (entry->superpage_size) {
15769 				top->share_mode = SM_LARGE_PAGE;
15770 				top->shared_pages_resident = 0;
15771 				top->private_pages_resident = entry_size;
15772 			} else if (entry->needs_copy) {
15773 				top->share_mode = SM_COW;
15774 				top->shared_pages_resident =
15775 				    OBJ_RESIDENT_COUNT(obj, entry_size);
15776 			} else {
15777 				if (ref_count == 1 ||
15778 				    (ref_count == 2 && obj->named)) {
15779 					top->share_mode = SM_PRIVATE;
15780 					top->private_pages_resident =
15781 					    OBJ_RESIDENT_COUNT(obj,
15782 					    entry_size);
15783 				} else {
15784 					top->share_mode = SM_SHARED;
15785 					top->shared_pages_resident =
15786 					    OBJ_RESIDENT_COUNT(obj,
15787 					    entry_size);
15788 				}
15789 			}
15790 			top->ref_count = ref_count;
15791 		}
15792 
15793 		vm_object_unlock(obj);
15794 
15795 		/* XXX K64: obj_id will be truncated */
15796 		top->obj_id = (unsigned int) (uintptr_t)VM_KERNEL_ADDRHASH(obj);
15797 	}
15798 }
15799 
15800 void
vm_map_region_walk(vm_map_t map,vm_map_offset_t va,vm_map_entry_t entry,vm_object_offset_t offset,vm_object_size_t range,vm_region_extended_info_t extended,boolean_t look_for_pages,mach_msg_type_number_t count)15801 vm_map_region_walk(
15802 	vm_map_t                        map,
15803 	vm_map_offset_t                 va,
15804 	vm_map_entry_t                  entry,
15805 	vm_object_offset_t              offset,
15806 	vm_object_size_t                range,
15807 	vm_region_extended_info_t       extended,
15808 	boolean_t                       look_for_pages,
15809 	mach_msg_type_number_t count)
15810 {
15811 	struct vm_object *obj, *tmp_obj;
15812 	vm_map_offset_t       last_offset;
15813 	int               i;
15814 	int               ref_count;
15815 	struct vm_object        *shadow_object;
15816 	unsigned short          shadow_depth;
15817 	boolean_t         do_region_footprint;
15818 	int                     effective_page_size, effective_page_shift;
15819 	vm_map_offset_t         effective_page_mask;
15820 
15821 	do_region_footprint = task_self_region_footprint();
15822 
15823 	if ((entry->is_sub_map) ||
15824 	    (VME_OBJECT(entry) == 0) ||
15825 	    (VME_OBJECT(entry)->phys_contiguous &&
15826 	    !entry->superpage_size)) {
15827 		extended->share_mode = SM_EMPTY;
15828 		extended->ref_count = 0;
15829 		return;
15830 	}
15831 
15832 	if (entry->superpage_size) {
15833 		extended->shadow_depth = 0;
15834 		extended->share_mode = SM_LARGE_PAGE;
15835 		extended->ref_count = 1;
15836 		extended->external_pager = 0;
15837 
15838 		/* TODO4K: Superpage in 4k mode? */
15839 		extended->pages_resident = (unsigned int)(range >> PAGE_SHIFT);
15840 		extended->shadow_depth = 0;
15841 		return;
15842 	}
15843 
15844 	effective_page_shift = vm_self_region_page_shift(map);
15845 	effective_page_size = (1 << effective_page_shift);
15846 	effective_page_mask = effective_page_size - 1;
15847 
15848 	offset = vm_map_trunc_page(offset, effective_page_mask);
15849 
15850 	obj = VME_OBJECT(entry);
15851 
15852 	vm_object_lock(obj);
15853 
15854 	if ((ref_count = os_ref_get_count_raw(&obj->ref_count)) > 1 &&
15855 	    obj->paging_in_progress) {
15856 		ref_count--;
15857 	}
15858 
15859 	if (look_for_pages) {
15860 		for (last_offset = offset + range;
15861 		    offset < last_offset;
15862 		    offset += effective_page_size, va += effective_page_size) {
15863 			if (do_region_footprint) {
15864 				int disp;
15865 
15866 				disp = 0;
15867 				if (map->has_corpse_footprint) {
15868 					/*
15869 					 * Query the page info data we saved
15870 					 * while forking the corpse.
15871 					 */
15872 					vm_map_corpse_footprint_query_page_info(
15873 						map,
15874 						va,
15875 						&disp);
15876 				} else {
15877 					/*
15878 					 * Query the pmap.
15879 					 */
15880 					vm_map_footprint_query_page_info(
15881 						map,
15882 						entry,
15883 						va,
15884 						&disp);
15885 				}
15886 				if (disp & VM_PAGE_QUERY_PAGE_PRESENT) {
15887 					extended->pages_resident++;
15888 				}
15889 				if (disp & VM_PAGE_QUERY_PAGE_REUSABLE) {
15890 					extended->pages_reusable++;
15891 				}
15892 				if (disp & VM_PAGE_QUERY_PAGE_DIRTY) {
15893 					extended->pages_dirtied++;
15894 				}
15895 				if (disp & PMAP_QUERY_PAGE_COMPRESSED) {
15896 					extended->pages_swapped_out++;
15897 				}
15898 				continue;
15899 			}
15900 
15901 			vm_map_region_look_for_page(map, va, obj,
15902 			    vm_object_trunc_page(offset), ref_count,
15903 			    0, extended, count);
15904 		}
15905 
15906 		if (do_region_footprint) {
15907 			goto collect_object_info;
15908 		}
15909 	} else {
15910 collect_object_info:
15911 		shadow_object = obj->shadow;
15912 		shadow_depth = 0;
15913 
15914 		if (!(obj->internal)) {
15915 			extended->external_pager = 1;
15916 		}
15917 
15918 		if (shadow_object != VM_OBJECT_NULL) {
15919 			vm_object_lock(shadow_object);
15920 			for (;
15921 			    shadow_object != VM_OBJECT_NULL;
15922 			    shadow_depth++) {
15923 				vm_object_t     next_shadow;
15924 
15925 				if (!(shadow_object->internal)) {
15926 					extended->external_pager = 1;
15927 				}
15928 
15929 				next_shadow = shadow_object->shadow;
15930 				if (next_shadow) {
15931 					vm_object_lock(next_shadow);
15932 				}
15933 				vm_object_unlock(shadow_object);
15934 				shadow_object = next_shadow;
15935 			}
15936 		}
15937 		extended->shadow_depth = shadow_depth;
15938 	}
15939 
15940 	if (extended->shadow_depth || entry->needs_copy) {
15941 		extended->share_mode = SM_COW;
15942 	} else {
15943 		if (ref_count == 1) {
15944 			extended->share_mode = SM_PRIVATE;
15945 		} else {
15946 			if (obj->true_share) {
15947 				extended->share_mode = SM_TRUESHARED;
15948 			} else {
15949 				extended->share_mode = SM_SHARED;
15950 			}
15951 		}
15952 	}
15953 	extended->ref_count = ref_count - extended->shadow_depth;
15954 
15955 	for (i = 0; i < extended->shadow_depth; i++) {
15956 		if ((tmp_obj = obj->shadow) == 0) {
15957 			break;
15958 		}
15959 		vm_object_lock(tmp_obj);
15960 		vm_object_unlock(obj);
15961 
15962 		if ((ref_count = os_ref_get_count_raw(&tmp_obj->ref_count)) > 1 &&
15963 		    tmp_obj->paging_in_progress) {
15964 			ref_count--;
15965 		}
15966 
15967 		extended->ref_count += ref_count;
15968 		obj = tmp_obj;
15969 	}
15970 	vm_object_unlock(obj);
15971 
15972 	if (extended->external_pager && extended->ref_count == 2 && extended->share_mode == SM_SHARED) {
15973 		extended->share_mode = SM_PRIVATE;
15974 	} else if (extended->share_mode == SM_SHARED && !(task_self_region_info_flags() & VM_REGION_INFO_FLAGS_NO_ALIASED)) {
15975 		vm_map_entry_t       cur;
15976 		vm_map_entry_t       last;
15977 		int      my_refs;
15978 
15979 		obj = VME_OBJECT(entry);
15980 		last = vm_map_to_entry(map);
15981 		my_refs = 0;
15982 
15983 		if ((ref_count = os_ref_get_count_raw(&obj->ref_count)) > 1 &&
15984 		    obj->paging_in_progress) {
15985 			ref_count--;
15986 		}
15987 		for (cur = vm_map_first_entry(map); cur != last; cur = cur->vme_next) {
15988 			if (vm_map_region_has_obj_ref(cur, obj)) {
15989 				my_refs++;
15990 			}
15991 		}
15992 
15993 		if (my_refs == ref_count) {
15994 			extended->share_mode = SM_PRIVATE_ALIASED;
15995 		} else if (my_refs > 1) {
15996 			extended->share_mode = SM_SHARED_ALIASED;
15997 		}
15998 	}
15999 }
16000 
16001 
16002 /* object is locked on entry and locked on return */
16003 
16004 
16005 static void
vm_map_region_look_for_page(__unused vm_map_t map,__unused vm_map_offset_t va,vm_object_t object,vm_object_offset_t offset,int max_refcnt,unsigned short depth,vm_region_extended_info_t extended,mach_msg_type_number_t count)16006 vm_map_region_look_for_page(
16007 	__unused vm_map_t               map,
16008 	__unused vm_map_offset_t        va,
16009 	vm_object_t                     object,
16010 	vm_object_offset_t              offset,
16011 	int                             max_refcnt,
16012 	unsigned short                  depth,
16013 	vm_region_extended_info_t       extended,
16014 	mach_msg_type_number_t count)
16015 {
16016 	vm_page_t       p;
16017 	vm_object_t     shadow;
16018 	int             ref_count;
16019 	vm_object_t     caller_object;
16020 
16021 	shadow = object->shadow;
16022 	caller_object = object;
16023 
16024 
16025 	while (TRUE) {
16026 		if (!(object->internal)) {
16027 			extended->external_pager = 1;
16028 		}
16029 
16030 		if ((p = vm_page_lookup(object, offset)) != VM_PAGE_NULL) {
16031 			if (shadow && (max_refcnt == 1)) {
16032 				extended->pages_shared_now_private++;
16033 			}
16034 
16035 			if (!vm_page_is_fictitious(p) &&
16036 			    (p->vmp_dirty || pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(p)))) {
16037 				extended->pages_dirtied++;
16038 			} else if (count >= VM_REGION_EXTENDED_INFO_COUNT) {
16039 				if (p->vmp_reusable || object->all_reusable) {
16040 					extended->pages_reusable++;
16041 				}
16042 			}
16043 
16044 			extended->pages_resident++;
16045 
16046 			if (object != caller_object) {
16047 				vm_object_unlock(object);
16048 			}
16049 
16050 			return;
16051 		}
16052 		if (object->internal &&
16053 		    object->alive &&
16054 		    !object->terminating &&
16055 		    object->pager_ready) {
16056 			if (vm_object_compressor_pager_state_get(object, offset)
16057 			    == VM_EXTERNAL_STATE_EXISTS) {
16058 				/* the pager has that page */
16059 				extended->pages_swapped_out++;
16060 				if (object != caller_object) {
16061 					vm_object_unlock(object);
16062 				}
16063 				return;
16064 			}
16065 		}
16066 
16067 		if (shadow) {
16068 			vm_object_lock(shadow);
16069 			if ((ref_count = os_ref_get_count_raw(&shadow->ref_count)) > 1 &&
16070 			    shadow->paging_in_progress) {
16071 				ref_count--;
16072 			}
16073 
16074 			if (++depth > extended->shadow_depth) {
16075 				extended->shadow_depth = depth;
16076 			}
16077 
16078 			if (ref_count > max_refcnt) {
16079 				max_refcnt = ref_count;
16080 			}
16081 
16082 			if (object != caller_object) {
16083 				vm_object_unlock(object);
16084 			}
16085 
16086 			offset = offset + object->vo_shadow_offset;
16087 			object = shadow;
16088 			shadow = object->shadow;
16089 			continue;
16090 		}
16091 		if (object != caller_object) {
16092 			vm_object_unlock(object);
16093 		}
16094 		break;
16095 	}
16096 }
16097 
16098 static inline boolean_t
vm_map_region_has_obj_ref(vm_map_entry_t entry,vm_object_t object)16099 vm_map_region_has_obj_ref(
16100 	vm_map_entry_t    entry,
16101 	vm_object_t       object)
16102 {
16103 	vm_object_t cur_obj;
16104 	vm_object_t shadow_obj;
16105 
16106 	if (entry->is_sub_map) {
16107 		return FALSE;
16108 	}
16109 
16110 	cur_obj = VME_OBJECT(entry);
16111 	if (cur_obj == VM_OBJECT_NULL) {
16112 		return FALSE;
16113 	} else if (cur_obj == object) {
16114 		return TRUE;
16115 	}
16116 
16117 	/*
16118 	 * Avoid locks for first shadow check, otherwise diagnostic tools will
16119 	 * spend most of their time obtaining locks in this function when analyzing
16120 	 * processes with many VM entries which may commonly have no shadow chain.
16121 	 *
16122 	 * This is acceptable because:
16123 	 *  - Shadow's fields are not accessed outside of its lock
16124 	 *  - Objects are unlikely to be modified due to:
16125 	 *	  - Many diagnostic tools suspend the task
16126 	 *	  - VM map is locked
16127 	 *	- The rare incorrect return from this function turns a guess into a
16128 	 *	  slightly worse guess
16129 	 *	- Entire shadow chain is not locked as a whole, so can still change
16130 	 *	  while traversing, resulting in incorrect guess even with locking
16131 	 */
16132 	shadow_obj = cur_obj->shadow;
16133 	if (shadow_obj == VM_OBJECT_NULL) {
16134 		return FALSE;
16135 	} else if (shadow_obj == object) {
16136 		return TRUE;
16137 	}
16138 
16139 	vm_object_lock(cur_obj);
16140 
16141 	while ((shadow_obj = cur_obj->shadow)) {
16142 		/* check if object was found before grabbing a lock */
16143 		if (shadow_obj == object) {
16144 			vm_object_unlock(cur_obj);
16145 			return TRUE;
16146 		}
16147 
16148 		vm_object_lock(shadow_obj);
16149 		vm_object_unlock(cur_obj);
16150 		cur_obj = shadow_obj;
16151 	}
16152 
16153 	/* exhausted the shadow chain */
16154 	vm_object_unlock(cur_obj);
16155 	return FALSE;
16156 }
16157 
16158 
16159 /*
16160  *	Routine:	vm_map_simplify
16161  *
16162  *	Description:
16163  *		Attempt to simplify the map representation in
16164  *		the vicinity of the given starting address.
16165  *	Note:
16166  *		This routine is intended primarily to keep the
16167  *		kernel maps more compact -- they generally don't
16168  *		benefit from the "expand a map entry" technology
16169  *		at allocation time because the adjacent entry
16170  *		is often wired down.
16171  */
16172 void
vm_map_simplify_entry(vm_map_t map,vm_map_entry_t this_entry)16173 vm_map_simplify_entry(
16174 	vm_map_t        map,
16175 	vm_map_entry_t  this_entry)
16176 {
16177 	vm_map_entry_t  prev_entry;
16178 
16179 	prev_entry = this_entry->vme_prev;
16180 
16181 	if ((this_entry != vm_map_to_entry(map)) &&
16182 	    (prev_entry != vm_map_to_entry(map)) &&
16183 
16184 	    (prev_entry->vme_end == this_entry->vme_start) &&
16185 
16186 	    (prev_entry->is_sub_map == this_entry->is_sub_map) &&
16187 	    (prev_entry->vme_object_value == this_entry->vme_object_value) &&
16188 	    (prev_entry->vme_kernel_object == this_entry->vme_kernel_object) &&
16189 	    ((VME_OFFSET(prev_entry) + (prev_entry->vme_end -
16190 	    prev_entry->vme_start))
16191 	    == VME_OFFSET(this_entry)) &&
16192 
16193 	    (prev_entry->behavior == this_entry->behavior) &&
16194 	    (prev_entry->needs_copy == this_entry->needs_copy) &&
16195 	    (prev_entry->protection == this_entry->protection) &&
16196 	    (prev_entry->max_protection == this_entry->max_protection) &&
16197 	    (prev_entry->inheritance == this_entry->inheritance) &&
16198 	    (prev_entry->use_pmap == this_entry->use_pmap) &&
16199 	    (VME_ALIAS(prev_entry) == VME_ALIAS(this_entry)) &&
16200 	    (prev_entry->no_cache == this_entry->no_cache) &&
16201 	    (prev_entry->vme_permanent == this_entry->vme_permanent) &&
16202 	    (prev_entry->map_aligned == this_entry->map_aligned) &&
16203 	    (prev_entry->zero_wired_pages == this_entry->zero_wired_pages) &&
16204 	    (prev_entry->used_for_jit == this_entry->used_for_jit) &&
16205 #if __arm64e__
16206 	    (prev_entry->used_for_tpro == this_entry->used_for_tpro) &&
16207 #endif
16208 	    (prev_entry->csm_associated == this_entry->csm_associated) &&
16209 	    (prev_entry->vme_xnu_user_debug == this_entry->vme_xnu_user_debug) &&
16210 	    (prev_entry->iokit_acct == this_entry->iokit_acct) &&
16211 	    (prev_entry->vme_resilient_codesign ==
16212 	    this_entry->vme_resilient_codesign) &&
16213 	    (prev_entry->vme_resilient_media ==
16214 	    this_entry->vme_resilient_media) &&
16215 	    (prev_entry->vme_no_copy_on_read == this_entry->vme_no_copy_on_read) &&
16216 	    (prev_entry->translated_allow_execute == this_entry->translated_allow_execute) &&
16217 
16218 	    (prev_entry->wired_count == this_entry->wired_count) &&
16219 	    (prev_entry->user_wired_count == this_entry->user_wired_count) &&
16220 
16221 	    ((prev_entry->vme_atomic == FALSE) && (this_entry->vme_atomic == FALSE)) &&
16222 	    (prev_entry->in_transition == FALSE) &&
16223 	    (this_entry->in_transition == FALSE) &&
16224 	    (prev_entry->needs_wakeup == FALSE) &&
16225 	    (this_entry->needs_wakeup == FALSE) &&
16226 	    (prev_entry->is_shared == this_entry->is_shared) &&
16227 	    (prev_entry->superpage_size == FALSE) &&
16228 	    (this_entry->superpage_size == FALSE)
16229 	    ) {
16230 		if (prev_entry->vme_permanent) {
16231 			assert(this_entry->vme_permanent);
16232 			prev_entry->vme_permanent = false;
16233 		}
16234 		vm_map_store_entry_unlink(map, prev_entry, true);
16235 		assert(prev_entry->vme_start < this_entry->vme_end);
16236 		if (prev_entry->map_aligned) {
16237 			assert(VM_MAP_PAGE_ALIGNED(prev_entry->vme_start,
16238 			    VM_MAP_PAGE_MASK(map)));
16239 		}
16240 		this_entry->vme_start = prev_entry->vme_start;
16241 		VME_OFFSET_SET(this_entry, VME_OFFSET(prev_entry));
16242 
16243 		if (map->holelistenabled) {
16244 			vm_map_store_update_first_free(map, this_entry, TRUE);
16245 		}
16246 
16247 		if (prev_entry->is_sub_map) {
16248 			vm_map_deallocate(VME_SUBMAP(prev_entry));
16249 		} else {
16250 			vm_object_deallocate(VME_OBJECT(prev_entry));
16251 		}
16252 		vm_map_entry_dispose(prev_entry);
16253 		SAVE_HINT_MAP_WRITE(map, this_entry);
16254 	}
16255 }
16256 
16257 void
vm_map_simplify(vm_map_t map,vm_map_offset_t start)16258 vm_map_simplify(
16259 	vm_map_t        map,
16260 	vm_map_offset_t start)
16261 {
16262 	vm_map_entry_t  this_entry;
16263 
16264 	vm_map_lock(map);
16265 	if (vm_map_lookup_entry(map, start, &this_entry)) {
16266 		vm_map_simplify_entry(map, this_entry);
16267 		vm_map_simplify_entry(map, this_entry->vme_next);
16268 	}
16269 	vm_map_unlock(map);
16270 }
16271 
16272 static void
vm_map_simplify_range(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16273 vm_map_simplify_range(
16274 	vm_map_t        map,
16275 	vm_map_offset_t start,
16276 	vm_map_offset_t end)
16277 {
16278 	vm_map_entry_t  entry;
16279 
16280 	/*
16281 	 * The map should be locked (for "write") by the caller.
16282 	 */
16283 
16284 	if (start >= end) {
16285 		/* invalid address range */
16286 		return;
16287 	}
16288 
16289 	start = vm_map_trunc_page(start,
16290 	    VM_MAP_PAGE_MASK(map));
16291 	end = vm_map_round_page(end,
16292 	    VM_MAP_PAGE_MASK(map));
16293 
16294 	if (!vm_map_lookup_entry(map, start, &entry)) {
16295 		/* "start" is not mapped and "entry" ends before "start" */
16296 		if (entry == vm_map_to_entry(map)) {
16297 			/* start with first entry in the map */
16298 			entry = vm_map_first_entry(map);
16299 		} else {
16300 			/* start with next entry */
16301 			entry = entry->vme_next;
16302 		}
16303 	}
16304 
16305 	while (entry != vm_map_to_entry(map) &&
16306 	    entry->vme_start <= end) {
16307 		/* try and coalesce "entry" with its previous entry */
16308 		vm_map_simplify_entry(map, entry);
16309 		entry = entry->vme_next;
16310 	}
16311 }
16312 
16313 static __attribute__((always_inline, warn_unused_result))
16314 kern_return_t
vm_map_machine_attribute_sanitize(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,mach_vm_offset_t * start,mach_vm_offset_t * end,vm_map_size_t * size)16315 vm_map_machine_attribute_sanitize(
16316 	vm_map_t                map,
16317 	vm_map_offset_ut        start_u,
16318 	vm_map_offset_ut        end_u,
16319 	mach_vm_offset_t       *start,
16320 	mach_vm_offset_t       *end,
16321 	vm_map_size_t          *size)
16322 {
16323 	return vm_sanitize_addr_end(start_u, end_u,
16324 	           VM_SANITIZE_CALLER_VM_MAP_MACHINE_ATTRIBUTE, map,
16325 	           VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS, start, end,
16326 	           size);
16327 }
16328 
16329 
16330 /*
16331  *	Routine:	vm_map_machine_attribute
16332  *	Purpose:
16333  *		Provide machine-specific attributes to mappings,
16334  *		such as cachability etc. for machines that provide
16335  *		them.  NUMA architectures and machines with big/strange
16336  *		caches will use this.
16337  *	Note:
16338  *		Responsibilities for locking and checking are handled here,
16339  *		everything else in the pmap module. If any non-volatile
16340  *		information must be kept, the pmap module should handle
16341  *		it itself. [This assumes that attributes do not
16342  *		need to be inherited, which seems ok to me]
16343  */
16344 kern_return_t
vm_map_machine_attribute(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_machine_attribute_t attribute,vm_machine_attribute_val_t * value)16345 vm_map_machine_attribute(
16346 	vm_map_t                map,
16347 	vm_map_offset_ut        start_u,
16348 	vm_map_offset_ut        end_u,
16349 	vm_machine_attribute_t  attribute,
16350 	vm_machine_attribute_val_t *value) /* IN/OUT */
16351 {
16352 	mach_vm_offset_t start, end;
16353 	vm_map_size_t    sync_size;
16354 	kern_return_t    ret;
16355 	vm_map_entry_t   entry;
16356 
16357 	ret = vm_map_machine_attribute_sanitize(map,
16358 	    start_u,
16359 	    end_u,
16360 	    &start,
16361 	    &end,
16362 	    &sync_size);
16363 	if (__improbable(ret != KERN_SUCCESS)) {
16364 		return vm_sanitize_get_kr(ret);
16365 	}
16366 
16367 	if (start < vm_map_min(map) || end > vm_map_max(map)) {
16368 		return KERN_INVALID_ADDRESS;
16369 	}
16370 
16371 	vm_map_lock(map);
16372 
16373 	if (attribute != MATTR_CACHE) {
16374 		/* If we don't have to find physical addresses, we */
16375 		/* don't have to do an explicit traversal here.    */
16376 		ret = pmap_attribute(map->pmap, start, end - start,
16377 		    attribute, value);
16378 		vm_map_unlock(map);
16379 		return ret;
16380 	}
16381 
16382 	ret = KERN_SUCCESS;                                                                             /* Assume it all worked */
16383 
16384 	while (sync_size) {
16385 		if (vm_map_lookup_entry(map, start, &entry)) {
16386 			vm_map_size_t   sub_size;
16387 			if ((entry->vme_end - start) > sync_size) {
16388 				sub_size = sync_size;
16389 				sync_size = 0;
16390 			} else {
16391 				sub_size = entry->vme_end - start;
16392 				sync_size -= sub_size;
16393 			}
16394 			if (entry->is_sub_map) {
16395 				vm_map_offset_t sub_start;
16396 				vm_map_offset_t sub_end;
16397 
16398 				sub_start = (start - entry->vme_start)
16399 				    + VME_OFFSET(entry);
16400 				sub_end = sub_start + sub_size;
16401 				vm_map_machine_attribute(
16402 					VME_SUBMAP(entry),
16403 					sub_start,
16404 					sub_end,
16405 					attribute, value);
16406 			} else if (VME_OBJECT(entry)) {
16407 				vm_page_t               m;
16408 				vm_object_t             object;
16409 				vm_object_t             base_object;
16410 				vm_object_t             last_object;
16411 				vm_object_offset_t      offset;
16412 				vm_object_offset_t      base_offset;
16413 				vm_map_size_t           range;
16414 				range = sub_size;
16415 				offset = (start - entry->vme_start)
16416 				    + VME_OFFSET(entry);
16417 				offset = vm_object_trunc_page(offset);
16418 				base_offset = offset;
16419 				object = VME_OBJECT(entry);
16420 				base_object = object;
16421 				last_object = NULL;
16422 
16423 				vm_object_lock(object);
16424 
16425 				while (range) {
16426 					m = vm_page_lookup(
16427 						object, offset);
16428 
16429 					if (m && !vm_page_is_fictitious(m)) {
16430 						ret =
16431 						    pmap_attribute_cache_sync(
16432 							VM_PAGE_GET_PHYS_PAGE(m),
16433 							PAGE_SIZE,
16434 							attribute, value);
16435 					} else if (object->shadow) {
16436 						offset = offset + object->vo_shadow_offset;
16437 						last_object = object;
16438 						object = object->shadow;
16439 						vm_object_lock(last_object->shadow);
16440 						vm_object_unlock(last_object);
16441 						continue;
16442 					}
16443 					if (range < PAGE_SIZE) {
16444 						range = 0;
16445 					} else {
16446 						range -= PAGE_SIZE;
16447 					}
16448 
16449 					if (base_object != object) {
16450 						vm_object_unlock(object);
16451 						vm_object_lock(base_object);
16452 						object = base_object;
16453 					}
16454 					/* Bump to the next page */
16455 					base_offset += PAGE_SIZE;
16456 					offset = base_offset;
16457 				}
16458 				vm_object_unlock(object);
16459 			}
16460 			start += sub_size;
16461 		} else {
16462 			vm_map_unlock(map);
16463 			return KERN_FAILURE;
16464 		}
16465 	}
16466 
16467 	vm_map_unlock(map);
16468 
16469 	return ret;
16470 }
16471 
16472 /*
16473  *	vm_map_behavior_set:
16474  *
16475  *	Sets the paging reference behavior of the specified address
16476  *	range in the target map.  Paging reference behavior affects
16477  *	how pagein operations resulting from faults on the map will be
16478  *	clustered.
16479  */
16480 kern_return_t
vm_map_behavior_set(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_behavior_t new_behavior)16481 vm_map_behavior_set(
16482 	vm_map_t        map,
16483 	vm_map_offset_t start,
16484 	vm_map_offset_t end,
16485 	vm_behavior_t   new_behavior)
16486 {
16487 	vm_map_entry_t  entry;
16488 	vm_map_entry_t  temp_entry;
16489 
16490 	if (start > end ||
16491 	    start < vm_map_min(map) ||
16492 	    end > vm_map_max(map)) {
16493 		return KERN_NO_SPACE;
16494 	}
16495 	if (__improbable(vm_map_range_overflows(map, start, end - start))) {
16496 		return KERN_INVALID_ADDRESS;
16497 	}
16498 
16499 	switch (new_behavior) {
16500 	/*
16501 	 * This first block of behaviors all set a persistent state on the specified
16502 	 * memory range.  All we have to do here is to record the desired behavior
16503 	 * in the vm_map_entry_t's.
16504 	 */
16505 
16506 	case VM_BEHAVIOR_DEFAULT:
16507 	case VM_BEHAVIOR_RANDOM:
16508 	case VM_BEHAVIOR_SEQUENTIAL:
16509 	case VM_BEHAVIOR_RSEQNTL:
16510 	case VM_BEHAVIOR_ZERO_WIRED_PAGES:
16511 		vm_map_lock(map);
16512 
16513 		/*
16514 		 *	The entire address range must be valid for the map.
16515 		 *      Note that vm_map_range_check() does a
16516 		 *	vm_map_lookup_entry() internally and returns the
16517 		 *	entry containing the start of the address range if
16518 		 *	the entire range is valid.
16519 		 */
16520 		if (vm_map_range_check(map, start, end, &temp_entry)) {
16521 			entry = temp_entry;
16522 			vm_map_clip_start(map, entry, start);
16523 		} else {
16524 			vm_map_unlock(map);
16525 			return KERN_INVALID_ADDRESS;
16526 		}
16527 
16528 		if (new_behavior == VM_BEHAVIOR_ZERO_WIRED_PAGES) {
16529 			/* zeroing requires write access */
16530 			temp_entry = entry;
16531 			for (;
16532 			    entry != vm_map_to_entry(map) && (entry->vme_start < end);
16533 			    entry = entry->vme_next) {
16534 				if (!(entry->protection & VM_PROT_WRITE) ||
16535 #if __arm64e__
16536 				    entry->used_for_tpro ||
16537 #endif /* __arm64e__ */
16538 				    entry->used_for_jit) {
16539 					vm_map_unlock(map);
16540 					return KERN_PROTECTION_FAILURE;
16541 				}
16542 			}
16543 			entry = temp_entry;
16544 		}
16545 
16546 		while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
16547 			vm_map_clip_end(map, entry, end);
16548 			if (entry->is_sub_map) {
16549 				assert(!entry->use_pmap);
16550 			}
16551 
16552 			if (new_behavior == VM_BEHAVIOR_ZERO_WIRED_PAGES) {
16553 				assert(entry->protection & VM_PROT_WRITE);
16554 #if __arm64e__
16555 				assert(!entry->used_for_tpro);
16556 #endif /* __arm64e__ */
16557 				assert(!entry->used_for_jit);
16558 				entry->zero_wired_pages = TRUE;
16559 			} else {
16560 				entry->behavior = new_behavior;
16561 			}
16562 			entry = entry->vme_next;
16563 		}
16564 
16565 		vm_map_unlock(map);
16566 		break;
16567 
16568 	/*
16569 	 * The rest of these are different from the above in that they cause
16570 	 * an immediate action to take place as opposed to setting a behavior that
16571 	 * affects future actions.
16572 	 */
16573 
16574 	case VM_BEHAVIOR_WILLNEED:
16575 		return vm_map_willneed(map, start, end);
16576 
16577 	case VM_BEHAVIOR_DONTNEED:
16578 		return vm_map_msync(map, start, end - start, VM_SYNC_DEACTIVATE | VM_SYNC_CONTIGUOUS);
16579 
16580 	case VM_BEHAVIOR_FREE:
16581 		return vm_map_msync(map, start, end - start, VM_SYNC_KILLPAGES | VM_SYNC_CONTIGUOUS);
16582 
16583 	case VM_BEHAVIOR_REUSABLE:
16584 		return vm_map_reusable_pages(map, start, end);
16585 
16586 	case VM_BEHAVIOR_REUSE:
16587 		return vm_map_reuse_pages(map, start, end);
16588 
16589 	case VM_BEHAVIOR_CAN_REUSE:
16590 		return vm_map_can_reuse(map, start, end);
16591 
16592 #if MACH_ASSERT
16593 	case VM_BEHAVIOR_PAGEOUT:
16594 		return vm_map_pageout(map, start, end);
16595 #endif /* MACH_ASSERT */
16596 
16597 	case VM_BEHAVIOR_ZERO:
16598 		return vm_map_zero(map, start, end);
16599 
16600 	default:
16601 		return KERN_INVALID_ARGUMENT;
16602 	}
16603 
16604 	return KERN_SUCCESS;
16605 }
16606 
16607 
16608 /*
16609  * Internals for madvise(MADV_WILLNEED) system call.
16610  *
16611  * The implementation is to do:-
16612  * a) read-ahead if the mapping corresponds to a mapped regular file
16613  * b) or, fault in the pages (zero-fill, decompress etc) if it's an anonymous mapping
16614  */
16615 static kern_return_t
vm_map_willneed(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16616 vm_map_willneed(
16617 	vm_map_t        map,
16618 	vm_map_offset_t start,
16619 	vm_map_offset_t end
16620 	)
16621 {
16622 	vm_map_entry_t entry;
16623 	kern_return_t kr;
16624 	vm_object_size_t len;
16625 	vm_size_t region_size;
16626 
16627 	KDBG(VMDBG_CODE(DBG_VM_MAP_WILLNEED) | DBG_FUNC_START,
16628 	    start, end);
16629 	struct vm_object_fault_info fault_info = {
16630 		.interruptible = THREAD_UNINT,
16631 		.behavior = VM_BEHAVIOR_SEQUENTIAL,
16632 		/* Do not activate pages after faulting */
16633 		.stealth = true,
16634 		/* Don't wait for busy pages */
16635 		.fi_no_sleep = true,
16636 	};
16637 
16638 	/*
16639 	 * The MADV_WILLNEED operation doesn't require any changes to the
16640 	 * vm_map_entry_t's, so the read lock is sufficient.
16641 	 */
16642 
16643 	vm_map_lock_read(map);
16644 
16645 	/*
16646 	 * The madvise semantics require that the address range be fully
16647 	 * allocated with no holes.  Otherwise, we're required to return
16648 	 * an error.
16649 	 */
16650 
16651 	if (!vm_map_range_check(map, start, end, &entry)) {
16652 		vm_map_unlock_read(map);
16653 		kr = KERN_INVALID_ADDRESS;
16654 		goto done;
16655 	}
16656 
16657 	/*
16658 	 * Examine each vm_map_entry_t in the range.
16659 	 */
16660 	while (start < end) {
16661 		/*
16662 		 * Set the length so we don't go beyond the end of the
16663 		 * map_entry or beyond the end of the range we were given.
16664 		 * This range could span also multiple map entries all of which
16665 		 * map different files, so make sure we only do the right amount
16666 		 * of I/O for each object.  Note that it's possible for there
16667 		 * to be multiple map entries all referring to the same object
16668 		 * but with different page permissions, but it's not worth
16669 		 * trying to optimize that case.
16670 		 */
16671 		len = MIN(entry->vme_end - start, end - start);
16672 
16673 		vm_map_offset_t addr = start;
16674 
16675 		vm_size_t effective_page_mask = MIN(vm_map_page_mask(map), PAGE_MASK);
16676 		vm_map_offset_t effective_page_size = effective_page_mask + 1;
16677 
16678 		/*
16679 		 * Write-fault if the entry supports it to preclude subsequent soft-faults
16680 		 */
16681 		vm_prot_t fault_prot = entry->protection & VM_PROT_WRITE ?
16682 		    VM_PROT_WRITE : VM_PROT_READ;
16683 
16684 		vm_map_unlock_read(map);
16685 
16686 		region_size = len;
16687 		while (region_size) {
16688 			/*
16689 			 * Provide a hint for how much clustering we would like. Note that
16690 			 * each individual fault will limit the size of each request to
16691 			 * MAX_UPL_TRANSFER_BYTES.
16692 			 */
16693 			fault_info.cluster_size = region_size;
16694 			kr = vm_pre_fault_with_info(
16695 				map,
16696 				vm_map_trunc_page(addr, effective_page_mask),
16697 				fault_prot,
16698 				&fault_info);
16699 			if (kr == KERN_ALREADY_WAITING) {
16700 				/*
16701 				 * The page is busy being faulted/paged by another thread.
16702 				 */
16703 				KDBG(VMDBG_CODE(DBG_VM_MAP_WILLNEED) | DBG_FUNC_NONE,
16704 				    task_pid(current_task()), addr, kr);
16705 				kr = KERN_SUCCESS;
16706 			} else if (kr != KERN_SUCCESS) {
16707 				goto done;
16708 			}
16709 			region_size -= effective_page_size;
16710 			addr += effective_page_size;
16711 		}
16712 
16713 		start += len;
16714 		if (start >= end) {
16715 			kr = KERN_SUCCESS;
16716 			goto done;
16717 		}
16718 
16719 		if (thread_should_abort(current_thread())) {
16720 			kr = KERN_ABORTED;
16721 			goto done;
16722 		}
16723 
16724 		/* look up next entry */
16725 		vm_map_lock_read(map);
16726 		if (!vm_map_lookup_entry(map, start, &entry)) {
16727 			/*
16728 			 * There's a new hole in the address range.
16729 			 */
16730 			vm_map_unlock_read(map);
16731 			kr = KERN_INVALID_ADDRESS;
16732 			goto done;
16733 		}
16734 	}
16735 
16736 	vm_map_unlock_read(map);
16737 done:
16738 	KDBG(VMDBG_CODE(DBG_VM_MAP_WILLNEED) | DBG_FUNC_END,
16739 	    start, kr);
16740 	return kr;
16741 }
16742 
16743 static boolean_t
vm_map_entry_is_reusable(vm_map_entry_t entry)16744 vm_map_entry_is_reusable(
16745 	vm_map_entry_t entry)
16746 {
16747 	/* Only user map entries */
16748 
16749 	vm_object_t object;
16750 
16751 	if (entry->is_sub_map) {
16752 		return FALSE;
16753 	}
16754 
16755 	switch (VME_ALIAS(entry)) {
16756 	case VM_MEMORY_MALLOC:
16757 	case VM_MEMORY_MALLOC_SMALL:
16758 	case VM_MEMORY_MALLOC_LARGE:
16759 	case VM_MEMORY_REALLOC:
16760 	case VM_MEMORY_MALLOC_TINY:
16761 	case VM_MEMORY_MALLOC_LARGE_REUSABLE:
16762 	case VM_MEMORY_MALLOC_LARGE_REUSED:
16763 		/*
16764 		 * This is a malloc() memory region: check if it's still
16765 		 * in its original state and can be re-used for more
16766 		 * malloc() allocations.
16767 		 */
16768 		break;
16769 	default:
16770 		/*
16771 		 * Not a malloc() memory region: let the caller decide if
16772 		 * it's re-usable.
16773 		 */
16774 		return TRUE;
16775 	}
16776 
16777 	if (/*entry->is_shared ||*/
16778 		entry->is_sub_map ||
16779 		entry->in_transition ||
16780 		entry->protection != VM_PROT_DEFAULT ||
16781 		entry->max_protection != VM_PROT_ALL ||
16782 		entry->inheritance != VM_INHERIT_DEFAULT ||
16783 		entry->no_cache ||
16784 		entry->vme_permanent ||
16785 		entry->superpage_size != FALSE ||
16786 		entry->zero_wired_pages ||
16787 		entry->wired_count != 0 ||
16788 		entry->user_wired_count != 0) {
16789 		return FALSE;
16790 	}
16791 
16792 	object = VME_OBJECT(entry);
16793 	if (object == VM_OBJECT_NULL) {
16794 		return TRUE;
16795 	}
16796 	if (
16797 #if 0
16798 		/*
16799 		 * Let's proceed even if the VM object is potentially
16800 		 * shared.
16801 		 * We check for this later when processing the actual
16802 		 * VM pages, so the contents will be safe if shared.
16803 		 *
16804 		 * But we can still mark this memory region as "reusable" to
16805 		 * acknowledge that the caller did let us know that the memory
16806 		 * could be re-used and should not be penalized for holding
16807 		 * on to it.  This allows its "resident size" to not include
16808 		 * the reusable range.
16809 		 */
16810 		object->ref_count == 1 &&
16811 #endif
16812 		object->vo_copy == VM_OBJECT_NULL &&
16813 		object->shadow == VM_OBJECT_NULL &&
16814 		object->internal &&
16815 		object->purgable == VM_PURGABLE_DENY &&
16816 		HAS_DEFAULT_CACHEABILITY(object->wimg_bits & VM_WIMG_MASK) &&
16817 		!object->code_signed) {
16818 		return TRUE;
16819 	}
16820 	return FALSE;
16821 }
16822 
16823 static kern_return_t
vm_map_reuse_pages(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16824 vm_map_reuse_pages(
16825 	vm_map_t        map,
16826 	vm_map_offset_t start,
16827 	vm_map_offset_t end)
16828 {
16829 	vm_map_entry_t                  entry;
16830 	vm_object_t                     object;
16831 	vm_object_offset_t              start_offset, end_offset;
16832 
16833 	/*
16834 	 * The MADV_REUSE operation doesn't require any changes to the
16835 	 * vm_map_entry_t's, so the read lock is sufficient.
16836 	 */
16837 
16838 	if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
16839 		/*
16840 		 * XXX TODO4K
16841 		 * need to figure out what reusable means for a
16842 		 * portion of a native page.
16843 		 */
16844 		return KERN_SUCCESS;
16845 	}
16846 
16847 	vm_map_lock_read(map);
16848 	assert(map->pmap != kernel_pmap);       /* protect alias access */
16849 
16850 	/*
16851 	 * The madvise semantics require that the address range be fully
16852 	 * allocated with no holes.  Otherwise, we're required to return
16853 	 * an error.
16854 	 */
16855 
16856 	if (!vm_map_range_check(map, start, end, &entry)) {
16857 		vm_map_unlock_read(map);
16858 		vm_page_stats_reusable.reuse_pages_failure++;
16859 		return KERN_INVALID_ADDRESS;
16860 	}
16861 
16862 	/*
16863 	 * Examine each vm_map_entry_t in the range.
16864 	 */
16865 	for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
16866 	    entry = entry->vme_next) {
16867 		/*
16868 		 * Sanity check on the VM map entry.
16869 		 */
16870 		if (!vm_map_entry_is_reusable(entry)) {
16871 			vm_map_unlock_read(map);
16872 			vm_page_stats_reusable.reuse_pages_failure++;
16873 			return KERN_INVALID_ADDRESS;
16874 		}
16875 
16876 		/*
16877 		 * The first time through, the start address could be anywhere
16878 		 * within the vm_map_entry we found.  So adjust the offset to
16879 		 * correspond.
16880 		 */
16881 		if (entry->vme_start < start) {
16882 			start_offset = start - entry->vme_start;
16883 		} else {
16884 			start_offset = 0;
16885 		}
16886 		end_offset = MIN(end, entry->vme_end) - entry->vme_start;
16887 		start_offset += VME_OFFSET(entry);
16888 		end_offset += VME_OFFSET(entry);
16889 
16890 		object = VME_OBJECT(entry);
16891 		if (object != VM_OBJECT_NULL) {
16892 			vm_object_lock(object);
16893 			vm_object_reuse_pages(object, start_offset, end_offset,
16894 			    TRUE);
16895 			vm_object_unlock(object);
16896 		}
16897 
16898 		if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE_REUSABLE) {
16899 			/*
16900 			 * XXX
16901 			 * We do not hold the VM map exclusively here.
16902 			 * The "alias" field is not that critical, so it's
16903 			 * safe to update it here, as long as it is the only
16904 			 * one that can be modified while holding the VM map
16905 			 * "shared".
16906 			 */
16907 			VME_ALIAS_SET(entry, VM_MEMORY_MALLOC_LARGE_REUSED);
16908 		}
16909 	}
16910 
16911 	vm_map_unlock_read(map);
16912 	vm_page_stats_reusable.reuse_pages_success++;
16913 	return KERN_SUCCESS;
16914 }
16915 
16916 
16917 static kern_return_t
vm_map_reusable_pages(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16918 vm_map_reusable_pages(
16919 	vm_map_t        map,
16920 	vm_map_offset_t start,
16921 	vm_map_offset_t end)
16922 {
16923 	vm_map_entry_t                  entry;
16924 	vm_object_t                     object;
16925 	vm_object_offset_t              start_offset, end_offset;
16926 	vm_map_offset_t                 pmap_offset;
16927 
16928 	if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
16929 		/*
16930 		 * XXX TODO4K
16931 		 * need to figure out what reusable means for a portion
16932 		 * of a native page.
16933 		 */
16934 		return KERN_SUCCESS;
16935 	}
16936 
16937 	/*
16938 	 * The MADV_REUSABLE operation doesn't require any changes to the
16939 	 * vm_map_entry_t's, so the read lock is sufficient.
16940 	 */
16941 
16942 	vm_map_lock_read(map);
16943 	assert(map->pmap != kernel_pmap);       /* protect alias access */
16944 
16945 	/*
16946 	 * The madvise semantics require that the address range be fully
16947 	 * allocated with no holes.  Otherwise, we're required to return
16948 	 * an error.
16949 	 */
16950 
16951 	if (!vm_map_range_check(map, start, end, &entry)) {
16952 		vm_map_unlock_read(map);
16953 		vm_page_stats_reusable.reusable_pages_failure++;
16954 		return KERN_INVALID_ADDRESS;
16955 	}
16956 
16957 	/*
16958 	 * Examine each vm_map_entry_t in the range.
16959 	 */
16960 	for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
16961 	    entry = entry->vme_next) {
16962 		int kill_pages = 0;
16963 		boolean_t kill_no_write = FALSE;
16964 
16965 		/*
16966 		 * Sanity check on the VM map entry.
16967 		 */
16968 		if (!vm_map_entry_is_reusable(entry)) {
16969 			vm_map_unlock_read(map);
16970 			vm_page_stats_reusable.reusable_pages_failure++;
16971 			return KERN_INVALID_ADDRESS;
16972 		}
16973 
16974 		if (!(entry->protection & VM_PROT_WRITE) && !entry->used_for_jit
16975 #if __arm64e__
16976 		    && !entry->used_for_tpro
16977 #endif
16978 		    ) {
16979 			/* not writable: can't discard contents */
16980 			vm_map_unlock_read(map);
16981 			vm_page_stats_reusable.reusable_nonwritable++;
16982 			vm_page_stats_reusable.reusable_pages_failure++;
16983 			return KERN_PROTECTION_FAILURE;
16984 		}
16985 
16986 		/*
16987 		 * The first time through, the start address could be anywhere
16988 		 * within the vm_map_entry we found.  So adjust the offset to
16989 		 * correspond.
16990 		 */
16991 		if (entry->vme_start < start) {
16992 			start_offset = start - entry->vme_start;
16993 			pmap_offset = start;
16994 		} else {
16995 			start_offset = 0;
16996 			pmap_offset = entry->vme_start;
16997 		}
16998 		end_offset = MIN(end, entry->vme_end) - entry->vme_start;
16999 		start_offset += VME_OFFSET(entry);
17000 		end_offset += VME_OFFSET(entry);
17001 
17002 		object = VME_OBJECT(entry);
17003 		if (object == VM_OBJECT_NULL) {
17004 			continue;
17005 		}
17006 
17007 		if ((entry->protection & VM_PROT_EXECUTE) ||
17008 		    entry->vme_xnu_user_debug) {
17009 			/*
17010 			 * Executable or user debug pages might be write-protected by
17011 			 * hardware, so do not attempt to write to these pages.
17012 			 */
17013 			kill_no_write = TRUE;
17014 		}
17015 
17016 		vm_object_lock(object);
17017 		if (((os_ref_get_count_raw(&object->ref_count) == 1) ||
17018 		    (object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC &&
17019 		    object->vo_copy == VM_OBJECT_NULL)) &&
17020 		    object->shadow == VM_OBJECT_NULL &&
17021 		    /*
17022 		     * "iokit_acct" entries are billed for their virtual size
17023 		     * (rather than for their resident pages only), so they
17024 		     * wouldn't benefit from making pages reusable, and it
17025 		     * would be hard to keep track of pages that are both
17026 		     * "iokit_acct" and "reusable" in the pmap stats and
17027 		     * ledgers.
17028 		     */
17029 		    !(entry->iokit_acct ||
17030 		    (!entry->is_sub_map && !entry->use_pmap))) {
17031 			if (os_ref_get_count_raw(&object->ref_count) != 1) {
17032 				vm_page_stats_reusable.reusable_shared++;
17033 			}
17034 			kill_pages = 1;
17035 		} else {
17036 			kill_pages = -1;
17037 		}
17038 		if (kill_pages != -1) {
17039 			vm_object_deactivate_pages(object,
17040 			    start_offset,
17041 			    end_offset - start_offset,
17042 			    kill_pages,
17043 			    TRUE /*reusable_pages*/,
17044 			    kill_no_write,
17045 			    map->pmap,
17046 			    pmap_offset);
17047 		} else {
17048 			vm_page_stats_reusable.reusable_pages_shared++;
17049 			DTRACE_VM4(vm_map_reusable_pages_shared,
17050 			    unsigned int, VME_ALIAS(entry),
17051 			    vm_map_t, map,
17052 			    vm_map_entry_t, entry,
17053 			    vm_object_t, object);
17054 		}
17055 		vm_object_unlock(object);
17056 
17057 		if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE ||
17058 		    VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE_REUSED) {
17059 			/*
17060 			 * XXX
17061 			 * We do not hold the VM map exclusively here.
17062 			 * The "alias" field is not that critical, so it's
17063 			 * safe to update it here, as long as it is the only
17064 			 * one that can be modified while holding the VM map
17065 			 * "shared".
17066 			 */
17067 			VME_ALIAS_SET(entry, VM_MEMORY_MALLOC_LARGE_REUSABLE);
17068 		}
17069 	}
17070 
17071 	vm_map_unlock_read(map);
17072 	vm_page_stats_reusable.reusable_pages_success++;
17073 	return KERN_SUCCESS;
17074 }
17075 
17076 
17077 static kern_return_t
vm_map_can_reuse(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)17078 vm_map_can_reuse(
17079 	vm_map_t        map,
17080 	vm_map_offset_t start,
17081 	vm_map_offset_t end)
17082 {
17083 	vm_map_entry_t                  entry;
17084 
17085 	/*
17086 	 * The MADV_REUSABLE operation doesn't require any changes to the
17087 	 * vm_map_entry_t's, so the read lock is sufficient.
17088 	 */
17089 
17090 	vm_map_lock_read(map);
17091 	assert(map->pmap != kernel_pmap);       /* protect alias access */
17092 
17093 	/*
17094 	 * The madvise semantics require that the address range be fully
17095 	 * allocated with no holes.  Otherwise, we're required to return
17096 	 * an error.
17097 	 */
17098 
17099 	if (!vm_map_range_check(map, start, end, &entry)) {
17100 		vm_map_unlock_read(map);
17101 		vm_page_stats_reusable.can_reuse_failure++;
17102 		return KERN_INVALID_ADDRESS;
17103 	}
17104 
17105 	/*
17106 	 * Examine each vm_map_entry_t in the range.
17107 	 */
17108 	for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
17109 	    entry = entry->vme_next) {
17110 		/*
17111 		 * Sanity check on the VM map entry.
17112 		 */
17113 		if (!vm_map_entry_is_reusable(entry)) {
17114 			vm_map_unlock_read(map);
17115 			vm_page_stats_reusable.can_reuse_failure++;
17116 			return KERN_INVALID_ADDRESS;
17117 		}
17118 	}
17119 
17120 	vm_map_unlock_read(map);
17121 	vm_page_stats_reusable.can_reuse_success++;
17122 	return KERN_SUCCESS;
17123 }
17124 
17125 
17126 #if MACH_ASSERT
17127 static kern_return_t
vm_map_pageout(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)17128 vm_map_pageout(
17129 	vm_map_t        map,
17130 	vm_map_offset_t start,
17131 	vm_map_offset_t end)
17132 {
17133 	vm_map_entry_t                  entry;
17134 
17135 	/*
17136 	 * The MADV_PAGEOUT operation doesn't require any changes to the
17137 	 * vm_map_entry_t's, so the read lock is sufficient.
17138 	 */
17139 
17140 	vm_map_lock_read(map);
17141 
17142 	/*
17143 	 * The madvise semantics require that the address range be fully
17144 	 * allocated with no holes.  Otherwise, we're required to return
17145 	 * an error.
17146 	 */
17147 
17148 	if (!vm_map_range_check(map, start, end, &entry)) {
17149 		vm_map_unlock_read(map);
17150 		return KERN_INVALID_ADDRESS;
17151 	}
17152 
17153 	/*
17154 	 * Examine each vm_map_entry_t in the range.
17155 	 */
17156 	for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
17157 	    entry = entry->vme_next) {
17158 		vm_object_t     object;
17159 
17160 		/*
17161 		 * Sanity check on the VM map entry.
17162 		 */
17163 		if (entry->is_sub_map) {
17164 			vm_map_t submap;
17165 			vm_map_offset_t submap_start;
17166 			vm_map_offset_t submap_end;
17167 			vm_map_entry_t submap_entry;
17168 
17169 			submap = VME_SUBMAP(entry);
17170 			submap_start = VME_OFFSET(entry);
17171 			submap_end = submap_start + (entry->vme_end -
17172 			    entry->vme_start);
17173 
17174 			vm_map_lock_read(submap);
17175 
17176 			if (!vm_map_range_check(submap,
17177 			    submap_start,
17178 			    submap_end,
17179 			    &submap_entry)) {
17180 				vm_map_unlock_read(submap);
17181 				vm_map_unlock_read(map);
17182 				return KERN_INVALID_ADDRESS;
17183 			}
17184 
17185 			if (submap_entry->is_sub_map) {
17186 				vm_map_unlock_read(submap);
17187 				continue;
17188 			}
17189 
17190 			object = VME_OBJECT(submap_entry);
17191 			if (object == VM_OBJECT_NULL || !object->internal) {
17192 				vm_map_unlock_read(submap);
17193 				continue;
17194 			}
17195 
17196 			vm_object_pageout(object);
17197 
17198 			vm_map_unlock_read(submap);
17199 			submap = VM_MAP_NULL;
17200 			submap_entry = VM_MAP_ENTRY_NULL;
17201 			continue;
17202 		}
17203 
17204 		object = VME_OBJECT(entry);
17205 		if (object == VM_OBJECT_NULL || !object->internal) {
17206 			continue;
17207 		}
17208 
17209 		vm_object_pageout(object);
17210 	}
17211 
17212 	vm_map_unlock_read(map);
17213 	return KERN_SUCCESS;
17214 }
17215 #endif /* MACH_ASSERT */
17216 
17217 /*
17218  * This function determines if the zero operation can be run on the
17219  * respective entry. Additional checks on the object are in
17220  * vm_object_zero_preflight.
17221  */
17222 static kern_return_t
vm_map_zero_entry_preflight(vm_map_entry_t entry)17223 vm_map_zero_entry_preflight(vm_map_entry_t entry)
17224 {
17225 	/*
17226 	 * Zeroing is restricted to writable non-executable entries and non-JIT
17227 	 * regions.
17228 	 */
17229 	if (!(entry->protection & VM_PROT_WRITE) ||
17230 	    (entry->protection & VM_PROT_EXECUTE) ||
17231 	    entry->used_for_jit ||
17232 	    entry->vme_xnu_user_debug) {
17233 		return KERN_PROTECTION_FAILURE;
17234 	}
17235 
17236 	/*
17237 	 * Zeroing for copy on write isn't yet supported. Zeroing is also not
17238 	 * allowed for submaps.
17239 	 */
17240 	if (entry->needs_copy || entry->is_sub_map) {
17241 		return KERN_NO_ACCESS;
17242 	}
17243 
17244 	return KERN_SUCCESS;
17245 }
17246 
17247 /*
17248  * This function translates entry's start and end to offsets in the object
17249  */
17250 static void
vm_map_get_bounds_in_object(vm_map_entry_t entry,vm_map_offset_t start,vm_map_offset_t end,vm_map_offset_t * start_offset,vm_map_offset_t * end_offset)17251 vm_map_get_bounds_in_object(
17252 	vm_map_entry_t      entry,
17253 	vm_map_offset_t     start,
17254 	vm_map_offset_t     end,
17255 	vm_map_offset_t    *start_offset,
17256 	vm_map_offset_t    *end_offset)
17257 {
17258 	if (entry->vme_start < start) {
17259 		*start_offset = start - entry->vme_start;
17260 	} else {
17261 		*start_offset = 0;
17262 	}
17263 	*end_offset = MIN(end, entry->vme_end) - entry->vme_start;
17264 	*start_offset += VME_OFFSET(entry);
17265 	*end_offset += VME_OFFSET(entry);
17266 }
17267 
17268 /*
17269  * This function iterates through the entries in the requested range
17270  * and zeroes any resident pages in the corresponding objects. Compressed
17271  * pages are dropped instead of being faulted in and zeroed.
17272  */
17273 static kern_return_t
vm_map_zero(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)17274 vm_map_zero(
17275 	vm_map_t        map,
17276 	vm_map_offset_t start,
17277 	vm_map_offset_t end)
17278 {
17279 	vm_map_entry_t                  entry;
17280 	vm_map_offset_t                 cur = start;
17281 	kern_return_t                   ret;
17282 
17283 	/*
17284 	 * This operation isn't supported where the map page size is less than
17285 	 * the hardware page size. Caller will need to handle error and
17286 	 * explicitly zero memory if needed.
17287 	 */
17288 	if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
17289 		return KERN_NO_ACCESS;
17290 	}
17291 
17292 	/*
17293 	 * The MADV_ZERO operation doesn't require any changes to the
17294 	 * vm_map_entry_t's, so the read lock is sufficient.
17295 	 */
17296 	vm_map_lock_read(map);
17297 	assert(map->pmap != kernel_pmap);       /* protect alias access */
17298 
17299 	/*
17300 	 * The madvise semantics require that the address range be fully
17301 	 * allocated with no holes. Otherwise, we're required to return
17302 	 * an error. This check needs to be redone if the map has changed.
17303 	 */
17304 	if (!vm_map_range_check(map, cur, end, &entry)) {
17305 		vm_map_unlock_read(map);
17306 		return KERN_INVALID_ADDRESS;
17307 	}
17308 
17309 	/*
17310 	 * Examine each vm_map_entry_t in the range.
17311 	 */
17312 	while (entry != vm_map_to_entry(map) && entry->vme_start < end) {
17313 		vm_map_offset_t cur_offset;
17314 		vm_map_offset_t end_offset;
17315 		unsigned int last_timestamp = map->timestamp;
17316 		vm_object_t object = VME_OBJECT(entry);
17317 
17318 		ret = vm_map_zero_entry_preflight(entry);
17319 		if (ret != KERN_SUCCESS) {
17320 			vm_map_unlock_read(map);
17321 			return ret;
17322 		}
17323 
17324 		if (object == VM_OBJECT_NULL) {
17325 			entry = entry->vme_next;
17326 			continue;
17327 		}
17328 
17329 		vm_map_get_bounds_in_object(entry, cur, end, &cur_offset, &end_offset);
17330 		vm_object_lock(object);
17331 		/*
17332 		 * Take a reference on the object as vm_object_zero will drop the object
17333 		 * lock when it encounters a busy page.
17334 		 */
17335 		vm_object_reference_locked(object);
17336 		vm_map_unlock_read(map);
17337 
17338 		ret = vm_object_zero(object, cur_offset, end_offset);
17339 		vm_object_unlock(object);
17340 		vm_object_deallocate(object);
17341 		if (ret != KERN_SUCCESS) {
17342 			return ret;
17343 		}
17344 		/*
17345 		 * Update cur as vm_object_zero has succeeded.
17346 		 */
17347 		cur += (end_offset - cur_offset);
17348 		if (cur == end) {
17349 			return KERN_SUCCESS;
17350 		}
17351 
17352 		/*
17353 		 * If the map timestamp has changed, restart by relooking up cur in the
17354 		 * map
17355 		 */
17356 		vm_map_lock_read(map);
17357 		if (last_timestamp != map->timestamp) {
17358 			/*
17359 			 * Relookup cur in the map
17360 			 */
17361 			if (!vm_map_range_check(map, cur, end, &entry)) {
17362 				vm_map_unlock_read(map);
17363 				return KERN_INVALID_ADDRESS;
17364 			}
17365 			continue;
17366 		}
17367 		/*
17368 		 * If the map hasn't changed proceed with the next entry
17369 		 */
17370 		entry = entry->vme_next;
17371 	}
17372 
17373 	vm_map_unlock_read(map);
17374 	return KERN_SUCCESS;
17375 }
17376 
17377 
17378 /*
17379  *	Routine:	vm_map_entry_insert
17380  *
17381  *	Description:	This routine inserts a new vm_entry in a locked map.
17382  */
17383 static vm_map_entry_t
vm_map_entry_insert(vm_map_t map,vm_map_entry_t insp_entry,vm_map_offset_t start,vm_map_offset_t end,vm_object_t object,vm_object_offset_t offset,vm_map_kernel_flags_t vmk_flags,boolean_t needs_copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance,boolean_t clear_map_aligned)17384 vm_map_entry_insert(
17385 	vm_map_t                map,
17386 	vm_map_entry_t          insp_entry,
17387 	vm_map_offset_t         start,
17388 	vm_map_offset_t         end,
17389 	vm_object_t             object,
17390 	vm_object_offset_t      offset,
17391 	vm_map_kernel_flags_t   vmk_flags,
17392 	boolean_t               needs_copy,
17393 	vm_prot_t               cur_protection,
17394 	vm_prot_t               max_protection,
17395 	vm_inherit_t            inheritance,
17396 	boolean_t               clear_map_aligned)
17397 {
17398 	vm_map_entry_t  new_entry;
17399 	boolean_t map_aligned = FALSE;
17400 
17401 	assert(insp_entry != (vm_map_entry_t)0);
17402 	vm_map_lock_assert_exclusive(map);
17403 
17404 	__assert_only vm_object_offset_t      end_offset = 0;
17405 	assertf(!os_add_overflow(end - start, offset, &end_offset), "size 0x%llx, offset 0x%llx caused overflow", (uint64_t)(end - start), offset);
17406 
17407 	if (VM_MAP_PAGE_SHIFT(map) != PAGE_SHIFT) {
17408 		map_aligned = TRUE;
17409 	}
17410 	if (clear_map_aligned &&
17411 	    (!VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)) ||
17412 	    !VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)))) {
17413 		map_aligned = FALSE;
17414 	}
17415 	if (map_aligned) {
17416 		assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
17417 		assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
17418 	} else {
17419 		assert(page_aligned(start));
17420 		assert(page_aligned(end));
17421 	}
17422 	assert(start < end);
17423 
17424 	new_entry = vm_map_entry_create(map);
17425 
17426 	new_entry->vme_start = start;
17427 	new_entry->vme_end = end;
17428 
17429 	if (vmk_flags.vmkf_submap) {
17430 		new_entry->vme_atomic = vmk_flags.vmkf_submap_atomic;
17431 		VME_SUBMAP_SET(new_entry, (vm_map_t)object);
17432 	} else {
17433 		VME_OBJECT_SET(new_entry, object, false, 0);
17434 	}
17435 	VME_OFFSET_SET(new_entry, offset);
17436 	VME_ALIAS_SET(new_entry, vmk_flags.vm_tag);
17437 
17438 	new_entry->map_aligned = map_aligned;
17439 	new_entry->needs_copy = needs_copy;
17440 	new_entry->inheritance = inheritance;
17441 	new_entry->protection = cur_protection;
17442 	new_entry->max_protection = max_protection;
17443 	/*
17444 	 * submap: "use_pmap" means "nested".
17445 	 * default: false.
17446 	 *
17447 	 * object: "use_pmap" means "use pmap accounting" for footprint.
17448 	 * default: true.
17449 	 */
17450 	new_entry->use_pmap = !vmk_flags.vmkf_submap;
17451 	new_entry->no_cache = vmk_flags.vmf_no_cache;
17452 	new_entry->vme_permanent = vmk_flags.vmf_permanent;
17453 	new_entry->translated_allow_execute = vmk_flags.vmkf_translated_allow_execute;
17454 	new_entry->vme_no_copy_on_read = vmk_flags.vmkf_no_copy_on_read;
17455 	new_entry->superpage_size = (vmk_flags.vmf_superpage_size != 0);
17456 
17457 	if (vmk_flags.vmkf_map_jit) {
17458 		if (!(map->jit_entry_exists) ||
17459 		    VM_MAP_POLICY_ALLOW_MULTIPLE_JIT(map)) {
17460 			new_entry->used_for_jit = TRUE;
17461 			map->jit_entry_exists = TRUE;
17462 		}
17463 	}
17464 
17465 	/*
17466 	 *	Insert the new entry into the list.
17467 	 */
17468 
17469 	vm_map_store_entry_link(map, insp_entry, new_entry, vmk_flags);
17470 	map->size += end - start;
17471 
17472 	/*
17473 	 *	Update the free space hint and the lookup hint.
17474 	 */
17475 
17476 	SAVE_HINT_MAP_WRITE(map, new_entry);
17477 	return new_entry;
17478 }
17479 
17480 /*
17481  *	Routine:	vm_map_remap_extract
17482  *
17483  *	Description:	This routine returns a vm_entry list from a map.
17484  */
17485 static kern_return_t
vm_map_remap_extract(vm_map_t map,vm_map_offset_t addr,vm_map_size_t size,boolean_t copy,vm_map_copy_t map_copy,vm_prot_t * cur_protection,vm_prot_t * max_protection,vm_inherit_t inheritance,vm_map_kernel_flags_t vmk_flags)17486 vm_map_remap_extract(
17487 	vm_map_t                map,
17488 	vm_map_offset_t         addr,
17489 	vm_map_size_t           size,
17490 	boolean_t               copy,
17491 	vm_map_copy_t           map_copy,
17492 	vm_prot_t               *cur_protection,   /* IN/OUT */
17493 	vm_prot_t               *max_protection,   /* IN/OUT */
17494 	/* What, no behavior? */
17495 	vm_inherit_t            inheritance,
17496 	vm_map_kernel_flags_t   vmk_flags)
17497 {
17498 	struct vm_map_header   *map_header = &map_copy->cpy_hdr;
17499 	kern_return_t           result;
17500 	vm_map_size_t           mapped_size;
17501 	vm_map_size_t           tmp_size;
17502 	vm_map_entry_t          src_entry;     /* result of last map lookup */
17503 	vm_map_entry_t          new_entry;
17504 	vm_object_offset_t      offset;
17505 	vm_map_offset_t         map_address;
17506 	vm_map_offset_t         src_start;     /* start of entry to map */
17507 	vm_map_offset_t         src_end;       /* end of region to be mapped */
17508 	vm_object_t             object;
17509 	vm_map_version_t        version;
17510 	boolean_t               src_needs_copy;
17511 	boolean_t               new_entry_needs_copy;
17512 	vm_map_entry_t          saved_src_entry;
17513 	boolean_t               src_entry_was_wired;
17514 	vm_prot_t               max_prot_for_prot_copy;
17515 	vm_map_offset_t         effective_page_mask;
17516 	bool                    pageable, same_map;
17517 	boolean_t               vm_remap_legacy;
17518 	vm_prot_t               required_cur_prot, required_max_prot;
17519 	vm_object_t             new_copy_object;     /* vm_object_copy_* result */
17520 	boolean_t               saved_used_for_jit;  /* Saved used_for_jit. */
17521 
17522 	pageable = vmk_flags.vmkf_copy_pageable;
17523 	same_map = vmk_flags.vmkf_copy_same_map;
17524 
17525 	effective_page_mask = MIN(PAGE_MASK, VM_MAP_PAGE_MASK(map));
17526 
17527 	assert(map != VM_MAP_NULL);
17528 	assert(size != 0);
17529 	assert(size == vm_map_round_page(size, effective_page_mask));
17530 	assert(inheritance == VM_INHERIT_NONE ||
17531 	    inheritance == VM_INHERIT_COPY ||
17532 	    inheritance == VM_INHERIT_SHARE);
17533 	assert(!(*cur_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)));
17534 	assert(!(*max_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)));
17535 	assert((*cur_protection & *max_protection) == *cur_protection);
17536 
17537 	/*
17538 	 *	Compute start and end of region.
17539 	 */
17540 	src_start = vm_map_trunc_page(addr, effective_page_mask);
17541 	src_end = vm_map_round_page(src_start + size, effective_page_mask);
17542 
17543 	/*
17544 	 *	Initialize map_header.
17545 	 */
17546 	map_header->nentries = 0;
17547 	map_header->entries_pageable = pageable;
17548 //	map_header->page_shift = MIN(VM_MAP_PAGE_SHIFT(map), PAGE_SHIFT);
17549 	map_header->page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(map);
17550 	map_header->rb_head_store.rbh_root = (void *)(int)SKIP_RB_TREE;
17551 	vm_map_store_init(map_header);
17552 
17553 	if (copy && vmk_flags.vmkf_remap_prot_copy) {
17554 		/*
17555 		 * Special case for vm_map_protect(VM_PROT_COPY):
17556 		 * we want to set the new mappings' max protection to the
17557 		 * specified *max_protection...
17558 		 */
17559 		max_prot_for_prot_copy = *max_protection & (VM_PROT_ALL | VM_PROT_ALLEXEC);
17560 		/* ... but we want to use the vm_remap() legacy mode */
17561 		vmk_flags.vmkf_remap_legacy_mode = true;
17562 		*max_protection = VM_PROT_NONE;
17563 		*cur_protection = VM_PROT_NONE;
17564 	} else {
17565 		max_prot_for_prot_copy = VM_PROT_NONE;
17566 	}
17567 
17568 	if (vmk_flags.vmkf_remap_legacy_mode) {
17569 		/*
17570 		 * vm_remap() legacy mode:
17571 		 * Extract all memory regions in the specified range and
17572 		 * collect the strictest set of protections allowed on the
17573 		 * entire range, so the caller knows what they can do with
17574 		 * the remapped range.
17575 		 * We start with VM_PROT_ALL and we'll remove the protections
17576 		 * missing from each memory region.
17577 		 */
17578 		vm_remap_legacy = TRUE;
17579 		*cur_protection = VM_PROT_ALL;
17580 		*max_protection = VM_PROT_ALL;
17581 		required_cur_prot = VM_PROT_NONE;
17582 		required_max_prot = VM_PROT_NONE;
17583 	} else {
17584 		/*
17585 		 * vm_remap_new() mode:
17586 		 * Extract all memory regions in the specified range and
17587 		 * ensure that they have at least the protections specified
17588 		 * by the caller via *cur_protection and *max_protection.
17589 		 * The resulting mapping should have these protections.
17590 		 */
17591 		vm_remap_legacy = FALSE;
17592 		if (copy) {
17593 			required_cur_prot = VM_PROT_NONE;
17594 			required_max_prot = VM_PROT_READ;
17595 		} else {
17596 			required_cur_prot = *cur_protection;
17597 			required_max_prot = *max_protection;
17598 		}
17599 	}
17600 
17601 	map_address = 0;
17602 	mapped_size = 0;
17603 	result = KERN_SUCCESS;
17604 
17605 	/*
17606 	 *	The specified source virtual space might correspond to
17607 	 *	multiple map entries, need to loop on them.
17608 	 */
17609 	vm_map_lock(map);
17610 
17611 	if (map->pmap == kernel_pmap) {
17612 		map_copy->is_kernel_range = true;
17613 		map_copy->orig_range = kmem_addr_get_range(addr, size);
17614 #if CONFIG_MAP_RANGES
17615 	} else if (map->uses_user_ranges) {
17616 		map_copy->is_user_range = true;
17617 		map_copy->orig_range = vm_map_user_range_resolve(map, addr, size, NULL);
17618 #endif /* CONFIG_MAP_RANGES */
17619 	}
17620 
17621 	if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
17622 		/*
17623 		 * This address space uses sub-pages so the range might
17624 		 * not be re-mappable in an address space with larger
17625 		 * pages. Re-assemble any broken-up VM map entries to
17626 		 * improve our chances of making it work.
17627 		 */
17628 		vm_map_simplify_range(map, src_start, src_end);
17629 	}
17630 	while (mapped_size != size) {
17631 		vm_map_size_t   entry_size;
17632 
17633 		/*
17634 		 *	Find the beginning of the region.
17635 		 */
17636 		if (!vm_map_lookup_entry(map, src_start, &src_entry)) {
17637 			result = KERN_INVALID_ADDRESS;
17638 			break;
17639 		}
17640 
17641 		if (src_start < src_entry->vme_start ||
17642 		    (mapped_size && src_start != src_entry->vme_start)) {
17643 			result = KERN_INVALID_ADDRESS;
17644 			break;
17645 		}
17646 
17647 		tmp_size = size - mapped_size;
17648 		if (src_end > src_entry->vme_end) {
17649 			tmp_size -= (src_end - src_entry->vme_end);
17650 		}
17651 
17652 		entry_size = (vm_map_size_t)(src_entry->vme_end -
17653 		    src_entry->vme_start);
17654 
17655 		if (src_entry->is_sub_map &&
17656 		    vmk_flags.vmkf_copy_single_object) {
17657 			vm_map_t submap;
17658 			vm_map_offset_t submap_start;
17659 			vm_map_size_t submap_size;
17660 			boolean_t submap_needs_copy;
17661 
17662 			/*
17663 			 * No check for "required protection" on "src_entry"
17664 			 * because the protections that matter are the ones
17665 			 * on the submap's VM map entry, which will be checked
17666 			 * during the call to vm_map_remap_extract() below.
17667 			 */
17668 			object = VM_OBJECT_NULL;
17669 
17670 			submap_size = src_entry->vme_end - src_start;
17671 			if (submap_size > size) {
17672 				submap_size = size;
17673 			}
17674 			submap_start = VME_OFFSET(src_entry) + src_start - src_entry->vme_start;
17675 			submap = VME_SUBMAP(src_entry);
17676 			if (copy) {
17677 				/*
17678 				 * The caller wants a copy-on-write re-mapping,
17679 				 * so let's extract from the submap accordingly.
17680 				 */
17681 				submap_needs_copy = TRUE;
17682 			} else if (src_entry->needs_copy) {
17683 				/*
17684 				 * The caller wants a shared re-mapping but the
17685 				 * submap is mapped with "needs_copy", so its
17686 				 * contents can't be shared as is. Extract the
17687 				 * contents of the submap as "copy-on-write".
17688 				 * The re-mapping won't be shared with the
17689 				 * original mapping but this is equivalent to
17690 				 * what happened with the original "remap from
17691 				 * submap" code.
17692 				 * The shared region is mapped "needs_copy", for
17693 				 * example.
17694 				 */
17695 				submap_needs_copy = TRUE;
17696 			} else {
17697 				/*
17698 				 * The caller wants a shared re-mapping and
17699 				 * this mapping can be shared (no "needs_copy"),
17700 				 * so let's extract from the submap accordingly.
17701 				 * Kernel submaps are mapped without
17702 				 * "needs_copy", for example.
17703 				 */
17704 				submap_needs_copy = FALSE;
17705 			}
17706 			vm_map_reference(submap);
17707 			vm_map_unlock(map);
17708 			src_entry = NULL;
17709 			if (vm_remap_legacy) {
17710 				*cur_protection = VM_PROT_NONE;
17711 				*max_protection = VM_PROT_NONE;
17712 			}
17713 
17714 			DTRACE_VM7(remap_submap_recurse,
17715 			    vm_map_t, map,
17716 			    vm_map_offset_t, addr,
17717 			    vm_map_size_t, size,
17718 			    boolean_t, copy,
17719 			    vm_map_offset_t, submap_start,
17720 			    vm_map_size_t, submap_size,
17721 			    boolean_t, submap_needs_copy);
17722 
17723 			result = vm_map_remap_extract(submap,
17724 			    submap_start,
17725 			    submap_size,
17726 			    submap_needs_copy,
17727 			    map_copy,
17728 			    cur_protection,
17729 			    max_protection,
17730 			    inheritance,
17731 			    vmk_flags);
17732 			vm_map_deallocate(submap);
17733 
17734 			if (result == KERN_SUCCESS &&
17735 			    submap_needs_copy &&
17736 			    !copy) {
17737 				/*
17738 				 * We were asked for a "shared"
17739 				 * re-mapping but had to ask for a
17740 				 * "copy-on-write" remapping of the
17741 				 * submap's mapping to honor the
17742 				 * submap's "needs_copy".
17743 				 * We now need to resolve that
17744 				 * pending "copy-on-write" to
17745 				 * get something we can share.
17746 				 */
17747 				vm_map_entry_t copy_entry;
17748 				vm_object_offset_t copy_offset;
17749 				vm_map_size_t copy_size;
17750 				vm_object_t copy_object;
17751 				copy_entry = vm_map_copy_first_entry(map_copy);
17752 				copy_size = copy_entry->vme_end - copy_entry->vme_start;
17753 				copy_object = VME_OBJECT(copy_entry);
17754 				copy_offset = VME_OFFSET(copy_entry);
17755 				if (copy_object == VM_OBJECT_NULL) {
17756 					assert(copy_offset == 0);
17757 					assert(!copy_entry->needs_copy);
17758 					if (copy_entry->max_protection == VM_PROT_NONE) {
17759 						assert(copy_entry->protection == VM_PROT_NONE);
17760 						/* nothing to share */
17761 					} else {
17762 						assert(copy_offset == 0);
17763 						copy_object = vm_object_allocate(copy_size);
17764 						VME_OFFSET_SET(copy_entry, 0);
17765 						VME_OBJECT_SET(copy_entry, copy_object, false, 0);
17766 						assert(copy_entry->use_pmap);
17767 					}
17768 				} else if (copy_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
17769 					/* already shareable */
17770 					assert(!copy_entry->needs_copy);
17771 				} else if (copy_entry->needs_copy ||
17772 				    copy_object->shadowed ||
17773 				    (copy_object->internal &&
17774 				    !copy_object->true_share &&
17775 				    !copy_entry->is_shared &&
17776 				    copy_object->vo_size > copy_size)) {
17777 					VME_OBJECT_SHADOW(copy_entry, copy_size, TRUE);
17778 					assert(copy_entry->use_pmap);
17779 					if (copy_entry->needs_copy) {
17780 						/* already write-protected */
17781 					} else {
17782 						vm_prot_t prot;
17783 						prot = copy_entry->protection & ~VM_PROT_WRITE;
17784 						vm_object_pmap_protect(copy_object,
17785 						    copy_offset,
17786 						    copy_size,
17787 						    PMAP_NULL,
17788 						    PAGE_SIZE,
17789 						    0,
17790 						    prot);
17791 					}
17792 					copy_entry->needs_copy = FALSE;
17793 				}
17794 				copy_object = VME_OBJECT(copy_entry);
17795 				copy_offset = VME_OFFSET(copy_entry);
17796 				if (copy_object &&
17797 				    copy_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
17798 					copy_object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
17799 					copy_object->true_share = TRUE;
17800 				}
17801 			}
17802 
17803 			return result;
17804 		}
17805 
17806 		if (src_entry->is_sub_map) {
17807 			/* protections for submap mapping are irrelevant here */
17808 		} else if (((src_entry->protection & required_cur_prot) !=
17809 		    required_cur_prot) ||
17810 		    ((src_entry->max_protection & required_max_prot) !=
17811 		    required_max_prot)) {
17812 			if (vmk_flags.vmkf_copy_single_object &&
17813 			    mapped_size != 0) {
17814 				/*
17815 				 * Single object extraction.
17816 				 * We can't extract more with the required
17817 				 * protection but we've extracted some, so
17818 				 * stop there and declare success.
17819 				 * The caller should check the size of
17820 				 * the copy entry we've extracted.
17821 				 */
17822 				result = KERN_SUCCESS;
17823 			} else {
17824 				/*
17825 				 * VM range extraction.
17826 				 * Required proctection is not available
17827 				 * for this part of the range: fail.
17828 				 */
17829 				result = KERN_PROTECTION_FAILURE;
17830 			}
17831 			break;
17832 		}
17833 
17834 		if (src_entry->is_sub_map) {
17835 			vm_map_t submap;
17836 			vm_map_offset_t submap_start;
17837 			vm_map_size_t submap_size;
17838 			vm_map_copy_t submap_copy;
17839 			vm_prot_t submap_curprot, submap_maxprot;
17840 			boolean_t submap_needs_copy;
17841 
17842 			/*
17843 			 * No check for "required protection" on "src_entry"
17844 			 * because the protections that matter are the ones
17845 			 * on the submap's VM map entry, which will be checked
17846 			 * during the call to vm_map_copy_extract() below.
17847 			 */
17848 			object = VM_OBJECT_NULL;
17849 			submap_copy = VM_MAP_COPY_NULL;
17850 
17851 			/* find equivalent range in the submap */
17852 			submap = VME_SUBMAP(src_entry);
17853 			submap_start = VME_OFFSET(src_entry) + src_start - src_entry->vme_start;
17854 			submap_size = tmp_size;
17855 			if (copy) {
17856 				/*
17857 				 * The caller wants a copy-on-write re-mapping,
17858 				 * so let's extract from the submap accordingly.
17859 				 */
17860 				submap_needs_copy = TRUE;
17861 			} else if (src_entry->needs_copy) {
17862 				/*
17863 				 * The caller wants a shared re-mapping but the
17864 				 * submap is mapped with "needs_copy", so its
17865 				 * contents can't be shared as is. Extract the
17866 				 * contents of the submap as "copy-on-write".
17867 				 * The re-mapping won't be shared with the
17868 				 * original mapping but this is equivalent to
17869 				 * what happened with the original "remap from
17870 				 * submap" code.
17871 				 * The shared region is mapped "needs_copy", for
17872 				 * example.
17873 				 */
17874 				submap_needs_copy = TRUE;
17875 			} else {
17876 				/*
17877 				 * The caller wants a shared re-mapping and
17878 				 * this mapping can be shared (no "needs_copy"),
17879 				 * so let's extract from the submap accordingly.
17880 				 * Kernel submaps are mapped without
17881 				 * "needs_copy", for example.
17882 				 */
17883 				submap_needs_copy = FALSE;
17884 			}
17885 			/* extra ref to keep submap alive */
17886 			vm_map_reference(submap);
17887 
17888 			DTRACE_VM7(remap_submap_recurse,
17889 			    vm_map_t, map,
17890 			    vm_map_offset_t, addr,
17891 			    vm_map_size_t, size,
17892 			    boolean_t, copy,
17893 			    vm_map_offset_t, submap_start,
17894 			    vm_map_size_t, submap_size,
17895 			    boolean_t, submap_needs_copy);
17896 
17897 			/*
17898 			 * The map can be safely unlocked since we
17899 			 * already hold a reference on the submap.
17900 			 *
17901 			 * No timestamp since we don't care if the map
17902 			 * gets modified while we're down in the submap.
17903 			 * We'll resume the extraction at src_start + tmp_size
17904 			 * anyway.
17905 			 */
17906 			vm_map_unlock(map);
17907 			src_entry = NULL; /* not valid once map is unlocked */
17908 
17909 			if (vm_remap_legacy) {
17910 				submap_curprot = VM_PROT_NONE;
17911 				submap_maxprot = VM_PROT_NONE;
17912 				if (max_prot_for_prot_copy) {
17913 					submap_maxprot = max_prot_for_prot_copy;
17914 				}
17915 			} else {
17916 				assert(!max_prot_for_prot_copy);
17917 				submap_curprot = *cur_protection;
17918 				submap_maxprot = *max_protection;
17919 			}
17920 			result = vm_map_copy_extract(submap,
17921 			    submap_start,
17922 			    submap_size,
17923 			    submap_needs_copy,
17924 			    &submap_copy,
17925 			    &submap_curprot,
17926 			    &submap_maxprot,
17927 			    inheritance,
17928 			    vmk_flags);
17929 
17930 			/* release extra ref on submap */
17931 			vm_map_deallocate(submap);
17932 			submap = VM_MAP_NULL;
17933 
17934 			if (result != KERN_SUCCESS) {
17935 				vm_map_lock(map);
17936 				break;
17937 			}
17938 
17939 			/* transfer submap_copy entries to map_header */
17940 			while (vm_map_copy_first_entry(submap_copy) !=
17941 			    vm_map_copy_to_entry(submap_copy)) {
17942 				vm_map_entry_t copy_entry;
17943 				vm_map_size_t copy_entry_size;
17944 
17945 				copy_entry = vm_map_copy_first_entry(submap_copy);
17946 
17947 				/*
17948 				 * Prevent kernel_object from being exposed to
17949 				 * user space.
17950 				 */
17951 				if (__improbable(copy_entry->vme_kernel_object)) {
17952 					printf("%d[%s]: rejecting attempt to extract from kernel_object\n",
17953 					    proc_selfpid(),
17954 					    (get_bsdtask_info(current_task())
17955 					    ? proc_name_address(get_bsdtask_info(current_task()))
17956 					    : "?"));
17957 					DTRACE_VM(extract_kernel_only);
17958 					result = KERN_INVALID_RIGHT;
17959 					vm_map_copy_discard(submap_copy);
17960 					submap_copy = VM_MAP_COPY_NULL;
17961 					vm_map_lock(map);
17962 					break;
17963 				}
17964 
17965 				vm_map_copy_entry_unlink(submap_copy, copy_entry);
17966 				copy_entry_size = copy_entry->vme_end - copy_entry->vme_start;
17967 				copy_entry->vme_start = map_address;
17968 				copy_entry->vme_end = map_address + copy_entry_size;
17969 				map_address += copy_entry_size;
17970 				mapped_size += copy_entry_size;
17971 				src_start += copy_entry_size;
17972 				assert(src_start <= src_end);
17973 				_vm_map_store_entry_link(map_header,
17974 				    map_header->links.prev,
17975 				    copy_entry);
17976 			}
17977 			/* done with submap_copy */
17978 			vm_map_copy_discard(submap_copy);
17979 
17980 			if (vm_remap_legacy) {
17981 				*cur_protection &= submap_curprot;
17982 				*max_protection &= submap_maxprot;
17983 			}
17984 
17985 			/* re-acquire the map lock and continue to next entry */
17986 			vm_map_lock(map);
17987 			continue;
17988 		} else {
17989 			object = VME_OBJECT(src_entry);
17990 
17991 			/*
17992 			 * Prevent kernel_object from being exposed to
17993 			 * user space.
17994 			 */
17995 			if (__improbable(is_kernel_object(object))) {
17996 				printf("%d[%s]: rejecting attempt to extract from kernel_object\n",
17997 				    proc_selfpid(),
17998 				    (get_bsdtask_info(current_task())
17999 				    ? proc_name_address(get_bsdtask_info(current_task()))
18000 				    : "?"));
18001 				DTRACE_VM(extract_kernel_only);
18002 				result = KERN_INVALID_RIGHT;
18003 				break;
18004 			}
18005 
18006 			if (src_entry->iokit_acct) {
18007 				/*
18008 				 * This entry uses "IOKit accounting".
18009 				 */
18010 			} else if (object != VM_OBJECT_NULL &&
18011 			    object->internal &&
18012 			    (object->purgable != VM_PURGABLE_DENY ||
18013 			    object->vo_ledger_tag != VM_LEDGER_TAG_NONE)) {
18014 				/*
18015 				 * Purgeable objects have their own accounting:
18016 				 * no pmap accounting for them.
18017 				 */
18018 				assertf(!src_entry->use_pmap,
18019 				    "map=%p src_entry=%p [0x%llx:0x%llx] 0x%x/0x%x %d",
18020 				    map,
18021 				    src_entry,
18022 				    (uint64_t)src_entry->vme_start,
18023 				    (uint64_t)src_entry->vme_end,
18024 				    src_entry->protection,
18025 				    src_entry->max_protection,
18026 				    VME_ALIAS(src_entry));
18027 			} else {
18028 				/*
18029 				 * Not IOKit or purgeable:
18030 				 * must be accounted by pmap stats.
18031 				 */
18032 				assertf(src_entry->use_pmap,
18033 				    "map=%p src_entry=%p [0x%llx:0x%llx] 0x%x/0x%x %d",
18034 				    map,
18035 				    src_entry,
18036 				    (uint64_t)src_entry->vme_start,
18037 				    (uint64_t)src_entry->vme_end,
18038 				    src_entry->protection,
18039 				    src_entry->max_protection,
18040 				    VME_ALIAS(src_entry));
18041 			}
18042 
18043 			if (object == VM_OBJECT_NULL) {
18044 				assert(!src_entry->needs_copy);
18045 				if (src_entry->max_protection == VM_PROT_NONE) {
18046 					assert(src_entry->protection == VM_PROT_NONE);
18047 					/*
18048 					 * No VM object and no permissions:
18049 					 * this must be a reserved range with
18050 					 * nothing to share or copy.
18051 					 * There could also be all sorts of
18052 					 * pmap shenanigans within that reserved
18053 					 * range, so let's just copy the map
18054 					 * entry as is to remap a similar
18055 					 * reserved range.
18056 					 */
18057 					offset = 0; /* no object => no offset */
18058 					goto copy_src_entry;
18059 				}
18060 				object = vm_object_allocate(entry_size);
18061 				VME_OFFSET_SET(src_entry, 0);
18062 				VME_OBJECT_SET(src_entry, object, false, 0);
18063 				assert(src_entry->use_pmap);
18064 				assert(!map->mapped_in_other_pmaps);
18065 			} else if (src_entry->wired_count ||
18066 			    object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
18067 				/*
18068 				 * A wired memory region should not have
18069 				 * any pending copy-on-write and needs to
18070 				 * keep pointing at the VM object that
18071 				 * contains the wired pages.
18072 				 * If we're sharing this memory (copy=false),
18073 				 * we'll share this VM object.
18074 				 * If we're copying this memory (copy=true),
18075 				 * we'll call vm_object_copy_slowly() below
18076 				 * and use the new VM object for the remapping.
18077 				 *
18078 				 * Or, we are already using an asymmetric
18079 				 * copy, and therefore we already have
18080 				 * the right object.
18081 				 */
18082 				assert(!src_entry->needs_copy);
18083 			} else if (src_entry->needs_copy || object->shadowed ||
18084 			    (object->internal && !object->true_share &&
18085 			    !src_entry->is_shared &&
18086 			    object->vo_size > entry_size)) {
18087 				bool is_writable;
18088 
18089 				VME_OBJECT_SHADOW(src_entry, entry_size,
18090 				    vm_map_always_shadow(map));
18091 				assert(src_entry->use_pmap);
18092 
18093 				is_writable = false;
18094 				if (src_entry->protection & VM_PROT_WRITE) {
18095 					is_writable = true;
18096 #if __arm64e__
18097 				} else if (src_entry->used_for_tpro) {
18098 					is_writable = true;
18099 #endif /* __arm64e__ */
18100 				}
18101 				if (!src_entry->needs_copy && is_writable) {
18102 					vm_prot_t prot;
18103 
18104 					if (pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, src_entry->protection)) {
18105 						panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
18106 						    __FUNCTION__,
18107 						    map, map->pmap,
18108 						    src_entry,
18109 						    (uint64_t)src_entry->vme_start,
18110 						    (uint64_t)src_entry->vme_end,
18111 						    src_entry->protection);
18112 					}
18113 
18114 					prot = src_entry->protection & ~VM_PROT_WRITE;
18115 
18116 					if (override_nx(map,
18117 					    VME_ALIAS(src_entry))
18118 					    && prot) {
18119 						prot |= VM_PROT_EXECUTE;
18120 					}
18121 
18122 					if (pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, prot)) {
18123 						panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
18124 						    __FUNCTION__,
18125 						    map, map->pmap,
18126 						    src_entry,
18127 						    (uint64_t)src_entry->vme_start,
18128 						    (uint64_t)src_entry->vme_end,
18129 						    prot);
18130 					}
18131 
18132 					if (map->mapped_in_other_pmaps) {
18133 						vm_object_pmap_protect(
18134 							VME_OBJECT(src_entry),
18135 							VME_OFFSET(src_entry),
18136 							entry_size,
18137 							PMAP_NULL,
18138 							PAGE_SIZE,
18139 							src_entry->vme_start,
18140 							prot);
18141 #if MACH_ASSERT
18142 					} else if (__improbable(map->pmap == PMAP_NULL)) {
18143 						/*
18144 						 * Some VM tests (in vm_tests.c)
18145 						 * sometimes want to use a VM
18146 						 * map without a pmap.
18147 						 * Otherwise, this should never
18148 						 * happen.
18149 						 */
18150 						if (!thread_get_test_option(test_option_vm_map_allow_null_pmap)) {
18151 							panic("null pmap");
18152 						}
18153 #endif /* MACH_ASSERT */
18154 					} else {
18155 						pmap_protect(vm_map_pmap(map),
18156 						    src_entry->vme_start,
18157 						    src_entry->vme_end,
18158 						    prot);
18159 					}
18160 				}
18161 
18162 				object = VME_OBJECT(src_entry);
18163 				src_entry->needs_copy = FALSE;
18164 			}
18165 
18166 
18167 			vm_object_lock(object);
18168 			vm_object_reference_locked(object); /* object ref. for new entry */
18169 			assert(!src_entry->needs_copy);
18170 			if (object->copy_strategy ==
18171 			    MEMORY_OBJECT_COPY_SYMMETRIC) {
18172 				/*
18173 				 * If we want to share this object (copy==0),
18174 				 * it needs to be COPY_DELAY.
18175 				 * If we want to copy this object (copy==1),
18176 				 * we can't just set "needs_copy" on our side
18177 				 * and expect the other side to do the same
18178 				 * (symmetrically), so we can't let the object
18179 				 * stay COPY_SYMMETRIC.
18180 				 * So we always switch from COPY_SYMMETRIC to
18181 				 * COPY_DELAY.
18182 				 */
18183 				object->copy_strategy =
18184 				    MEMORY_OBJECT_COPY_DELAY;
18185 				VM_OBJECT_SET_TRUE_SHARE(object, TRUE);
18186 			}
18187 			vm_object_unlock(object);
18188 		}
18189 
18190 		offset = (VME_OFFSET(src_entry) +
18191 		    (src_start - src_entry->vme_start));
18192 
18193 copy_src_entry:
18194 
18195 
18196 		new_entry = _vm_map_entry_create(map_header);
18197 		vm_map_entry_copy(map, new_entry, src_entry);
18198 		if (new_entry->is_sub_map) {
18199 			/* clr address space specifics */
18200 			new_entry->use_pmap = FALSE;
18201 		} else if (copy) {
18202 			/*
18203 			 * We're dealing with a copy-on-write operation,
18204 			 * so the resulting mapping should not inherit the
18205 			 * original mapping's accounting settings.
18206 			 * "use_pmap" should be reset to its default (TRUE)
18207 			 * so that the new mapping gets accounted for in
18208 			 * the task's memory footprint.
18209 			 */
18210 			new_entry->use_pmap = TRUE;
18211 		}
18212 		/* "iokit_acct" was cleared in vm_map_entry_copy() */
18213 		assert(!new_entry->iokit_acct);
18214 
18215 		new_entry->map_aligned = FALSE;
18216 
18217 		new_entry->vme_start = map_address;
18218 		new_entry->vme_end = map_address + tmp_size;
18219 		assert(new_entry->vme_start < new_entry->vme_end);
18220 		if (copy && vmk_flags.vmkf_remap_prot_copy) {
18221 			/* security: keep "permanent" and "csm_associated" */
18222 			new_entry->vme_permanent = src_entry->vme_permanent;
18223 			new_entry->csm_associated = src_entry->csm_associated;
18224 			/*
18225 			 * Remapping for vm_map_protect(VM_PROT_COPY)
18226 			 * to convert a read-only mapping into a
18227 			 * copy-on-write version of itself but
18228 			 * with write access:
18229 			 * keep the original inheritance but let's not
18230 			 * add VM_PROT_WRITE to the max protection yet
18231 			 * since we want to do more security checks against
18232 			 * the target map.
18233 			 */
18234 			new_entry->inheritance = src_entry->inheritance;
18235 			new_entry->protection &= max_prot_for_prot_copy;
18236 
18237 #ifdef __arm64e__
18238 			/*
18239 			 * Remapping for vm_map_protect(VM_PROT_COPY) to remap a TPRO
18240 			 * region to be explicitly writable without TPRO is only permitted
18241 			 * if TPRO enforcement has been overridden.
18242 			 *
18243 			 * In this case we ensure any entries reset the TPRO state
18244 			 * and we permit the region to be downgraded from permanent.
18245 			 */
18246 			if (new_entry->used_for_tpro) {
18247 				if (vmk_flags.vmkf_tpro_enforcement_override) {
18248 					new_entry->used_for_tpro = FALSE;
18249 					new_entry->vme_permanent = FALSE;
18250 				} else {
18251 					result = KERN_PROTECTION_FAILURE;
18252 					vm_object_deallocate(object);
18253 					vm_map_entry_dispose(new_entry);
18254 					new_entry = VM_MAP_ENTRY_NULL;
18255 					break;
18256 				}
18257 			}
18258 #endif
18259 		} else {
18260 			new_entry->inheritance = inheritance;
18261 			if (!vm_remap_legacy) {
18262 				new_entry->protection = *cur_protection;
18263 				new_entry->max_protection = *max_protection;
18264 			}
18265 		}
18266 
18267 		VME_OFFSET_SET(new_entry, offset);
18268 
18269 		/*
18270 		 * The new region has to be copied now if required.
18271 		 */
18272 RestartCopy:
18273 		if (!copy) {
18274 			if (src_entry->used_for_jit == TRUE) {
18275 				if (same_map) {
18276 				} else if (!VM_MAP_POLICY_ALLOW_JIT_SHARING(map)) {
18277 					/*
18278 					 * Cannot allow an entry describing a JIT
18279 					 * region to be shared across address spaces.
18280 					 */
18281 					result = KERN_INVALID_ARGUMENT;
18282 					vm_object_deallocate(object);
18283 					vm_map_entry_dispose(new_entry);
18284 					new_entry = VM_MAP_ENTRY_NULL;
18285 					break;
18286 				}
18287 			}
18288 
18289 			if (!src_entry->is_sub_map &&
18290 			    VME_OBJECT(src_entry) == VM_OBJECT_NULL) {
18291 				/* no accessible memory; nothing to share */
18292 				assert(src_entry->protection == VM_PROT_NONE);
18293 				assert(src_entry->max_protection == VM_PROT_NONE);
18294 				src_entry->is_shared = FALSE;
18295 			} else {
18296 				src_entry->is_shared = TRUE;
18297 			}
18298 			if (!new_entry->is_sub_map &&
18299 			    VME_OBJECT(new_entry) == VM_OBJECT_NULL) {
18300 				/* no accessible memory; nothing to share */
18301 				assert(new_entry->protection == VM_PROT_NONE);
18302 				assert(new_entry->max_protection == VM_PROT_NONE);
18303 				new_entry->is_shared = FALSE;
18304 			} else {
18305 				new_entry->is_shared = TRUE;
18306 			}
18307 			if (!(new_entry->is_sub_map)) {
18308 				new_entry->needs_copy = FALSE;
18309 			}
18310 		} else if (src_entry->is_sub_map) {
18311 			/* make this a COW sub_map if not already */
18312 			assert(new_entry->wired_count == 0);
18313 			new_entry->needs_copy = TRUE;
18314 			object = VM_OBJECT_NULL;
18315 		} else if (src_entry->wired_count == 0 &&
18316 		    !(debug4k_no_cow_copyin && VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) &&
18317 		    vm_object_copy_quickly(VME_OBJECT(new_entry),
18318 		    VME_OFFSET(new_entry),
18319 		    (new_entry->vme_end -
18320 		    new_entry->vme_start),
18321 		    &src_needs_copy,
18322 		    &new_entry_needs_copy)) {
18323 			new_entry->needs_copy = new_entry_needs_copy;
18324 			new_entry->is_shared = FALSE;
18325 			assertf(new_entry->use_pmap, "map %p new_entry %p\n", map, new_entry);
18326 
18327 			/*
18328 			 * Handle copy_on_write semantics.
18329 			 */
18330 			if (src_needs_copy && !src_entry->needs_copy) {
18331 				vm_prot_t prot;
18332 
18333 				if (pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, src_entry->protection)) {
18334 					panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
18335 					    __FUNCTION__,
18336 					    map, map->pmap, src_entry,
18337 					    (uint64_t)src_entry->vme_start,
18338 					    (uint64_t)src_entry->vme_end,
18339 					    src_entry->protection);
18340 				}
18341 
18342 				prot = src_entry->protection & ~VM_PROT_WRITE;
18343 
18344 				if (override_nx(map,
18345 				    VME_ALIAS(src_entry))
18346 				    && prot) {
18347 					prot |= VM_PROT_EXECUTE;
18348 				}
18349 
18350 				if (pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, prot)) {
18351 					panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
18352 					    __FUNCTION__,
18353 					    map, map->pmap, src_entry,
18354 					    (uint64_t)src_entry->vme_start,
18355 					    (uint64_t)src_entry->vme_end,
18356 					    prot);
18357 				}
18358 
18359 				vm_object_pmap_protect(object,
18360 				    offset,
18361 				    entry_size,
18362 				    ((src_entry->is_shared
18363 				    || map->mapped_in_other_pmaps) ?
18364 				    PMAP_NULL : map->pmap),
18365 				    VM_MAP_PAGE_SIZE(map),
18366 				    src_entry->vme_start,
18367 				    prot);
18368 
18369 				assert(src_entry->wired_count == 0);
18370 				src_entry->needs_copy = TRUE;
18371 			}
18372 			/*
18373 			 * Throw away the old object reference of the new entry.
18374 			 */
18375 			vm_object_deallocate(object);
18376 		} else {
18377 			new_entry->is_shared = FALSE;
18378 			assertf(new_entry->use_pmap, "map %p new_entry %p\n", map, new_entry);
18379 
18380 			src_entry_was_wired = (src_entry->wired_count > 0);
18381 			saved_src_entry = src_entry;
18382 			src_entry = VM_MAP_ENTRY_NULL;
18383 
18384 			/*
18385 			 * The map can be safely unlocked since we
18386 			 * already hold a reference on the object.
18387 			 *
18388 			 * Record the timestamp of the map for later
18389 			 * verification, and unlock the map.
18390 			 */
18391 			version.main_timestamp = map->timestamp;
18392 			vm_map_unlock(map);     /* Increments timestamp once! */
18393 
18394 			/*
18395 			 * Perform the copy.
18396 			 */
18397 			if (src_entry_was_wired > 0 ||
18398 			    (debug4k_no_cow_copyin &&
18399 			    VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT)) {
18400 				vm_object_lock(object);
18401 				result = vm_object_copy_slowly(
18402 					object,
18403 					offset,
18404 					(new_entry->vme_end -
18405 					new_entry->vme_start),
18406 					THREAD_UNINT,
18407 					&new_copy_object);
18408 				/* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
18409 				saved_used_for_jit = new_entry->used_for_jit;
18410 				VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
18411 				new_entry->used_for_jit = saved_used_for_jit;
18412 				VME_OFFSET_SET(new_entry, offset - vm_object_trunc_page(offset));
18413 				new_entry->needs_copy = FALSE;
18414 			} else {
18415 				vm_object_offset_t new_offset;
18416 
18417 				new_offset = VME_OFFSET(new_entry);
18418 				result = vm_object_copy_strategically(
18419 					object,
18420 					offset,
18421 					(new_entry->vme_end -
18422 					new_entry->vme_start),
18423 					false, /* forking */
18424 					&new_copy_object,
18425 					&new_offset,
18426 					&new_entry_needs_copy);
18427 				/* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
18428 				saved_used_for_jit = new_entry->used_for_jit;
18429 				VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
18430 				new_entry->used_for_jit = saved_used_for_jit;
18431 				if (new_offset != VME_OFFSET(new_entry)) {
18432 					VME_OFFSET_SET(new_entry, new_offset);
18433 				}
18434 
18435 				new_entry->needs_copy = new_entry_needs_copy;
18436 			}
18437 
18438 			/*
18439 			 * Throw away the old object reference of the new entry.
18440 			 */
18441 			vm_object_deallocate(object);
18442 
18443 			if (result != KERN_SUCCESS &&
18444 			    result != KERN_MEMORY_RESTART_COPY) {
18445 				vm_map_entry_dispose(new_entry);
18446 				vm_map_lock(map);
18447 				break;
18448 			}
18449 
18450 			/*
18451 			 * Verify that the map has not substantially
18452 			 * changed while the copy was being made.
18453 			 */
18454 
18455 			vm_map_lock(map);
18456 			if (version.main_timestamp + 1 != map->timestamp) {
18457 				/*
18458 				 * Simple version comparison failed.
18459 				 *
18460 				 * Retry the lookup and verify that the
18461 				 * same object/offset are still present.
18462 				 */
18463 				saved_src_entry = VM_MAP_ENTRY_NULL;
18464 				vm_object_deallocate(VME_OBJECT(new_entry));
18465 				vm_map_entry_dispose(new_entry);
18466 				if (result == KERN_MEMORY_RESTART_COPY) {
18467 					result = KERN_SUCCESS;
18468 				}
18469 				continue;
18470 			}
18471 			/* map hasn't changed: src_entry is still valid */
18472 			src_entry = saved_src_entry;
18473 			saved_src_entry = VM_MAP_ENTRY_NULL;
18474 
18475 			if (result == KERN_MEMORY_RESTART_COPY) {
18476 				vm_object_reference(object);
18477 				goto RestartCopy;
18478 			}
18479 		}
18480 
18481 		_vm_map_store_entry_link(map_header,
18482 		    map_header->links.prev, new_entry);
18483 
18484 		/* protections for submap mapping are irrelevant here */
18485 		if (vm_remap_legacy && !src_entry->is_sub_map) {
18486 			*cur_protection &= src_entry->protection;
18487 			*max_protection &= src_entry->max_protection;
18488 		}
18489 
18490 		map_address += tmp_size;
18491 		mapped_size += tmp_size;
18492 		src_start += tmp_size;
18493 
18494 		if (vmk_flags.vmkf_copy_single_object) {
18495 			if (mapped_size != size) {
18496 				DEBUG4K_SHARE("map %p addr 0x%llx size 0x%llx clipped copy at mapped_size 0x%llx\n",
18497 				    map, (uint64_t)addr, (uint64_t)size, (uint64_t)mapped_size);
18498 				if (src_entry->vme_next != vm_map_to_entry(map) &&
18499 				    src_entry->vme_next->vme_object_value ==
18500 				    src_entry->vme_object_value) {
18501 					/* XXX TODO4K */
18502 					DEBUG4K_ERROR("could have extended copy to next entry...\n");
18503 				}
18504 			}
18505 			break;
18506 		}
18507 	} /* end while */
18508 
18509 	vm_map_unlock(map);
18510 	if (result != KERN_SUCCESS) {
18511 		/*
18512 		 * Free all allocated elements.
18513 		 */
18514 		for (src_entry = map_header->links.next;
18515 		    src_entry != CAST_TO_VM_MAP_ENTRY(&map_header->links);
18516 		    src_entry = new_entry) {
18517 			new_entry = src_entry->vme_next;
18518 			_vm_map_store_entry_unlink(map_header, src_entry, false);
18519 			if (src_entry->is_sub_map) {
18520 				vm_map_deallocate(VME_SUBMAP(src_entry));
18521 			} else {
18522 				vm_object_deallocate(VME_OBJECT(src_entry));
18523 			}
18524 			vm_map_entry_dispose(src_entry);
18525 		}
18526 	}
18527 	return result;
18528 }
18529 
18530 bool
vm_map_is_exotic(vm_map_t map)18531 vm_map_is_exotic(
18532 	vm_map_t map)
18533 {
18534 	return VM_MAP_IS_EXOTIC(map);
18535 }
18536 
18537 bool
vm_map_is_alien(vm_map_t map)18538 vm_map_is_alien(
18539 	vm_map_t map)
18540 {
18541 	return VM_MAP_IS_ALIEN(map);
18542 }
18543 
18544 #if XNU_TARGET_OS_OSX
18545 void
vm_map_mark_alien(vm_map_t map)18546 vm_map_mark_alien(
18547 	vm_map_t map)
18548 {
18549 	vm_map_lock(map);
18550 	map->is_alien = true;
18551 	vm_map_unlock(map);
18552 }
18553 
18554 void
vm_map_single_jit(vm_map_t map)18555 vm_map_single_jit(
18556 	vm_map_t map)
18557 {
18558 	vm_map_lock(map);
18559 	map->single_jit = true;
18560 	vm_map_unlock(map);
18561 }
18562 #endif /* XNU_TARGET_OS_OSX */
18563 
18564 
18565 /*
18566  * Callers of this function must call vm_map_copy_require on
18567  * previously created vm_map_copy_t or pass a newly created
18568  * one to ensure that it hasn't been forged.
18569  */
18570 static kern_return_t
vm_map_copy_to_physcopy(vm_map_copy_t copy_map,vm_map_t target_map)18571 vm_map_copy_to_physcopy(
18572 	vm_map_copy_t   copy_map,
18573 	vm_map_t        target_map)
18574 {
18575 	vm_map_size_t           size;
18576 	vm_map_entry_t          entry;
18577 	vm_map_entry_t          new_entry;
18578 	vm_object_t             new_object;
18579 	unsigned int            pmap_flags;
18580 	pmap_t                  new_pmap;
18581 	vm_map_t                new_map;
18582 	vm_map_address_t        src_start, src_end, src_cur;
18583 	vm_map_address_t        dst_start, dst_end, dst_cur;
18584 	kern_return_t           kr;
18585 	void                    *kbuf;
18586 
18587 	/*
18588 	 * Perform the equivalent of vm_allocate() and memcpy().
18589 	 * Replace the mappings in "copy_map" with the newly allocated mapping.
18590 	 */
18591 	DEBUG4K_COPY("copy_map %p (%d %d 0x%llx 0x%llx) BEFORE\n", copy_map, copy_map->cpy_hdr.page_shift, copy_map->cpy_hdr.nentries, copy_map->offset, (uint64_t)copy_map->size);
18592 
18593 	assert(copy_map->cpy_hdr.page_shift != VM_MAP_PAGE_MASK(target_map));
18594 
18595 	/* create a new pmap to map "copy_map" */
18596 	pmap_flags = 0;
18597 	assert(copy_map->cpy_hdr.page_shift == FOURK_PAGE_SHIFT);
18598 #if PMAP_CREATE_FORCE_4K_PAGES
18599 	pmap_flags |= PMAP_CREATE_FORCE_4K_PAGES;
18600 #endif /* PMAP_CREATE_FORCE_4K_PAGES */
18601 	pmap_flags |= PMAP_CREATE_64BIT;
18602 	new_pmap = pmap_create_options(NULL, (vm_map_size_t)0, pmap_flags);
18603 	if (new_pmap == NULL) {
18604 		return KERN_RESOURCE_SHORTAGE;
18605 	}
18606 
18607 	/* allocate new VM object */
18608 	size = VM_MAP_ROUND_PAGE(copy_map->size, PAGE_MASK);
18609 	new_object = vm_object_allocate(size);
18610 	assert(new_object);
18611 
18612 	/* allocate new VM map entry */
18613 	new_entry = vm_map_copy_entry_create(copy_map);
18614 	assert(new_entry);
18615 
18616 	/* finish initializing new VM map entry */
18617 	new_entry->protection = VM_PROT_DEFAULT;
18618 	new_entry->max_protection = VM_PROT_DEFAULT;
18619 	new_entry->use_pmap = TRUE;
18620 
18621 	/* make new VM map entry point to new VM object */
18622 	new_entry->vme_start = 0;
18623 	new_entry->vme_end = size;
18624 	VME_OBJECT_SET(new_entry, new_object, false, 0);
18625 	VME_OFFSET_SET(new_entry, 0);
18626 
18627 	/* create a new pageable VM map to map "copy_map" */
18628 	new_map = vm_map_create_options(new_pmap, 0, MACH_VM_MAX_ADDRESS,
18629 	    VM_MAP_CREATE_PAGEABLE);
18630 	assert(new_map);
18631 	vm_map_set_page_shift(new_map, copy_map->cpy_hdr.page_shift);
18632 
18633 	/* map "copy_map" in the new VM map */
18634 	src_start = 0;
18635 	kr = vm_map_copyout_internal(
18636 		new_map,
18637 		&src_start,
18638 		copy_map,
18639 		copy_map->size,
18640 		FALSE, /* consume_on_success */
18641 		VM_PROT_DEFAULT,
18642 		VM_PROT_DEFAULT,
18643 		VM_INHERIT_DEFAULT);
18644 	assert(kr == KERN_SUCCESS);
18645 	src_end = src_start + copy_map->size;
18646 
18647 	/* map "new_object" in the new VM map */
18648 	vm_object_reference(new_object);
18649 	dst_start = 0;
18650 	kr = vm_map_enter(new_map,
18651 	    &dst_start,
18652 	    size,
18653 	    0,               /* mask */
18654 	    VM_MAP_KERNEL_FLAGS_ANYWHERE(.vm_tag = VM_KERN_MEMORY_OSFMK),
18655 	    new_object,
18656 	    0,               /* offset */
18657 	    FALSE,               /* needs copy */
18658 	    VM_PROT_DEFAULT,
18659 	    VM_PROT_DEFAULT,
18660 	    VM_INHERIT_DEFAULT);
18661 	assert(kr == KERN_SUCCESS);
18662 	dst_end = dst_start + size;
18663 
18664 	/* get a kernel buffer */
18665 	kbuf = kalloc_data(PAGE_SIZE, Z_WAITOK | Z_NOFAIL);
18666 
18667 	/* physically copy "copy_map" mappings to new VM object */
18668 	for (src_cur = src_start, dst_cur = dst_start;
18669 	    src_cur < src_end;
18670 	    src_cur += PAGE_SIZE, dst_cur += PAGE_SIZE) {
18671 		vm_size_t bytes;
18672 
18673 		bytes = PAGE_SIZE;
18674 		if (src_cur + PAGE_SIZE > src_end) {
18675 			/* partial copy for last page */
18676 			bytes = src_end - src_cur;
18677 			assert(bytes > 0 && bytes < PAGE_SIZE);
18678 			/* rest of dst page should be zero-filled */
18679 		}
18680 		/* get bytes from src mapping */
18681 		kr = copyinmap(new_map, src_cur, kbuf, bytes);
18682 		if (kr != KERN_SUCCESS) {
18683 			DEBUG4K_COPY("copyinmap(%p, 0x%llx, %p, 0x%llx) kr 0x%x\n", new_map, (uint64_t)src_cur, kbuf, (uint64_t)bytes, kr);
18684 		}
18685 		/* put bytes in dst mapping */
18686 		assert(dst_cur < dst_end);
18687 		assert(dst_cur + bytes <= dst_end);
18688 		kr = copyoutmap(new_map, kbuf, dst_cur, bytes);
18689 		if (kr != KERN_SUCCESS) {
18690 			DEBUG4K_COPY("copyoutmap(%p, %p, 0x%llx, 0x%llx) kr 0x%x\n", new_map, kbuf, (uint64_t)dst_cur, (uint64_t)bytes, kr);
18691 		}
18692 	}
18693 
18694 	/* free kernel buffer */
18695 	kfree_data(kbuf, PAGE_SIZE);
18696 
18697 	/* destroy new map */
18698 	vm_map_destroy(new_map);
18699 	new_map = VM_MAP_NULL;
18700 
18701 	/* dispose of the old map entries in "copy_map" */
18702 	while (vm_map_copy_first_entry(copy_map) !=
18703 	    vm_map_copy_to_entry(copy_map)) {
18704 		entry = vm_map_copy_first_entry(copy_map);
18705 		vm_map_copy_entry_unlink(copy_map, entry);
18706 		if (entry->is_sub_map) {
18707 			vm_map_deallocate(VME_SUBMAP(entry));
18708 		} else {
18709 			vm_object_deallocate(VME_OBJECT(entry));
18710 		}
18711 		vm_map_copy_entry_dispose(entry);
18712 	}
18713 
18714 	/* change "copy_map"'s page_size to match "target_map" */
18715 	copy_map->cpy_hdr.page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(target_map);
18716 	copy_map->offset = 0;
18717 	copy_map->size = size;
18718 
18719 	/* insert new map entry in "copy_map" */
18720 	assert(vm_map_copy_last_entry(copy_map) == vm_map_copy_to_entry(copy_map));
18721 	vm_map_copy_entry_link(copy_map, vm_map_copy_last_entry(copy_map), new_entry);
18722 
18723 	DEBUG4K_COPY("copy_map %p (%d %d 0x%llx 0x%llx) AFTER\n", copy_map, copy_map->cpy_hdr.page_shift, copy_map->cpy_hdr.nentries, copy_map->offset, (uint64_t)copy_map->size);
18724 	return KERN_SUCCESS;
18725 }
18726 
18727 void
18728 vm_map_copy_adjust_get_target_copy_map(
18729 	vm_map_copy_t   copy_map,
18730 	vm_map_copy_t   *target_copy_map_p);
18731 void
vm_map_copy_adjust_get_target_copy_map(vm_map_copy_t copy_map,vm_map_copy_t * target_copy_map_p)18732 vm_map_copy_adjust_get_target_copy_map(
18733 	vm_map_copy_t   copy_map,
18734 	vm_map_copy_t   *target_copy_map_p)
18735 {
18736 	vm_map_copy_t   target_copy_map;
18737 	vm_map_entry_t  entry, target_entry;
18738 
18739 	if (*target_copy_map_p != VM_MAP_COPY_NULL) {
18740 		/* the caller already has a "target_copy_map": use it */
18741 		return;
18742 	}
18743 
18744 	/* the caller wants us to create a new copy of "copy_map" */
18745 	assert(copy_map->type == VM_MAP_COPY_ENTRY_LIST);
18746 	target_copy_map = vm_map_copy_allocate(copy_map->type);
18747 	target_copy_map->offset = copy_map->offset;
18748 	target_copy_map->size = copy_map->size;
18749 	target_copy_map->cpy_hdr.page_shift = copy_map->cpy_hdr.page_shift;
18750 	for (entry = vm_map_copy_first_entry(copy_map);
18751 	    entry != vm_map_copy_to_entry(copy_map);
18752 	    entry = entry->vme_next) {
18753 		target_entry = vm_map_copy_entry_create(target_copy_map);
18754 		vm_map_entry_copy_full(target_entry, entry);
18755 		if (target_entry->is_sub_map) {
18756 			vm_map_reference(VME_SUBMAP(target_entry));
18757 		} else {
18758 			vm_object_reference(VME_OBJECT(target_entry));
18759 		}
18760 		vm_map_copy_entry_link(
18761 			target_copy_map,
18762 			vm_map_copy_last_entry(target_copy_map),
18763 			target_entry);
18764 	}
18765 	entry = VM_MAP_ENTRY_NULL;
18766 	*target_copy_map_p = target_copy_map;
18767 }
18768 
18769 /*
18770  * Callers of this function must call vm_map_copy_require on
18771  * previously created vm_map_copy_t or pass a newly created
18772  * one to ensure that it hasn't been forged.
18773  */
18774 static void
vm_map_copy_trim(vm_map_copy_t copy_map,uint16_t new_page_shift,vm_map_offset_t trim_start,vm_map_offset_t trim_end)18775 vm_map_copy_trim(
18776 	vm_map_copy_t   copy_map,
18777 	uint16_t        new_page_shift,
18778 	vm_map_offset_t trim_start,
18779 	vm_map_offset_t trim_end)
18780 {
18781 	uint16_t        copy_page_shift;
18782 	vm_map_entry_t  entry, next_entry;
18783 
18784 	assert(copy_map->type == VM_MAP_COPY_ENTRY_LIST);
18785 	assert(copy_map->cpy_hdr.nentries > 0);
18786 
18787 	trim_start += vm_map_copy_first_entry(copy_map)->vme_start;
18788 	trim_end += vm_map_copy_first_entry(copy_map)->vme_start;
18789 
18790 	/* use the new page_shift to do the clipping */
18791 	copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy_map);
18792 	copy_map->cpy_hdr.page_shift = new_page_shift;
18793 
18794 	for (entry = vm_map_copy_first_entry(copy_map);
18795 	    entry != vm_map_copy_to_entry(copy_map);
18796 	    entry = next_entry) {
18797 		next_entry = entry->vme_next;
18798 		if (entry->vme_end <= trim_start) {
18799 			/* entry fully before trim range: skip */
18800 			continue;
18801 		}
18802 		if (entry->vme_start >= trim_end) {
18803 			/* entry fully after trim range: done */
18804 			break;
18805 		}
18806 		/* clip entry if needed */
18807 		vm_map_copy_clip_start(copy_map, entry, trim_start);
18808 		vm_map_copy_clip_end(copy_map, entry, trim_end);
18809 		/* dispose of entry */
18810 		copy_map->size -= entry->vme_end - entry->vme_start;
18811 		vm_map_copy_entry_unlink(copy_map, entry);
18812 		if (entry->is_sub_map) {
18813 			vm_map_deallocate(VME_SUBMAP(entry));
18814 		} else {
18815 			vm_object_deallocate(VME_OBJECT(entry));
18816 		}
18817 		vm_map_copy_entry_dispose(entry);
18818 		entry = VM_MAP_ENTRY_NULL;
18819 	}
18820 
18821 	/* restore copy_map's original page_shift */
18822 	copy_map->cpy_hdr.page_shift = copy_page_shift;
18823 }
18824 
18825 /*
18826  * Make any necessary adjustments to "copy_map" to allow it to be
18827  * mapped into "target_map".
18828  * If no changes were necessary, "target_copy_map" points to the
18829  * untouched "copy_map".
18830  * If changes are necessary, changes will be made to "target_copy_map".
18831  * If "target_copy_map" was NULL, we create a new "vm_map_copy_t" and
18832  * copy the original "copy_map" to it before applying the changes.
18833  * The caller should discard "target_copy_map" if it's not the same as
18834  * the original "copy_map".
18835  */
18836 /* TODO4K: also adjust to sub-range in the copy_map -> add start&end? */
18837 kern_return_t
vm_map_copy_adjust_to_target(vm_map_copy_t src_copy_map,vm_map_offset_ut offset_u,vm_map_size_ut size_u,vm_map_t target_map,boolean_t copy,vm_map_copy_t * target_copy_map_p,vm_map_offset_t * overmap_start_p,vm_map_offset_t * overmap_end_p,vm_map_offset_t * trimmed_start_p)18838 vm_map_copy_adjust_to_target(
18839 	vm_map_copy_t           src_copy_map,
18840 	vm_map_offset_ut        offset_u,
18841 	vm_map_size_ut          size_u,
18842 	vm_map_t                target_map,
18843 	boolean_t               copy,
18844 	vm_map_copy_t           *target_copy_map_p,
18845 	vm_map_offset_t         *overmap_start_p,
18846 	vm_map_offset_t         *overmap_end_p,
18847 	vm_map_offset_t         *trimmed_start_p)
18848 {
18849 	vm_map_copy_t           copy_map, target_copy_map;
18850 	vm_map_size_t           target_size;
18851 	vm_map_size_t           src_copy_map_size;
18852 	vm_map_size_t           overmap_start, overmap_end;
18853 	int                     misalignments;
18854 	vm_map_entry_t          entry, target_entry;
18855 	vm_map_offset_t         addr_adjustment;
18856 	vm_map_offset_t         new_start, new_end;
18857 	int                     copy_page_mask, target_page_mask;
18858 	uint16_t                copy_page_shift, target_page_shift;
18859 	vm_map_offset_t         trimmed_end;
18860 	vm_map_size_t           map_size;
18861 	kern_return_t           kr;
18862 
18863 	/*
18864 	 * Sanitize any input parameters that are addr/size/prot/inherit
18865 	 */
18866 	kr = vm_map_copy_addr_size_sanitize(
18867 		target_map,
18868 		offset_u,
18869 		size_u,
18870 		VM_SANITIZE_CALLER_MACH_MEMORY_ENTRY_MAP_SIZE,
18871 		&new_start,
18872 		&new_end,
18873 		&map_size);
18874 	if (__improbable(kr != KERN_SUCCESS)) {
18875 		return vm_sanitize_get_kr(kr);
18876 	}
18877 
18878 	/*
18879 	 * Assert that the vm_map_copy is coming from the right
18880 	 * zone and hasn't been forged
18881 	 */
18882 	vm_map_copy_require(src_copy_map);
18883 	assert(src_copy_map->type == VM_MAP_COPY_ENTRY_LIST);
18884 
18885 	/*
18886 	 * Start working with "src_copy_map" but we'll switch
18887 	 * to "target_copy_map" as soon as we start making adjustments.
18888 	 */
18889 	copy_map = src_copy_map;
18890 	src_copy_map_size = src_copy_map->size;
18891 
18892 	copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy_map);
18893 	copy_page_mask = VM_MAP_COPY_PAGE_MASK(copy_map);
18894 	target_page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(target_map);
18895 	target_page_mask = VM_MAP_PAGE_MASK(target_map);
18896 
18897 	DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p...\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(offset_u), (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(size_u), *target_copy_map_p);
18898 
18899 	target_copy_map = *target_copy_map_p;
18900 	if (target_copy_map != VM_MAP_COPY_NULL) {
18901 		vm_map_copy_require(target_copy_map);
18902 	}
18903 
18904 	if (new_end > copy_map->size) {
18905 		DEBUG4K_ERROR("copy_map %p (%d->%d) copy_map->size 0x%llx offset 0x%llx size 0x%llx KERN_INVALID_ARGUMENT\n", copy_map, copy_page_shift, target_page_shift, (uint64_t)copy_map->size, (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(offset_u), (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(size_u));
18906 		return KERN_INVALID_ARGUMENT;
18907 	}
18908 
18909 	/* trim the end */
18910 	trimmed_end = 0;
18911 	new_end = VM_MAP_ROUND_PAGE(new_end, target_page_mask);
18912 	if (new_end < copy_map->size) {
18913 		trimmed_end = src_copy_map_size - new_end;
18914 		DEBUG4K_ADJUST("copy_map %p (%d->%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p... trim end from 0x%llx to 0x%llx\n", copy_map, copy_page_shift, target_page_shift, copy, (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(offset_u), (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(size_u), target_copy_map, (uint64_t)new_end, (uint64_t)copy_map->size);
18915 		/* get "target_copy_map" if needed and adjust it */
18916 		vm_map_copy_adjust_get_target_copy_map(copy_map,
18917 		    &target_copy_map);
18918 		copy_map = target_copy_map;
18919 		vm_map_copy_trim(target_copy_map, target_page_shift,
18920 		    new_end, copy_map->size);
18921 	}
18922 
18923 	/* trim the start */
18924 	new_start = VM_MAP_TRUNC_PAGE(new_start, target_page_mask);
18925 	if (new_start != 0) {
18926 		DEBUG4K_ADJUST("copy_map %p (%d->%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p... trim start from 0x%llx to 0x%llx\n", copy_map, copy_page_shift, target_page_shift, copy, (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(offset_u), (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(size_u), target_copy_map, (uint64_t)0, (uint64_t)new_start);
18927 		/* get "target_copy_map" if needed and adjust it */
18928 		vm_map_copy_adjust_get_target_copy_map(copy_map,
18929 		    &target_copy_map);
18930 		copy_map = target_copy_map;
18931 		vm_map_copy_trim(target_copy_map, target_page_shift,
18932 		    0, new_start);
18933 	}
18934 	*trimmed_start_p = new_start;
18935 
18936 	/* target_size starts with what's left after trimming */
18937 	target_size = copy_map->size;
18938 	assertf(target_size == src_copy_map_size - *trimmed_start_p - trimmed_end,
18939 	    "target_size 0x%llx src_copy_map_size 0x%llx trimmed_start 0x%llx trimmed_end 0x%llx\n",
18940 	    (uint64_t)target_size, (uint64_t)src_copy_map_size,
18941 	    (uint64_t)*trimmed_start_p, (uint64_t)trimmed_end);
18942 
18943 	/* check for misalignments but don't adjust yet */
18944 	misalignments = 0;
18945 	overmap_start = 0;
18946 	overmap_end = 0;
18947 	if (copy_page_shift < target_page_shift) {
18948 		/*
18949 		 * Remapping from 4K to 16K: check the VM object alignments
18950 		 * throughout the range.
18951 		 * If the start and end of the range are mis-aligned, we can
18952 		 * over-map to re-align, and adjust the "overmap" start/end
18953 		 * and "target_size" of the range accordingly.
18954 		 * If there is any mis-alignment within the range:
18955 		 *     if "copy":
18956 		 *         we can do immediate-copy instead of copy-on-write,
18957 		 *     else:
18958 		 *         no way to remap and share; fail.
18959 		 */
18960 		for (entry = vm_map_copy_first_entry(copy_map);
18961 		    entry != vm_map_copy_to_entry(copy_map);
18962 		    entry = entry->vme_next) {
18963 			vm_object_offset_t object_offset_start, object_offset_end;
18964 
18965 			object_offset_start = VME_OFFSET(entry);
18966 			object_offset_end = object_offset_start;
18967 			object_offset_end += entry->vme_end - entry->vme_start;
18968 			if (object_offset_start & target_page_mask) {
18969 				if (entry == vm_map_copy_first_entry(copy_map) && !copy) {
18970 					overmap_start++;
18971 				} else {
18972 					misalignments++;
18973 				}
18974 			}
18975 			if (object_offset_end & target_page_mask) {
18976 				if (entry->vme_next == vm_map_copy_to_entry(copy_map) && !copy) {
18977 					overmap_end++;
18978 				} else {
18979 					misalignments++;
18980 				}
18981 			}
18982 		}
18983 	}
18984 	entry = VM_MAP_ENTRY_NULL;
18985 
18986 	/* decide how to deal with misalignments */
18987 	assert(overmap_start <= 1);
18988 	assert(overmap_end <= 1);
18989 	if (!overmap_start && !overmap_end && !misalignments) {
18990 		/* copy_map is properly aligned for target_map ... */
18991 		if (*trimmed_start_p) {
18992 			/* ... but we trimmed it, so still need to adjust */
18993 		} else {
18994 			/* ... and we didn't trim anything: we're done */
18995 			if (target_copy_map == VM_MAP_COPY_NULL) {
18996 				target_copy_map = copy_map;
18997 			}
18998 			*target_copy_map_p = target_copy_map;
18999 			*overmap_start_p = 0;
19000 			*overmap_end_p = 0;
19001 			DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx) -> trimmed 0x%llx overmap start 0x%llx end 0x%llx KERN_SUCCESS\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p);
19002 			return KERN_SUCCESS;
19003 		}
19004 	} else if (misalignments && !copy) {
19005 		/* can't "share" if misaligned */
19006 		DEBUG4K_ADJUST("unsupported sharing\n");
19007 #if MACH_ASSERT
19008 		if (debug4k_panic_on_misaligned_sharing) {
19009 			panic("DEBUG4k %s:%d unsupported sharing", __FUNCTION__, __LINE__);
19010 		}
19011 #endif /* MACH_ASSERT */
19012 		DEBUG4K_ADJUST("copy_map %p (%d) target_map %p (%d) copy %d target_copy_map %p -> KERN_NOT_SUPPORTED\n", copy_map, copy_page_shift, target_map, target_page_shift, copy, *target_copy_map_p);
19013 		return KERN_NOT_SUPPORTED;
19014 	} else {
19015 		/* can't virtual-copy if misaligned (but can physical-copy) */
19016 		DEBUG4K_ADJUST("mis-aligned copying\n");
19017 	}
19018 
19019 	/* get a "target_copy_map" if needed and switch to it */
19020 	vm_map_copy_adjust_get_target_copy_map(copy_map, &target_copy_map);
19021 	copy_map = target_copy_map;
19022 
19023 	if (misalignments && copy) {
19024 		vm_map_size_t target_copy_map_size;
19025 
19026 		/*
19027 		 * Can't do copy-on-write with misaligned mappings.
19028 		 * Replace the mappings with a physical copy of the original
19029 		 * mappings' contents.
19030 		 */
19031 		target_copy_map_size = target_copy_map->size;
19032 		kr = vm_map_copy_to_physcopy(target_copy_map, target_map);
19033 		if (kr != KERN_SUCCESS) {
19034 			return kr;
19035 		}
19036 		*target_copy_map_p = target_copy_map;
19037 		*overmap_start_p = 0;
19038 		*overmap_end_p = target_copy_map->size - target_copy_map_size;
19039 		DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx)-> trimmed 0x%llx overmap start 0x%llx end 0x%llx PHYSCOPY\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p);
19040 		return KERN_SUCCESS;
19041 	}
19042 
19043 	/* apply the adjustments */
19044 	misalignments = 0;
19045 	overmap_start = 0;
19046 	overmap_end = 0;
19047 	/* remove copy_map->offset, so that everything starts at offset 0 */
19048 	addr_adjustment = copy_map->offset;
19049 	/* also remove whatever we trimmed from the start */
19050 	addr_adjustment += *trimmed_start_p;
19051 	for (target_entry = vm_map_copy_first_entry(target_copy_map);
19052 	    target_entry != vm_map_copy_to_entry(target_copy_map);
19053 	    target_entry = target_entry->vme_next) {
19054 		vm_object_offset_t object_offset_start, object_offset_end;
19055 
19056 		DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx BEFORE\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
19057 		object_offset_start = VME_OFFSET(target_entry);
19058 		if (object_offset_start & target_page_mask) {
19059 			DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx misaligned at start\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
19060 			if (target_entry == vm_map_copy_first_entry(target_copy_map)) {
19061 				/*
19062 				 * start of 1st entry is mis-aligned:
19063 				 * re-adjust by over-mapping.
19064 				 */
19065 				overmap_start = object_offset_start - trunc_page_mask_64(object_offset_start, target_page_mask);
19066 				DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> overmap_start 0x%llx\n", target_entry, VME_OFFSET(target_entry), copy, (uint64_t)overmap_start);
19067 				VME_OFFSET_SET(target_entry, VME_OFFSET(target_entry) - overmap_start);
19068 			} else {
19069 				misalignments++;
19070 				DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> misalignments %d\n", target_entry, VME_OFFSET(target_entry), copy, misalignments);
19071 				assert(copy);
19072 			}
19073 		}
19074 
19075 		if (target_entry == vm_map_copy_first_entry(target_copy_map)) {
19076 			target_size += overmap_start;
19077 		} else {
19078 			target_entry->vme_start += overmap_start;
19079 		}
19080 		target_entry->vme_end += overmap_start;
19081 
19082 		object_offset_end = VME_OFFSET(target_entry) + target_entry->vme_end - target_entry->vme_start;
19083 		if (object_offset_end & target_page_mask) {
19084 			DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx misaligned at end\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
19085 			if (target_entry->vme_next == vm_map_copy_to_entry(target_copy_map)) {
19086 				/*
19087 				 * end of last entry is mis-aligned: re-adjust by over-mapping.
19088 				 */
19089 				overmap_end = round_page_mask_64(object_offset_end, target_page_mask) - object_offset_end;
19090 				DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> overmap_end 0x%llx\n", target_entry, VME_OFFSET(target_entry), copy, (uint64_t)overmap_end);
19091 				target_entry->vme_end += overmap_end;
19092 				target_size += overmap_end;
19093 			} else {
19094 				misalignments++;
19095 				DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> misalignments %d\n", target_entry, VME_OFFSET(target_entry), copy, misalignments);
19096 				assert(copy);
19097 			}
19098 		}
19099 		target_entry->vme_start -= addr_adjustment;
19100 		target_entry->vme_end -= addr_adjustment;
19101 		DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx AFTER\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
19102 	}
19103 
19104 	target_copy_map->size = target_size;
19105 	target_copy_map->offset += overmap_start;
19106 	target_copy_map->offset -= addr_adjustment;
19107 	target_copy_map->cpy_hdr.page_shift = target_page_shift;
19108 
19109 //	assert(VM_MAP_PAGE_ALIGNED(target_copy_map->size, target_page_mask));
19110 //	assert(VM_MAP_PAGE_ALIGNED(target_copy_map->offset, FOURK_PAGE_MASK));
19111 	assert(overmap_start < VM_MAP_PAGE_SIZE(target_map));
19112 	assert(overmap_end < VM_MAP_PAGE_SIZE(target_map));
19113 
19114 	*target_copy_map_p = target_copy_map;
19115 	*overmap_start_p = overmap_start;
19116 	*overmap_end_p = overmap_end;
19117 
19118 	DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx) -> trimmed 0x%llx overmap start 0x%llx end 0x%llx KERN_SUCCESS\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p);
19119 	return KERN_SUCCESS;
19120 }
19121 
19122 kern_return_t
vm_map_range_physical_size(vm_map_t map,vm_map_address_t start,mach_vm_size_t size,mach_vm_size_t * phys_size)19123 vm_map_range_physical_size(
19124 	vm_map_t         map,
19125 	vm_map_address_t start,
19126 	mach_vm_size_t   size,
19127 	mach_vm_size_t * phys_size)
19128 {
19129 	kern_return_t   kr;
19130 	vm_map_copy_t   copy_map, target_copy_map;
19131 	vm_map_offset_t adjusted_start, adjusted_end;
19132 	vm_map_size_t   adjusted_size;
19133 	vm_prot_t       cur_prot, max_prot;
19134 	vm_map_offset_t overmap_start, overmap_end, trimmed_start, end;
19135 	vm_map_kernel_flags_t vmk_flags;
19136 
19137 	if (size == 0) {
19138 		DEBUG4K_SHARE("map %p start 0x%llx size 0x%llx -> phys_size 0!\n", map, (uint64_t)start, (uint64_t)size);
19139 		*phys_size = 0;
19140 		return KERN_SUCCESS;
19141 	}
19142 
19143 	adjusted_start = vm_map_trunc_page(start, VM_MAP_PAGE_MASK(map));
19144 	adjusted_end = vm_map_round_page(start + size, VM_MAP_PAGE_MASK(map));
19145 	if (__improbable(os_add_overflow(start, size, &end) ||
19146 	    adjusted_end <= adjusted_start)) {
19147 		/* wraparound */
19148 		printf("%s:%d(start=0x%llx, size=0x%llx) pgmask 0x%x: wraparound\n", __FUNCTION__, __LINE__, (uint64_t)start, (uint64_t)size, VM_MAP_PAGE_MASK(map));
19149 		*phys_size = 0;
19150 		return KERN_INVALID_ARGUMENT;
19151 	}
19152 	if (__improbable(vm_map_range_overflows(map, start, size))) {
19153 		*phys_size = 0;
19154 		return KERN_INVALID_ADDRESS;
19155 	}
19156 	assert(adjusted_end > adjusted_start);
19157 	adjusted_size = adjusted_end - adjusted_start;
19158 	*phys_size = adjusted_size;
19159 	if (VM_MAP_PAGE_SIZE(map) == PAGE_SIZE) {
19160 		return KERN_SUCCESS;
19161 	}
19162 	if (start == 0) {
19163 		adjusted_start = vm_map_trunc_page(start, PAGE_MASK);
19164 		adjusted_end = vm_map_round_page(start + size, PAGE_MASK);
19165 		if (__improbable(adjusted_end <= adjusted_start)) {
19166 			/* wraparound */
19167 			printf("%s:%d(start=0x%llx, size=0x%llx) pgmask 0x%x: wraparound\n", __FUNCTION__, __LINE__, (uint64_t)start, (uint64_t)size, PAGE_MASK);
19168 			*phys_size = 0;
19169 			return KERN_INVALID_ARGUMENT;
19170 		}
19171 		assert(adjusted_end > adjusted_start);
19172 		adjusted_size = adjusted_end - adjusted_start;
19173 		*phys_size = adjusted_size;
19174 		return KERN_SUCCESS;
19175 	}
19176 
19177 	vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
19178 	vmk_flags.vmkf_copy_pageable = TRUE;
19179 	vmk_flags.vmkf_copy_same_map = TRUE;
19180 	assert(adjusted_size != 0);
19181 	cur_prot = VM_PROT_NONE; /* legacy mode */
19182 	max_prot = VM_PROT_NONE; /* legacy mode */
19183 	vmk_flags.vmkf_remap_legacy_mode = true;
19184 	kr = vm_map_copy_extract(map, adjusted_start, adjusted_size,
19185 	    FALSE /* copy */,
19186 	    &copy_map,
19187 	    &cur_prot, &max_prot, VM_INHERIT_DEFAULT,
19188 	    vmk_flags);
19189 	if (kr != KERN_SUCCESS) {
19190 		DEBUG4K_ERROR("map %p start 0x%llx 0x%llx size 0x%llx 0x%llx kr 0x%x\n", map, (uint64_t)start, (uint64_t)adjusted_start, size, (uint64_t)adjusted_size, kr);
19191 		//assert(0);
19192 		*phys_size = 0;
19193 		return kr;
19194 	}
19195 	assert(copy_map != VM_MAP_COPY_NULL);
19196 	target_copy_map = copy_map;
19197 	DEBUG4K_ADJUST("adjusting...\n");
19198 	kr = vm_map_copy_adjust_to_target(
19199 		copy_map,
19200 		start - adjusted_start, /* offset */
19201 		size, /* size */
19202 		kernel_map,
19203 		FALSE,                          /* copy */
19204 		&target_copy_map,
19205 		&overmap_start,
19206 		&overmap_end,
19207 		&trimmed_start);
19208 	if (kr == KERN_SUCCESS) {
19209 		if (target_copy_map->size != *phys_size) {
19210 			DEBUG4K_ADJUST("map %p (%d) start 0x%llx size 0x%llx adjusted_start 0x%llx adjusted_end 0x%llx overmap_start 0x%llx overmap_end 0x%llx trimmed_start 0x%llx phys_size 0x%llx -> 0x%llx\n", map, VM_MAP_PAGE_SHIFT(map), (uint64_t)start, (uint64_t)size, (uint64_t)adjusted_start, (uint64_t)adjusted_end, (uint64_t)overmap_start, (uint64_t)overmap_end, (uint64_t)trimmed_start, (uint64_t)*phys_size, (uint64_t)target_copy_map->size);
19211 		}
19212 		*phys_size = target_copy_map->size;
19213 	} else {
19214 		DEBUG4K_ERROR("map %p start 0x%llx 0x%llx size 0x%llx 0x%llx kr 0x%x\n", map, (uint64_t)start, (uint64_t)adjusted_start, size, (uint64_t)adjusted_size, kr);
19215 		//assert(0);
19216 		*phys_size = 0;
19217 	}
19218 	vm_map_copy_discard(copy_map);
19219 	copy_map = VM_MAP_COPY_NULL;
19220 
19221 	return kr;
19222 }
19223 
19224 static __attribute__((always_inline, warn_unused_result))
19225 kern_return_t
vm_map_remap_sanitize(vm_map_t src_map,vm_map_t target_map,vm_map_address_ut address_u,vm_map_size_ut size_u,vm_map_offset_ut mask_u,vm_map_offset_ut memory_address_u,vm_prot_ut cur_protection_u,vm_prot_ut max_protection_u,vm_inherit_ut inheritance_u,vm_map_kernel_flags_t vmk_flags,vm_map_address_t * target_addr,vm_map_address_t * mask,vm_map_offset_t * memory_address,vm_map_offset_t * memory_end,vm_map_size_t * memory_size,vm_prot_t * cur_protection,vm_prot_t * max_protection,vm_inherit_t * inheritance)19226 vm_map_remap_sanitize(
19227 	vm_map_t                src_map,
19228 	vm_map_t                target_map,
19229 	vm_map_address_ut       address_u,
19230 	vm_map_size_ut          size_u,
19231 	vm_map_offset_ut        mask_u,
19232 	vm_map_offset_ut        memory_address_u,
19233 	vm_prot_ut              cur_protection_u,
19234 	vm_prot_ut              max_protection_u,
19235 	vm_inherit_ut           inheritance_u,
19236 	vm_map_kernel_flags_t   vmk_flags,
19237 	vm_map_address_t       *target_addr,
19238 	vm_map_address_t       *mask,
19239 	vm_map_offset_t        *memory_address,
19240 	vm_map_offset_t        *memory_end,
19241 	vm_map_size_t          *memory_size,
19242 	vm_prot_t              *cur_protection,
19243 	vm_prot_t              *max_protection,
19244 	vm_inherit_t           *inheritance)
19245 {
19246 	kern_return_t           result;
19247 	vm_sanitize_flags_t     vm_sanitize_flags;
19248 
19249 	result = vm_sanitize_inherit(inheritance_u, VM_SANITIZE_CALLER_VM_MAP_REMAP,
19250 	    inheritance);
19251 	if (__improbable(result != KERN_SUCCESS)) {
19252 		return result;
19253 	}
19254 
19255 	result = vm_sanitize_cur_and_max_prots(cur_protection_u, max_protection_u,
19256 	    VM_SANITIZE_CALLER_VM_MAP_REMAP, target_map,
19257 	    cur_protection, max_protection);
19258 	if (__improbable(result != KERN_SUCCESS)) {
19259 		return result;
19260 	}
19261 
19262 	result = vm_sanitize_mask(mask_u, VM_SANITIZE_CALLER_VM_MAP_REMAP, mask);
19263 	if (__improbable(result != KERN_SUCCESS)) {
19264 		return result;
19265 	}
19266 
19267 	/*
19268 	 * If the user is requesting that we return the address of the
19269 	 * first byte of the data (rather than the base of the page),
19270 	 * then we use different rounding semantics: specifically,
19271 	 * we assume that (memory_address, size) describes a region
19272 	 * all of whose pages we must cover, rather than a base to be truncated
19273 	 * down and a size to be added to that base.  So we figure out
19274 	 * the highest page that the requested region includes and make
19275 	 * sure that the size will cover it.
19276 	 *
19277 	 * The key example we're worried about it is of the form:
19278 	 *
19279 	 *              memory_address = 0x1ff0, size = 0x20
19280 	 *
19281 	 * With the old semantics, we round down the memory_address to 0x1000
19282 	 * and round up the size to 0x1000, resulting in our covering *only*
19283 	 * page 0x1000.  With the new semantics, we'd realize that the region covers
19284 	 * 0x1ff0-0x2010, and compute a size of 0x2000.  Thus, we cover both page
19285 	 * 0x1000 and page 0x2000 in the region we remap.
19286 	 *
19287 	 * VM_SANITIZE_FLAGS_REALIGN_START asks for the old (broken) semantics.
19288 	 */
19289 	vm_sanitize_flags = VM_SANITIZE_FLAGS_SIZE_ZERO_FAILS;
19290 	if (!vmk_flags.vmf_return_data_addr) {
19291 		vm_sanitize_flags |= VM_SANITIZE_FLAGS_REALIGN_START;
19292 	}
19293 
19294 	result = vm_sanitize_addr_size(memory_address_u, size_u,
19295 	    VM_SANITIZE_CALLER_VM_MAP_REMAP, src_map,
19296 	    vm_sanitize_flags, memory_address, memory_end,
19297 	    memory_size);
19298 	if (__improbable(result != KERN_SUCCESS)) {
19299 		return result;
19300 	}
19301 
19302 	*target_addr = vm_sanitize_addr(target_map, address_u);
19303 	return KERN_SUCCESS;
19304 }
19305 
19306 /*
19307  *	Routine:	vm_remap
19308  *
19309  *			Map portion of a task's address space.
19310  *			Mapped region must not overlap more than
19311  *			one vm memory object. Protections and
19312  *			inheritance attributes remain the same
19313  *			as in the original task and are	out parameters.
19314  *			Source and Target task can be identical
19315  *			Other attributes are identical as for vm_map()
19316  */
19317 kern_return_t
vm_map_remap(vm_map_t target_map,vm_map_address_ut * address_u,vm_map_size_ut size_u,vm_map_offset_ut mask_u,vm_map_kernel_flags_t vmk_flags,vm_map_t src_map,vm_map_offset_ut memory_address_u,boolean_t copy,vm_prot_ut * cur_protection_u,vm_prot_ut * max_protection_u,vm_inherit_ut inheritance_u)19318 vm_map_remap(
19319 	vm_map_t                target_map,
19320 	vm_map_address_ut      *address_u,
19321 	vm_map_size_ut          size_u,
19322 	vm_map_offset_ut        mask_u,
19323 	vm_map_kernel_flags_t   vmk_flags,
19324 	vm_map_t                src_map,
19325 	vm_map_offset_ut        memory_address_u,
19326 	boolean_t               copy,
19327 	vm_prot_ut             *cur_protection_u, /* IN/OUT */
19328 	vm_prot_ut             *max_protection_u, /* IN/OUT */
19329 	vm_inherit_ut           inheritance_u)
19330 {
19331 	vm_map_address_t        target_addr, mask;
19332 	vm_map_size_t           target_size;
19333 	vm_map_offset_t         memory_address, memory_end;
19334 	vm_map_size_t           memory_size;
19335 	vm_prot_t               cur_protection, max_protection;
19336 	vm_inherit_t            inheritance;
19337 	kern_return_t           result;
19338 	vm_map_entry_t          insp_entry = VM_MAP_ENTRY_NULL;
19339 	vm_map_copy_t           copy_map;
19340 	vm_map_offset_t         offset_in_mapping;
19341 	vm_map_size_t           src_page_mask, target_page_mask;
19342 	vm_map_size_t           initial_size;
19343 	VM_MAP_ZAP_DECLARE(zap_list);
19344 
19345 	if (target_map == VM_MAP_NULL || src_map == VM_MAP_NULL) {
19346 		return KERN_INVALID_ARGUMENT;
19347 	}
19348 	src_page_mask    = VM_MAP_PAGE_MASK(src_map);
19349 	target_page_mask = VM_MAP_PAGE_MASK(target_map);
19350 
19351 	if (src_page_mask != target_page_mask) {
19352 		if (copy) {
19353 			DEBUG4K_COPY("src_map %p pgsz 0x%x addr 0x%llx size 0x%llx copy %d -> target_map %p pgsz 0x%x\n", src_map, VM_MAP_PAGE_SIZE(src_map), VM_SANITIZE_UNSAFE_UNWRAP(memory_address_u), VM_SANITIZE_UNSAFE_UNWRAP(size_u), copy, target_map, VM_MAP_PAGE_SIZE(target_map));
19354 		} else {
19355 			DEBUG4K_SHARE("src_map %p pgsz 0x%x addr 0x%llx size 0x%llx copy %d -> target_map %p pgsz 0x%x\n", src_map, VM_MAP_PAGE_SIZE(src_map), VM_SANITIZE_UNSAFE_UNWRAP(memory_address_u), VM_SANITIZE_UNSAFE_UNWRAP(size_u), copy, target_map, VM_MAP_PAGE_SIZE(target_map));
19356 		}
19357 	}
19358 
19359 	/*
19360 	 * Sanitize any input parameters that are addr/size/prot/inherit
19361 	 */
19362 	result = vm_map_remap_sanitize(src_map,
19363 	    target_map,
19364 	    *address_u,
19365 	    size_u,
19366 	    mask_u,
19367 	    memory_address_u,
19368 	    *cur_protection_u,
19369 	    *max_protection_u,
19370 	    inheritance_u,
19371 	    vmk_flags,
19372 	    &target_addr,
19373 	    &mask,
19374 	    &memory_address,
19375 	    &memory_end,
19376 	    &memory_size,
19377 	    &cur_protection,
19378 	    &max_protection,
19379 	    &inheritance);
19380 	if (__improbable(result != KERN_SUCCESS)) {
19381 		return vm_sanitize_get_kr(result);
19382 	}
19383 
19384 	if (vmk_flags.vmf_return_data_addr) {
19385 		/*
19386 		 * This is safe to unwrap now that the quantities
19387 		 * have been validated and rounded up normally.
19388 		 */
19389 		offset_in_mapping = vm_sanitize_offset_in_page(src_map,
19390 		    memory_address_u);
19391 		initial_size = VM_SANITIZE_UNSAFE_UNWRAP(size_u);
19392 	} else {
19393 		/*
19394 		 * IMPORTANT:
19395 		 * This legacy code path is broken: for the range mentioned
19396 		 * above [ memory_address = 0x1ff0,size = 0x20 ], which spans
19397 		 * two 4k pages, it yields [ memory_address = 0x1000,
19398 		 * size = 0x1000 ], which covers only the first 4k page.
19399 		 * BUT some code unfortunately depends on this bug, so we
19400 		 * can't fix it without breaking something.
19401 		 * New code should get automatically opted in the new
19402 		 * behavior with the new VM_FLAGS_RETURN_DATA_ADDR flags.
19403 		 */
19404 		offset_in_mapping = 0;
19405 		initial_size = memory_size;
19406 	}
19407 
19408 	if (vmk_flags.vmf_resilient_media) {
19409 		/* must be copy-on-write to be "media resilient" */
19410 		if (!copy) {
19411 			return KERN_INVALID_ARGUMENT;
19412 		}
19413 	}
19414 
19415 	vmk_flags.vmkf_copy_pageable = target_map->hdr.entries_pageable;
19416 	vmk_flags.vmkf_copy_same_map = (src_map == target_map);
19417 
19418 	assert(memory_size != 0);
19419 	result = vm_map_copy_extract(src_map,
19420 	    memory_address,
19421 	    memory_size,
19422 	    copy, &copy_map,
19423 	    &cur_protection, /* IN/OUT */
19424 	    &max_protection, /* IN/OUT */
19425 	    inheritance,
19426 	    vmk_flags);
19427 	if (result != KERN_SUCCESS) {
19428 		return result;
19429 	}
19430 	assert(copy_map != VM_MAP_COPY_NULL);
19431 
19432 	/*
19433 	 * Handle the policy for vm map ranges
19434 	 *
19435 	 * If the maps differ, the target_map policy applies like for vm_map()
19436 	 * For same mapping remaps, we preserve the range.
19437 	 */
19438 	if (vmk_flags.vmkf_copy_same_map) {
19439 		vmk_flags.vmkf_range_id = copy_map->orig_range;
19440 	} else {
19441 		vm_map_kernel_flags_update_range_id(&vmk_flags, target_map, memory_size);
19442 	}
19443 
19444 	target_size = memory_size;
19445 	if (src_page_mask != target_page_mask) {
19446 		vm_map_copy_t   target_copy_map;
19447 		vm_map_offset_t overmap_start = 0;
19448 		vm_map_offset_t overmap_end   = 0;
19449 		vm_map_offset_t trimmed_start = 0;
19450 
19451 		target_copy_map = copy_map; /* can modify "copy_map" itself */
19452 		DEBUG4K_ADJUST("adjusting...\n");
19453 		result = vm_map_copy_adjust_to_target(
19454 			copy_map,
19455 			offset_in_mapping, /* offset */
19456 			initial_size,
19457 			target_map,
19458 			copy,
19459 			&target_copy_map,
19460 			&overmap_start,
19461 			&overmap_end,
19462 			&trimmed_start);
19463 		if (result != KERN_SUCCESS) {
19464 			DEBUG4K_COPY("failed to adjust 0x%x\n", result);
19465 			vm_map_copy_discard(copy_map);
19466 			return result;
19467 		}
19468 		if (trimmed_start == 0) {
19469 			/* nothing trimmed: no adjustment needed */
19470 		} else if (trimmed_start >= offset_in_mapping) {
19471 			/* trimmed more than offset_in_mapping: nothing left */
19472 			assert(overmap_start == 0);
19473 			assert(overmap_end == 0);
19474 			offset_in_mapping = 0;
19475 		} else {
19476 			/* trimmed some of offset_in_mapping: adjust */
19477 			assert(overmap_start == 0);
19478 			assert(overmap_end == 0);
19479 			offset_in_mapping -= trimmed_start;
19480 		}
19481 		offset_in_mapping += overmap_start;
19482 		target_size = target_copy_map->size;
19483 	}
19484 
19485 	/*
19486 	 * Allocate/check a range of free virtual address
19487 	 * space for the target
19488 	 */
19489 	target_size = vm_map_round_page(target_size, target_page_mask);
19490 
19491 	if (target_size == 0) {
19492 		vm_map_copy_discard(copy_map);
19493 		return KERN_INVALID_ARGUMENT;
19494 	}
19495 
19496 	if (__improbable(!vm_map_is_map_size_valid(
19497 		    target_map, target_size, vmk_flags.vmkf_no_soft_limit))) {
19498 		vm_map_copy_discard(copy_map);
19499 		return KERN_NO_SPACE;
19500 	}
19501 
19502 	vm_map_lock(target_map);
19503 
19504 	if (!vmk_flags.vmf_fixed) {
19505 		result = vm_map_locate_space_anywhere(target_map, target_size,
19506 		    mask, vmk_flags, &target_addr, &insp_entry);
19507 	} else {
19508 		/*
19509 		 * vm_map_locate_space_fixed will reject overflowing
19510 		 * target_addr + target_size values
19511 		 */
19512 		result = vm_map_locate_space_fixed(target_map, target_addr,
19513 		    target_size, mask, vmk_flags, &insp_entry, &zap_list);
19514 
19515 		if (result == KERN_MEMORY_PRESENT) {
19516 			assert(!vmk_flags.vmkf_already);
19517 			insp_entry = VM_MAP_ENTRY_NULL;
19518 			result = KERN_NO_SPACE;
19519 		}
19520 	}
19521 
19522 	if (result == KERN_SUCCESS) {
19523 		while (vm_map_copy_first_entry(copy_map) !=
19524 		    vm_map_copy_to_entry(copy_map)) {
19525 			vm_map_entry_t entry = vm_map_copy_first_entry(copy_map);
19526 
19527 			vm_map_copy_entry_unlink(copy_map, entry);
19528 
19529 			if (vmk_flags.vmkf_remap_prot_copy) {
19530 				/*
19531 				 * This vm_map_remap() is for a
19532 				 * vm_protect(VM_PROT_COPY), so the caller
19533 				 * expects to be allowed to add write access
19534 				 * to this new mapping.  This is done by
19535 				 * adding VM_PROT_WRITE to each entry's
19536 				 * max_protection... unless some security
19537 				 * settings disallow it.
19538 				 */
19539 				bool allow_write = false;
19540 				if (entry->vme_permanent) {
19541 					/* immutable mapping... */
19542 					if ((entry->max_protection & VM_PROT_EXECUTE) &&
19543 					    developer_mode_state()) {
19544 						/*
19545 						 * ... but executable and
19546 						 * possibly being debugged,
19547 						 * so let's allow it to become
19548 						 * writable, for breakpoints
19549 						 * and dtrace probes, for
19550 						 * example.
19551 						 */
19552 						allow_write = true;
19553 					} else {
19554 						printf("%d[%s] vm_remap(0x%llx,0x%llx) VM_PROT_COPY denied on permanent mapping prot 0x%x/0x%x developer %d\n",
19555 						    proc_selfpid(),
19556 						    (get_bsdtask_info(current_task())
19557 						    ? proc_name_address(get_bsdtask_info(current_task()))
19558 						    : "?"),
19559 						    (uint64_t)memory_address,
19560 						    (uint64_t)memory_size,
19561 						    entry->protection,
19562 						    entry->max_protection,
19563 						    developer_mode_state());
19564 						DTRACE_VM6(vm_map_delete_permanent_deny_protcopy,
19565 						    vm_map_entry_t, entry,
19566 						    vm_map_offset_t, entry->vme_start,
19567 						    vm_map_offset_t, entry->vme_end,
19568 						    vm_prot_t, entry->protection,
19569 						    vm_prot_t, entry->max_protection,
19570 						    int, VME_ALIAS(entry));
19571 					}
19572 				} else {
19573 					allow_write = true;
19574 				}
19575 
19576 				/*
19577 				 * VM_PROT_COPY: allow this mapping to become
19578 				 * writable, unless it was "permanent".
19579 				 */
19580 				if (allow_write) {
19581 					entry->max_protection |= VM_PROT_WRITE;
19582 				}
19583 			}
19584 			if (vmk_flags.vmf_resilient_codesign) {
19585 				/* no codesigning -> read-only access */
19586 				entry->max_protection = VM_PROT_READ;
19587 				entry->protection = VM_PROT_READ;
19588 				entry->vme_resilient_codesign = TRUE;
19589 			}
19590 			entry->vme_start += target_addr;
19591 			entry->vme_end += target_addr;
19592 			assert(!entry->map_aligned);
19593 			if (vmk_flags.vmf_resilient_media &&
19594 			    !entry->is_sub_map &&
19595 			    (VME_OBJECT(entry) == VM_OBJECT_NULL ||
19596 			    VME_OBJECT(entry)->internal)) {
19597 				entry->vme_resilient_media = TRUE;
19598 			}
19599 			assert(VM_MAP_PAGE_ALIGNED(entry->vme_start, MIN(target_page_mask, PAGE_MASK)));
19600 			assert(VM_MAP_PAGE_ALIGNED(entry->vme_end, MIN(target_page_mask, PAGE_MASK)));
19601 			assert(VM_MAP_PAGE_ALIGNED(VME_OFFSET(entry), MIN(target_page_mask, PAGE_MASK)));
19602 			vm_map_store_entry_link(target_map, insp_entry, entry,
19603 			    vmk_flags);
19604 			insp_entry = entry;
19605 		}
19606 	}
19607 
19608 	if (vmk_flags.vmf_resilient_codesign) {
19609 		cur_protection = VM_PROT_READ;
19610 		max_protection = VM_PROT_READ;
19611 	}
19612 
19613 	if (result == KERN_SUCCESS) {
19614 		target_map->size += target_size;
19615 		SAVE_HINT_MAP_WRITE(target_map, insp_entry);
19616 	}
19617 	vm_map_unlock(target_map);
19618 
19619 	vm_map_zap_dispose(&zap_list);
19620 
19621 	if (result == KERN_SUCCESS && target_map->wiring_required) {
19622 		result = vm_map_wire_nested(target_map, target_addr,
19623 		    target_addr + target_size, cur_protection, VM_KERN_MEMORY_MLOCK,
19624 		    TRUE, PMAP_NULL, 0, NULL);
19625 	}
19626 
19627 	if (result == KERN_SUCCESS) {
19628 #if KASAN
19629 		if (target_map->pmap == kernel_pmap) {
19630 			kasan_notify_address(target_addr, target_size);
19631 		}
19632 #endif
19633 		/*
19634 		 * If requested, return the address of the data pointed to by the
19635 		 * request, rather than the base of the resulting page.
19636 		 */
19637 		if (vmk_flags.vmf_return_data_addr) {
19638 			target_addr += offset_in_mapping;
19639 		}
19640 
19641 		/*
19642 		 * Update OUT parameters.
19643 		 */
19644 		*address_u = vm_sanitize_wrap_addr(target_addr);
19645 
19646 		*cur_protection_u = vm_sanitize_wrap_prot(cur_protection);
19647 		*max_protection_u = vm_sanitize_wrap_prot(max_protection);
19648 	}
19649 
19650 	if (src_page_mask != target_page_mask) {
19651 		DEBUG4K_SHARE("vm_remap(%p 0x%llx 0x%llx copy=%d-> %p 0x%llx 0x%llx  result=0x%x\n", src_map, (uint64_t)memory_address, (uint64_t)target_size, copy, target_map, (uint64_t)target_addr, (uint64_t)offset_in_mapping, result);
19652 	}
19653 	vm_map_copy_discard(copy_map);
19654 	copy_map = VM_MAP_COPY_NULL;
19655 
19656 	return result;
19657 }
19658 
19659 /*
19660  *	vm_map_switch_to:
19661  *
19662  *	Set the address map for the current thread to the specified map.
19663  *  Returns a struct containing info about the previous map, which should be
19664  *  restored with `vm_map_switch_back`
19665  */
19666 
19667 vm_map_switch_context_t
vm_map_switch_to(vm_map_t map)19668 vm_map_switch_to(vm_map_t map)
19669 {
19670 	thread_t thread = current_thread();
19671 	vm_map_t oldmap = thread->map;
19672 
19673 	/*
19674 	 * Deactivate the current map and activate the requested map
19675 	 */
19676 	mp_disable_preemption();
19677 	PMAP_SWITCH_USER(thread, map, cpu_number());
19678 	mp_enable_preemption();
19679 
19680 	vm_map_lock(map);
19681 	task_t task = map->owning_task;
19682 	if (task) {
19683 		task_reference(task);
19684 	}
19685 	vm_map_unlock(map);
19686 
19687 	return (vm_map_switch_context_t) { oldmap, task };
19688 }
19689 
19690 void
vm_map_switch_back(vm_map_switch_context_t ctx)19691 vm_map_switch_back(vm_map_switch_context_t ctx)
19692 {
19693 	thread_t thread = current_thread();
19694 	task_t task = ctx.task;
19695 	vm_map_t map = ctx.map;
19696 
19697 	if (task) {
19698 		task_deallocate(task);
19699 	} else {
19700 		/*
19701 		 * We want to make sure that vm_map_setup was not called while the
19702 		 * map was switched. This allows us to guarantee the property that
19703 		 * we always have a reference on current_map()->owning_task if it is
19704 		 * not NULL.
19705 		 */
19706 		assert(!thread->map->owning_task);
19707 	}
19708 
19709 	/*
19710 	 * Restore the original map from prior to vm_map_switch_to
19711 	 */
19712 	mp_disable_preemption();
19713 	PMAP_SWITCH_USER(thread, map, cpu_number());
19714 	mp_enable_preemption();
19715 }
19716 
19717 static __attribute__((always_inline, warn_unused_result))
19718 kern_return_t
vm_map_rw_user_sanitize(vm_map_t map,vm_map_address_ut addr_u,vm_size_ut size_u,vm_sanitize_caller_t vm_sanitize_caller,vm_map_address_t * addr,vm_map_address_t * end,vm_map_size_t * size)19719 vm_map_rw_user_sanitize(
19720 	vm_map_t                map,
19721 	vm_map_address_ut       addr_u,
19722 	vm_size_ut              size_u,
19723 	vm_sanitize_caller_t    vm_sanitize_caller,
19724 	vm_map_address_t       *addr,
19725 	vm_map_address_t       *end,
19726 	vm_map_size_t          *size)
19727 {
19728 	vm_sanitize_flags_t flags = VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH |
19729 	    VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES |
19730 	    VM_SANITIZE_FLAGS_CHECK_ADDR_RANGE;
19731 
19732 	return vm_sanitize_addr_size(addr_u, size_u,
19733 	           vm_sanitize_caller, map,
19734 	           flags,
19735 	           addr, end, size);
19736 }
19737 
19738 /*
19739  *	Routine:	vm_map_write_user
19740  *
19741  *	Description:
19742  *		Copy out data from a kernel space into space in the
19743  *		destination map. The space must already exist in the
19744  *		destination map.
19745  *		NOTE:  This routine should only be called by threads
19746  *		which can block on a page fault. i.e. kernel mode user
19747  *		threads.
19748  *
19749  */
19750 kern_return_t
vm_map_write_user(vm_map_t map,void * src_p,vm_map_address_ut dst_addr_u,vm_size_ut size_u)19751 vm_map_write_user(
19752 	vm_map_t                map,
19753 	void                   *src_p,
19754 	vm_map_address_ut       dst_addr_u,
19755 	vm_size_ut              size_u)
19756 {
19757 	kern_return_t    kr;
19758 	vm_map_address_t dst_addr, dst_end;
19759 	vm_map_size_t    size;
19760 
19761 	/*
19762 	 * src_p isn't validated: [src_p, src_p + size_u)
19763 	 * is trusted kernel input.
19764 	 *
19765 	 * dst_addr_u and size_u are untrusted and need to be sanitized.
19766 	 */
19767 	kr = vm_map_rw_user_sanitize(map,
19768 	    dst_addr_u,
19769 	    size_u,
19770 	    VM_SANITIZE_CALLER_VM_MAP_WRITE_USER,
19771 	    &dst_addr,
19772 	    &dst_end,
19773 	    &size);
19774 	if (__improbable(kr != KERN_SUCCESS)) {
19775 		return vm_sanitize_get_kr(kr);
19776 	}
19777 
19778 	if (current_map() == map) {
19779 		if (copyout(src_p, dst_addr, size)) {
19780 			kr = KERN_INVALID_ADDRESS;
19781 		}
19782 	} else {
19783 		vm_map_switch_context_t switch_ctx;
19784 
19785 		/* take on the identity of the target map while doing */
19786 		/* the transfer */
19787 
19788 		vm_map_reference(map);
19789 		switch_ctx = vm_map_switch_to(map);
19790 		if (copyout(src_p, dst_addr, size)) {
19791 			kr = KERN_INVALID_ADDRESS;
19792 		}
19793 		vm_map_switch_back(switch_ctx);
19794 		vm_map_deallocate(map);
19795 	}
19796 	return kr;
19797 }
19798 
19799 /*
19800  *	Routine:	vm_map_read_user
19801  *
19802  *	Description:
19803  *		Copy in data from a user space source map into the
19804  *		kernel map. The space must already exist in the
19805  *		kernel map.
19806  *		NOTE:  This routine should only be called by threads
19807  *		which can block on a page fault. i.e. kernel mode user
19808  *		threads.
19809  *
19810  */
19811 kern_return_t
vm_map_read_user(vm_map_t map,vm_map_address_ut src_addr_u,void * dst_p,vm_size_ut size_u)19812 vm_map_read_user(
19813 	vm_map_t                map,
19814 	vm_map_address_ut       src_addr_u,
19815 	void                   *dst_p,
19816 	vm_size_ut              size_u)
19817 {
19818 	kern_return_t    kr;
19819 	vm_map_address_t src_addr, src_end;
19820 	vm_map_size_t    size;
19821 
19822 	/*
19823 	 * dst_p isn't validated: [dst_p, dst_p + size_u)
19824 	 * is trusted kernel input.
19825 	 *
19826 	 * src_addr_u and size_u are untrusted and need to be sanitized.
19827 	 */
19828 	kr = vm_map_rw_user_sanitize(map,
19829 	    src_addr_u,
19830 	    size_u,
19831 	    VM_SANITIZE_CALLER_VM_MAP_READ_USER,
19832 	    &src_addr,
19833 	    &src_end,
19834 	    &size);
19835 	if (__improbable(kr != KERN_SUCCESS)) {
19836 		return vm_sanitize_get_kr(kr);
19837 	}
19838 
19839 	if (current_map() == map) {
19840 		if (copyin(src_addr, dst_p, size)) {
19841 			kr = KERN_INVALID_ADDRESS;
19842 		}
19843 	} else {
19844 		vm_map_switch_context_t switch_ctx;
19845 
19846 		/* take on the identity of the target map while doing */
19847 		/* the transfer */
19848 
19849 		vm_map_reference(map);
19850 		switch_ctx = vm_map_switch_to(map);
19851 		if (copyin(src_addr, dst_p, size)) {
19852 			kr = KERN_INVALID_ADDRESS;
19853 		}
19854 		vm_map_switch_back(switch_ctx);
19855 		vm_map_deallocate(map);
19856 	}
19857 	return kr;
19858 }
19859 
19860 
19861 static __attribute__((always_inline, warn_unused_result))
19862 kern_return_t
vm_map_check_protection_sanitize(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_prot_ut protection_u,vm_sanitize_caller_t vm_sanitize_caller,vm_map_offset_t * start,vm_map_offset_t * end,vm_prot_t * protection)19863 vm_map_check_protection_sanitize(
19864 	vm_map_t                map,
19865 	vm_map_offset_ut        start_u,
19866 	vm_map_offset_ut        end_u,
19867 	vm_prot_ut              protection_u,
19868 	vm_sanitize_caller_t    vm_sanitize_caller,
19869 	vm_map_offset_t        *start,
19870 	vm_map_offset_t        *end,
19871 	vm_prot_t              *protection)
19872 {
19873 	kern_return_t           kr;
19874 	vm_map_size_t           size;
19875 
19876 	kr = vm_sanitize_addr_end(start_u, end_u, vm_sanitize_caller, map,
19877 	    VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH, start, end,
19878 	    &size);
19879 	if (__improbable(kr != KERN_SUCCESS)) {
19880 		return kr;
19881 	}
19882 
19883 	/*
19884 	 * Given that the protection is used only for comparisons below
19885 	 * no sanitization is being applied on it.
19886 	 */
19887 	*protection = VM_SANITIZE_UNSAFE_UNWRAP(protection_u);
19888 
19889 	return KERN_SUCCESS;
19890 }
19891 
19892 /*
19893  *	vm_map_check_protection:
19894  *
19895  *	Assert that the target map allows the specified
19896  *	privilege on the entire address region given.
19897  *	The entire region must be allocated.
19898  */
19899 boolean_t
vm_map_check_protection(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_prot_ut protection_u,vm_sanitize_caller_t vm_sanitize_caller)19900 vm_map_check_protection(
19901 	vm_map_t                map,
19902 	vm_map_offset_ut        start_u,
19903 	vm_map_offset_ut        end_u,
19904 	vm_prot_ut              protection_u,
19905 	vm_sanitize_caller_t    vm_sanitize_caller)
19906 {
19907 	vm_map_entry_t entry;
19908 	vm_map_entry_t tmp_entry;
19909 	vm_map_offset_t start;
19910 	vm_map_offset_t end;
19911 	vm_prot_t protection;
19912 	kern_return_t kr;
19913 
19914 	kr = vm_map_check_protection_sanitize(map,
19915 	    start_u,
19916 	    end_u,
19917 	    protection_u,
19918 	    vm_sanitize_caller,
19919 	    &start,
19920 	    &end,
19921 	    &protection);
19922 	if (__improbable(kr != KERN_SUCCESS)) {
19923 		kr = vm_sanitize_get_kr(kr);
19924 		if (kr == KERN_SUCCESS) {
19925 			return true;
19926 		}
19927 		return false;
19928 	}
19929 
19930 	vm_map_lock(map);
19931 
19932 	if (start < vm_map_min(map) || end > vm_map_max(map)) {
19933 		vm_map_unlock(map);
19934 		return false;
19935 	}
19936 
19937 	if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
19938 		vm_map_unlock(map);
19939 		return false;
19940 	}
19941 
19942 	entry = tmp_entry;
19943 
19944 	while (start < end) {
19945 		if (entry == vm_map_to_entry(map)) {
19946 			vm_map_unlock(map);
19947 			return false;
19948 		}
19949 
19950 		/*
19951 		 *	No holes allowed!
19952 		 */
19953 
19954 		if (start < entry->vme_start) {
19955 			vm_map_unlock(map);
19956 			return false;
19957 		}
19958 
19959 		/*
19960 		 * Check protection associated with entry.
19961 		 */
19962 
19963 		if ((entry->protection & protection) != protection) {
19964 			vm_map_unlock(map);
19965 			return false;
19966 		}
19967 
19968 		/* go to next entry */
19969 
19970 		start = entry->vme_end;
19971 		entry = entry->vme_next;
19972 	}
19973 	vm_map_unlock(map);
19974 	return true;
19975 }
19976 
19977 kern_return_t
vm_map_purgable_control(vm_map_t map,vm_map_offset_ut address_u,vm_purgable_t control,int * state)19978 vm_map_purgable_control(
19979 	vm_map_t                map,
19980 	vm_map_offset_ut        address_u,
19981 	vm_purgable_t           control,
19982 	int                    *state)
19983 {
19984 	vm_map_offset_t         address;
19985 	vm_map_entry_t          entry;
19986 	vm_object_t             object;
19987 	kern_return_t           kr;
19988 	boolean_t               was_nonvolatile;
19989 
19990 	/*
19991 	 * Vet all the input parameters and current type and state of the
19992 	 * underlaying object.  Return with an error if anything is amiss.
19993 	 */
19994 	if (map == VM_MAP_NULL) {
19995 		return KERN_INVALID_ARGUMENT;
19996 	}
19997 
19998 	if (control != VM_PURGABLE_SET_STATE &&
19999 	    control != VM_PURGABLE_GET_STATE &&
20000 	    control != VM_PURGABLE_PURGE_ALL &&
20001 	    control != VM_PURGABLE_SET_STATE_FROM_KERNEL) {
20002 		return KERN_INVALID_ARGUMENT;
20003 	}
20004 
20005 	if (control == VM_PURGABLE_PURGE_ALL) {
20006 		vm_purgeable_object_purge_all();
20007 		return KERN_SUCCESS;
20008 	}
20009 
20010 	if ((control == VM_PURGABLE_SET_STATE ||
20011 	    control == VM_PURGABLE_SET_STATE_FROM_KERNEL) &&
20012 	    (((*state & ~(VM_PURGABLE_ALL_MASKS)) != 0) ||
20013 	    ((*state & VM_PURGABLE_STATE_MASK) > VM_PURGABLE_STATE_MASK))) {
20014 		return KERN_INVALID_ARGUMENT;
20015 	}
20016 
20017 	address = vm_sanitize_addr(map, address_u);
20018 
20019 	vm_map_lock_read(map);
20020 
20021 	if (!vm_map_lookup_entry(map, address, &entry) || entry->is_sub_map) {
20022 		/*
20023 		 * Must pass a valid non-submap address.
20024 		 */
20025 		vm_map_unlock_read(map);
20026 		return KERN_INVALID_ADDRESS;
20027 	}
20028 
20029 	if ((entry->protection & VM_PROT_WRITE) == 0 &&
20030 	    control != VM_PURGABLE_GET_STATE) {
20031 		/*
20032 		 * Can't apply purgable controls to something you can't write.
20033 		 */
20034 		vm_map_unlock_read(map);
20035 		return KERN_PROTECTION_FAILURE;
20036 	}
20037 
20038 	object = VME_OBJECT(entry);
20039 	if (object == VM_OBJECT_NULL ||
20040 	    object->purgable == VM_PURGABLE_DENY) {
20041 		/*
20042 		 * Object must already be present and be purgeable.
20043 		 */
20044 		vm_map_unlock_read(map);
20045 		return KERN_INVALID_ARGUMENT;
20046 	}
20047 
20048 	vm_object_lock(object);
20049 
20050 #if 00
20051 	if (VME_OFFSET(entry) != 0 ||
20052 	    entry->vme_end - entry->vme_start != object->vo_size) {
20053 		/*
20054 		 * Can only apply purgable controls to the whole (existing)
20055 		 * object at once.
20056 		 */
20057 		vm_map_unlock_read(map);
20058 		vm_object_unlock(object);
20059 		return KERN_INVALID_ARGUMENT;
20060 	}
20061 #endif
20062 
20063 	assert(!entry->is_sub_map);
20064 	assert(!entry->use_pmap); /* purgeable has its own accounting */
20065 
20066 	vm_map_unlock_read(map);
20067 
20068 	was_nonvolatile = (object->purgable == VM_PURGABLE_NONVOLATILE);
20069 
20070 	kr = vm_object_purgable_control(object, control, state);
20071 
20072 	if (was_nonvolatile &&
20073 	    object->purgable != VM_PURGABLE_NONVOLATILE &&
20074 	    map->pmap == kernel_pmap) {
20075 #if DEBUG
20076 		object->vo_purgeable_volatilizer = kernel_task;
20077 #endif /* DEBUG */
20078 	}
20079 
20080 	vm_object_unlock(object);
20081 
20082 	return kr;
20083 }
20084 
20085 void
vm_map_footprint_query_page_info(vm_map_t map,vm_map_entry_t map_entry,vm_map_offset_t curr_s_offset,int * disposition_p)20086 vm_map_footprint_query_page_info(
20087 	vm_map_t        map,
20088 	vm_map_entry_t  map_entry,
20089 	vm_map_offset_t curr_s_offset,
20090 	int             *disposition_p)
20091 {
20092 	int             pmap_disp;
20093 	vm_object_t     object = VM_OBJECT_NULL;
20094 	int             disposition;
20095 	int             effective_page_size;
20096 
20097 	vm_map_lock_assert_held(map);
20098 	assert(!map->has_corpse_footprint);
20099 	assert(curr_s_offset >= map_entry->vme_start);
20100 	assert(curr_s_offset < map_entry->vme_end);
20101 
20102 	if (map_entry->is_sub_map) {
20103 		if (!map_entry->use_pmap) {
20104 			/* nested pmap: no footprint */
20105 			*disposition_p = 0;
20106 			return;
20107 		}
20108 	} else {
20109 		object = VME_OBJECT(map_entry);
20110 		if (object == VM_OBJECT_NULL) {
20111 			/* nothing mapped here: no need to ask */
20112 			*disposition_p = 0;
20113 			return;
20114 		}
20115 	}
20116 
20117 	effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(map));
20118 
20119 	pmap_disp = 0;
20120 
20121 	/*
20122 	 * Query the pmap.
20123 	 */
20124 	pmap_query_page_info(map->pmap, curr_s_offset, &pmap_disp);
20125 
20126 	/*
20127 	 * Compute this page's disposition.
20128 	 */
20129 	disposition = 0;
20130 
20131 	/* deal with "alternate accounting" first */
20132 	if (!map_entry->is_sub_map &&
20133 	    object->vo_no_footprint) {
20134 		/* does not count in footprint */
20135 //		assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
20136 	} else if (!map_entry->is_sub_map &&
20137 	    !object->internal &&
20138 	    object->vo_ledger_tag &&
20139 	    VM_OBJECT_OWNER(object) != NULL &&
20140 	    VM_OBJECT_OWNER(object)->map == map) {
20141 		/* owned external object: wired pages count in footprint */
20142 		assertf(map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
20143 		if ((((curr_s_offset
20144 		    - map_entry->vme_start
20145 		    + VME_OFFSET(map_entry))
20146 		    / effective_page_size) <
20147 		    object->wired_page_count)) {
20148 			/*
20149 			 * External object owned by this task: report the first
20150 			 * "#wired" pages as "resident" (to show that they
20151 			 * contribute to the footprint) but not "dirty"
20152 			 * (to avoid double-counting with the fake "owned"
20153 			 * region we'll report at the end of the address space
20154 			 * to account for all (mapped or not) owned memory
20155 			 * owned by this task.
20156 			 */
20157 			disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
20158 		}
20159 	} else if (!map_entry->is_sub_map &&
20160 	    object->internal &&
20161 	    (object->purgable == VM_PURGABLE_NONVOLATILE ||
20162 	    (object->purgable == VM_PURGABLE_DENY &&
20163 	    object->vo_ledger_tag)) &&
20164 	    VM_OBJECT_OWNER(object) != NULL &&
20165 	    VM_OBJECT_OWNER(object)->map == map) {
20166 		assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
20167 		if ((((curr_s_offset
20168 		    - map_entry->vme_start
20169 		    + VME_OFFSET(map_entry))
20170 		    / effective_page_size) <
20171 		    (object->resident_page_count +
20172 		    vm_compressor_pager_get_count(object->pager)))) {
20173 			/*
20174 			 * Non-volatile purgeable object owned
20175 			 * by this task: report the first
20176 			 * "#resident + #compressed" pages as
20177 			 * "resident" (to show that they
20178 			 * contribute to the footprint) but not
20179 			 * "dirty" (to avoid double-counting
20180 			 * with the fake "non-volatile" region
20181 			 * we'll report at the end of the
20182 			 * address space to account for all
20183 			 * (mapped or not) non-volatile memory
20184 			 * owned by this task.
20185 			 */
20186 			disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
20187 		}
20188 	} else if (!map_entry->is_sub_map &&
20189 	    object->internal &&
20190 	    (object->purgable == VM_PURGABLE_VOLATILE ||
20191 	    object->purgable == VM_PURGABLE_EMPTY) &&
20192 	    VM_OBJECT_OWNER(object) != NULL &&
20193 	    VM_OBJECT_OWNER(object)->map == map) {
20194 		if (object->internal) {
20195 			assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
20196 		}
20197 		if ((((curr_s_offset
20198 		    - map_entry->vme_start
20199 		    + VME_OFFSET(map_entry))
20200 		    / effective_page_size) <
20201 		    object->wired_page_count)) {
20202 			/*
20203 			 * Volatile|empty purgeable object owned
20204 			 * by this task: report the first
20205 			 * "#wired" pages as "resident" (to
20206 			 * show that they contribute to the
20207 			 * footprint) but not "dirty" (to avoid
20208 			 * double-counting with the fake
20209 			 * "non-volatile" region we'll report
20210 			 * at the end of the address space to
20211 			 * account for all (mapped or not)
20212 			 * non-volatile memory owned by this
20213 			 * task.
20214 			 */
20215 			disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
20216 		}
20217 	} else if (!map_entry->is_sub_map &&
20218 	    map_entry->iokit_acct &&
20219 	    object->internal &&
20220 	    object->purgable == VM_PURGABLE_DENY) {
20221 		/*
20222 		 * Non-purgeable IOKit memory: phys_footprint
20223 		 * includes the entire virtual mapping.
20224 		 */
20225 		assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
20226 		disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
20227 		disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
20228 	} else if (pmap_disp & (PMAP_QUERY_PAGE_ALTACCT |
20229 	    PMAP_QUERY_PAGE_COMPRESSED_ALTACCT)) {
20230 		/* alternate accounting */
20231 #if __arm64__ && (DEVELOPMENT || DEBUG)
20232 		if (map->pmap->footprint_was_suspended) {
20233 			/*
20234 			 * The assertion below can fail if dyld
20235 			 * suspended footprint accounting
20236 			 * while doing some adjustments to
20237 			 * this page;  the mapping would say
20238 			 * "use pmap accounting" but the page
20239 			 * would be marked "alternate
20240 			 * accounting".
20241 			 */
20242 		} else
20243 #endif /* __arm64__ && (DEVELOPMENT || DEBUG) */
20244 		{
20245 			assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
20246 		}
20247 		disposition = 0;
20248 	} else {
20249 		if (pmap_disp & PMAP_QUERY_PAGE_PRESENT) {
20250 			assertf(map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
20251 			disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
20252 			disposition |= VM_PAGE_QUERY_PAGE_REF;
20253 			if (pmap_disp & PMAP_QUERY_PAGE_INTERNAL) {
20254 				disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
20255 			} else {
20256 				disposition |= VM_PAGE_QUERY_PAGE_EXTERNAL;
20257 			}
20258 			if (pmap_disp & PMAP_QUERY_PAGE_REUSABLE) {
20259 				disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
20260 			}
20261 		} else if (pmap_disp & PMAP_QUERY_PAGE_COMPRESSED) {
20262 			assertf(map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
20263 			disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
20264 		}
20265 	}
20266 
20267 	*disposition_p = disposition;
20268 }
20269 
20270 kern_return_t
vm_map_page_info(vm_map_t map,vm_map_offset_ut offset_u,vm_page_info_flavor_t flavor,vm_page_info_t info,mach_msg_type_number_t * count)20271 vm_map_page_info(
20272 	vm_map_t                map,
20273 	vm_map_offset_ut        offset_u,
20274 	vm_page_info_flavor_t   flavor,
20275 	vm_page_info_t          info,
20276 	mach_msg_type_number_t  *count)
20277 {
20278 	return vm_map_page_range_info_internal(map,
20279 	           offset_u, /* start of range */
20280 	           vm_sanitize_compute_ut_end(offset_u, 1), /* this will get rounded in the call to the page boundary */
20281 	           (int)-1, /* effective_page_shift: unspecified */
20282 	           flavor,
20283 	           info,
20284 	           count);
20285 }
20286 
20287 static __attribute__((always_inline, warn_unused_result))
20288 kern_return_t
vm_map_page_range_info_sanitize(vm_map_t map,vm_map_offset_ut start_offset_u,vm_map_offset_ut end_offset_u,vm_map_offset_t effective_page_mask,vm_map_offset_t * start,vm_map_offset_t * end,vm_map_offset_t * offset_in_page)20289 vm_map_page_range_info_sanitize(
20290 	vm_map_t                map,
20291 	vm_map_offset_ut        start_offset_u,
20292 	vm_map_offset_ut        end_offset_u,
20293 	vm_map_offset_t         effective_page_mask,
20294 	vm_map_offset_t        *start,
20295 	vm_map_offset_t        *end,
20296 	vm_map_offset_t        *offset_in_page)
20297 {
20298 	kern_return_t           retval;
20299 	vm_map_size_t           size;
20300 
20301 	/*
20302 	 * Perform validation against map's mask but don't align start/end,
20303 	 * as we need for those to be aligned wrt effective_page_mask
20304 	 */
20305 	retval = vm_sanitize_addr_end(start_offset_u, end_offset_u,
20306 	    VM_SANITIZE_CALLER_VM_MAP_PAGE_RANGE_INFO, map,
20307 	    VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH |
20308 	    VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES, start,
20309 	    end, &size);
20310 	if (retval != KERN_SUCCESS) {
20311 		return retval;
20312 	}
20313 
20314 	retval = vm_sanitize_addr_end(start_offset_u, end_offset_u,
20315 	    VM_SANITIZE_CALLER_VM_MAP_PAGE_RANGE_INFO, effective_page_mask,
20316 	    VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH, start,
20317 	    end, &size);
20318 	if (retval != KERN_SUCCESS) {
20319 		return retval;
20320 	}
20321 
20322 	*offset_in_page = vm_sanitize_offset_in_page(effective_page_mask,
20323 	    start_offset_u);
20324 
20325 	return KERN_SUCCESS;
20326 }
20327 
20328 kern_return_t
vm_map_page_range_info_internal(vm_map_t map,vm_map_offset_ut start_offset_u,vm_map_offset_ut end_offset_u,int effective_page_shift,vm_page_info_flavor_t flavor,vm_page_info_t info,mach_msg_type_number_t * count)20329 vm_map_page_range_info_internal(
20330 	vm_map_t                map,
20331 	vm_map_offset_ut        start_offset_u,
20332 	vm_map_offset_ut        end_offset_u,
20333 	int                     effective_page_shift,
20334 	vm_page_info_flavor_t   flavor,
20335 	vm_page_info_t          info,
20336 	mach_msg_type_number_t  *count)
20337 {
20338 	vm_map_entry_t          map_entry = VM_MAP_ENTRY_NULL;
20339 	vm_object_t             object = VM_OBJECT_NULL, curr_object = VM_OBJECT_NULL;
20340 	vm_page_t               m = VM_PAGE_NULL;
20341 	kern_return_t           retval = KERN_SUCCESS;
20342 	int                     disposition = 0;
20343 	int                     ref_count = 0;
20344 	int                     depth = 0, info_idx = 0;
20345 	vm_page_info_basic_t    basic_info = 0;
20346 	vm_map_offset_t         offset_in_page = 0, offset_in_object = 0, curr_offset_in_object = 0;
20347 	vm_map_offset_t         start = 0, end = 0, curr_s_offset = 0, curr_e_offset = 0;
20348 	boolean_t               do_region_footprint;
20349 	ledger_amount_t         ledger_resident, ledger_compressed;
20350 	int                     effective_page_size;
20351 	vm_map_offset_t         effective_page_mask;
20352 
20353 	switch (flavor) {
20354 	case VM_PAGE_INFO_BASIC:
20355 		if (*count != VM_PAGE_INFO_BASIC_COUNT) {
20356 			/*
20357 			 * The "vm_page_info_basic_data" structure was not
20358 			 * properly padded, so allow the size to be off by
20359 			 * one to maintain backwards binary compatibility...
20360 			 */
20361 			if (*count != VM_PAGE_INFO_BASIC_COUNT - 1) {
20362 				return KERN_INVALID_ARGUMENT;
20363 			}
20364 		}
20365 		break;
20366 	default:
20367 		return KERN_INVALID_ARGUMENT;
20368 	}
20369 
20370 	if (effective_page_shift == -1) {
20371 		effective_page_shift = vm_self_region_page_shift_safely(map);
20372 		if (effective_page_shift == -1) {
20373 			return KERN_INVALID_ARGUMENT;
20374 		}
20375 	}
20376 	effective_page_size = (1 << effective_page_shift);
20377 	effective_page_mask = effective_page_size - 1;
20378 
20379 
20380 	retval = vm_map_page_range_info_sanitize(map,
20381 	    start_offset_u,
20382 	    end_offset_u,
20383 	    effective_page_mask,
20384 	    &start,
20385 	    &end,
20386 	    &offset_in_page);
20387 	if (retval != KERN_SUCCESS) {
20388 		return vm_sanitize_get_kr(retval);
20389 	}
20390 
20391 	assert((end - start) <= MAX_PAGE_RANGE_QUERY);
20392 
20393 	do_region_footprint = task_self_region_footprint();
20394 	disposition = 0;
20395 	ref_count = 0;
20396 	depth = 0;
20397 	info_idx = 0; /* Tracks the next index within the info structure to be filled.*/
20398 
20399 	vm_map_lock_read(map);
20400 
20401 
20402 	task_ledgers_footprint(map->pmap->ledger, &ledger_resident, &ledger_compressed);
20403 
20404 	for (curr_s_offset = start; curr_s_offset < end;) {
20405 		/*
20406 		 * New lookup needs reset of these variables.
20407 		 */
20408 		curr_object = object = VM_OBJECT_NULL;
20409 		offset_in_object = 0;
20410 		ref_count = 0;
20411 		depth = 0;
20412 
20413 		if (do_region_footprint &&
20414 		    curr_s_offset >= vm_map_last_entry(map)->vme_end) {
20415 			/*
20416 			 * Request for "footprint" info about a page beyond
20417 			 * the end of address space: this must be for
20418 			 * the fake region vm_map_region_recurse_64()
20419 			 * reported to account for non-volatile purgeable
20420 			 * memory owned by this task.
20421 			 */
20422 			disposition = 0;
20423 
20424 			if (curr_s_offset - vm_map_last_entry(map)->vme_end <=
20425 			    (unsigned) ledger_compressed) {
20426 				/*
20427 				 * We haven't reported all the "non-volatile
20428 				 * compressed" pages yet, so report this fake
20429 				 * page as "compressed".
20430 				 */
20431 				disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
20432 			} else {
20433 				/*
20434 				 * We've reported all the non-volatile
20435 				 * compressed page but not all the non-volatile
20436 				 * pages , so report this fake page as
20437 				 * "resident dirty".
20438 				 */
20439 				disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
20440 				disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
20441 				disposition |= VM_PAGE_QUERY_PAGE_REF;
20442 			}
20443 			switch (flavor) {
20444 			case VM_PAGE_INFO_BASIC:
20445 				basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20446 				basic_info->disposition = disposition;
20447 				basic_info->ref_count = 1;
20448 				basic_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
20449 				basic_info->offset = 0;
20450 				basic_info->depth = 0;
20451 
20452 				info_idx++;
20453 				break;
20454 			}
20455 			curr_s_offset += effective_page_size;
20456 			continue;
20457 		}
20458 
20459 		/*
20460 		 * First, find the map entry covering "curr_s_offset", going down
20461 		 * submaps if necessary.
20462 		 */
20463 		if (!vm_map_lookup_entry(map, curr_s_offset, &map_entry)) {
20464 			/* no entry -> no object -> no page */
20465 
20466 			if (curr_s_offset < vm_map_min(map)) {
20467 				/*
20468 				 * Illegal address that falls below map min.
20469 				 */
20470 				curr_e_offset = MIN(end, vm_map_min(map));
20471 			} else if (curr_s_offset >= vm_map_max(map)) {
20472 				/*
20473 				 * Illegal address that falls on/after map max.
20474 				 */
20475 				curr_e_offset = end;
20476 			} else if (map_entry == vm_map_to_entry(map)) {
20477 				/*
20478 				 * Hit a hole.
20479 				 */
20480 				if (map_entry->vme_next == vm_map_to_entry(map)) {
20481 					/*
20482 					 * Empty map.
20483 					 */
20484 					curr_e_offset = MIN(map->max_offset, end);
20485 				} else {
20486 					/*
20487 					 * Hole at start of the map.
20488 					 */
20489 					curr_e_offset = MIN(map_entry->vme_next->vme_start, end);
20490 				}
20491 			} else {
20492 				if (map_entry->vme_next == vm_map_to_entry(map)) {
20493 					/*
20494 					 * Hole at the end of the map.
20495 					 */
20496 					curr_e_offset = MIN(map->max_offset, end);
20497 				} else {
20498 					curr_e_offset = MIN(map_entry->vme_next->vme_start, end);
20499 				}
20500 			}
20501 
20502 			assert(curr_e_offset >= curr_s_offset);
20503 
20504 			uint64_t num_pages = (curr_e_offset - curr_s_offset) >> effective_page_shift;
20505 
20506 			void *info_ptr = (void*) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20507 
20508 			bzero(info_ptr, num_pages * sizeof(struct vm_page_info_basic));
20509 
20510 			curr_s_offset = curr_e_offset;
20511 
20512 			info_idx += num_pages;
20513 
20514 			continue;
20515 		}
20516 
20517 		/* compute offset from this map entry's start */
20518 		offset_in_object = curr_s_offset - map_entry->vme_start;
20519 
20520 		/* compute offset into this map entry's object (or submap) */
20521 		offset_in_object += VME_OFFSET(map_entry);
20522 
20523 		if (map_entry->is_sub_map) {
20524 			vm_map_t sub_map = VM_MAP_NULL;
20525 			vm_page_info_t submap_info = 0;
20526 			vm_map_offset_t submap_s_offset = 0, submap_e_offset = 0, range_len = 0;
20527 
20528 			range_len = MIN(map_entry->vme_end, end) - curr_s_offset;
20529 
20530 			submap_s_offset = offset_in_object;
20531 			submap_e_offset = submap_s_offset + range_len;
20532 
20533 			sub_map = VME_SUBMAP(map_entry);
20534 
20535 			vm_map_reference(sub_map);
20536 			vm_map_unlock_read(map);
20537 
20538 			submap_info = (vm_page_info_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20539 
20540 			assertf(VM_MAP_PAGE_SHIFT(sub_map) >= VM_MAP_PAGE_SHIFT(map),
20541 			    "Submap page size (%d) differs from current map (%d)\n", VM_MAP_PAGE_SIZE(sub_map), VM_MAP_PAGE_SIZE(map));
20542 
20543 			retval = vm_map_page_range_info_internal(sub_map,
20544 			    submap_s_offset,
20545 			    submap_e_offset,
20546 			    effective_page_shift,
20547 			    VM_PAGE_INFO_BASIC,
20548 			    (vm_page_info_t) submap_info,
20549 			    count);
20550 
20551 			assert(retval == KERN_SUCCESS);
20552 
20553 			vm_map_deallocate(sub_map);
20554 			sub_map = VM_MAP_NULL;
20555 			vm_map_lock_read(map);
20556 
20557 			/* Move the "info" index by the number of pages we inspected.*/
20558 			info_idx += range_len >> effective_page_shift;
20559 
20560 			/* Move our current offset by the size of the range we inspected.*/
20561 			curr_s_offset += range_len;
20562 
20563 			continue;
20564 		}
20565 
20566 		object = VME_OBJECT(map_entry);
20567 
20568 		if (object == VM_OBJECT_NULL) {
20569 			/*
20570 			 * We don't have an object here and, hence,
20571 			 * no pages to inspect. We'll fill up the
20572 			 * info structure appropriately.
20573 			 */
20574 
20575 			curr_e_offset = MIN(map_entry->vme_end, end);
20576 
20577 			uint64_t num_pages = (curr_e_offset - curr_s_offset) >> effective_page_shift;
20578 
20579 			void *info_ptr = (void*) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20580 
20581 			bzero(info_ptr, num_pages * sizeof(struct vm_page_info_basic));
20582 
20583 			curr_s_offset = curr_e_offset;
20584 
20585 			info_idx += num_pages;
20586 
20587 			continue;
20588 		}
20589 
20590 		if (do_region_footprint) {
20591 			disposition = 0;
20592 			if (map->has_corpse_footprint) {
20593 				/*
20594 				 * Query the page info data we saved
20595 				 * while forking the corpse.
20596 				 */
20597 				vm_map_corpse_footprint_query_page_info(
20598 					map,
20599 					curr_s_offset,
20600 					&disposition);
20601 			} else {
20602 				/*
20603 				 * Query the live pmap for footprint info
20604 				 * about this page.
20605 				 */
20606 				vm_map_footprint_query_page_info(
20607 					map,
20608 					map_entry,
20609 					curr_s_offset,
20610 					&disposition);
20611 			}
20612 			switch (flavor) {
20613 			case VM_PAGE_INFO_BASIC:
20614 				basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20615 				basic_info->disposition = disposition;
20616 				basic_info->ref_count = 1;
20617 				basic_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
20618 				basic_info->offset = 0;
20619 				basic_info->depth = 0;
20620 
20621 				info_idx++;
20622 				break;
20623 			}
20624 			curr_s_offset += effective_page_size;
20625 			continue;
20626 		}
20627 
20628 		vm_object_reference(object);
20629 		/*
20630 		 * Shared mode -- so we can allow other readers
20631 		 * to grab the lock too.
20632 		 */
20633 		vm_object_lock_shared(object);
20634 
20635 		curr_e_offset = MIN(map_entry->vme_end, end);
20636 
20637 		vm_map_unlock_read(map);
20638 
20639 		map_entry = NULL; /* map is unlocked, the entry is no longer valid. */
20640 
20641 		curr_object = object;
20642 
20643 		for (; curr_s_offset < curr_e_offset;) {
20644 			if (object == curr_object) {
20645 				/* account for our object reference above. */
20646 				ref_count = os_ref_get_count_raw(&curr_object->ref_count) - 1;
20647 			} else {
20648 				ref_count = os_ref_get_count_raw(&curr_object->ref_count);
20649 			}
20650 
20651 			curr_offset_in_object = offset_in_object;
20652 
20653 			for (;;) {
20654 				m = vm_page_lookup(curr_object, vm_object_trunc_page(curr_offset_in_object));
20655 
20656 				if (m != VM_PAGE_NULL) {
20657 					disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
20658 					break;
20659 				} else {
20660 					if (curr_object->internal &&
20661 					    curr_object->alive &&
20662 					    !curr_object->terminating &&
20663 					    curr_object->pager_ready) {
20664 						if (vm_object_compressor_pager_state_get(curr_object, vm_object_trunc_page(curr_offset_in_object))
20665 						    == VM_EXTERNAL_STATE_EXISTS) {
20666 							/* the pager has that page */
20667 							disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
20668 							break;
20669 						}
20670 					}
20671 
20672 					/*
20673 					 * Go down the VM object shadow chain until we find the page
20674 					 * we're looking for.
20675 					 */
20676 
20677 					if (curr_object->shadow != VM_OBJECT_NULL) {
20678 						vm_object_t shadow = VM_OBJECT_NULL;
20679 
20680 						curr_offset_in_object += curr_object->vo_shadow_offset;
20681 						shadow = curr_object->shadow;
20682 
20683 						vm_object_lock_shared(shadow);
20684 						vm_object_unlock(curr_object);
20685 
20686 						curr_object = shadow;
20687 						depth++;
20688 						continue;
20689 					} else {
20690 						break;
20691 					}
20692 				}
20693 			}
20694 
20695 			/* The ref_count is not strictly accurate, it measures the number   */
20696 			/* of entities holding a ref on the object, they may not be mapping */
20697 			/* the object or may not be mapping the section holding the         */
20698 			/* target page but its still a ball park number and though an over- */
20699 			/* count, it picks up the copy-on-write cases                       */
20700 
20701 			/* We could also get a picture of page sharing from pmap_attributes */
20702 			/* but this would under count as only faulted-in mappings would     */
20703 			/* show up.							    */
20704 
20705 			if ((curr_object == object) && curr_object->shadow) {
20706 				disposition |= VM_PAGE_QUERY_PAGE_COPIED;
20707 			}
20708 
20709 			if (!curr_object->internal) {
20710 				disposition |= VM_PAGE_QUERY_PAGE_EXTERNAL;
20711 			}
20712 
20713 			if (m != VM_PAGE_NULL) {
20714 				if (vm_page_is_fictitious(m)) {
20715 					disposition |= VM_PAGE_QUERY_PAGE_FICTITIOUS;
20716 				} else {
20717 					if (m->vmp_dirty || pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(m))) {
20718 						disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
20719 					}
20720 
20721 					if (m->vmp_reference || pmap_is_referenced(VM_PAGE_GET_PHYS_PAGE(m))) {
20722 						disposition |= VM_PAGE_QUERY_PAGE_REF;
20723 					}
20724 
20725 					if (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
20726 						disposition |= VM_PAGE_QUERY_PAGE_SPECULATIVE;
20727 					}
20728 
20729 					/*
20730 					 * XXX TODO4K:
20731 					 * when this routine deals with 4k
20732 					 * pages, check the appropriate CS bit
20733 					 * here.
20734 					 */
20735 					if (m->vmp_cs_validated) {
20736 						disposition |= VM_PAGE_QUERY_PAGE_CS_VALIDATED;
20737 					}
20738 					if (m->vmp_cs_tainted) {
20739 						disposition |= VM_PAGE_QUERY_PAGE_CS_TAINTED;
20740 					}
20741 					if (m->vmp_cs_nx) {
20742 						disposition |= VM_PAGE_QUERY_PAGE_CS_NX;
20743 					}
20744 					if (m->vmp_reusable || curr_object->all_reusable) {
20745 						disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
20746 					}
20747 				}
20748 			}
20749 
20750 			switch (flavor) {
20751 			case VM_PAGE_INFO_BASIC:
20752 				basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20753 				basic_info->disposition = disposition;
20754 				basic_info->ref_count = ref_count;
20755 				basic_info->object_id = (vm_object_id_t) (uintptr_t)
20756 				    VM_KERNEL_ADDRHASH(curr_object);
20757 				basic_info->offset =
20758 				    (memory_object_offset_t) curr_offset_in_object + offset_in_page;
20759 				basic_info->depth = depth;
20760 
20761 				info_idx++;
20762 				break;
20763 			}
20764 
20765 			disposition = 0;
20766 			offset_in_page = 0; // This doesn't really make sense for any offset other than the starting offset.
20767 
20768 			/*
20769 			 * Move to next offset in the range and in our object.
20770 			 */
20771 			curr_s_offset += effective_page_size;
20772 			offset_in_object += effective_page_size;
20773 			curr_offset_in_object = offset_in_object;
20774 
20775 			if (curr_object != object) {
20776 				vm_object_unlock(curr_object);
20777 
20778 				curr_object = object;
20779 
20780 				vm_object_lock_shared(curr_object);
20781 			} else {
20782 				vm_object_lock_yield_shared(curr_object);
20783 			}
20784 		}
20785 
20786 		vm_object_unlock(curr_object);
20787 		vm_object_deallocate(curr_object);
20788 
20789 		vm_map_lock_read(map);
20790 	}
20791 
20792 	vm_map_unlock_read(map);
20793 	return retval;
20794 }
20795 
20796 static __attribute__((always_inline, warn_unused_result))
20797 kern_return_t
vm_map_msync_sanitize(vm_map_t map,vm_map_address_ut address_u,vm_map_size_ut size_u,vm_object_offset_t * address,vm_map_size_t * size)20798 vm_map_msync_sanitize(
20799 	vm_map_t                map,
20800 	vm_map_address_ut       address_u,
20801 	vm_map_size_ut          size_u,
20802 	vm_object_offset_t     *address,
20803 	vm_map_size_t          *size)
20804 {
20805 	vm_object_offset_t      end;
20806 
20807 	return vm_sanitize_addr_size(address_u, size_u,
20808 	           VM_SANITIZE_CALLER_VM_MAP_MSYNC,
20809 	           map, VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS,
20810 	           address, &end, size);
20811 }
20812 
20813 /*
20814  *	vm_map_msync
20815  *
20816  *	Synchronises the memory range specified with its backing store
20817  *	image by either flushing or cleaning the contents to the appropriate
20818  *	memory manager engaging in a memory object synchronize dialog with
20819  *	the manager.  The client doesn't return until the manager issues
20820  *	m_o_s_completed message.  MIG Magically converts user task parameter
20821  *	to the task's address map.
20822  *
20823  *	interpretation of sync_flags
20824  *	VM_SYNC_INVALIDATE	- discard pages, only return precious
20825  *				  pages to manager.
20826  *
20827  *	VM_SYNC_INVALIDATE & (VM_SYNC_SYNCHRONOUS | VM_SYNC_ASYNCHRONOUS)
20828  *				- discard pages, write dirty or precious
20829  *				  pages back to memory manager.
20830  *
20831  *	VM_SYNC_SYNCHRONOUS | VM_SYNC_ASYNCHRONOUS
20832  *				- write dirty or precious pages back to
20833  *				  the memory manager.
20834  *
20835  *	VM_SYNC_CONTIGUOUS	- does everything normally, but if there
20836  *				  is a hole in the region, and we would
20837  *				  have returned KERN_SUCCESS, return
20838  *				  KERN_INVALID_ADDRESS instead.
20839  *
20840  *	NOTE
20841  *	The memory object attributes have not yet been implemented, this
20842  *	function will have to deal with the invalidate attribute
20843  *
20844  *	RETURNS
20845  *	KERN_INVALID_TASK		Bad task parameter
20846  *	KERN_INVALID_ARGUMENT		both sync and async were specified.
20847  *	KERN_SUCCESS			The usual.
20848  *	KERN_INVALID_ADDRESS		There was a hole in the region.
20849  */
20850 
20851 kern_return_t
vm_map_msync(vm_map_t map,vm_map_address_ut address_u,vm_map_size_ut size_u,vm_sync_t sync_flags)20852 vm_map_msync(
20853 	vm_map_t                map,
20854 	vm_map_address_ut       address_u,
20855 	vm_map_size_ut          size_u,
20856 	vm_sync_t               sync_flags)
20857 {
20858 	vm_map_entry_t          entry;
20859 	vm_map_size_t           size, amount_left;
20860 	vm_object_offset_t      address, offset;
20861 	vm_object_offset_t      start_offset, end_offset;
20862 	boolean_t               do_sync_req;
20863 	boolean_t               had_hole = FALSE;
20864 	vm_map_offset_t         pmap_offset;
20865 	kern_return_t           kr;
20866 
20867 	if ((sync_flags & VM_SYNC_ASYNCHRONOUS) &&
20868 	    (sync_flags & VM_SYNC_SYNCHRONOUS)) {
20869 		return KERN_INVALID_ARGUMENT;
20870 	}
20871 
20872 	if (map == VM_MAP_NULL) {
20873 		return KERN_INVALID_TASK;
20874 	}
20875 
20876 	kr = vm_map_msync_sanitize(map,
20877 	    address_u,
20878 	    size_u,
20879 	    &address,
20880 	    &size);
20881 	if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
20882 		DEBUG4K_SHARE("map %p address 0x%llx size 0x%llx flags 0x%x\n", map, (uint64_t)address, (uint64_t)size, sync_flags);
20883 	}
20884 	if (__improbable(kr != KERN_SUCCESS)) {
20885 		return vm_sanitize_get_kr(kr);
20886 	}
20887 
20888 	amount_left = size;
20889 
20890 	while (amount_left > 0) {
20891 		vm_object_size_t        flush_size;
20892 		vm_object_t             object;
20893 
20894 		vm_map_lock(map);
20895 		if (!vm_map_lookup_entry(map,
20896 		    address,
20897 		    &entry)) {
20898 			vm_map_size_t   skip;
20899 
20900 			/*
20901 			 * hole in the address map.
20902 			 */
20903 			had_hole = TRUE;
20904 
20905 			if (sync_flags & VM_SYNC_KILLPAGES) {
20906 				/*
20907 				 * For VM_SYNC_KILLPAGES, there should be
20908 				 * no holes in the range, since we couldn't
20909 				 * prevent someone else from allocating in
20910 				 * that hole and we wouldn't want to "kill"
20911 				 * their pages.
20912 				 */
20913 				vm_map_unlock(map);
20914 				break;
20915 			}
20916 
20917 			/*
20918 			 * Check for empty map.
20919 			 */
20920 			if (entry == vm_map_to_entry(map) &&
20921 			    entry->vme_next == entry) {
20922 				vm_map_unlock(map);
20923 				break;
20924 			}
20925 			/*
20926 			 * Check that we don't wrap and that
20927 			 * we have at least one real map entry.
20928 			 */
20929 			if ((map->hdr.nentries == 0) ||
20930 			    (entry->vme_next->vme_start < address)) {
20931 				vm_map_unlock(map);
20932 				break;
20933 			}
20934 			/*
20935 			 * Move up to the next entry if needed
20936 			 */
20937 			skip = (entry->vme_next->vme_start - address);
20938 			if (skip >= amount_left) {
20939 				amount_left = 0;
20940 			} else {
20941 				amount_left -= skip;
20942 			}
20943 			address = entry->vme_next->vme_start;
20944 			vm_map_unlock(map);
20945 			continue;
20946 		}
20947 
20948 		offset = address - entry->vme_start;
20949 		pmap_offset = address;
20950 
20951 		/*
20952 		 * do we have more to flush than is contained in this
20953 		 * entry ?
20954 		 */
20955 		if (amount_left + entry->vme_start + offset > entry->vme_end) {
20956 			flush_size = entry->vme_end -
20957 			    (entry->vme_start + offset);
20958 		} else {
20959 			flush_size = amount_left;
20960 		}
20961 		amount_left -= flush_size;
20962 		address += flush_size;
20963 
20964 		if (entry->is_sub_map == TRUE) {
20965 			vm_map_t        local_map;
20966 			vm_map_offset_t local_offset;
20967 
20968 			local_map = VME_SUBMAP(entry);
20969 			local_offset = VME_OFFSET(entry);
20970 			vm_map_reference(local_map);
20971 			vm_map_unlock(map);
20972 			if (vm_map_msync(
20973 				    local_map,
20974 				    local_offset,
20975 				    flush_size,
20976 				    sync_flags) == KERN_INVALID_ADDRESS) {
20977 				had_hole = TRUE;
20978 			}
20979 			vm_map_deallocate(local_map);
20980 			local_map = VM_MAP_NULL;
20981 			continue;
20982 		}
20983 		object = VME_OBJECT(entry);
20984 
20985 		/*
20986 		 * We can't sync this object if the object has not been
20987 		 * created yet
20988 		 */
20989 		if (object == VM_OBJECT_NULL) {
20990 			vm_map_unlock(map);
20991 			continue;
20992 		}
20993 		offset += VME_OFFSET(entry);
20994 
20995 		vm_object_lock(object);
20996 
20997 		if (sync_flags & (VM_SYNC_KILLPAGES | VM_SYNC_DEACTIVATE)) {
20998 			int kill_pages = 0;
20999 
21000 			if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
21001 				/*
21002 				 * This is a destructive operation and so we
21003 				 * err on the side of limiting the range of
21004 				 * the operation.
21005 				 */
21006 				start_offset = vm_object_round_page(offset);
21007 				end_offset = vm_object_trunc_page(offset + flush_size);
21008 
21009 				if (end_offset <= start_offset) {
21010 					vm_object_unlock(object);
21011 					vm_map_unlock(map);
21012 					continue;
21013 				}
21014 
21015 				pmap_offset += start_offset - offset;
21016 			} else {
21017 				start_offset = offset;
21018 				end_offset = offset + flush_size;
21019 			}
21020 
21021 			if (sync_flags & VM_SYNC_KILLPAGES) {
21022 				if (((os_ref_get_count_raw(&object->ref_count) == 1) ||
21023 				    ((object->copy_strategy !=
21024 				    MEMORY_OBJECT_COPY_SYMMETRIC) &&
21025 				    (object->vo_copy == VM_OBJECT_NULL))) &&
21026 				    (object->shadow == VM_OBJECT_NULL)) {
21027 					if (os_ref_get_count_raw(&object->ref_count) != 1) {
21028 						vm_page_stats_reusable.free_shared++;
21029 					}
21030 					kill_pages = 1;
21031 				} else {
21032 					kill_pages = -1;
21033 				}
21034 			}
21035 			if (kill_pages != -1) {
21036 				boolean_t kill_no_write = FALSE;
21037 
21038 				if ((entry->protection & VM_PROT_EXECUTE) ||
21039 				    entry->vme_xnu_user_debug) {
21040 					/*
21041 					 * Executable or user debug pages might be write-protected by
21042 					 * hardware, so do not attempt to write to these pages.
21043 					 */
21044 					kill_no_write = TRUE;
21045 				}
21046 				vm_object_deactivate_pages(
21047 					object,
21048 					start_offset,
21049 					(vm_object_size_t) (end_offset - start_offset),
21050 					kill_pages,
21051 					FALSE, /* reusable_pages */
21052 					kill_no_write,
21053 					map->pmap,
21054 					pmap_offset);
21055 			}
21056 			vm_object_unlock(object);
21057 			vm_map_unlock(map);
21058 			continue;
21059 		}
21060 		/*
21061 		 * We can't sync this object if there isn't a pager.
21062 		 * Don't bother to sync internal objects, since there can't
21063 		 * be any "permanent" storage for these objects anyway.
21064 		 */
21065 		if ((object->pager == MEMORY_OBJECT_NULL) ||
21066 		    (object->internal) || (object->private)) {
21067 			vm_object_unlock(object);
21068 			vm_map_unlock(map);
21069 			continue;
21070 		}
21071 		/*
21072 		 * keep reference on the object until syncing is done
21073 		 */
21074 		vm_object_reference_locked(object);
21075 		vm_object_unlock(object);
21076 
21077 		vm_map_unlock(map);
21078 
21079 		if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
21080 			start_offset = vm_object_trunc_page(offset);
21081 			end_offset = vm_object_round_page(offset + flush_size);
21082 		} else {
21083 			start_offset = offset;
21084 			end_offset = offset + flush_size;
21085 		}
21086 
21087 		do_sync_req = vm_object_sync(object,
21088 		    start_offset,
21089 		    (end_offset - start_offset),
21090 		    sync_flags & VM_SYNC_INVALIDATE,
21091 		    ((sync_flags & VM_SYNC_SYNCHRONOUS) ||
21092 		    (sync_flags & VM_SYNC_ASYNCHRONOUS)),
21093 		    sync_flags & VM_SYNC_SYNCHRONOUS);
21094 
21095 		if ((sync_flags & VM_SYNC_INVALIDATE) && object->resident_page_count == 0) {
21096 			/*
21097 			 * clear out the clustering and read-ahead hints
21098 			 */
21099 			vm_object_lock(object);
21100 
21101 			object->pages_created = 0;
21102 			object->pages_used = 0;
21103 			object->sequential = 0;
21104 			object->last_alloc = 0;
21105 
21106 			vm_object_unlock(object);
21107 		}
21108 		vm_object_deallocate(object);
21109 	} /* while */
21110 
21111 	/* for proper msync() behaviour */
21112 	if (had_hole == TRUE && (sync_flags & VM_SYNC_CONTIGUOUS)) {
21113 		return KERN_INVALID_ADDRESS;
21114 	}
21115 
21116 	return KERN_SUCCESS;
21117 }/* vm_msync */
21118 
21119 void
vm_named_entry_associate_vm_object(vm_named_entry_t named_entry,vm_object_t object,vm_object_offset_t offset,vm_object_size_t size,vm_prot_t prot)21120 vm_named_entry_associate_vm_object(
21121 	vm_named_entry_t        named_entry,
21122 	vm_object_t             object,
21123 	vm_object_offset_t      offset,
21124 	vm_object_size_t        size,
21125 	vm_prot_t               prot)
21126 {
21127 	vm_map_copy_t copy;
21128 	vm_map_entry_t copy_entry;
21129 
21130 	assert(!named_entry->is_sub_map);
21131 	assert(!named_entry->is_copy);
21132 	assert(!named_entry->is_object);
21133 	assert(!named_entry->internal);
21134 	assert(named_entry->backing.copy == VM_MAP_COPY_NULL);
21135 
21136 	copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
21137 	copy->offset = offset;
21138 	copy->size = size;
21139 	copy->cpy_hdr.page_shift = (uint16_t)PAGE_SHIFT;
21140 
21141 	copy_entry = vm_map_copy_entry_create(copy);
21142 	copy_entry->protection = prot;
21143 	copy_entry->max_protection = prot;
21144 	copy_entry->use_pmap = TRUE;
21145 	copy_entry->vme_start = VM_MAP_TRUNC_PAGE(offset, PAGE_MASK);
21146 	copy_entry->vme_end = VM_MAP_ROUND_PAGE(offset + size, PAGE_MASK);
21147 	VME_OBJECT_SET(copy_entry, object, false, 0);
21148 	VME_OFFSET_SET(copy_entry, vm_object_trunc_page(offset));
21149 	vm_map_copy_entry_link(copy, vm_map_copy_last_entry(copy), copy_entry);
21150 
21151 	named_entry->backing.copy = copy;
21152 	named_entry->is_object = TRUE;
21153 	if (object->internal) {
21154 		named_entry->internal = TRUE;
21155 	}
21156 
21157 	DEBUG4K_MEMENTRY("named_entry %p copy %p object %p offset 0x%llx size 0x%llx prot 0x%x\n",
21158 	    named_entry, copy, object, offset, size, prot);
21159 }
21160 
21161 vm_object_t
vm_named_entry_to_vm_object(vm_named_entry_t named_entry)21162 vm_named_entry_to_vm_object(
21163 	vm_named_entry_t named_entry)
21164 {
21165 	vm_map_copy_t   copy;
21166 	vm_map_entry_t  copy_entry;
21167 	vm_object_t     object;
21168 
21169 	assert(!named_entry->is_sub_map);
21170 	assert(!named_entry->is_copy);
21171 	assert(named_entry->is_object);
21172 	copy = named_entry->backing.copy;
21173 	assert(copy != VM_MAP_COPY_NULL);
21174 	/*
21175 	 * Assert that the vm_map_copy is coming from the right
21176 	 * zone and hasn't been forged
21177 	 */
21178 	vm_map_copy_require(copy);
21179 	assert(copy->cpy_hdr.nentries == 1);
21180 	copy_entry = vm_map_copy_first_entry(copy);
21181 	object = VME_OBJECT(copy_entry);
21182 
21183 	DEBUG4K_MEMENTRY("%p -> %p -> %p [0x%llx 0x%llx 0x%llx 0x%x/0x%x ] -> %p offset 0x%llx size 0x%llx prot 0x%x\n", named_entry, copy, copy_entry, (uint64_t)copy_entry->vme_start, (uint64_t)copy_entry->vme_end, copy_entry->vme_offset, copy_entry->protection, copy_entry->max_protection, object, named_entry->offset, named_entry->size, named_entry->protection);
21184 
21185 	return object;
21186 }
21187 
21188 /*
21189  *	Routine:	convert_port_entry_to_map
21190  *	Purpose:
21191  *		Convert from a port specifying an entry or a task
21192  *		to a map. Doesn't consume the port ref; produces a map ref,
21193  *		which may be null.  Unlike convert_port_to_map, the
21194  *		port may be task or a named entry backed.
21195  *	Conditions:
21196  *		Nothing locked.
21197  */
21198 
21199 vm_map_t
convert_port_entry_to_map(ipc_port_t port)21200 convert_port_entry_to_map(
21201 	ipc_port_t      port)
21202 {
21203 	vm_map_t map = VM_MAP_NULL;
21204 	vm_named_entry_t named_entry;
21205 
21206 	if (!IP_VALID(port)) {
21207 		return VM_MAP_NULL;
21208 	}
21209 
21210 	if (ip_kotype(port) != IKOT_NAMED_ENTRY) {
21211 		return convert_port_to_map(port);
21212 	}
21213 
21214 	named_entry = mach_memory_entry_from_port(port);
21215 
21216 	if ((named_entry->is_sub_map) &&
21217 	    (named_entry->protection & VM_PROT_WRITE)) {
21218 		map = named_entry->backing.map;
21219 		if (map->pmap != PMAP_NULL) {
21220 			if (map->pmap == kernel_pmap) {
21221 				panic("userspace has access "
21222 				    "to a kernel map %p", map);
21223 			}
21224 			pmap_require(map->pmap);
21225 		}
21226 		vm_map_reference(map);
21227 	}
21228 
21229 	return map;
21230 }
21231 
21232 /*
21233  * Export routines to other components for the things we access locally through
21234  * macros.
21235  */
21236 #undef current_map
21237 vm_map_t
current_map(void)21238 current_map(void)
21239 {
21240 	return current_map_fast();
21241 }
21242 
21243 /*
21244  *	vm_map_reference:
21245  *
21246  *	Takes a reference on the specified map.
21247  */
21248 void
vm_map_reference(vm_map_t map)21249 vm_map_reference(
21250 	vm_map_t        map)
21251 {
21252 	if (__probable(map != VM_MAP_NULL)) {
21253 		vm_map_require(map);
21254 		os_ref_retain_raw(&map->map_refcnt, &map_refgrp);
21255 	}
21256 }
21257 
21258 /*
21259  *	vm_map_deallocate:
21260  *
21261  *	Removes a reference from the specified map,
21262  *	destroying it if no references remain.
21263  *	The map should not be locked.
21264  */
21265 void
vm_map_deallocate(vm_map_t map)21266 vm_map_deallocate(
21267 	vm_map_t        map)
21268 {
21269 	if (__probable(map != VM_MAP_NULL)) {
21270 		vm_map_require(map);
21271 		if (os_ref_release_raw(&map->map_refcnt, &map_refgrp) == 0) {
21272 			vm_map_destroy(map);
21273 		}
21274 	}
21275 }
21276 
21277 void
vm_map_inspect_deallocate(vm_map_inspect_t map)21278 vm_map_inspect_deallocate(
21279 	vm_map_inspect_t      map)
21280 {
21281 	vm_map_deallocate((vm_map_t)map);
21282 }
21283 
21284 void
vm_map_read_deallocate(vm_map_read_t map)21285 vm_map_read_deallocate(
21286 	vm_map_read_t      map)
21287 {
21288 	vm_map_deallocate((vm_map_t)map);
21289 }
21290 
21291 
21292 void
vm_map_disable_NX(vm_map_t map)21293 vm_map_disable_NX(vm_map_t map)
21294 {
21295 	if (map == NULL) {
21296 		return;
21297 	}
21298 	if (map->pmap == NULL) {
21299 		return;
21300 	}
21301 
21302 	pmap_disable_NX(map->pmap);
21303 }
21304 
21305 void
vm_map_disallow_data_exec(vm_map_t map)21306 vm_map_disallow_data_exec(vm_map_t map)
21307 {
21308 	if (map == NULL) {
21309 		return;
21310 	}
21311 
21312 	map->map_disallow_data_exec = TRUE;
21313 }
21314 
21315 /* XXX Consider making these constants (VM_MAX_ADDRESS and MACH_VM_MAX_ADDRESS)
21316  * more descriptive.
21317  */
21318 void
vm_map_set_32bit(vm_map_t map)21319 vm_map_set_32bit(vm_map_t map)
21320 {
21321 #if defined(__arm64__)
21322 	map->max_offset = pmap_max_offset(FALSE, ARM_PMAP_MAX_OFFSET_DEVICE);
21323 #else
21324 	map->max_offset = (vm_map_offset_t)VM_MAX_ADDRESS;
21325 #endif
21326 }
21327 
21328 
21329 void
vm_map_set_64bit(vm_map_t map)21330 vm_map_set_64bit(vm_map_t map)
21331 {
21332 #if defined(__arm64__)
21333 	map->max_offset = pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_DEVICE);
21334 #else
21335 	map->max_offset = (vm_map_offset_t)MACH_VM_MAX_ADDRESS;
21336 #endif
21337 }
21338 
21339 /*
21340  * Expand the maximum size of an existing map to 64GB.
21341  */
21342 void
vm_map_set_jumbo(vm_map_t map)21343 vm_map_set_jumbo(vm_map_t map)
21344 {
21345 #if defined (__arm64__) && !XNU_TARGET_OS_OSX
21346 	vm_map_set_max_addr(map, ~0, false);
21347 #else /* arm64 */
21348 	(void) map;
21349 #endif
21350 }
21351 
21352 #if XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT
21353 /*
21354  * Expand the maximum size of an existing map to the maximum supported.
21355  */
21356 void
vm_map_set_extra_jumbo(vm_map_t map)21357 vm_map_set_extra_jumbo(vm_map_t map)
21358 {
21359 #if defined (__arm64__) && !XNU_TARGET_OS_OSX
21360 	vm_map_set_max_addr(map, ~0, true);
21361 #else /* arm64 */
21362 	(void) map;
21363 #endif
21364 }
21365 #endif /* XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT */
21366 
21367 /*
21368  * This map has a JIT entitlement
21369  */
21370 void
vm_map_set_jit_entitled(vm_map_t map)21371 vm_map_set_jit_entitled(vm_map_t map)
21372 {
21373 #if defined (__arm64__)
21374 	pmap_set_jit_entitled(map->pmap);
21375 #else /* arm64 */
21376 	(void) map;
21377 #endif
21378 }
21379 
21380 /*
21381  * Get status of this maps TPRO flag
21382  */
21383 boolean_t
vm_map_tpro(vm_map_t map)21384 vm_map_tpro(vm_map_t map)
21385 {
21386 #if defined (__arm64e__)
21387 	return pmap_get_tpro(map->pmap);
21388 #else /* arm64e */
21389 	(void) map;
21390 	return FALSE;
21391 #endif
21392 }
21393 
21394 /*
21395  * This map has TPRO enabled
21396  */
21397 void
vm_map_set_tpro(vm_map_t map)21398 vm_map_set_tpro(vm_map_t map)
21399 {
21400 #if defined (__arm64e__)
21401 	pmap_set_tpro(map->pmap);
21402 #else /* arm64e */
21403 	(void) map;
21404 #endif
21405 }
21406 
21407 
21408 /*
21409  * Does this map have TPRO enforcement enabled
21410  */
21411 boolean_t
vm_map_tpro_enforcement(vm_map_t map)21412 vm_map_tpro_enforcement(vm_map_t map)
21413 {
21414 	return map->tpro_enforcement;
21415 }
21416 
21417 /*
21418  * Set TPRO enforcement for this map
21419  */
21420 void
vm_map_set_tpro_enforcement(vm_map_t map)21421 vm_map_set_tpro_enforcement(vm_map_t map)
21422 {
21423 	if (vm_map_tpro(map)) {
21424 		vm_map_lock(map);
21425 		map->tpro_enforcement = TRUE;
21426 		vm_map_unlock(map);
21427 	}
21428 }
21429 
21430 /*
21431  * Enable TPRO on the requested region
21432  *
21433  * Note:
21434  *     This routine is primarily intended to be called during/soon after map
21435  *     creation before the associated task has been released to run. It is only
21436  *     currently safe when we have no resident pages.
21437  */
21438 boolean_t
vm_map_set_tpro_range(__unused vm_map_t map,__unused vm_map_address_t start,__unused vm_map_address_t end)21439 vm_map_set_tpro_range(
21440 	__unused vm_map_t map,
21441 	__unused vm_map_address_t start,
21442 	__unused vm_map_address_t end)
21443 {
21444 	return TRUE;
21445 }
21446 
21447 /*
21448  * Expand the maximum size of an existing map.
21449  */
21450 void
vm_map_set_max_addr(vm_map_t map,vm_map_offset_t new_max_offset,__unused bool extra_jumbo)21451 vm_map_set_max_addr(
21452 	vm_map_t map,
21453 	vm_map_offset_t new_max_offset,
21454 	__unused bool extra_jumbo)
21455 {
21456 #if defined(__arm64__)
21457 	vm_map_offset_t max_supported_offset;
21458 	vm_map_offset_t old_max_offset;
21459 	unsigned int option = ARM_PMAP_MAX_OFFSET_JUMBO;
21460 
21461 	vm_map_lock(map);
21462 
21463 	old_max_offset = map->max_offset;
21464 #if XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT
21465 	if (extra_jumbo) {
21466 		option = ARM_PMAP_MAX_OFFSET_EXTRA_JUMBO;
21467 	}
21468 #endif /* XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT */
21469 	max_supported_offset = pmap_max_offset(vm_map_is_64bit(map), option);
21470 
21471 	new_max_offset = trunc_page(new_max_offset);
21472 
21473 	/* The address space cannot be shrunk using this routine. */
21474 	if (old_max_offset >= new_max_offset) {
21475 		vm_map_unlock(map);
21476 		return;
21477 	}
21478 
21479 	if (max_supported_offset < new_max_offset) {
21480 		new_max_offset = max_supported_offset;
21481 	}
21482 
21483 	map->max_offset = new_max_offset;
21484 
21485 	/*
21486 	 * Disable the following chunk of code that extends the "holes" list
21487 	 * to accomodate a larger VM map.
21488 	 * In `vm_map_create_options()`, we now set the end of the "holes" list to
21489 	 * max(map->max_offset, MACH_VM_MAX_ADDRESS) for all platforms.
21490 	 * MACH_VM_MAX_ADDRESS is the largest virtual address a userspace process
21491 	 * can map, so any `new_max_offset` value will be <= MACH_VM_MAX_ADDRESS.
21492 	 * The "holes" list does not need to be adjusted.
21493 	 */
21494 #if 0
21495 	if (map->holelistenabled) {
21496 		if (map->holes_list->prev->vme_end == old_max_offset) {
21497 			/*
21498 			 * There is already a hole at the end of the map; simply make it bigger.
21499 			 */
21500 			map->holes_list->prev->vme_end = map->max_offset;
21501 		} else {
21502 			/*
21503 			 * There is no hole at the end, so we need to create a new hole
21504 			 * for the new empty space we're creating.
21505 			 */
21506 			struct vm_map_links *new_hole;
21507 
21508 			new_hole = zalloc_id(ZONE_ID_VM_MAP_HOLES, Z_WAITOK | Z_NOFAIL);
21509 			new_hole->start = old_max_offset;
21510 			new_hole->end = map->max_offset;
21511 			new_hole->prev = map->holes_list->prev;
21512 			new_hole->next = (struct vm_map_entry *)map->holes_list;
21513 			map->holes_list->prev->vme_next = (struct vm_map_entry *)new_hole;
21514 			map->holes_list->prev = (struct vm_map_entry *)new_hole;
21515 		}
21516 	}
21517 #endif
21518 
21519 	vm_map_unlock(map);
21520 #else
21521 	(void)map;
21522 	(void)new_max_offset;
21523 #endif
21524 }
21525 
21526 vm_map_offset_t
vm_compute_max_offset(boolean_t is64)21527 vm_compute_max_offset(boolean_t is64)
21528 {
21529 #if defined(__arm64__)
21530 	return pmap_max_offset(is64, ARM_PMAP_MAX_OFFSET_DEVICE);
21531 #else
21532 	return is64 ? (vm_map_offset_t)MACH_VM_MAX_ADDRESS : (vm_map_offset_t)VM_MAX_ADDRESS;
21533 #endif
21534 }
21535 
21536 void
vm_map_get_max_aslr_slide_section(vm_map_t map __unused,int64_t * max_sections,int64_t * section_size)21537 vm_map_get_max_aslr_slide_section(
21538 	vm_map_t                map __unused,
21539 	int64_t                 *max_sections,
21540 	int64_t                 *section_size)
21541 {
21542 #if defined(__arm64__)
21543 	*max_sections = 3;
21544 	*section_size = ARM_TT_TWIG_SIZE;
21545 #else
21546 	*max_sections = 1;
21547 	*section_size = 0;
21548 #endif
21549 }
21550 
21551 uint64_t
vm_map_get_max_aslr_slide_pages(vm_map_t map)21552 vm_map_get_max_aslr_slide_pages(vm_map_t map)
21553 {
21554 #if defined(__arm64__)
21555 	/* Limit arm64 slide to 16MB to conserve contiguous VA space in the more
21556 	 * limited embedded address space; this is also meant to minimize pmap
21557 	 * memory usage on 16KB page systems.
21558 	 */
21559 	return 1 << (24 - VM_MAP_PAGE_SHIFT(map));
21560 #else
21561 	return 1 << (vm_map_is_64bit(map) ? 16 : 8);
21562 #endif
21563 }
21564 
21565 uint64_t
vm_map_get_max_loader_aslr_slide_pages(vm_map_t map)21566 vm_map_get_max_loader_aslr_slide_pages(vm_map_t map)
21567 {
21568 #if defined(__arm64__)
21569 	/* We limit the loader slide to 4MB, in order to ensure at least 8 bits
21570 	 * of independent entropy on 16KB page systems.
21571 	 */
21572 	return 1 << (22 - VM_MAP_PAGE_SHIFT(map));
21573 #else
21574 	return 1 << (vm_map_is_64bit(map) ? 16 : 8);
21575 #endif
21576 }
21577 
21578 boolean_t
vm_map_is_64bit(vm_map_t map)21579 vm_map_is_64bit(
21580 	vm_map_t map)
21581 {
21582 	return map->max_offset > ((vm_map_offset_t)VM_MAX_ADDRESS);
21583 }
21584 
21585 boolean_t
vm_map_has_hard_pagezero(vm_map_t map,vm_map_offset_t pagezero_size)21586 vm_map_has_hard_pagezero(
21587 	vm_map_t        map,
21588 	vm_map_offset_t pagezero_size)
21589 {
21590 	/*
21591 	 * XXX FBDP
21592 	 * We should lock the VM map (for read) here but we can get away
21593 	 * with it for now because there can't really be any race condition:
21594 	 * the VM map's min_offset is changed only when the VM map is created
21595 	 * and when the zero page is established (when the binary gets loaded),
21596 	 * and this routine gets called only when the task terminates and the
21597 	 * VM map is being torn down, and when a new map is created via
21598 	 * load_machfile()/execve().
21599 	 */
21600 	return map->min_offset >= pagezero_size;
21601 }
21602 
21603 /*
21604  * Raise a VM map's maximun offset.
21605  */
21606 kern_return_t
vm_map_raise_max_offset(vm_map_t map,vm_map_offset_t new_max_offset)21607 vm_map_raise_max_offset(
21608 	vm_map_t        map,
21609 	vm_map_offset_t new_max_offset)
21610 {
21611 	kern_return_t   ret;
21612 
21613 	vm_map_lock(map);
21614 	ret = KERN_INVALID_ADDRESS;
21615 
21616 	if (new_max_offset >= map->max_offset) {
21617 		if (!vm_map_is_64bit(map)) {
21618 			if (new_max_offset <= (vm_map_offset_t)VM_MAX_ADDRESS) {
21619 				map->max_offset = new_max_offset;
21620 				ret = KERN_SUCCESS;
21621 			}
21622 		} else {
21623 			if (new_max_offset <= (vm_map_offset_t)MACH_VM_MAX_ADDRESS) {
21624 				map->max_offset = new_max_offset;
21625 				ret = KERN_SUCCESS;
21626 			}
21627 		}
21628 	}
21629 
21630 	vm_map_unlock(map);
21631 	return ret;
21632 }
21633 
21634 
21635 /*
21636  * Raise a VM map's minimum offset.
21637  * To strictly enforce "page zero" reservation.
21638  */
21639 kern_return_t
vm_map_raise_min_offset(vm_map_t map,vm_map_offset_t new_min_offset)21640 vm_map_raise_min_offset(
21641 	vm_map_t        map,
21642 	vm_map_offset_t new_min_offset)
21643 {
21644 	vm_map_entry_t  first_entry;
21645 
21646 	new_min_offset = vm_map_round_page(new_min_offset,
21647 	    VM_MAP_PAGE_MASK(map));
21648 
21649 	vm_map_lock(map);
21650 
21651 	if (new_min_offset < map->min_offset) {
21652 		/*
21653 		 * Can't move min_offset backwards, as that would expose
21654 		 * a part of the address space that was previously, and for
21655 		 * possibly good reasons, inaccessible.
21656 		 */
21657 		vm_map_unlock(map);
21658 		return KERN_INVALID_ADDRESS;
21659 	}
21660 	if (new_min_offset >= map->max_offset) {
21661 		/* can't go beyond the end of the address space */
21662 		vm_map_unlock(map);
21663 		return KERN_INVALID_ADDRESS;
21664 	}
21665 
21666 	first_entry = vm_map_first_entry(map);
21667 	if (first_entry != vm_map_to_entry(map) &&
21668 	    first_entry->vme_start < new_min_offset) {
21669 		/*
21670 		 * Some memory was already allocated below the new
21671 		 * minimun offset.  It's too late to change it now...
21672 		 */
21673 		vm_map_unlock(map);
21674 		return KERN_NO_SPACE;
21675 	}
21676 
21677 	map->min_offset = new_min_offset;
21678 
21679 	if (map->holelistenabled) {
21680 		assert(map->holes_list);
21681 		map->holes_list->start = new_min_offset;
21682 		assert(new_min_offset < map->holes_list->end);
21683 	}
21684 
21685 	vm_map_unlock(map);
21686 
21687 	return KERN_SUCCESS;
21688 }
21689 
21690 /*
21691  * Set the limit on the maximum amount of address space and user wired memory allowed for this map.
21692  * This is basically a copy of the RLIMIT_AS and RLIMIT_MEMLOCK rlimit value maintained by the BSD
21693  * side of the kernel. The limits are checked in the mach VM side, so we keep a copy so we don't
21694  * have to reach over to the BSD data structures.
21695  */
21696 
21697 uint64_t vm_map_set_size_limit_count = 0;
21698 kern_return_t
vm_map_set_size_limit(vm_map_t map,uint64_t new_size_limit)21699 vm_map_set_size_limit(vm_map_t map, uint64_t new_size_limit)
21700 {
21701 	kern_return_t kr;
21702 
21703 	vm_map_lock(map);
21704 	if (new_size_limit < map->size) {
21705 		/* new limit should not be lower than its current size */
21706 		DTRACE_VM2(vm_map_set_size_limit_fail,
21707 		    vm_map_size_t, map->size,
21708 		    uint64_t, new_size_limit);
21709 		kr = KERN_FAILURE;
21710 	} else if (new_size_limit == map->size_limit) {
21711 		/* no change */
21712 		kr = KERN_SUCCESS;
21713 	} else {
21714 		/* set new limit */
21715 		DTRACE_VM2(vm_map_set_size_limit,
21716 		    vm_map_size_t, map->size,
21717 		    uint64_t, new_size_limit);
21718 		if (new_size_limit != RLIM_INFINITY) {
21719 			vm_map_set_size_limit_count++;
21720 		}
21721 		map->size_limit = new_size_limit;
21722 		kr = KERN_SUCCESS;
21723 	}
21724 	vm_map_unlock(map);
21725 	return kr;
21726 }
21727 
21728 uint64_t vm_map_set_data_limit_count = 0;
21729 kern_return_t
vm_map_set_data_limit(vm_map_t map,uint64_t new_data_limit)21730 vm_map_set_data_limit(vm_map_t map, uint64_t new_data_limit)
21731 {
21732 	kern_return_t kr;
21733 
21734 	vm_map_lock(map);
21735 	if (new_data_limit < map->size) {
21736 		/* new limit should not be lower than its current size */
21737 		DTRACE_VM2(vm_map_set_data_limit_fail,
21738 		    vm_map_size_t, map->size,
21739 		    uint64_t, new_data_limit);
21740 		kr = KERN_FAILURE;
21741 	} else if (new_data_limit == map->data_limit) {
21742 		/* no change */
21743 		kr = KERN_SUCCESS;
21744 	} else {
21745 		/* set new limit */
21746 		DTRACE_VM2(vm_map_set_data_limit,
21747 		    vm_map_size_t, map->size,
21748 		    uint64_t, new_data_limit);
21749 		if (new_data_limit != RLIM_INFINITY) {
21750 			vm_map_set_data_limit_count++;
21751 		}
21752 		map->data_limit = new_data_limit;
21753 		kr = KERN_SUCCESS;
21754 	}
21755 	vm_map_unlock(map);
21756 	return kr;
21757 }
21758 
21759 void
vm_map_set_user_wire_limit(vm_map_t map,vm_size_t limit)21760 vm_map_set_user_wire_limit(vm_map_t     map,
21761     vm_size_t    limit)
21762 {
21763 	vm_map_lock(map);
21764 	map->user_wire_limit = limit;
21765 	vm_map_unlock(map);
21766 }
21767 
21768 
21769 void
vm_map_switch_protect(vm_map_t map,boolean_t val)21770 vm_map_switch_protect(vm_map_t     map,
21771     boolean_t    val)
21772 {
21773 	vm_map_lock(map);
21774 	map->switch_protect = val;
21775 	vm_map_unlock(map);
21776 }
21777 
21778 extern int cs_process_enforcement_enable;
21779 boolean_t
vm_map_cs_enforcement(vm_map_t map)21780 vm_map_cs_enforcement(
21781 	vm_map_t map)
21782 {
21783 	if (cs_process_enforcement_enable) {
21784 		return TRUE;
21785 	}
21786 	return map->cs_enforcement;
21787 }
21788 
21789 kern_return_t
vm_map_cs_wx_enable(__unused vm_map_t map)21790 vm_map_cs_wx_enable(
21791 	__unused vm_map_t map)
21792 {
21793 #if CODE_SIGNING_MONITOR
21794 	kern_return_t ret = csm_allow_invalid_code(vm_map_pmap(map));
21795 	if ((ret == KERN_SUCCESS) || (ret == KERN_NOT_SUPPORTED)) {
21796 		return KERN_SUCCESS;
21797 	}
21798 	return ret;
21799 #else
21800 	/* The VM manages WX memory entirely on its own */
21801 	return KERN_SUCCESS;
21802 #endif
21803 }
21804 
21805 kern_return_t
vm_map_csm_allow_jit(__unused vm_map_t map)21806 vm_map_csm_allow_jit(
21807 	__unused vm_map_t map)
21808 {
21809 #if CODE_SIGNING_MONITOR
21810 	return csm_allow_jit_region(vm_map_pmap(map));
21811 #else
21812 	/* No code signing monitor to enforce JIT policy */
21813 	return KERN_SUCCESS;
21814 #endif
21815 }
21816 
21817 void
vm_map_cs_debugged_set(vm_map_t map,boolean_t val)21818 vm_map_cs_debugged_set(
21819 	vm_map_t map,
21820 	boolean_t val)
21821 {
21822 	vm_map_lock(map);
21823 	map->cs_debugged = val;
21824 	vm_map_unlock(map);
21825 }
21826 
21827 void
vm_map_cs_enforcement_set(vm_map_t map,boolean_t val)21828 vm_map_cs_enforcement_set(
21829 	vm_map_t map,
21830 	boolean_t val)
21831 {
21832 	vm_map_lock(map);
21833 	map->cs_enforcement = val;
21834 	pmap_set_vm_map_cs_enforced(map->pmap, val);
21835 	vm_map_unlock(map);
21836 }
21837 
21838 /*
21839  * IOKit has mapped a region into this map; adjust the pmap's ledgers appropriately.
21840  * phys_footprint is a composite limit consisting of iokit + physmem, so we need to
21841  * bump both counters.
21842  */
21843 void
vm_map_iokit_mapped_region(vm_map_t map,vm_size_t bytes)21844 vm_map_iokit_mapped_region(vm_map_t map, vm_size_t bytes)
21845 {
21846 	pmap_t pmap = vm_map_pmap(map);
21847 
21848 	ledger_credit(pmap->ledger, task_ledgers.iokit_mapped, bytes);
21849 	ledger_credit(pmap->ledger, task_ledgers.phys_footprint, bytes);
21850 }
21851 
21852 void
vm_map_iokit_unmapped_region(vm_map_t map,vm_size_t bytes)21853 vm_map_iokit_unmapped_region(vm_map_t map, vm_size_t bytes)
21854 {
21855 	pmap_t pmap = vm_map_pmap(map);
21856 
21857 	ledger_debit(pmap->ledger, task_ledgers.iokit_mapped, bytes);
21858 	ledger_debit(pmap->ledger, task_ledgers.phys_footprint, bytes);
21859 }
21860 
21861 /* Add (generate) code signature for memory range */
21862 #if CONFIG_DYNAMIC_CODE_SIGNING
21863 kern_return_t
vm_map_sign(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)21864 vm_map_sign(vm_map_t map,
21865     vm_map_offset_t start,
21866     vm_map_offset_t end)
21867 {
21868 	vm_map_entry_t entry;
21869 	vm_map_offset_t entry_start;
21870 	vm_object_offset_t entry_offset;
21871 	vm_page_t m;
21872 	vm_object_t object;
21873 
21874 	/*
21875 	 * Vet all the input parameters and current type and state of the
21876 	 * underlaying object.  Return with an error if anything is amiss.
21877 	 */
21878 	if (map == VM_MAP_NULL) {
21879 		return KERN_INVALID_ARGUMENT;
21880 	}
21881 
21882 	if (__improbable(vm_map_range_overflows(map, start, end - start))) {
21883 		return KERN_INVALID_ADDRESS;
21884 	}
21885 
21886 	vm_map_lock_read(map);
21887 
21888 	if (!vm_map_lookup_entry(map, start, &entry) || entry->is_sub_map) {
21889 		/*
21890 		 * Must pass a valid non-submap address.
21891 		 */
21892 		vm_map_unlock_read(map);
21893 		return KERN_INVALID_ADDRESS;
21894 	}
21895 
21896 	if ((entry->vme_start > start) || (entry->vme_end < end)) {
21897 		/*
21898 		 * Map entry doesn't cover the requested range. Not handling
21899 		 * this situation currently.
21900 		 */
21901 		vm_map_unlock_read(map);
21902 		return KERN_INVALID_ARGUMENT;
21903 	}
21904 
21905 	object = VME_OBJECT(entry);
21906 	if (object == VM_OBJECT_NULL) {
21907 		/*
21908 		 * Object must already be present or we can't sign.
21909 		 */
21910 		vm_map_unlock_read(map);
21911 		return KERN_INVALID_ARGUMENT;
21912 	}
21913 
21914 	vm_object_lock(object);
21915 
21916 	entry_start = entry->vme_start;
21917 	entry_offset = VME_OFFSET(entry);
21918 	vm_map_unlock_read(map);
21919 	entry = VM_MAP_ENTRY_NULL; /* no longer valid after unlocking map */
21920 
21921 	while (start < end) {
21922 		uint32_t refmod;
21923 
21924 		m = vm_page_lookup(object,
21925 		    start - entry_start + entry_offset);
21926 		if (m == VM_PAGE_NULL) {
21927 			/* shoud we try to fault a page here? we can probably
21928 			 * demand it exists and is locked for this request */
21929 			vm_object_unlock(object);
21930 			return KERN_FAILURE;
21931 		}
21932 		/* deal with special page status */
21933 		if (m->vmp_busy ||
21934 		    (m->vmp_unusual && (VMP_ERROR_GET(m) || m->vmp_restart ||
21935 		    vm_page_is_private(m) || m->vmp_absent))) {
21936 			vm_object_unlock(object);
21937 			return KERN_FAILURE;
21938 		}
21939 
21940 		/* Page is OK... now "validate" it */
21941 		/* This is the place where we'll call out to create a code
21942 		 * directory, later */
21943 		/* XXX TODO4K: deal with 4k subpages individually? */
21944 		m->vmp_cs_validated = VMP_CS_ALL_TRUE;
21945 
21946 		/* The page is now "clean" for codesigning purposes. That means
21947 		 * we don't consider it as modified (wpmapped) anymore. But
21948 		 * we'll disconnect the page so we note any future modification
21949 		 * attempts. */
21950 		m->vmp_wpmapped = FALSE;
21951 		refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
21952 
21953 		/* Pull the dirty status from the pmap, since we cleared the
21954 		 * wpmapped bit */
21955 		if ((refmod & VM_MEM_MODIFIED) && !m->vmp_dirty) {
21956 			SET_PAGE_DIRTY(m, FALSE);
21957 		}
21958 
21959 		/* On to the next page */
21960 		start += PAGE_SIZE;
21961 	}
21962 	vm_object_unlock(object);
21963 
21964 	return KERN_SUCCESS;
21965 }
21966 #endif
21967 
21968 kern_return_t
vm_map_partial_reap(vm_map_t map,unsigned int * reclaimed_resident,unsigned int * reclaimed_compressed)21969 vm_map_partial_reap(vm_map_t map, unsigned int *reclaimed_resident, unsigned int *reclaimed_compressed)
21970 {
21971 	vm_map_entry_t  entry = VM_MAP_ENTRY_NULL;
21972 	vm_map_entry_t  next_entry;
21973 	kern_return_t   kr = KERN_SUCCESS;
21974 	VM_MAP_ZAP_DECLARE(zap_list);
21975 
21976 	vm_map_lock(map);
21977 
21978 	for (entry = vm_map_first_entry(map);
21979 	    entry != vm_map_to_entry(map);
21980 	    entry = next_entry) {
21981 		next_entry = entry->vme_next;
21982 
21983 		if (!entry->is_sub_map &&
21984 		    VME_OBJECT(entry) &&
21985 		    (VME_OBJECT(entry)->internal == TRUE) &&
21986 		    (os_ref_get_count_raw(&VME_OBJECT(entry)->ref_count) == 1)) {
21987 			*reclaimed_resident += VME_OBJECT(entry)->resident_page_count;
21988 			*reclaimed_compressed += vm_compressor_pager_get_count(VME_OBJECT(entry)->pager);
21989 
21990 			(void)vm_map_delete(map, entry->vme_start,
21991 			    entry->vme_end, VM_MAP_REMOVE_NO_YIELD,
21992 			    KMEM_GUARD_NONE, &zap_list);
21993 		}
21994 	}
21995 
21996 	vm_map_unlock(map);
21997 
21998 	vm_map_zap_dispose(&zap_list);
21999 
22000 	return kr;
22001 }
22002 
22003 
22004 #if DEVELOPMENT || DEBUG
22005 
22006 int
vm_map_disconnect_page_mappings(vm_map_t map,boolean_t do_unnest)22007 vm_map_disconnect_page_mappings(
22008 	vm_map_t map,
22009 	boolean_t do_unnest)
22010 {
22011 	vm_map_entry_t entry;
22012 	ledger_amount_t byte_count = 0;
22013 
22014 	if (do_unnest == TRUE) {
22015 #ifndef NO_NESTED_PMAP
22016 		vm_map_lock(map);
22017 
22018 		for (entry = vm_map_first_entry(map);
22019 		    entry != vm_map_to_entry(map);
22020 		    entry = entry->vme_next) {
22021 			if (entry->is_sub_map && entry->use_pmap) {
22022 				/*
22023 				 * Make sure the range between the start of this entry and
22024 				 * the end of this entry is no longer nested, so that
22025 				 * we will only remove mappings from the pmap in use by this
22026 				 * this task
22027 				 */
22028 				vm_map_clip_unnest(map, entry, entry->vme_start, entry->vme_end);
22029 			}
22030 		}
22031 		vm_map_unlock(map);
22032 #endif
22033 	}
22034 	vm_map_lock_read(map);
22035 
22036 	ledger_get_balance(map->pmap->ledger, task_ledgers.phys_mem, &byte_count);
22037 
22038 	for (entry = vm_map_first_entry(map);
22039 	    entry != vm_map_to_entry(map);
22040 	    entry = entry->vme_next) {
22041 		if (!entry->is_sub_map && ((VME_OBJECT(entry) == 0) ||
22042 		    (VME_OBJECT(entry)->phys_contiguous))) {
22043 			continue;
22044 		}
22045 		if (entry->is_sub_map) {
22046 			assert(!entry->use_pmap);
22047 		}
22048 
22049 		pmap_remove_options(map->pmap, entry->vme_start, entry->vme_end, 0);
22050 	}
22051 	vm_map_unlock_read(map);
22052 
22053 	return (int) (byte_count / VM_MAP_PAGE_SIZE(map));
22054 }
22055 
22056 kern_return_t
vm_map_inject_error(vm_map_t map,vm_map_offset_t vaddr)22057 vm_map_inject_error(vm_map_t map, vm_map_offset_t vaddr)
22058 {
22059 	vm_object_t object = NULL;
22060 	vm_object_offset_t offset;
22061 	vm_prot_t prot;
22062 	boolean_t wired;
22063 	vm_map_version_t version;
22064 	vm_map_t real_map;
22065 	int result = KERN_FAILURE;
22066 
22067 	vaddr = vm_map_trunc_page(vaddr, PAGE_MASK);
22068 	vm_map_lock(map);
22069 
22070 	result = vm_map_lookup_and_lock_object(&map, vaddr, VM_PROT_READ,
22071 	    OBJECT_LOCK_EXCLUSIVE, &version, &object, &offset, &prot, &wired,
22072 	    NULL, &real_map, NULL);
22073 	if (object == NULL) {
22074 		result = KERN_MEMORY_ERROR;
22075 	} else if (object->pager) {
22076 		result = vm_compressor_pager_inject_error(object->pager,
22077 		    offset);
22078 	} else {
22079 		result = KERN_MEMORY_PRESENT;
22080 	}
22081 
22082 	if (object != NULL) {
22083 		vm_object_unlock(object);
22084 	}
22085 
22086 	if (real_map != map) {
22087 		vm_map_unlock(real_map);
22088 	}
22089 	vm_map_unlock(map);
22090 
22091 	return result;
22092 }
22093 
22094 /* iterate over map entries. Call the first argument block for the number of entries and the second for every entry
22095  * returns: KERN_SUCCESS if iteration completed ok,
22096  *      error code if callback returned an error
22097  *      KERN_FAILURE if there was a race of adding/removing entries during the iteration and the number of entries
22098  *      iterated is different from the number in the first call
22099  */
22100 static kern_return_t
22101 vm_map_entries_foreach_locked(vm_map_t map, kern_return_t (^count_handler)(int nentries),
22102     kern_return_t (^entry_handler)(void* entry))
22103 {
22104 	vm_map_lock_assert_held(map);
22105 	int nentries = map->hdr.nentries;
22106 	kern_return_t error = count_handler(nentries);
22107 	if (error) {
22108 		return error;
22109 	}
22110 
22111 	/* iterate until we loop back to the map, see get_vmmap_entries() */
22112 	vm_map_entry_t entry = vm_map_first_entry(map);
22113 	int count = 0;
22114 	while (entry != vm_map_to_entry(map)) {
22115 		error = entry_handler(entry);
22116 		if (error != KERN_SUCCESS) {
22117 			return error;
22118 		}
22119 		entry = entry->vme_next;
22120 		++count;
22121 		if (count > nentries) {
22122 			/* nentries and entries iteration don't agree on how many entries there are, shouldn't really happen */
22123 			return KERN_FAILURE;
22124 		}
22125 	}
22126 	if (count < nentries) {
22127 		return KERN_FAILURE;
22128 	}
22129 	return KERN_SUCCESS;
22130 }
22131 
22132 kern_return_t
22133 vm_map_entries_foreach(vm_map_t map, kern_return_t (^count_handler)(int nentries),
22134     kern_return_t (^entry_handler)(void* entry))
22135 {
22136 	vm_map_lock_read(map);
22137 	kern_return_t error = vm_map_entries_foreach_locked(map, count_handler, entry_handler);
22138 	vm_map_unlock_read(map);
22139 	return error;
22140 }
22141 
22142 /*
22143  * Dump info about the entry into the given buffer.
22144  * return true on success, false if there was not enough space in the give buffer
22145  * argument size in: bytes free in the given buffer, out: bytes written
22146  */
22147 kern_return_t
vm_map_dump_entry_and_compressor_pager(void * pentry,char * buf,size_t * size)22148 vm_map_dump_entry_and_compressor_pager(void* pentry, char *buf, size_t *size)
22149 {
22150 	size_t insize = *size;
22151 	kern_return_t kr;
22152 	size_t offset = 0;
22153 
22154 	*size = 0;
22155 	if (sizeof(struct vm_map_entry_info) > insize) {
22156 		return KERN_NO_SPACE;
22157 	}
22158 
22159 	vm_map_entry_t entry = (vm_map_entry_t)pentry;
22160 	struct vm_map_entry_info *out_entry = (struct vm_map_entry_info*)buf;
22161 	out_entry->vmei_start = entry->vme_start;
22162 	out_entry->vmei_end = entry->vme_end;
22163 	out_entry->vmei_alias = VME_ALIAS(entry);
22164 	out_entry->vmei_offset = VME_OFFSET(entry);
22165 	out_entry->vmei_is_sub_map = entry->is_sub_map;
22166 	out_entry->vmei_protection = entry->protection;
22167 	offset += sizeof(struct vm_map_entry_info);
22168 
22169 	out_entry->vmei_slot_mapping_count = 0;
22170 	out_entry->vmei_is_compressor_pager = false;
22171 	*size = offset;
22172 	if (out_entry->vmei_is_sub_map) {
22173 		return KERN_SUCCESS; // TODO: sub_map interrogation not supported yet
22174 	}
22175 	/* have a vm_object? */
22176 	vm_object_t object = VME_OBJECT(entry);
22177 	if (object == VM_OBJECT_NULL || !object->internal) {
22178 		return KERN_SUCCESS;
22179 	}
22180 	/* objects has a pager? */
22181 	memory_object_t pager = object->pager;
22182 	if (pager != MEMORY_OBJECT_NULL) {
22183 		return KERN_SUCCESS;
22184 	}
22185 	bool is_compressor = false;
22186 	unsigned int slot_mapping_count = 0;
22187 	size_t pager_info_size = insize - offset;
22188 	kr = vm_compressor_pager_dump(pager, buf + offset, &pager_info_size, &is_compressor, &slot_mapping_count);
22189 	if (kr != KERN_SUCCESS) {
22190 		/* didn't have enough space for everything we want to write, caller needs to retry */
22191 		return kr;
22192 	}
22193 	offset += pager_info_size;
22194 	/* if we got here, is_compressor should be true due to the object->internal check above, so this assignment
22195 	 * is just for sanity sake */
22196 	out_entry->vmei_is_compressor_pager = is_compressor;
22197 	out_entry->vmei_slot_mapping_count = slot_mapping_count;
22198 	*size = offset;
22199 	return KERN_SUCCESS;
22200 }
22201 
22202 
22203 #endif
22204 
22205 
22206 #if CONFIG_FREEZE
22207 
22208 
22209 extern struct freezer_context freezer_context_global;
22210 AbsoluteTime c_freezer_last_yield_ts = 0;
22211 
22212 extern unsigned int memorystatus_freeze_private_shared_pages_ratio;
22213 extern unsigned int memorystatus_freeze_shared_mb_per_process_max;
22214 
22215 kern_return_t
vm_map_freeze(task_t task,unsigned int * purgeable_count,unsigned int * wired_count,unsigned int * clean_count,unsigned int * dirty_count,unsigned int dirty_budget,unsigned int * shared_count,int * freezer_error_code,boolean_t eval_only)22216 vm_map_freeze(
22217 	task_t       task,
22218 	unsigned int *purgeable_count,
22219 	unsigned int *wired_count,
22220 	unsigned int *clean_count,
22221 	unsigned int *dirty_count,
22222 	unsigned int dirty_budget,
22223 	unsigned int *shared_count,
22224 	int          *freezer_error_code,
22225 	boolean_t    eval_only)
22226 {
22227 	vm_map_entry_t  entry2 = VM_MAP_ENTRY_NULL;
22228 	kern_return_t   kr = KERN_SUCCESS;
22229 	boolean_t       evaluation_phase = TRUE;
22230 	vm_object_t     cur_shared_object = NULL;
22231 	int             cur_shared_obj_ref_cnt = 0;
22232 	unsigned int    dirty_private_count = 0, dirty_shared_count = 0, obj_pages_snapshot = 0;
22233 
22234 	*purgeable_count = *wired_count = *clean_count = *dirty_count = *shared_count = 0;
22235 
22236 	/*
22237 	 * We need the exclusive lock here so that we can
22238 	 * block any page faults or lookups while we are
22239 	 * in the middle of freezing this vm map.
22240 	 */
22241 	vm_map_t map = task->map;
22242 
22243 	vm_map_lock(map);
22244 
22245 	assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
22246 
22247 	if (vm_compressor_low_on_space() || vm_swap_low_on_space()) {
22248 		if (vm_compressor_low_on_space()) {
22249 			*freezer_error_code = FREEZER_ERROR_NO_COMPRESSOR_SPACE;
22250 		}
22251 
22252 		if (vm_swap_low_on_space()) {
22253 			*freezer_error_code = FREEZER_ERROR_NO_SWAP_SPACE;
22254 		}
22255 
22256 		kr = KERN_NO_SPACE;
22257 		goto done;
22258 	}
22259 
22260 	if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE == FALSE) {
22261 		/*
22262 		 * In-memory compressor backing the freezer. No disk.
22263 		 * So no need to do the evaluation phase.
22264 		 */
22265 		evaluation_phase = FALSE;
22266 
22267 		if (eval_only == TRUE) {
22268 			/*
22269 			 * We don't support 'eval_only' mode
22270 			 * in this non-swap config.
22271 			 */
22272 			*freezer_error_code = FREEZER_ERROR_GENERIC;
22273 			kr = KERN_INVALID_ARGUMENT;
22274 			goto done;
22275 		}
22276 
22277 		freezer_context_global.freezer_ctx_uncompressed_pages = 0;
22278 		clock_get_uptime(&c_freezer_last_yield_ts);
22279 	}
22280 again:
22281 
22282 	for (entry2 = vm_map_first_entry(map);
22283 	    entry2 != vm_map_to_entry(map);
22284 	    entry2 = entry2->vme_next) {
22285 		vm_object_t src_object;
22286 
22287 		if (entry2->is_sub_map) {
22288 			continue;
22289 		}
22290 
22291 		src_object = VME_OBJECT(entry2);
22292 		if (!src_object ||
22293 		    src_object->phys_contiguous ||
22294 		    !src_object->internal) {
22295 			continue;
22296 		}
22297 
22298 		/* If eligible, scan the entry, moving eligible pages over to our parent object */
22299 
22300 		if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
22301 			/*
22302 			 * We skip purgeable objects during evaluation phase only.
22303 			 * If we decide to freeze this process, we'll explicitly
22304 			 * purge these objects before we go around again with
22305 			 * 'evaluation_phase' set to FALSE.
22306 			 */
22307 
22308 			if ((src_object->purgable == VM_PURGABLE_EMPTY) || (src_object->purgable == VM_PURGABLE_VOLATILE)) {
22309 				/*
22310 				 * We want to purge objects that may not belong to this task but are mapped
22311 				 * in this task alone. Since we already purged this task's purgeable memory
22312 				 * at the end of a successful evaluation phase, we want to avoid doing no-op calls
22313 				 * on this task's purgeable objects. Hence the check for only volatile objects.
22314 				 */
22315 				if (evaluation_phase ||
22316 				    src_object->purgable != VM_PURGABLE_VOLATILE ||
22317 				    os_ref_get_count_raw(&src_object->ref_count) != 1) {
22318 					continue;
22319 				}
22320 				vm_object_lock(src_object);
22321 				if (src_object->purgable == VM_PURGABLE_VOLATILE &&
22322 				    os_ref_get_count_raw(&src_object->ref_count) == 1) {
22323 					purgeable_q_t old_queue;
22324 
22325 					/* object should be on a purgeable queue */
22326 					assert(src_object->objq.next != NULL &&
22327 					    src_object->objq.prev != NULL);
22328 					/* move object from its volatile queue to the nonvolatile queue */
22329 					old_queue = vm_purgeable_object_remove(src_object);
22330 					assert(old_queue);
22331 					if (src_object->purgeable_when_ripe) {
22332 						/* remove a token from that volatile queue */
22333 						vm_page_lock_queues();
22334 						vm_purgeable_token_delete_first(old_queue);
22335 						vm_page_unlock_queues();
22336 					}
22337 					/* purge the object */
22338 					vm_object_purge(src_object, 0);
22339 				}
22340 				vm_object_unlock(src_object);
22341 				continue;
22342 			}
22343 
22344 			/*
22345 			 * Pages belonging to this object could be swapped to disk.
22346 			 * Make sure it's not a shared object because we could end
22347 			 * up just bringing it back in again.
22348 			 *
22349 			 * We try to optimize somewhat by checking for objects that are mapped
22350 			 * more than once within our own map. But we don't do full searches,
22351 			 * we just look at the entries following our current entry.
22352 			 */
22353 
22354 			if (os_ref_get_count_raw(&src_object->ref_count) > 1) {
22355 				if (src_object != cur_shared_object) {
22356 					obj_pages_snapshot = (src_object->resident_page_count - src_object->wired_page_count) + vm_compressor_pager_get_count(src_object->pager);
22357 					dirty_shared_count += obj_pages_snapshot;
22358 
22359 					cur_shared_object = src_object;
22360 					cur_shared_obj_ref_cnt = 1;
22361 					continue;
22362 				} else {
22363 					cur_shared_obj_ref_cnt++;
22364 					if (os_ref_get_count_raw(&src_object->ref_count) == cur_shared_obj_ref_cnt) {
22365 						/*
22366 						 * Fall through to below and treat this object as private.
22367 						 * So deduct its pages from our shared total and add it to the
22368 						 * private total.
22369 						 */
22370 
22371 						dirty_shared_count -= obj_pages_snapshot;
22372 						dirty_private_count += obj_pages_snapshot;
22373 					} else {
22374 						continue;
22375 					}
22376 				}
22377 			}
22378 
22379 
22380 			if (os_ref_get_count_raw(&src_object->ref_count) == 1) {
22381 				dirty_private_count += (src_object->resident_page_count - src_object->wired_page_count) + vm_compressor_pager_get_count(src_object->pager);
22382 			}
22383 
22384 			if (evaluation_phase == TRUE) {
22385 				continue;
22386 			}
22387 		}
22388 
22389 		uint32_t paged_out_count = vm_object_compressed_freezer_pageout(src_object, dirty_budget);
22390 		*wired_count += src_object->wired_page_count;
22391 
22392 		if (vm_compressor_low_on_space() || vm_swap_low_on_space()) {
22393 			if (vm_compressor_low_on_space()) {
22394 				*freezer_error_code = FREEZER_ERROR_NO_COMPRESSOR_SPACE;
22395 			}
22396 
22397 			if (vm_swap_low_on_space()) {
22398 				*freezer_error_code = FREEZER_ERROR_NO_SWAP_SPACE;
22399 			}
22400 
22401 			kr = KERN_NO_SPACE;
22402 			break;
22403 		}
22404 		if (paged_out_count >= dirty_budget) {
22405 			break;
22406 		}
22407 		dirty_budget -= paged_out_count;
22408 	}
22409 
22410 	*shared_count = (unsigned int) ((dirty_shared_count * PAGE_SIZE_64) / (1024 * 1024ULL));
22411 	if (evaluation_phase) {
22412 		unsigned int shared_pages_threshold = (memorystatus_freeze_shared_mb_per_process_max * 1024 * 1024ULL) / PAGE_SIZE_64;
22413 
22414 		if (dirty_shared_count > shared_pages_threshold) {
22415 			*freezer_error_code = FREEZER_ERROR_EXCESS_SHARED_MEMORY;
22416 			kr = KERN_FAILURE;
22417 			goto done;
22418 		}
22419 
22420 		if (dirty_shared_count &&
22421 		    ((dirty_private_count / dirty_shared_count) < memorystatus_freeze_private_shared_pages_ratio)) {
22422 			*freezer_error_code = FREEZER_ERROR_LOW_PRIVATE_SHARED_RATIO;
22423 			kr = KERN_FAILURE;
22424 			goto done;
22425 		}
22426 
22427 		evaluation_phase = FALSE;
22428 		dirty_shared_count = dirty_private_count = 0;
22429 
22430 		freezer_context_global.freezer_ctx_uncompressed_pages = 0;
22431 		clock_get_uptime(&c_freezer_last_yield_ts);
22432 
22433 		if (eval_only) {
22434 			kr = KERN_SUCCESS;
22435 			goto done;
22436 		}
22437 
22438 		vm_purgeable_purge_task_owned(task);
22439 
22440 		goto again;
22441 	} else {
22442 		kr = KERN_SUCCESS;
22443 	}
22444 
22445 done:
22446 	vm_map_unlock(map);
22447 
22448 	if ((eval_only == FALSE) && (kr == KERN_SUCCESS)) {
22449 		vm_object_compressed_freezer_done();
22450 	}
22451 	return kr;
22452 }
22453 
22454 #endif
22455 
22456 /*
22457  * vm_map_entry_should_cow_for_true_share:
22458  *
22459  * Determines if the map entry should be clipped and setup for copy-on-write
22460  * to avoid applying "true_share" to a large VM object when only a subset is
22461  * targeted.
22462  *
22463  * For now, we target only the map entries created for the Objective C
22464  * Garbage Collector, which initially have the following properties:
22465  *	- alias == VM_MEMORY_MALLOC
22466  *      - wired_count == 0
22467  *      - !needs_copy
22468  * and a VM object with:
22469  *      - internal
22470  *      - copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC
22471  *      - !true_share
22472  *      - vo_size == ANON_CHUNK_SIZE
22473  *
22474  * Only non-kernel map entries.
22475  */
22476 boolean_t
vm_map_entry_should_cow_for_true_share(vm_map_entry_t entry)22477 vm_map_entry_should_cow_for_true_share(
22478 	vm_map_entry_t  entry)
22479 {
22480 	vm_object_t     object;
22481 
22482 	if (entry->is_sub_map) {
22483 		/* entry does not point at a VM object */
22484 		return FALSE;
22485 	}
22486 
22487 	if (entry->needs_copy) {
22488 		/* already set for copy_on_write: done! */
22489 		return FALSE;
22490 	}
22491 
22492 	if (VME_ALIAS(entry) != VM_MEMORY_MALLOC &&
22493 	    VME_ALIAS(entry) != VM_MEMORY_MALLOC_SMALL) {
22494 		/* not a malloc heap or Obj-C Garbage Collector heap */
22495 		return FALSE;
22496 	}
22497 
22498 	if (entry->wired_count) {
22499 		/* wired: can't change the map entry... */
22500 		vm_counters.should_cow_but_wired++;
22501 		return FALSE;
22502 	}
22503 
22504 	object = VME_OBJECT(entry);
22505 
22506 	if (object == VM_OBJECT_NULL) {
22507 		/* no object yet... */
22508 		return FALSE;
22509 	}
22510 
22511 	if (!object->internal) {
22512 		/* not an internal object */
22513 		return FALSE;
22514 	}
22515 
22516 	if (object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
22517 		/* not the default copy strategy */
22518 		return FALSE;
22519 	}
22520 
22521 	if (object->true_share) {
22522 		/* already true_share: too late to avoid it */
22523 		return FALSE;
22524 	}
22525 
22526 	if (VME_ALIAS(entry) == VM_MEMORY_MALLOC &&
22527 	    object->vo_size != ANON_CHUNK_SIZE) {
22528 		/* ... not an object created for the ObjC Garbage Collector */
22529 		return FALSE;
22530 	}
22531 
22532 	if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_SMALL &&
22533 	    object->vo_size != 2048 * 4096) {
22534 		/* ... not a "MALLOC_SMALL" heap */
22535 		return FALSE;
22536 	}
22537 
22538 	/*
22539 	 * All the criteria match: we have a large object being targeted for "true_share".
22540 	 * To limit the adverse side-effects linked with "true_share", tell the caller to
22541 	 * try and avoid setting up the entire object for "true_share" by clipping the
22542 	 * targeted range and setting it up for copy-on-write.
22543 	 */
22544 	return TRUE;
22545 }
22546 
22547 uint64_t vm_map_range_overflows_count = 0;
22548 TUNABLE_WRITEABLE(boolean_t, vm_map_range_overflows_log, "vm_map_range_overflows_log", FALSE);
22549 bool
vm_map_range_overflows(vm_map_t map,vm_map_offset_t addr,vm_map_size_t size)22550 vm_map_range_overflows(
22551 	vm_map_t map,
22552 	vm_map_offset_t addr,
22553 	vm_map_size_t size)
22554 {
22555 	vm_map_offset_t start, end, sum;
22556 	vm_map_offset_t pgmask;
22557 
22558 	if (size == 0) {
22559 		/* empty range -> no overflow */
22560 		return false;
22561 	}
22562 	pgmask = vm_map_page_mask(map);
22563 	start = vm_map_trunc_page_mask(addr, pgmask);
22564 	end = vm_map_round_page_mask(addr + size, pgmask);
22565 	if (__improbable(os_add_overflow(addr, size, &sum) || end <= start)) {
22566 		vm_map_range_overflows_count++;
22567 		if (vm_map_range_overflows_log) {
22568 			printf("%d[%s] vm_map_range_overflows addr 0x%llx size 0x%llx pgmask 0x%llx\n",
22569 			    proc_selfpid(),
22570 			    proc_best_name(current_proc()),
22571 			    (uint64_t)addr,
22572 			    (uint64_t)size,
22573 			    (uint64_t)pgmask);
22574 		}
22575 		DTRACE_VM4(vm_map_range_overflows,
22576 		    vm_map_t, map,
22577 		    uint32_t, pgmask,
22578 		    uint64_t, (uint64_t)addr,
22579 		    uint64_t, (uint64_t)size);
22580 		return true;
22581 	}
22582 	return false;
22583 }
22584 
22585 vm_map_offset_t
vm_map_round_page_mask(vm_map_offset_t offset,vm_map_offset_t mask)22586 vm_map_round_page_mask(
22587 	vm_map_offset_t offset,
22588 	vm_map_offset_t mask)
22589 {
22590 	return VM_MAP_ROUND_PAGE(offset, mask);
22591 }
22592 
22593 vm_map_offset_t
vm_map_trunc_page_mask(vm_map_offset_t offset,vm_map_offset_t mask)22594 vm_map_trunc_page_mask(
22595 	vm_map_offset_t offset,
22596 	vm_map_offset_t mask)
22597 {
22598 	return VM_MAP_TRUNC_PAGE(offset, mask);
22599 }
22600 
22601 boolean_t
vm_map_page_aligned(vm_map_offset_t offset,vm_map_offset_t mask)22602 vm_map_page_aligned(
22603 	vm_map_offset_t offset,
22604 	vm_map_offset_t mask)
22605 {
22606 	return ((offset) & mask) == 0;
22607 }
22608 
22609 int
vm_map_page_shift(vm_map_t map)22610 vm_map_page_shift(
22611 	vm_map_t map)
22612 {
22613 	return VM_MAP_PAGE_SHIFT(map);
22614 }
22615 
22616 int
vm_map_page_size(vm_map_t map)22617 vm_map_page_size(
22618 	vm_map_t map)
22619 {
22620 	return VM_MAP_PAGE_SIZE(map);
22621 }
22622 
22623 vm_map_offset_t
vm_map_page_mask(vm_map_t map)22624 vm_map_page_mask(
22625 	vm_map_t map)
22626 {
22627 	return VM_MAP_PAGE_MASK(map);
22628 }
22629 
22630 kern_return_t
vm_map_set_page_shift(vm_map_t map,int pageshift)22631 vm_map_set_page_shift(
22632 	vm_map_t        map,
22633 	int             pageshift)
22634 {
22635 	if (map->hdr.nentries != 0) {
22636 		/* too late to change page size */
22637 		return KERN_FAILURE;
22638 	}
22639 
22640 	map->hdr.page_shift = (uint16_t)pageshift;
22641 
22642 	return KERN_SUCCESS;
22643 }
22644 
22645 kern_return_t
vm_map_query_volatile(vm_map_t map,mach_vm_size_t * volatile_virtual_size_p,mach_vm_size_t * volatile_resident_size_p,mach_vm_size_t * volatile_compressed_size_p,mach_vm_size_t * volatile_pmap_size_p,mach_vm_size_t * volatile_compressed_pmap_size_p)22646 vm_map_query_volatile(
22647 	vm_map_t        map,
22648 	mach_vm_size_t  *volatile_virtual_size_p,
22649 	mach_vm_size_t  *volatile_resident_size_p,
22650 	mach_vm_size_t  *volatile_compressed_size_p,
22651 	mach_vm_size_t  *volatile_pmap_size_p,
22652 	mach_vm_size_t  *volatile_compressed_pmap_size_p)
22653 {
22654 	mach_vm_size_t  volatile_virtual_size;
22655 	mach_vm_size_t  volatile_resident_count;
22656 	mach_vm_size_t  volatile_compressed_count;
22657 	mach_vm_size_t  volatile_pmap_count;
22658 	mach_vm_size_t  volatile_compressed_pmap_count;
22659 	mach_vm_size_t  resident_count;
22660 	vm_map_entry_t  entry;
22661 	vm_object_t     object;
22662 
22663 	/* map should be locked by caller */
22664 
22665 	volatile_virtual_size = 0;
22666 	volatile_resident_count = 0;
22667 	volatile_compressed_count = 0;
22668 	volatile_pmap_count = 0;
22669 	volatile_compressed_pmap_count = 0;
22670 
22671 	for (entry = vm_map_first_entry(map);
22672 	    entry != vm_map_to_entry(map);
22673 	    entry = entry->vme_next) {
22674 		mach_vm_size_t  pmap_resident_bytes, pmap_compressed_bytes;
22675 
22676 		if (entry->is_sub_map) {
22677 			continue;
22678 		}
22679 		if (!(entry->protection & VM_PROT_WRITE)) {
22680 			continue;
22681 		}
22682 		object = VME_OBJECT(entry);
22683 		if (object == VM_OBJECT_NULL) {
22684 			continue;
22685 		}
22686 		if (object->purgable != VM_PURGABLE_VOLATILE &&
22687 		    object->purgable != VM_PURGABLE_EMPTY) {
22688 			continue;
22689 		}
22690 		if (VME_OFFSET(entry)) {
22691 			/*
22692 			 * If the map entry has been split and the object now
22693 			 * appears several times in the VM map, we don't want
22694 			 * to count the object's resident_page_count more than
22695 			 * once.  We count it only for the first one, starting
22696 			 * at offset 0 and ignore the other VM map entries.
22697 			 */
22698 			continue;
22699 		}
22700 		resident_count = object->resident_page_count;
22701 		if ((VME_OFFSET(entry) / PAGE_SIZE) >= resident_count) {
22702 			resident_count = 0;
22703 		} else {
22704 			resident_count -= (VME_OFFSET(entry) / PAGE_SIZE);
22705 		}
22706 
22707 		volatile_virtual_size += entry->vme_end - entry->vme_start;
22708 		volatile_resident_count += resident_count;
22709 		if (object->pager) {
22710 			volatile_compressed_count +=
22711 			    vm_compressor_pager_get_count(object->pager);
22712 		}
22713 		pmap_compressed_bytes = 0;
22714 		pmap_resident_bytes =
22715 		    pmap_query_resident(map->pmap,
22716 		    entry->vme_start,
22717 		    entry->vme_end,
22718 		    &pmap_compressed_bytes);
22719 		volatile_pmap_count += (pmap_resident_bytes / PAGE_SIZE);
22720 		volatile_compressed_pmap_count += (pmap_compressed_bytes
22721 		    / PAGE_SIZE);
22722 	}
22723 
22724 	/* map is still locked on return */
22725 
22726 	*volatile_virtual_size_p = volatile_virtual_size;
22727 	*volatile_resident_size_p = volatile_resident_count * PAGE_SIZE;
22728 	*volatile_compressed_size_p = volatile_compressed_count * PAGE_SIZE;
22729 	*volatile_pmap_size_p = volatile_pmap_count * PAGE_SIZE;
22730 	*volatile_compressed_pmap_size_p = volatile_compressed_pmap_count * PAGE_SIZE;
22731 
22732 	return KERN_SUCCESS;
22733 }
22734 
22735 void
vm_map_sizes(vm_map_t map,vm_map_size_t * psize,vm_map_size_t * pfree,vm_map_size_t * plargest_free)22736 vm_map_sizes(vm_map_t map,
22737     vm_map_size_t * psize,
22738     vm_map_size_t * pfree,
22739     vm_map_size_t * plargest_free)
22740 {
22741 	vm_map_entry_t  entry;
22742 	vm_map_offset_t prev;
22743 	vm_map_size_t   free, total_free, largest_free;
22744 	boolean_t       end;
22745 
22746 	if (!map) {
22747 		*psize = *pfree = *plargest_free = 0;
22748 		return;
22749 	}
22750 	total_free = largest_free = 0;
22751 
22752 	vm_map_lock_read(map);
22753 	if (psize) {
22754 		*psize = map->max_offset - map->min_offset;
22755 	}
22756 
22757 	prev = map->min_offset;
22758 	for (entry = vm_map_first_entry(map);; entry = entry->vme_next) {
22759 		end = (entry == vm_map_to_entry(map));
22760 
22761 		if (end) {
22762 			free = entry->vme_end   - prev;
22763 		} else {
22764 			free = entry->vme_start - prev;
22765 		}
22766 
22767 		total_free += free;
22768 		if (free > largest_free) {
22769 			largest_free = free;
22770 		}
22771 
22772 		if (end) {
22773 			break;
22774 		}
22775 		prev = entry->vme_end;
22776 	}
22777 	vm_map_unlock_read(map);
22778 	if (pfree) {
22779 		*pfree = total_free;
22780 	}
22781 	if (plargest_free) {
22782 		*plargest_free = largest_free;
22783 	}
22784 }
22785 
22786 #if VM_SCAN_FOR_SHADOW_CHAIN
22787 int
vm_map_shadow_max(vm_map_t map)22788 vm_map_shadow_max(
22789 	vm_map_t map)
22790 {
22791 	int             shadows, shadows_max;
22792 	vm_map_entry_t  entry;
22793 	vm_object_t     object, next_object;
22794 
22795 	if (map == NULL) {
22796 		return 0;
22797 	}
22798 
22799 	shadows_max = 0;
22800 
22801 	vm_map_lock_read(map);
22802 
22803 	for (entry = vm_map_first_entry(map);
22804 	    entry != vm_map_to_entry(map);
22805 	    entry = entry->vme_next) {
22806 		if (entry->is_sub_map) {
22807 			continue;
22808 		}
22809 		object = VME_OBJECT(entry);
22810 		if (object == NULL) {
22811 			continue;
22812 		}
22813 		vm_object_lock_shared(object);
22814 		for (shadows = 0;
22815 		    object->shadow != NULL;
22816 		    shadows++, object = next_object) {
22817 			next_object = object->shadow;
22818 			vm_object_lock_shared(next_object);
22819 			vm_object_unlock(object);
22820 		}
22821 		vm_object_unlock(object);
22822 		if (shadows > shadows_max) {
22823 			shadows_max = shadows;
22824 		}
22825 	}
22826 
22827 	vm_map_unlock_read(map);
22828 
22829 	return shadows_max;
22830 }
22831 #endif /* VM_SCAN_FOR_SHADOW_CHAIN */
22832 
22833 void
vm_commit_pagezero_status(vm_map_t lmap)22834 vm_commit_pagezero_status(vm_map_t lmap)
22835 {
22836 	pmap_advise_pagezero_range(lmap->pmap, lmap->min_offset);
22837 }
22838 
22839 #if __x86_64__
22840 void
vm_map_set_high_start(vm_map_t map,vm_map_offset_t high_start)22841 vm_map_set_high_start(
22842 	vm_map_t        map,
22843 	vm_map_offset_t high_start)
22844 {
22845 	map->vmmap_high_start = high_start;
22846 }
22847 #endif /* __x86_64__ */
22848 
22849 #if CODE_SIGNING_MONITOR
22850 
22851 kern_return_t
vm_map_entry_cs_associate(vm_map_t map,vm_map_entry_t entry,vm_map_kernel_flags_t vmk_flags)22852 vm_map_entry_cs_associate(
22853 	vm_map_t                map,
22854 	vm_map_entry_t          entry,
22855 	vm_map_kernel_flags_t   vmk_flags)
22856 {
22857 	vm_object_t cs_object, cs_shadow, backing_object;
22858 	vm_object_offset_t cs_offset, backing_offset;
22859 	void *cs_blobs;
22860 	struct vnode *cs_vnode;
22861 	kern_return_t cs_ret;
22862 
22863 	if (map->pmap == NULL ||
22864 	    entry->is_sub_map || /* XXX FBDP: recurse on sub-range? */
22865 	    (csm_address_space_exempt(map->pmap) == KERN_SUCCESS) ||
22866 	    VME_OBJECT(entry) == VM_OBJECT_NULL) {
22867 		return KERN_SUCCESS;
22868 	}
22869 
22870 	if (!(entry->protection & VM_PROT_EXECUTE)) {
22871 		/*
22872 		 * This memory region is not executable, so the code-signing
22873 		 * monitor would usually not care about it...
22874 		 */
22875 		if (vmk_flags.vmkf_remap_prot_copy &&
22876 		    (entry->max_protection & VM_PROT_EXECUTE)) {
22877 			/*
22878 			 * ... except if the memory region is being remapped
22879 			 * from r-x/r-x to rw-/rwx via vm_protect(VM_PROT_COPY)
22880 			 * which is what a debugger or dtrace would be doing
22881 			 * to prepare to modify an executable page to insert
22882 			 * a breakpoint or activate a probe.
22883 			 * In that case, fall through so that we can mark
22884 			 * this region as being "debugged" and no longer
22885 			 * strictly code-signed.
22886 			 */
22887 		} else {
22888 			/*
22889 			 * Really not executable, so no need to tell the
22890 			 * code-signing monitor.
22891 			 */
22892 			return KERN_SUCCESS;
22893 		}
22894 	}
22895 
22896 	vm_map_lock_assert_exclusive(map);
22897 
22898 	/*
22899 	 * Check for a debug association mapping before we check for used_for_jit. This
22900 	 * allows non-RWX JIT on macOS systems to masquerade their mappings as USER_DEBUG
22901 	 * pages instead of USER_JIT. These non-RWX JIT pages cannot be marked as USER_JIT
22902 	 * since they are mapped with RW or RX permissions, which the page table monitor
22903 	 * denies on USER_JIT pages. Given that, if they're not mapped as USER_DEBUG,
22904 	 * they will be mapped as USER_EXEC, and that will cause another page table monitor
22905 	 * violation when those USER_EXEC pages are mapped as RW.
22906 	 *
22907 	 * Since these pages switch between RW and RX through mprotect, they mimic what
22908 	 * we expect a debugger to do. As the code signing monitor does not enforce mappings
22909 	 * on macOS systems, this works in our favor here and allows us to continue to
22910 	 * support these legacy-programmed applications without sacrificing security on
22911 	 * the page table or the code signing monitor. We don't need to explicitly check
22912 	 * for entry_for_jit here and the mapping permissions. If the initial mapping is
22913 	 * created with RX, then the application must map it as RW in order to first write
22914 	 * to the page (MAP_JIT mappings must be private and anonymous). The switch to
22915 	 * RX will cause vm_map_protect to mark the entry as vmkf_remap_prot_copy.
22916 	 * Similarly, if the mapping was created as RW, and then switched to RX,
22917 	 * vm_map_protect will again mark the entry as a copy, and both these cases
22918 	 * lead to this if-statement being entered.
22919 	 *
22920 	 * For more information: rdar://115313336.
22921 	 */
22922 	if (vmk_flags.vmkf_remap_prot_copy) {
22923 		cs_ret = csm_associate_debug_region(
22924 			map->pmap,
22925 			entry->vme_start,
22926 			entry->vme_end - entry->vme_start);
22927 
22928 		/*
22929 		 * csm_associate_debug_region returns not supported when the code signing
22930 		 * monitor is disabled. This is intentional, since cs_ret is checked towards
22931 		 * the end of the function, and if it is not supported, then we still want the
22932 		 * VM to perform code-signing enforcement on this entry. That said, if we don't
22933 		 * mark this as a xnu_user_debug page when the code-signing monitor is disabled,
22934 		 * then it never gets retyped to XNU_USER_DEBUG frame type, which then causes
22935 		 * an issue with debugging (since it'll be mapped in as XNU_USER_EXEC in some
22936 		 * cases, which will cause a violation when attempted to be mapped as writable).
22937 		 */
22938 		if ((cs_ret == KERN_SUCCESS) || (cs_ret == KERN_NOT_SUPPORTED)) {
22939 			entry->vme_xnu_user_debug = TRUE;
22940 		}
22941 #if DEVELOPMENT || DEBUG
22942 		if (vm_log_xnu_user_debug) {
22943 			printf("FBDP %d[%s] %s:%d map %p entry %p [ 0x%llx 0x%llx ]  vme_xnu_user_debug=%d cs_ret %d\n",
22944 			    proc_selfpid(),
22945 			    (get_bsdtask_info(current_task()) ? proc_name_address(get_bsdtask_info(current_task())) : "?"),
22946 			    __FUNCTION__, __LINE__,
22947 			    map, entry,
22948 			    (uint64_t)entry->vme_start, (uint64_t)entry->vme_end,
22949 			    entry->vme_xnu_user_debug,
22950 			    cs_ret);
22951 		}
22952 #endif /* DEVELOPMENT || DEBUG */
22953 		goto done;
22954 	}
22955 
22956 	if (entry->used_for_jit) {
22957 		cs_ret = csm_associate_jit_region(
22958 			map->pmap,
22959 			entry->vme_start,
22960 			entry->vme_end - entry->vme_start);
22961 		goto done;
22962 	}
22963 
22964 	cs_object = VME_OBJECT(entry);
22965 	vm_object_lock_shared(cs_object);
22966 	cs_offset = VME_OFFSET(entry);
22967 
22968 	/* find the VM object backed by the code-signed vnode */
22969 	for (;;) {
22970 		/* go to the bottom of cs_object's shadow chain */
22971 		for (;
22972 		    cs_object->shadow != VM_OBJECT_NULL;
22973 		    cs_object = cs_shadow) {
22974 			cs_shadow = cs_object->shadow;
22975 			cs_offset += cs_object->vo_shadow_offset;
22976 			vm_object_lock_shared(cs_shadow);
22977 			vm_object_unlock(cs_object);
22978 		}
22979 		if (cs_object->internal ||
22980 		    cs_object->pager == MEMORY_OBJECT_NULL) {
22981 			vm_object_unlock(cs_object);
22982 			return KERN_SUCCESS;
22983 		}
22984 
22985 		cs_offset += cs_object->paging_offset;
22986 
22987 		/*
22988 		 * cs_object could be backed by a:
22989 		 *      vnode_pager
22990 		 *	apple_protect_pager
22991 		 *      shared_region_pager
22992 		 *	fourk_pager (multiple backing objects -> fail?)
22993 		 * ask the pager if it has a backing VM object
22994 		 */
22995 		if (!memory_object_backing_object(cs_object->pager,
22996 		    cs_offset,
22997 		    &backing_object,
22998 		    &backing_offset)) {
22999 			/* no backing object: cs_object is it */
23000 			break;
23001 		}
23002 
23003 		/* look down the backing object's shadow chain */
23004 		vm_object_lock_shared(backing_object);
23005 		vm_object_unlock(cs_object);
23006 		cs_object = backing_object;
23007 		cs_offset = backing_offset;
23008 	}
23009 
23010 	cs_vnode = vnode_pager_lookup_vnode(cs_object->pager);
23011 	if (cs_vnode == NULL) {
23012 		/* no vnode, no code signatures to associate */
23013 		cs_ret = KERN_SUCCESS;
23014 	} else {
23015 		cs_ret = vnode_pager_get_cs_blobs(cs_vnode,
23016 		    &cs_blobs);
23017 		assert(cs_ret == KERN_SUCCESS);
23018 		cs_ret = cs_associate_blob_with_mapping(map->pmap,
23019 		    entry->vme_start,
23020 		    (entry->vme_end - entry->vme_start),
23021 		    cs_offset,
23022 		    cs_blobs);
23023 	}
23024 	vm_object_unlock(cs_object);
23025 	cs_object = VM_OBJECT_NULL;
23026 
23027 done:
23028 	if (cs_ret == KERN_SUCCESS) {
23029 		DTRACE_VM2(vm_map_entry_cs_associate_success,
23030 		    vm_map_offset_t, entry->vme_start,
23031 		    vm_map_offset_t, entry->vme_end);
23032 		if (vm_map_executable_immutable) {
23033 			/*
23034 			 * Prevent this executable
23035 			 * mapping from being unmapped
23036 			 * or modified.
23037 			 */
23038 			entry->vme_permanent = TRUE;
23039 		}
23040 		/*
23041 		 * pmap says it will validate the
23042 		 * code-signing validity of pages
23043 		 * faulted in via this mapping, so
23044 		 * this map entry should be marked so
23045 		 * that vm_fault() bypasses code-signing
23046 		 * validation for faults coming through
23047 		 * this mapping.
23048 		 */
23049 		entry->csm_associated = TRUE;
23050 	} else if (cs_ret == KERN_NOT_SUPPORTED) {
23051 		/*
23052 		 * pmap won't check the code-signing
23053 		 * validity of pages faulted in via
23054 		 * this mapping, so VM should keep
23055 		 * doing it.
23056 		 */
23057 		DTRACE_VM3(vm_map_entry_cs_associate_off,
23058 		    vm_map_offset_t, entry->vme_start,
23059 		    vm_map_offset_t, entry->vme_end,
23060 		    int, cs_ret);
23061 	} else {
23062 		/*
23063 		 * A real error: do not allow
23064 		 * execution in this mapping.
23065 		 */
23066 		DTRACE_VM3(vm_map_entry_cs_associate_failure,
23067 		    vm_map_offset_t, entry->vme_start,
23068 		    vm_map_offset_t, entry->vme_end,
23069 		    int, cs_ret);
23070 		if (vmk_flags.vmkf_overwrite_immutable) {
23071 			/*
23072 			 * We can get here when we remap an apple_protect pager
23073 			 * on top of an already cs_associated executable mapping
23074 			 * with the same code signatures, so we don't want to
23075 			 * lose VM_PROT_EXECUTE in that case...
23076 			 */
23077 		} else {
23078 			entry->protection &= ~VM_PROT_ALLEXEC;
23079 			entry->max_protection &= ~VM_PROT_ALLEXEC;
23080 		}
23081 	}
23082 
23083 	return cs_ret;
23084 }
23085 
23086 #endif /* CODE_SIGNING_MONITOR */
23087 
23088 inline bool
vm_map_is_corpse_source(vm_map_t map)23089 vm_map_is_corpse_source(vm_map_t map)
23090 {
23091 	bool status = false;
23092 	if (map) {
23093 		vm_map_lock_read(map);
23094 		status = map->corpse_source;
23095 		vm_map_unlock_read(map);
23096 	}
23097 	return status;
23098 }
23099 
23100 inline void
vm_map_set_corpse_source(vm_map_t map)23101 vm_map_set_corpse_source(vm_map_t map)
23102 {
23103 	if (map) {
23104 		vm_map_lock(map);
23105 		map->corpse_source = true;
23106 		vm_map_unlock(map);
23107 	}
23108 }
23109 
23110 inline void
vm_map_unset_corpse_source(vm_map_t map)23111 vm_map_unset_corpse_source(vm_map_t map)
23112 {
23113 	if (map) {
23114 		vm_map_lock(map);
23115 		map->corpse_source = false;
23116 		vm_map_unlock(map);
23117 	}
23118 }
23119 /*
23120  * FORKED CORPSE FOOTPRINT
23121  *
23122  * A forked corpse gets a copy of the original VM map but its pmap is mostly
23123  * empty since it never ran and never got to fault in any pages.
23124  * Collecting footprint info (via "sysctl vm.self_region_footprint") for
23125  * a forked corpse would therefore return very little information.
23126  *
23127  * When forking a corpse, we can pass the VM_MAP_FORK_CORPSE_FOOTPRINT option
23128  * to vm_map_fork() to collect footprint information from the original VM map
23129  * and its pmap, and store it in the forked corpse's VM map.  That information
23130  * is stored in place of the VM map's "hole list" since we'll never need to
23131  * lookup for holes in the corpse's map.
23132  *
23133  * The corpse's footprint info looks like this:
23134  *
23135  * vm_map->vmmap_corpse_footprint points to pageable kernel memory laid out
23136  * as follows:
23137  *                     +---------------------------------------+
23138  *            header-> | cf_size                               |
23139  *                     +-------------------+-------------------+
23140  *                     | cf_last_region    | cf_last_zeroes    |
23141  *                     +-------------------+-------------------+
23142  *           region1-> | cfr_vaddr                             |
23143  *                     +-------------------+-------------------+
23144  *                     | cfr_num_pages     | d0 | d1 | d2 | d3 |
23145  *                     +---------------------------------------+
23146  *                     | d4 | d5 | ...                         |
23147  *                     +---------------------------------------+
23148  *                     | ...                                   |
23149  *                     +-------------------+-------------------+
23150  *                     | dy | dz | na | na | cfr_vaddr...      | <-region2
23151  *                     +-------------------+-------------------+
23152  *                     | cfr_vaddr (ctd)   | cfr_num_pages     |
23153  *                     +---------------------------------------+
23154  *                     | d0 | d1 ...                           |
23155  *                     +---------------------------------------+
23156  *                       ...
23157  *                     +---------------------------------------+
23158  *       last region-> | cfr_vaddr                             |
23159  *                     +---------------------------------------+
23160  *                     + cfr_num_pages     | d0 | d1 | d2 | d3 |
23161  *                     +---------------------------------------+
23162  *                       ...
23163  *                     +---------------------------------------+
23164  *                     | dx | dy | dz | na | na | na | na | na |
23165  *                     +---------------------------------------+
23166  *
23167  * where:
23168  *      cf_size:	total size of the buffer (rounded to page size)
23169  *      cf_last_region:	offset in the buffer of the last "region" sub-header
23170  *	cf_last_zeroes: number of trailing "zero" dispositions at the end
23171  *			of last region
23172  *	cfr_vaddr:	virtual address of the start of the covered "region"
23173  *	cfr_num_pages:	number of pages in the covered "region"
23174  *	d*:		disposition of the page at that virtual address
23175  * Regions in the buffer are word-aligned.
23176  *
23177  * We estimate the size of the buffer based on the number of memory regions
23178  * and the virtual size of the address space.  While copying each memory region
23179  * during vm_map_fork(), we also collect the footprint info for that region
23180  * and store it in the buffer, packing it as much as possible (coalescing
23181  * contiguous memory regions to avoid having too many region headers and
23182  * avoiding long streaks of "zero" page dispositions by splitting footprint
23183  * "regions", so the number of regions in the footprint buffer might not match
23184  * the number of memory regions in the address space.
23185  *
23186  * We also have to copy the original task's "nonvolatile" ledgers since that's
23187  * part of the footprint and will need to be reported to any tool asking for
23188  * the footprint information of the forked corpse.
23189  */
23190 
23191 uint64_t vm_map_corpse_footprint_count = 0;
23192 uint64_t vm_map_corpse_footprint_size_avg = 0;
23193 uint64_t vm_map_corpse_footprint_size_max = 0;
23194 uint64_t vm_map_corpse_footprint_full = 0;
23195 uint64_t vm_map_corpse_footprint_no_buf = 0;
23196 
23197 struct vm_map_corpse_footprint_header {
23198 	vm_size_t       cf_size;        /* allocated buffer size */
23199 	uint32_t        cf_last_region; /* offset of last region in buffer */
23200 	union {
23201 		uint32_t cfu_last_zeroes; /* during creation:
23202 		                           * number of "zero" dispositions at
23203 		                           * end of last region */
23204 		uint32_t cfu_hint_region; /* during lookup:
23205 		                           * offset of last looked up region */
23206 #define cf_last_zeroes cfu.cfu_last_zeroes
23207 #define cf_hint_region cfu.cfu_hint_region
23208 	} cfu;
23209 };
23210 typedef uint8_t cf_disp_t;
23211 struct vm_map_corpse_footprint_region {
23212 	vm_map_offset_t cfr_vaddr;      /* region start virtual address */
23213 	uint32_t        cfr_num_pages;  /* number of pages in this "region" */
23214 	cf_disp_t   cfr_disposition[0]; /* disposition of each page */
23215 } __attribute__((packed));
23216 
23217 static cf_disp_t
vm_page_disposition_to_cf_disp(int disposition)23218 vm_page_disposition_to_cf_disp(
23219 	int disposition)
23220 {
23221 	assert(sizeof(cf_disp_t) == 1);
23222 	/* relocate bits that don't fit in a "uint8_t" */
23223 	if (disposition & VM_PAGE_QUERY_PAGE_REUSABLE) {
23224 		disposition |= VM_PAGE_QUERY_PAGE_FICTITIOUS;
23225 	}
23226 	/* cast gets rid of extra bits */
23227 	return (cf_disp_t) disposition;
23228 }
23229 
23230 static int
vm_page_cf_disp_to_disposition(cf_disp_t cf_disp)23231 vm_page_cf_disp_to_disposition(
23232 	cf_disp_t cf_disp)
23233 {
23234 	int disposition;
23235 
23236 	assert(sizeof(cf_disp_t) == 1);
23237 	disposition = (int) cf_disp;
23238 	/* move relocated bits back in place */
23239 	if (cf_disp & VM_PAGE_QUERY_PAGE_FICTITIOUS) {
23240 		disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
23241 		disposition &= ~VM_PAGE_QUERY_PAGE_FICTITIOUS;
23242 	}
23243 	return disposition;
23244 }
23245 
23246 static kmem_guard_t
vm_map_corpse_footprint_guard(vm_map_t map)23247 vm_map_corpse_footprint_guard(vm_map_t map)
23248 {
23249 	return (kmem_guard_t){
23250 		       .kmg_atomic = true,
23251 		       .kmg_tag = VM_KERN_MEMORY_DIAG,
23252 		       .kmg_context = os_hash_kernel_pointer(&map->vmmap_corpse_footprint),
23253 	};
23254 }
23255 
23256 /*
23257  * vm_map_corpse_footprint_new_region:
23258  *      closes the current footprint "region" and creates a new one
23259  *
23260  * Returns NULL if there's not enough space in the buffer for a new region.
23261  */
23262 static struct vm_map_corpse_footprint_region *
vm_map_corpse_footprint_new_region(struct vm_map_corpse_footprint_header * footprint_header)23263 vm_map_corpse_footprint_new_region(
23264 	struct vm_map_corpse_footprint_header *footprint_header)
23265 {
23266 	uintptr_t       footprint_edge;
23267 	uint32_t        new_region_offset;
23268 	struct vm_map_corpse_footprint_region *footprint_region;
23269 	struct vm_map_corpse_footprint_region *new_footprint_region;
23270 
23271 	footprint_edge = ((uintptr_t)footprint_header +
23272 	    footprint_header->cf_size);
23273 	footprint_region = ((struct vm_map_corpse_footprint_region *)
23274 	    ((char *)footprint_header +
23275 	    footprint_header->cf_last_region));
23276 	assert((uintptr_t)footprint_region + sizeof(*footprint_region) <=
23277 	    footprint_edge);
23278 
23279 	/* get rid of trailing zeroes in the last region */
23280 	assert(footprint_region->cfr_num_pages >=
23281 	    footprint_header->cf_last_zeroes);
23282 	footprint_region->cfr_num_pages -=
23283 	    footprint_header->cf_last_zeroes;
23284 	footprint_header->cf_last_zeroes = 0;
23285 
23286 	/* reuse this region if it's now empty */
23287 	if (footprint_region->cfr_num_pages == 0) {
23288 		return footprint_region;
23289 	}
23290 
23291 	/* compute offset of new region */
23292 	new_region_offset = footprint_header->cf_last_region;
23293 	new_region_offset += sizeof(*footprint_region);
23294 	new_region_offset += (footprint_region->cfr_num_pages * sizeof(cf_disp_t));
23295 	new_region_offset = roundup(new_region_offset, sizeof(int));
23296 
23297 	/* check if we're going over the edge */
23298 	if (((uintptr_t)footprint_header +
23299 	    new_region_offset +
23300 	    sizeof(*footprint_region)) >=
23301 	    footprint_edge) {
23302 		/* over the edge: no new region */
23303 		return NULL;
23304 	}
23305 
23306 	/* adjust offset of last region in header */
23307 	footprint_header->cf_last_region = new_region_offset;
23308 
23309 	new_footprint_region = (struct vm_map_corpse_footprint_region *)
23310 	    ((char *)footprint_header +
23311 	    footprint_header->cf_last_region);
23312 	new_footprint_region->cfr_vaddr = 0;
23313 	new_footprint_region->cfr_num_pages = 0;
23314 	/* caller needs to initialize new region */
23315 
23316 	return new_footprint_region;
23317 }
23318 
23319 /*
23320  * vm_map_corpse_footprint_collect:
23321  *	collect footprint information for "old_entry" in "old_map" and
23322  *	stores it in "new_map"'s vmmap_footprint_info.
23323  */
23324 kern_return_t
vm_map_corpse_footprint_collect(vm_map_t old_map,vm_map_entry_t old_entry,vm_map_t new_map)23325 vm_map_corpse_footprint_collect(
23326 	vm_map_t        old_map,
23327 	vm_map_entry_t  old_entry,
23328 	vm_map_t        new_map)
23329 {
23330 	vm_map_offset_t va;
23331 	kmem_return_t kmr;
23332 	struct vm_map_corpse_footprint_header *footprint_header;
23333 	struct vm_map_corpse_footprint_region *footprint_region;
23334 	struct vm_map_corpse_footprint_region *new_footprint_region;
23335 	cf_disp_t       *next_disp_p;
23336 	uintptr_t       footprint_edge;
23337 	uint32_t        num_pages_tmp;
23338 	int             effective_page_size;
23339 
23340 	effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(old_map));
23341 
23342 	va = old_entry->vme_start;
23343 
23344 	vm_map_lock_assert_exclusive(old_map);
23345 	vm_map_lock_assert_exclusive(new_map);
23346 
23347 	assert(new_map->has_corpse_footprint);
23348 	assert(!old_map->has_corpse_footprint);
23349 	if (!new_map->has_corpse_footprint ||
23350 	    old_map->has_corpse_footprint) {
23351 		/*
23352 		 * This can only transfer footprint info from a
23353 		 * map with a live pmap to a map with a corpse footprint.
23354 		 */
23355 		return KERN_NOT_SUPPORTED;
23356 	}
23357 
23358 	if (new_map->vmmap_corpse_footprint == NULL) {
23359 		vm_size_t buf_size;
23360 
23361 		buf_size = (sizeof(*footprint_header) +
23362 		    (old_map->hdr.nentries
23363 		    *
23364 		    (sizeof(*footprint_region) +
23365 		    +3))            /* potential alignment for each region */
23366 		    +
23367 		    ((old_map->size / effective_page_size)
23368 		    *
23369 		    sizeof(cf_disp_t)));      /* disposition for each page */
23370 //		printf("FBDP corpse map %p guestimate footprint size 0x%llx\n", new_map, (uint64_t) buf_size);
23371 		buf_size = round_page(buf_size);
23372 
23373 		/* limit buffer to 1 page to validate overflow detection */
23374 //		buf_size = PAGE_SIZE;
23375 
23376 		/* limit size to a somewhat sane amount */
23377 #if XNU_TARGET_OS_OSX
23378 #define VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE   (8*1024*1024)   /* 8MB */
23379 #else /* XNU_TARGET_OS_OSX */
23380 #define VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE   (256*1024)      /* 256KB */
23381 #endif /* XNU_TARGET_OS_OSX */
23382 		if (buf_size > VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE) {
23383 			buf_size = VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE;
23384 		}
23385 		kmem_guard_t guard = vm_map_corpse_footprint_guard(new_map);
23386 		kmr = kmem_alloc_guard(kernel_map, buf_size + PAGE_SIZE, 0,
23387 		    KMA_DATA | KMA_GUARD_LAST | KMA_KOBJECT | KMA_ZERO,
23388 		    guard);
23389 		if (kmr.kmr_return != KERN_SUCCESS) {
23390 			vm_map_corpse_footprint_no_buf++;
23391 			return kmr.kmr_return;
23392 		}
23393 
23394 		/* initialize header and 1st region */
23395 		footprint_header = (struct vm_map_corpse_footprint_header *)kmr.kmr_ptr;
23396 		assert3p(footprint_header, !=, NULL);
23397 		new_map->vmmap_corpse_footprint = footprint_header;
23398 
23399 		footprint_header->cf_size = buf_size;
23400 		footprint_header->cf_last_region =
23401 		    sizeof(*footprint_header);
23402 		footprint_header->cf_last_zeroes = 0;
23403 
23404 		footprint_region = (struct vm_map_corpse_footprint_region *)
23405 		    ((char *)footprint_header +
23406 		    footprint_header->cf_last_region);
23407 		footprint_region->cfr_vaddr = 0;
23408 		footprint_region->cfr_num_pages = 0;
23409 	} else {
23410 		/* retrieve header and last region */
23411 		footprint_header = (struct vm_map_corpse_footprint_header *)
23412 		    new_map->vmmap_corpse_footprint;
23413 		footprint_region = (struct vm_map_corpse_footprint_region *)
23414 		    ((char *)footprint_header +
23415 		    footprint_header->cf_last_region);
23416 	}
23417 	footprint_edge = ((uintptr_t)footprint_header +
23418 	    footprint_header->cf_size);
23419 
23420 	if ((footprint_region->cfr_vaddr +
23421 	    (((vm_map_offset_t)footprint_region->cfr_num_pages) *
23422 	    effective_page_size))
23423 	    != old_entry->vme_start) {
23424 		uint64_t num_pages_delta, num_pages_delta_size;
23425 		uint32_t region_offset_delta_size;
23426 
23427 		/*
23428 		 * Not the next contiguous virtual address:
23429 		 * start a new region or store "zero" dispositions for
23430 		 * the missing pages?
23431 		 */
23432 		/* size of gap in actual page dispositions */
23433 		num_pages_delta = ((old_entry->vme_start -
23434 		    footprint_region->cfr_vaddr) / effective_page_size)
23435 		    - footprint_region->cfr_num_pages;
23436 		num_pages_delta_size = num_pages_delta * sizeof(cf_disp_t);
23437 		/* size of gap as a new footprint region header */
23438 		region_offset_delta_size =
23439 		    (sizeof(*footprint_region) +
23440 		    roundup(((footprint_region->cfr_num_pages -
23441 		    footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)),
23442 		    sizeof(int)) -
23443 		    ((footprint_region->cfr_num_pages -
23444 		    footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)));
23445 //		printf("FBDP %s:%d region 0x%x 0x%llx 0x%x vme_start 0x%llx pages_delta 0x%llx region_delta 0x%x\n", __FUNCTION__, __LINE__, footprint_header->cf_last_region, footprint_region->cfr_vaddr, footprint_region->cfr_num_pages, old_entry->vme_start, num_pages_delta, region_offset_delta);
23446 		if (region_offset_delta_size < num_pages_delta_size ||
23447 		    os_add3_overflow(footprint_region->cfr_num_pages,
23448 		    (uint32_t) num_pages_delta,
23449 		    1,
23450 		    &num_pages_tmp)) {
23451 			/*
23452 			 * Storing data for this gap would take more space
23453 			 * than inserting a new footprint region header:
23454 			 * let's start a new region and save space. If it's a
23455 			 * tie, let's avoid using a new region, since that
23456 			 * would require more region hops to find the right
23457 			 * range during lookups.
23458 			 *
23459 			 * If the current region's cfr_num_pages would overflow
23460 			 * if we added "zero" page dispositions for the gap,
23461 			 * no choice but to start a new region.
23462 			 */
23463 //			printf("FBDP %s:%d new region\n", __FUNCTION__, __LINE__);
23464 			new_footprint_region =
23465 			    vm_map_corpse_footprint_new_region(footprint_header);
23466 			/* check that we're not going over the edge */
23467 			if (new_footprint_region == NULL) {
23468 				goto over_the_edge;
23469 			}
23470 			footprint_region = new_footprint_region;
23471 			/* initialize new region as empty */
23472 			footprint_region->cfr_vaddr = old_entry->vme_start;
23473 			footprint_region->cfr_num_pages = 0;
23474 		} else {
23475 			/*
23476 			 * Store "zero" page dispositions for the missing
23477 			 * pages.
23478 			 */
23479 //			printf("FBDP %s:%d zero gap\n", __FUNCTION__, __LINE__);
23480 			for (; num_pages_delta > 0; num_pages_delta--) {
23481 				next_disp_p = (cf_disp_t *)
23482 				    ((uintptr_t) footprint_region +
23483 				    sizeof(*footprint_region));
23484 				next_disp_p += footprint_region->cfr_num_pages;
23485 				/* check that we're not going over the edge */
23486 				if ((uintptr_t)next_disp_p >= footprint_edge) {
23487 					goto over_the_edge;
23488 				}
23489 				/* store "zero" disposition for this gap page */
23490 				footprint_region->cfr_num_pages++;
23491 				*next_disp_p = (cf_disp_t) 0;
23492 				footprint_header->cf_last_zeroes++;
23493 			}
23494 		}
23495 	}
23496 
23497 	for (va = old_entry->vme_start;
23498 	    va < old_entry->vme_end;
23499 	    va += effective_page_size) {
23500 		int             disposition;
23501 		cf_disp_t       cf_disp;
23502 
23503 		vm_map_footprint_query_page_info(old_map,
23504 		    old_entry,
23505 		    va,
23506 		    &disposition);
23507 		cf_disp = vm_page_disposition_to_cf_disp(disposition);
23508 
23509 //		if (va < SHARED_REGION_BASE_ARM64) printf("FBDP collect map %p va 0x%llx disp 0x%x\n", new_map, va, disp);
23510 
23511 		if (cf_disp == 0 && footprint_region->cfr_num_pages == 0) {
23512 			/*
23513 			 * Ignore "zero" dispositions at start of
23514 			 * region: just move start of region.
23515 			 */
23516 			footprint_region->cfr_vaddr += effective_page_size;
23517 			continue;
23518 		}
23519 
23520 		/* would region's cfr_num_pages overflow? */
23521 		if (os_add_overflow(footprint_region->cfr_num_pages, 1,
23522 		    &num_pages_tmp)) {
23523 			/* overflow: create a new region */
23524 			new_footprint_region =
23525 			    vm_map_corpse_footprint_new_region(
23526 				footprint_header);
23527 			if (new_footprint_region == NULL) {
23528 				goto over_the_edge;
23529 			}
23530 			footprint_region = new_footprint_region;
23531 			footprint_region->cfr_vaddr = va;
23532 			footprint_region->cfr_num_pages = 0;
23533 		}
23534 
23535 		next_disp_p = (cf_disp_t *) ((uintptr_t) footprint_region +
23536 		    sizeof(*footprint_region));
23537 		next_disp_p += footprint_region->cfr_num_pages;
23538 		/* check that we're not going over the edge */
23539 		if ((uintptr_t)next_disp_p >= footprint_edge) {
23540 			goto over_the_edge;
23541 		}
23542 		/* store this dispostion */
23543 		*next_disp_p = cf_disp;
23544 		footprint_region->cfr_num_pages++;
23545 
23546 		if (cf_disp != 0) {
23547 			/* non-zero disp: break the current zero streak */
23548 			footprint_header->cf_last_zeroes = 0;
23549 			/* done */
23550 			continue;
23551 		}
23552 
23553 		/* zero disp: add to the current streak of zeroes */
23554 		footprint_header->cf_last_zeroes++;
23555 		if ((footprint_header->cf_last_zeroes +
23556 		    roundup(((footprint_region->cfr_num_pages -
23557 		    footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)) &
23558 		    (sizeof(int) - 1),
23559 		    sizeof(int))) <
23560 		    (sizeof(*footprint_header))) {
23561 			/*
23562 			 * There are not enough trailing "zero" dispositions
23563 			 * (+ the extra padding we would need for the previous
23564 			 * region); creating a new region would not save space
23565 			 * at this point, so let's keep this "zero" disposition
23566 			 * in this region and reconsider later.
23567 			 */
23568 			continue;
23569 		}
23570 		/*
23571 		 * Create a new region to avoid having too many consecutive
23572 		 * "zero" dispositions.
23573 		 */
23574 		new_footprint_region =
23575 		    vm_map_corpse_footprint_new_region(footprint_header);
23576 		if (new_footprint_region == NULL) {
23577 			goto over_the_edge;
23578 		}
23579 		footprint_region = new_footprint_region;
23580 		/* initialize the new region as empty ... */
23581 		footprint_region->cfr_num_pages = 0;
23582 		/* ... and skip this "zero" disp */
23583 		footprint_region->cfr_vaddr = va + effective_page_size;
23584 	}
23585 
23586 	return KERN_SUCCESS;
23587 
23588 over_the_edge:
23589 //	printf("FBDP map %p footprint was full for va 0x%llx\n", new_map, va);
23590 	vm_map_corpse_footprint_full++;
23591 	return KERN_RESOURCE_SHORTAGE;
23592 }
23593 
23594 /*
23595  * vm_map_corpse_footprint_collect_done:
23596  *	completes the footprint collection by getting rid of any remaining
23597  *	trailing "zero" dispositions and trimming the unused part of the
23598  *	kernel buffer
23599  */
23600 void
vm_map_corpse_footprint_collect_done(vm_map_t new_map)23601 vm_map_corpse_footprint_collect_done(
23602 	vm_map_t        new_map)
23603 {
23604 	struct vm_map_corpse_footprint_header *footprint_header;
23605 	struct vm_map_corpse_footprint_region *footprint_region;
23606 	vm_size_t       buf_size, actual_size;
23607 
23608 	assert(new_map->has_corpse_footprint);
23609 	if (!new_map->has_corpse_footprint ||
23610 	    new_map->vmmap_corpse_footprint == NULL) {
23611 		return;
23612 	}
23613 
23614 	footprint_header = (struct vm_map_corpse_footprint_header *)
23615 	    new_map->vmmap_corpse_footprint;
23616 	buf_size = footprint_header->cf_size;
23617 
23618 	footprint_region = (struct vm_map_corpse_footprint_region *)
23619 	    ((char *)footprint_header +
23620 	    footprint_header->cf_last_region);
23621 
23622 	/* get rid of trailing zeroes in last region */
23623 	assert(footprint_region->cfr_num_pages >= footprint_header->cf_last_zeroes);
23624 	footprint_region->cfr_num_pages -= footprint_header->cf_last_zeroes;
23625 	footprint_header->cf_last_zeroes = 0;
23626 
23627 	actual_size = (vm_size_t)(footprint_header->cf_last_region +
23628 	    sizeof(*footprint_region) +
23629 	    (footprint_region->cfr_num_pages * sizeof(cf_disp_t)));
23630 
23631 //	printf("FBDP map %p buf_size 0x%llx actual_size 0x%llx\n", new_map, (uint64_t) buf_size, (uint64_t) actual_size);
23632 	vm_map_corpse_footprint_size_avg =
23633 	    (((vm_map_corpse_footprint_size_avg *
23634 	    vm_map_corpse_footprint_count) +
23635 	    actual_size) /
23636 	    (vm_map_corpse_footprint_count + 1));
23637 	vm_map_corpse_footprint_count++;
23638 	if (actual_size > vm_map_corpse_footprint_size_max) {
23639 		vm_map_corpse_footprint_size_max = actual_size;
23640 	}
23641 
23642 	actual_size = round_page(actual_size);
23643 	assert3u(buf_size, >=, actual_size);
23644 	if (buf_size > actual_size) {
23645 		/*
23646 		 * Free unused space at the end of the buffer
23647 		 */
23648 		kmem_guard_t guard = vm_map_corpse_footprint_guard(new_map);
23649 		kmem_return_t kmr = kmem_realloc_guard(kernel_map,
23650 		    (vm_offset_t)footprint_header,
23651 		    /* Account for guard page */
23652 		    buf_size + PAGE_SIZE,
23653 		    actual_size + PAGE_SIZE,
23654 		    KMR_DATA | KMR_GUARD_LAST | KMR_FREEOLD | KMR_KOBJECT,
23655 		    guard);
23656 		assertf(kmr.kmr_return == KERN_SUCCESS,
23657 		    "trim: footprint_header %p buf_size 0x%llx actual_size 0x%llx kr=0x%x\n",
23658 		    footprint_header,
23659 		    (uint64_t) buf_size,
23660 		    (uint64_t) actual_size,
23661 		    kmr.kmr_return);
23662 		footprint_header = (struct vm_map_corpse_footprint_header *)kmr.kmr_ptr;
23663 		assert3p(footprint_header, !=, NULL);
23664 		new_map->vmmap_corpse_footprint = footprint_header;
23665 		footprint_region = NULL;
23666 	}
23667 
23668 	footprint_header->cf_size = actual_size;
23669 }
23670 
23671 /*
23672  * vm_map_corpse_footprint_query_page_info:
23673  *	retrieves the disposition of the page at virtual address "vaddr"
23674  *	in the forked corpse's VM map
23675  *
23676  * This is the equivalent of vm_map_footprint_query_page_info() for a forked corpse.
23677  */
23678 kern_return_t
vm_map_corpse_footprint_query_page_info(vm_map_t map,vm_map_offset_t va,int * disposition_p)23679 vm_map_corpse_footprint_query_page_info(
23680 	vm_map_t        map,
23681 	vm_map_offset_t va,
23682 	int             *disposition_p)
23683 {
23684 	struct vm_map_corpse_footprint_header *footprint_header;
23685 	struct vm_map_corpse_footprint_region *footprint_region;
23686 	uint32_t        footprint_region_offset;
23687 	vm_map_offset_t region_start, region_end;
23688 	int             disp_idx;
23689 	kern_return_t   kr;
23690 	int             effective_page_size;
23691 	cf_disp_t       cf_disp;
23692 
23693 	if (!map->has_corpse_footprint) {
23694 		*disposition_p = 0;
23695 		kr = KERN_INVALID_ARGUMENT;
23696 		goto done;
23697 	}
23698 
23699 	footprint_header = map->vmmap_corpse_footprint;
23700 	if (footprint_header == NULL) {
23701 		*disposition_p = 0;
23702 //		if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p);
23703 		kr = KERN_INVALID_ARGUMENT;
23704 		goto done;
23705 	}
23706 
23707 	/* start looking at the hint ("cf_hint_region") */
23708 	footprint_region_offset = footprint_header->cf_hint_region;
23709 
23710 	effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(map));
23711 
23712 lookup_again:
23713 	if (footprint_region_offset < sizeof(*footprint_header)) {
23714 		/* hint too low: start from 1st region */
23715 		footprint_region_offset = sizeof(*footprint_header);
23716 	}
23717 	if (footprint_region_offset > footprint_header->cf_last_region) {
23718 		/* hint too high: re-start from 1st region */
23719 		footprint_region_offset = sizeof(*footprint_header);
23720 	}
23721 	footprint_region = (struct vm_map_corpse_footprint_region *)
23722 	    ((char *)footprint_header + footprint_region_offset);
23723 	region_start = footprint_region->cfr_vaddr;
23724 	region_end = (region_start +
23725 	    ((vm_map_offset_t)(footprint_region->cfr_num_pages) *
23726 	    effective_page_size));
23727 	if (va < region_start &&
23728 	    footprint_region_offset != sizeof(*footprint_header)) {
23729 		/* our range starts before the hint region */
23730 
23731 		/* reset the hint (in a racy way...) */
23732 		footprint_header->cf_hint_region = sizeof(*footprint_header);
23733 		/* lookup "va" again from 1st region */
23734 		footprint_region_offset = sizeof(*footprint_header);
23735 		goto lookup_again;
23736 	}
23737 
23738 	while (va >= region_end) {
23739 		if (footprint_region_offset >= footprint_header->cf_last_region) {
23740 			break;
23741 		}
23742 		/* skip the region's header */
23743 		footprint_region_offset += sizeof(*footprint_region);
23744 		/* skip the region's page dispositions */
23745 		footprint_region_offset += (footprint_region->cfr_num_pages * sizeof(cf_disp_t));
23746 		/* align to next word boundary */
23747 		footprint_region_offset =
23748 		    roundup(footprint_region_offset,
23749 		    sizeof(int));
23750 		footprint_region = (struct vm_map_corpse_footprint_region *)
23751 		    ((char *)footprint_header + footprint_region_offset);
23752 		region_start = footprint_region->cfr_vaddr;
23753 		region_end = (region_start +
23754 		    ((vm_map_offset_t)(footprint_region->cfr_num_pages) *
23755 		    effective_page_size));
23756 	}
23757 	if (va < region_start || va >= region_end) {
23758 		/* page not found */
23759 		*disposition_p = 0;
23760 //		if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p);
23761 		kr = KERN_SUCCESS;
23762 		goto done;
23763 	}
23764 
23765 	/* "va" found: set the lookup hint for next lookup (in a racy way...) */
23766 	footprint_header->cf_hint_region = footprint_region_offset;
23767 
23768 	/* get page disposition for "va" in this region */
23769 	disp_idx = (int) ((va - footprint_region->cfr_vaddr) / effective_page_size);
23770 	cf_disp = footprint_region->cfr_disposition[disp_idx];
23771 	*disposition_p = vm_page_cf_disp_to_disposition(cf_disp);
23772 	kr = KERN_SUCCESS;
23773 done:
23774 //	if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p);
23775 	/* dtrace -n 'vminfo:::footprint_query_page_info { printf("map 0x%p va 0x%llx disp 0x%x kr 0x%x", arg0, arg1, arg2, arg3); }' */
23776 	DTRACE_VM4(footprint_query_page_info,
23777 	    vm_map_t, map,
23778 	    vm_map_offset_t, va,
23779 	    int, *disposition_p,
23780 	    kern_return_t, kr);
23781 
23782 	return kr;
23783 }
23784 
23785 void
vm_map_corpse_footprint_destroy(vm_map_t map)23786 vm_map_corpse_footprint_destroy(
23787 	vm_map_t        map)
23788 {
23789 	if (map->has_corpse_footprint &&
23790 	    map->vmmap_corpse_footprint != NULL) {
23791 		struct vm_map_corpse_footprint_header *footprint_header;
23792 		vm_size_t buf_size;
23793 
23794 		footprint_header = map->vmmap_corpse_footprint;
23795 		buf_size = footprint_header->cf_size;
23796 		kmem_guard_t guard = vm_map_corpse_footprint_guard(map);
23797 		kmem_free_guard(kernel_map, (vm_offset_t)footprint_header,
23798 		    buf_size + PAGE_SIZE,
23799 		    KMF_GUARD_LAST, guard);
23800 		map->vmmap_corpse_footprint = NULL;
23801 		map->has_corpse_footprint = FALSE;
23802 	}
23803 }
23804 
23805 /*
23806  * vm_map_copy_footprint_ledgers:
23807  *	copies any ledger that's relevant to the memory footprint of "old_task"
23808  *	into the forked corpse's task ("new_task")
23809  */
23810 void
vm_map_copy_footprint_ledgers(task_t old_task,task_t new_task)23811 vm_map_copy_footprint_ledgers(
23812 	task_t  old_task,
23813 	task_t  new_task)
23814 {
23815 	vm_map_copy_ledger(old_task, new_task, task_ledgers.phys_footprint);
23816 	vm_map_copy_ledger(old_task, new_task, task_ledgers.purgeable_nonvolatile);
23817 	vm_map_copy_ledger(old_task, new_task, task_ledgers.purgeable_nonvolatile_compressed);
23818 	vm_map_copy_ledger(old_task, new_task, task_ledgers.internal);
23819 	vm_map_copy_ledger(old_task, new_task, task_ledgers.internal_compressed);
23820 	vm_map_copy_ledger(old_task, new_task, task_ledgers.iokit_mapped);
23821 	vm_map_copy_ledger(old_task, new_task, task_ledgers.alternate_accounting);
23822 	vm_map_copy_ledger(old_task, new_task, task_ledgers.alternate_accounting_compressed);
23823 	vm_map_copy_ledger(old_task, new_task, task_ledgers.page_table);
23824 	vm_map_copy_ledger(old_task, new_task, task_ledgers.tagged_footprint);
23825 	vm_map_copy_ledger(old_task, new_task, task_ledgers.tagged_footprint_compressed);
23826 	vm_map_copy_ledger(old_task, new_task, task_ledgers.network_nonvolatile);
23827 	vm_map_copy_ledger(old_task, new_task, task_ledgers.network_nonvolatile_compressed);
23828 	vm_map_copy_ledger(old_task, new_task, task_ledgers.media_footprint);
23829 	vm_map_copy_ledger(old_task, new_task, task_ledgers.media_footprint_compressed);
23830 	vm_map_copy_ledger(old_task, new_task, task_ledgers.graphics_footprint);
23831 	vm_map_copy_ledger(old_task, new_task, task_ledgers.graphics_footprint_compressed);
23832 	vm_map_copy_ledger(old_task, new_task, task_ledgers.neural_footprint);
23833 	vm_map_copy_ledger(old_task, new_task, task_ledgers.neural_footprint_compressed);
23834 	vm_map_copy_ledger(old_task, new_task, task_ledgers.wired_mem);
23835 	vm_map_copy_ledger(old_task, new_task, task_ledgers.neural_nofootprint_total);
23836 }
23837 
23838 /*
23839  * vm_map_copy_ledger:
23840  *	copy a single ledger from "old_task" to "new_task"
23841  */
23842 void
vm_map_copy_ledger(task_t old_task,task_t new_task,int ledger_entry)23843 vm_map_copy_ledger(
23844 	task_t  old_task,
23845 	task_t  new_task,
23846 	int     ledger_entry)
23847 {
23848 	ledger_amount_t old_balance, new_balance, delta;
23849 
23850 	assert(new_task->map->has_corpse_footprint);
23851 	if (!new_task->map->has_corpse_footprint) {
23852 		return;
23853 	}
23854 
23855 	/* turn off sanity checks for the ledger we're about to mess with */
23856 	ledger_disable_panic_on_negative(new_task->ledger,
23857 	    ledger_entry);
23858 
23859 	/* adjust "new_task" to match "old_task" */
23860 	ledger_get_balance(old_task->ledger,
23861 	    ledger_entry,
23862 	    &old_balance);
23863 	ledger_get_balance(new_task->ledger,
23864 	    ledger_entry,
23865 	    &new_balance);
23866 	if (new_balance == old_balance) {
23867 		/* new == old: done */
23868 	} else if (new_balance > old_balance) {
23869 		/* new > old ==> new -= new - old */
23870 		delta = new_balance - old_balance;
23871 		ledger_debit(new_task->ledger,
23872 		    ledger_entry,
23873 		    delta);
23874 	} else {
23875 		/* new < old ==> new += old - new */
23876 		delta = old_balance - new_balance;
23877 		ledger_credit(new_task->ledger,
23878 		    ledger_entry,
23879 		    delta);
23880 	}
23881 }
23882 
23883 /*
23884  * vm_map_get_pmap:
23885  * returns the pmap associated with the vm_map
23886  */
23887 pmap_t
vm_map_get_pmap(vm_map_t map)23888 vm_map_get_pmap(vm_map_t map)
23889 {
23890 	return vm_map_pmap(map);
23891 }
23892 
23893 ppnum_t
vm_map_get_phys_page(vm_map_t map,vm_offset_t addr)23894 vm_map_get_phys_page(
23895 	vm_map_t                map,
23896 	vm_offset_t             addr)
23897 {
23898 	vm_object_offset_t      offset;
23899 	vm_object_t             object;
23900 	vm_map_offset_t         map_offset;
23901 	vm_map_entry_t          entry;
23902 	ppnum_t                 phys_page = 0;
23903 
23904 	map_offset = vm_map_trunc_page(addr, PAGE_MASK);
23905 
23906 	vm_map_lock(map);
23907 	while (vm_map_lookup_entry(map, map_offset, &entry)) {
23908 		if (entry->is_sub_map) {
23909 			vm_map_t        old_map;
23910 			vm_map_lock(VME_SUBMAP(entry));
23911 			old_map = map;
23912 			map = VME_SUBMAP(entry);
23913 			map_offset = (VME_OFFSET(entry) +
23914 			    (map_offset - entry->vme_start));
23915 			vm_map_unlock(old_map);
23916 			continue;
23917 		}
23918 		if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
23919 			vm_map_unlock(map);
23920 			return (ppnum_t) 0;
23921 		}
23922 		if (VME_OBJECT(entry)->phys_contiguous) {
23923 			/* These are  not standard pageable memory mappings */
23924 			/* If they are not present in the object they will  */
23925 			/* have to be picked up from the pager through the  */
23926 			/* fault mechanism.  */
23927 			if (VME_OBJECT(entry)->vo_shadow_offset == 0) {
23928 				/* need to call vm_fault */
23929 				vm_map_unlock(map);
23930 				vm_fault(map, map_offset, VM_PROT_NONE,
23931 				    FALSE /* change_wiring */, VM_KERN_MEMORY_NONE,
23932 				    THREAD_UNINT, NULL, 0);
23933 				vm_map_lock(map);
23934 				continue;
23935 			}
23936 			offset = (VME_OFFSET(entry) +
23937 			    (map_offset - entry->vme_start));
23938 			phys_page = (ppnum_t)
23939 			    ((VME_OBJECT(entry)->vo_shadow_offset
23940 			    + offset) >> PAGE_SHIFT);
23941 			break;
23942 		}
23943 		offset = (VME_OFFSET(entry) + (map_offset - entry->vme_start));
23944 		object = VME_OBJECT(entry);
23945 		vm_object_lock(object);
23946 		while (TRUE) {
23947 			vm_page_t dst_page = vm_page_lookup(object, offset);
23948 			if (dst_page == VM_PAGE_NULL) {
23949 				if (object->shadow) {
23950 					vm_object_t old_object;
23951 					vm_object_lock(object->shadow);
23952 					old_object = object;
23953 					offset = offset + object->vo_shadow_offset;
23954 					object = object->shadow;
23955 					vm_object_unlock(old_object);
23956 				} else {
23957 					vm_object_unlock(object);
23958 					break;
23959 				}
23960 			} else {
23961 				phys_page = (ppnum_t)(VM_PAGE_GET_PHYS_PAGE(dst_page));
23962 				vm_object_unlock(object);
23963 				break;
23964 			}
23965 		}
23966 		break;
23967 	}
23968 
23969 	vm_map_unlock(map);
23970 	return phys_page;
23971 }
23972 
23973 #if CONFIG_MAP_RANGES
23974 static bitmap_t vm_map_user_range_heap_map[BITMAP_LEN(VM_MEMORY_COUNT)];
23975 static bitmap_t vm_map_user_range_large_file_map[BITMAP_LEN(VM_MEMORY_COUNT)];
23976 
23977 static_assert((int)UMEM_RANGE_ID_DEFAULT == MACH_VM_RANGE_DEFAULT);
23978 static_assert((int)UMEM_RANGE_ID_HEAP == MACH_VM_RANGE_DATA);
23979 
23980 /*
23981  * vm_map_range_map_init:
23982  *  initializes the VM range ID map to enable index lookup
23983  *  of user VM ranges based on VM tag from userspace.
23984  */
23985 static void
vm_map_range_map_init(void)23986 vm_map_range_map_init(void)
23987 {
23988 	/*
23989 	 * VM_MEMORY_MALLOC{,_NANO} are skipped on purpose:
23990 	 * - the former is malloc metadata which should be kept separate
23991 	 * - the latter has its own ranges
23992 	 */
23993 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_HUGE);
23994 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_LARGE);
23995 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_LARGE_REUSED);
23996 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_MEDIUM);
23997 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_PROB_GUARD);
23998 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_SMALL);
23999 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_TINY);
24000 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_TCMALLOC);
24001 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_LIBNETWORK);
24002 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_IOACCELERATOR);
24003 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_IOSURFACE);
24004 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_IMAGEIO);
24005 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_COREGRAPHICS);
24006 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_CORESERVICES);
24007 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_COREDATA);
24008 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_LAYERKIT);
24009 	bitmap_set(vm_map_user_range_large_file_map, VM_MEMORY_IOACCELERATOR);
24010 	bitmap_set(vm_map_user_range_large_file_map, VM_MEMORY_IOSURFACE);
24011 }
24012 
24013 static struct mach_vm_range
vm_map_range_random_uniform(vm_map_size_t req_size,vm_map_offset_t min_addr,vm_map_offset_t max_addr,vm_map_offset_t offmask)24014 vm_map_range_random_uniform(
24015 	vm_map_size_t           req_size,
24016 	vm_map_offset_t         min_addr,
24017 	vm_map_offset_t         max_addr,
24018 	vm_map_offset_t         offmask)
24019 {
24020 	vm_map_offset_t random_addr;
24021 	struct mach_vm_range alloc;
24022 
24023 	req_size = (req_size + offmask) & ~offmask;
24024 	min_addr = (min_addr + offmask) & ~offmask;
24025 	max_addr = max_addr & ~offmask;
24026 
24027 	read_random(&random_addr, sizeof(random_addr));
24028 	random_addr %= (max_addr - req_size - min_addr);
24029 	random_addr &= ~offmask;
24030 
24031 	alloc.min_address = min_addr + random_addr;
24032 	alloc.max_address = min_addr + random_addr + req_size;
24033 	return alloc;
24034 }
24035 
24036 static vm_map_offset_t
vm_map_range_offmask(void)24037 vm_map_range_offmask(void)
24038 {
24039 	uint32_t pte_depth;
24040 
24041 	/*
24042 	 * PTE optimizations
24043 	 *
24044 	 *
24045 	 * 16k pages systems
24046 	 * ~~~~~~~~~~~~~~~~~
24047 	 *
24048 	 * A single L1 (sub-)page covers the address space.
24049 	 * - L2 pages cover 64G,
24050 	 * - L3 pages cover 32M.
24051 	 *
24052 	 * On embedded, the dynamic VA range is 64G and uses a single L2 page.
24053 	 * As a result, we really only need to align the ranges to 32M to avoid
24054 	 * partial L3 pages.
24055 	 *
24056 	 * On macOS, the usage of L2 pages will increase, so as a result we will
24057 	 * want to align ranges to 64G in order to utilize them fully.
24058 	 *
24059 	 *
24060 	 * 4k pages systems
24061 	 * ~~~~~~~~~~~~~~~~
24062 	 *
24063 	 * A single L0 (sub-)page covers the address space.
24064 	 * - L1 pages cover 512G,
24065 	 * - L2 pages cover 1G,
24066 	 * - L3 pages cover 2M.
24067 	 *
24068 	 * The long tail of processes on a system will tend to have a VA usage
24069 	 * (ignoring the shared regions) in the 100s of MB order of magnitnude.
24070 	 * This is achievable with a single L1 and a few L2s without
24071 	 * randomization.
24072 	 *
24073 	 * However once randomization is introduced, the system will immediately
24074 	 * need several L1s and many more L2s. As a result:
24075 	 *
24076 	 * - on embedded devices, the cost of these extra pages isn't
24077 	 *   sustainable, and we just disable the feature entirely,
24078 	 *
24079 	 * - on macOS we align ranges to a 512G boundary so that the extra L1
24080 	 *   pages can be used to their full potential.
24081 	 */
24082 
24083 	/*
24084 	 * note, this function assumes _non exotic mappings_
24085 	 * which is why it uses the native kernel's PAGE_SHIFT.
24086 	 */
24087 #if XNU_PLATFORM_MacOSX
24088 	pte_depth = PAGE_SHIFT > 12 ? 2 : 3;
24089 #else /* !XNU_PLATFORM_MacOSX */
24090 	pte_depth = PAGE_SHIFT > 12 ? 1 : 0;
24091 #endif /* !XNU_PLATFORM_MacOSX */
24092 
24093 	if (pte_depth == 0) {
24094 		return 0;
24095 	}
24096 
24097 	return (1ull << ((PAGE_SHIFT - 3) * pte_depth + PAGE_SHIFT)) - 1;
24098 }
24099 
24100 /*
24101  * vm_map_range_configure:
24102  *	configures the user vm_map ranges by increasing the maximum VA range of
24103  *  the map and carving out a range at the end of VA space (searching backwards
24104  *  in the newly expanded map).
24105  */
24106 kern_return_t
vm_map_range_configure(vm_map_t map,__unused bool needs_extra_jumbo_va)24107 vm_map_range_configure(vm_map_t map, __unused bool needs_extra_jumbo_va)
24108 {
24109 	const vm_map_offset_t offmask = vm_map_range_offmask();
24110 	struct mach_vm_range data_range;
24111 	vm_map_offset_t default_end;
24112 	kern_return_t kr;
24113 
24114 	if (!vm_map_is_64bit(map) || vm_map_is_exotic(map) || offmask == 0) {
24115 		/*
24116 		 * No point doing vm ranges in a 32bit address space.
24117 		 */
24118 		return KERN_NOT_SUPPORTED;
24119 	}
24120 
24121 	/* Should not be applying ranges to kernel map or kernel map submaps */
24122 	assert(vm_map_pmap(map) != kernel_pmap);
24123 
24124 #if XNU_PLATFORM_MacOSX
24125 
24126 	/*
24127 	 * on macOS, the address space is a massive 47 bits (128T),
24128 	 * with several carve outs that processes can't use:
24129 	 * - the shared region
24130 	 * - the commpage region
24131 	 * - the GPU carve out (if applicable)
24132 	 *
24133 	 * and when nano-malloc is in use it desires memory at the 96T mark.
24134 	 *
24135 	 * However, their location is architecture dependent:
24136 	 * - On intel, the shared region and commpage are
24137 	 *   at the very end of the usable address space (above +127T),
24138 	 *   and there is no GPU carve out, and pthread wants to place
24139 	 *   threads at the 112T mark (0x70T).
24140 	 *
24141 	 * - On arm64, these are in the same spot as on embedded devices:
24142 	 *   o shared region:   [ 6G,  10G)  [ will likely grow over time ]
24143 	 *   o commpage region: [63G,  64G)
24144 	 *   o GPU carve out:   [64G, 448G)
24145 	 *
24146 	 * This is conveninent because the mappings at the end of the address
24147 	 * space (when they exist) are made by the kernel.
24148 	 *
24149 	 * The policy is to allocate a random 1T for the data heap
24150 	 * in the end of the address-space in the:
24151 	 * - [0x71, 0x7f) range on Intel (to leave space for pthread stacks)
24152 	 * - [0x61, 0x7f) range on ASM (to leave space for Nano malloc).
24153 	 */
24154 
24155 	/* see NANOZONE_SIGNATURE in libmalloc */
24156 #if __x86_64__
24157 	default_end = 0x71ull << 40;
24158 #else
24159 	default_end = 0x61ull << 40;
24160 #endif
24161 	data_range  = vm_map_range_random_uniform(1ull << 40,
24162 	        default_end, 0x7full << 40, offmask);
24163 
24164 #else /* !XNU_PLATFORM_MacOSX */
24165 
24166 	/*
24167 	 * Embedded devices:
24168 	 *
24169 	 *   The default VA Size scales with the device physical memory.
24170 	 *
24171 	 *   Out of that:
24172 	 *   - the "zero" page typically uses 4G + some slide
24173 	 *   - the shared region uses SHARED_REGION_SIZE bytes (4G)
24174 	 *
24175 	 *   Without the use of jumbo or any adjustment to the address space,
24176 	 *   a default VM map typically looks like this:
24177 	 *
24178 	 *       0G -->╒════════════╕
24179 	 *             │  pagezero  │
24180 	 *             │  + slide   │
24181 	 *      ~4G -->╞════════════╡<-- vm_map_min(map)
24182 	 *             │            │
24183 	 *       6G -->├────────────┤
24184 	 *             │   shared   │
24185 	 *             │   region   │
24186 	 *      10G -->├────────────┤
24187 	 *             │            │
24188 	 *   max_va -->├────────────┤<-- vm_map_max(map)
24189 	 *             │            │
24190 	 *             ╎   jumbo    ╎
24191 	 *             ╎            ╎
24192 	 *             │            │
24193 	 *      63G -->╞════════════╡<-- MACH_VM_MAX_ADDRESS
24194 	 *             │  commpage  │
24195 	 *      64G -->├────────────┤<-- MACH_VM_MIN_GPU_CARVEOUT_ADDRESS
24196 	 *             │            │
24197 	 *             ╎    GPU     ╎
24198 	 *             ╎  carveout  ╎
24199 	 *             │            │
24200 	 *     448G -->├────────────┤<-- MACH_VM_MAX_GPU_CARVEOUT_ADDRESS
24201 	 *             │            │
24202 	 *             ╎            ╎
24203 	 *             ╎            ╎
24204 	 *             │            │
24205 	 *     512G -->╘════════════╛<-- (1ull << ARM_16K_TT_L1_SHIFT)
24206 	 *
24207 	 *   When this drawing was made, "max_va" was smaller than
24208 	 *   ARM64_MAX_OFFSET_DEVICE_LARGE (~15.5G), leaving shy of
24209 	 *   12G of address space for the zero-page, slide, files,
24210 	 *   binaries, heap ...
24211 	 *
24212 	 *   We will want to make a "heap/data" carve out inside
24213 	 *   the jumbo range of half of that usable space, assuming
24214 	 *   that this is less than a forth of the jumbo range.
24215 	 *
24216 	 *   The assert below intends to catch when max_va grows
24217 	 *   too large for this heuristic.
24218 	 */
24219 
24220 	vm_map_lock_read(map);
24221 	default_end = vm_map_max(map);
24222 	vm_map_unlock_read(map);
24223 
24224 	/*
24225 	 * Check that we're not already jumbo'd,
24226 	 * or our address space was somehow modified.
24227 	 *
24228 	 * If so we cannot guarantee that we can set up the ranges
24229 	 * safely without interfering with the existing map.
24230 	 */
24231 	if (default_end > vm_compute_max_offset(true)) {
24232 		return KERN_NO_SPACE;
24233 	}
24234 
24235 	if (pmap_max_offset(true, ARM_PMAP_MAX_OFFSET_DEFAULT)) {
24236 		/*
24237 		 * an override boot-arg was set, disable user-ranges
24238 		 *
24239 		 * XXX: this is problematic because it means these boot-args
24240 		 *      no longer test the behavior changing the value
24241 		 *      of ARM64_MAX_OFFSET_DEVICE_* would have.
24242 		 */
24243 		return KERN_NOT_SUPPORTED;
24244 	}
24245 
24246 	/* expand the default VM space to 64GB */
24247 	vm_map_set_jumbo(map);
24248 
24249 	assert3u(7 * GiB(10) / 2, <=, vm_map_max(map) - default_end);
24250 	data_range = vm_map_range_random_uniform(GiB(10),
24251 	    default_end + PAGE_SIZE, vm_map_max(map), offmask);
24252 
24253 #endif /* !XNU_PLATFORM_MacOSX */
24254 
24255 	/*
24256 	 * Poke holes so that ASAN or people listing regions
24257 	 * do not think this space is free.
24258 	 */
24259 
24260 	if (default_end != data_range.min_address) {
24261 		kr = vm_map_enter(map, &default_end,
24262 		    data_range.min_address - default_end,
24263 		    0, VM_MAP_KERNEL_FLAGS_FIXED_PERMANENT(), VM_OBJECT_NULL,
24264 		    0, FALSE, VM_PROT_NONE, VM_PROT_NONE, VM_INHERIT_DEFAULT);
24265 		assert(kr == KERN_SUCCESS);
24266 	}
24267 
24268 	if (data_range.max_address != vm_map_max(map)) {
24269 		vm_map_entry_t entry;
24270 		vm_size_t size;
24271 
24272 		/*
24273 		 * Extend the end of the hole to the next VM entry or the end of the map,
24274 		 * whichever comes first.
24275 		 */
24276 		vm_map_lock_read(map);
24277 		vm_map_lookup_entry_or_next(map, data_range.max_address, &entry);
24278 		if (entry == vm_map_to_entry(map) || entry->vme_start > vm_map_max(map)) {
24279 			size = vm_map_max(map) - data_range.max_address;
24280 		} else {
24281 			size = entry->vme_start - data_range.max_address;
24282 		}
24283 		vm_map_unlock_read(map);
24284 
24285 		kr = vm_map_enter(map, &data_range.max_address, size,
24286 		    0, VM_MAP_KERNEL_FLAGS_FIXED_PERMANENT(), VM_OBJECT_NULL,
24287 		    0, FALSE, VM_PROT_NONE, VM_PROT_NONE, VM_INHERIT_DEFAULT);
24288 		assert(kr == KERN_SUCCESS);
24289 	}
24290 
24291 #if XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT
24292 	if (needs_extra_jumbo_va) {
24293 		/* This will grow the address space to MACH_VM_MAX_ADDRESS */
24294 		vm_map_set_extra_jumbo(map);
24295 	}
24296 #endif /* XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT */
24297 
24298 	vm_map_lock(map);
24299 	map->default_range.min_address = vm_map_min(map);
24300 	map->default_range.max_address = default_end;
24301 	map->data_range = data_range;
24302 #if XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT
24303 	/* If process has "extra jumbo" entitlement, enable large file range */
24304 	if (needs_extra_jumbo_va) {
24305 		map->large_file_range = vm_map_range_random_uniform(TiB(1),
24306 		    MACH_VM_JUMBO_ADDRESS, MACH_VM_MAX_ADDRESS, offmask);
24307 	}
24308 #endif /* XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT */
24309 	map->uses_user_ranges = true;
24310 	vm_map_unlock(map);
24311 
24312 	return KERN_SUCCESS;
24313 }
24314 
24315 /*
24316  * vm_map_range_fork:
24317  *	clones the array of ranges from old_map to new_map in support
24318  *  of a VM map fork.
24319  */
24320 void
vm_map_range_fork(vm_map_t new_map,vm_map_t old_map)24321 vm_map_range_fork(vm_map_t new_map, vm_map_t old_map)
24322 {
24323 	if (!old_map->uses_user_ranges) {
24324 		/* nothing to do */
24325 		return;
24326 	}
24327 
24328 	new_map->default_range = old_map->default_range;
24329 	new_map->data_range = old_map->data_range;
24330 
24331 	if (old_map->extra_ranges_count) {
24332 		vm_map_user_range_t otable, ntable;
24333 		uint16_t count;
24334 
24335 		otable = old_map->extra_ranges;
24336 		count  = old_map->extra_ranges_count;
24337 		ntable = kalloc_data(count * sizeof(struct vm_map_user_range),
24338 		    Z_WAITOK | Z_ZERO | Z_NOFAIL);
24339 		memcpy(ntable, otable,
24340 		    count * sizeof(struct vm_map_user_range));
24341 
24342 		new_map->extra_ranges_count = count;
24343 		new_map->extra_ranges = ntable;
24344 	}
24345 
24346 	new_map->uses_user_ranges = true;
24347 }
24348 
24349 /*
24350  * vm_map_get_user_range:
24351  *	copy the VM user range for the given VM map and range ID.
24352  */
24353 kern_return_t
vm_map_get_user_range(vm_map_t map,vm_map_range_id_t range_id,mach_vm_range_t range)24354 vm_map_get_user_range(
24355 	vm_map_t                map,
24356 	vm_map_range_id_t       range_id,
24357 	mach_vm_range_t         range)
24358 {
24359 	if (map == NULL || !map->uses_user_ranges || range == NULL) {
24360 		return KERN_INVALID_ARGUMENT;
24361 	}
24362 
24363 	switch (range_id) {
24364 	case UMEM_RANGE_ID_DEFAULT:
24365 		*range = map->default_range;
24366 		return KERN_SUCCESS;
24367 
24368 	case UMEM_RANGE_ID_HEAP:
24369 		*range = map->data_range;
24370 		return KERN_SUCCESS;
24371 
24372 	case UMEM_RANGE_ID_LARGE_FILE:
24373 		/*
24374 		 * Because this function tells a user-space process about the user
24375 		 * ranges in its VM map, this case communicates whether the large file
24376 		 * range is in use. Note that this is different from how the large file
24377 		 * range ID is handled in `vm_map_get_range()`: there, we "resolve" the
24378 		 * VA policy and return either the large file range or data range,
24379 		 * depending on whether the large file range is enabled.
24380 		 */
24381 		if (map->large_file_range.min_address != map->large_file_range.max_address) {
24382 			/* large file range is configured and should be used */
24383 			*range = map->large_file_range;
24384 		} else {
24385 			return KERN_INVALID_ARGUMENT;
24386 		}
24387 		return KERN_SUCCESS;
24388 
24389 	default:
24390 		return KERN_INVALID_ARGUMENT;
24391 	}
24392 }
24393 
24394 static vm_map_range_id_t
vm_map_user_range_resolve(vm_map_t map,mach_vm_address_t addr,mach_vm_size_t size,mach_vm_range_t range)24395 vm_map_user_range_resolve(
24396 	vm_map_t                map,
24397 	mach_vm_address_t       addr,
24398 	mach_vm_size_t          size,
24399 	mach_vm_range_t         range)
24400 {
24401 	struct mach_vm_range tmp;
24402 
24403 	vm_map_lock_assert_held(map);
24404 
24405 	static_assert((int)UMEM_RANGE_ID_DEFAULT == MACH_VM_RANGE_DEFAULT);
24406 	static_assert((int)UMEM_RANGE_ID_HEAP == MACH_VM_RANGE_DATA);
24407 
24408 	if (mach_vm_range_contains(&map->default_range, addr, size)) {
24409 		if (range) {
24410 			*range = map->default_range;
24411 		}
24412 		return UMEM_RANGE_ID_DEFAULT;
24413 	}
24414 
24415 	if (mach_vm_range_contains(&map->data_range, addr, size)) {
24416 		if (range) {
24417 			*range = map->data_range;
24418 		}
24419 		return UMEM_RANGE_ID_HEAP;
24420 	}
24421 
24422 	if (mach_vm_range_contains(&map->large_file_range, addr, size)) {
24423 		if (range) {
24424 			*range = map->large_file_range;
24425 		}
24426 		return UMEM_RANGE_ID_LARGE_FILE;
24427 	}
24428 
24429 	for (size_t i = 0; i < map->extra_ranges_count; i++) {
24430 		vm_map_user_range_t r = &map->extra_ranges[i];
24431 
24432 		tmp.min_address = r->vmur_min_address;
24433 		tmp.max_address = r->vmur_max_address;
24434 
24435 		if (mach_vm_range_contains(&tmp, addr, size)) {
24436 			if (range) {
24437 				*range = tmp;
24438 			}
24439 			return r->vmur_range_id;
24440 		}
24441 	}
24442 
24443 	if (range) {
24444 		range->min_address = range->max_address = 0;
24445 	}
24446 	return UMEM_RANGE_ID_DEFAULT;
24447 }
24448 #endif /* CONFIG_MAP_RANGES */
24449 
24450 void
vm_map_kernel_flags_update_range_id(vm_map_kernel_flags_t * vmkf,vm_map_t map,__unused vm_map_size_t size)24451 vm_map_kernel_flags_update_range_id(
24452 	vm_map_kernel_flags_t *vmkf,
24453 	vm_map_t map,
24454 	__unused vm_map_size_t size)
24455 {
24456 	if (map == kernel_map) {
24457 		if (vmkf->vmkf_range_id == KMEM_RANGE_ID_NONE) {
24458 			vmkf->vmkf_range_id = KMEM_RANGE_ID_DATA;
24459 		}
24460 #if CONFIG_MAP_RANGES
24461 	} else if (vmkf->vm_tag < VM_MEMORY_COUNT &&
24462 	    vmkf->vmkf_range_id == UMEM_RANGE_ID_DEFAULT) {
24463 		if (bitmap_test(vm_map_user_range_large_file_map, vmkf->vm_tag)
24464 		    || size >= VM_LARGE_FILE_THRESHOLD) {
24465 			/*
24466 			 * if the map doesn't have the large file range configured,
24467 			 * the range will get resolved to the heap range in `vm_map_get_range`
24468 			 */
24469 			vmkf->vmkf_range_id = UMEM_RANGE_ID_LARGE_FILE;
24470 		} else if (bitmap_test(vm_map_user_range_heap_map, vmkf->vm_tag)) {
24471 			vmkf->vmkf_range_id = UMEM_RANGE_ID_HEAP;
24472 		}
24473 #endif /* CONFIG_MAP_RANGES */
24474 	}
24475 }
24476 
24477 /*
24478  * vm_map_entry_has_device_pager:
24479  * Check if the vm map entry specified by the virtual address has a device pager.
24480  * If the vm map entry does not exist or if the map is NULL, this returns FALSE.
24481  */
24482 boolean_t
vm_map_entry_has_device_pager(vm_map_t map,vm_map_offset_t vaddr)24483 vm_map_entry_has_device_pager(vm_map_t map, vm_map_offset_t vaddr)
24484 {
24485 	vm_map_entry_t entry;
24486 	vm_object_t object;
24487 	boolean_t result;
24488 
24489 	if (map == NULL) {
24490 		return FALSE;
24491 	}
24492 
24493 	vm_map_lock(map);
24494 	while (TRUE) {
24495 		if (!vm_map_lookup_entry(map, vaddr, &entry)) {
24496 			result = FALSE;
24497 			break;
24498 		}
24499 		if (entry->is_sub_map) {
24500 			// Check the submap
24501 			vm_map_t submap = VME_SUBMAP(entry);
24502 			assert(submap != NULL);
24503 			vm_map_lock(submap);
24504 			vm_map_unlock(map);
24505 			map = submap;
24506 			continue;
24507 		}
24508 		object = VME_OBJECT(entry);
24509 		if (object != NULL && object->pager != NULL && is_device_pager_ops(object->pager->mo_pager_ops)) {
24510 			result = TRUE;
24511 			break;
24512 		}
24513 		result = FALSE;
24514 		break;
24515 	}
24516 
24517 	vm_map_unlock(map);
24518 	return result;
24519 }
24520 
24521 
24522 #if MACH_ASSERT
24523 
24524 extern int pmap_ledgers_panic;
24525 extern int pmap_ledgers_panic_leeway;
24526 
24527 #define LEDGER_DRIFT(__LEDGER)                    \
24528 	int             __LEDGER##_over;          \
24529 	ledger_amount_t __LEDGER##_over_total;    \
24530 	ledger_amount_t __LEDGER##_over_max;      \
24531 	int             __LEDGER##_under;         \
24532 	ledger_amount_t __LEDGER##_under_total;   \
24533 	ledger_amount_t __LEDGER##_under_max
24534 
24535 struct {
24536 	uint64_t        num_pmaps_checked;
24537 
24538 	LEDGER_DRIFT(phys_footprint);
24539 	LEDGER_DRIFT(internal);
24540 	LEDGER_DRIFT(internal_compressed);
24541 	LEDGER_DRIFT(external);
24542 	LEDGER_DRIFT(reusable);
24543 	LEDGER_DRIFT(iokit_mapped);
24544 	LEDGER_DRIFT(alternate_accounting);
24545 	LEDGER_DRIFT(alternate_accounting_compressed);
24546 	LEDGER_DRIFT(page_table);
24547 	LEDGER_DRIFT(purgeable_volatile);
24548 	LEDGER_DRIFT(purgeable_nonvolatile);
24549 	LEDGER_DRIFT(purgeable_volatile_compressed);
24550 	LEDGER_DRIFT(purgeable_nonvolatile_compressed);
24551 	LEDGER_DRIFT(tagged_nofootprint);
24552 	LEDGER_DRIFT(tagged_footprint);
24553 	LEDGER_DRIFT(tagged_nofootprint_compressed);
24554 	LEDGER_DRIFT(tagged_footprint_compressed);
24555 	LEDGER_DRIFT(network_volatile);
24556 	LEDGER_DRIFT(network_nonvolatile);
24557 	LEDGER_DRIFT(network_volatile_compressed);
24558 	LEDGER_DRIFT(network_nonvolatile_compressed);
24559 	LEDGER_DRIFT(media_nofootprint);
24560 	LEDGER_DRIFT(media_footprint);
24561 	LEDGER_DRIFT(media_nofootprint_compressed);
24562 	LEDGER_DRIFT(media_footprint_compressed);
24563 	LEDGER_DRIFT(graphics_nofootprint);
24564 	LEDGER_DRIFT(graphics_footprint);
24565 	LEDGER_DRIFT(graphics_nofootprint_compressed);
24566 	LEDGER_DRIFT(graphics_footprint_compressed);
24567 	LEDGER_DRIFT(neural_nofootprint);
24568 	LEDGER_DRIFT(neural_footprint);
24569 	LEDGER_DRIFT(neural_nofootprint_compressed);
24570 	LEDGER_DRIFT(neural_footprint_compressed);
24571 	LEDGER_DRIFT(neural_nofootprint_total);
24572 } pmap_ledgers_drift;
24573 
24574 void
vm_map_pmap_check_ledgers(pmap_t pmap,ledger_t ledger,int pid,char * procname)24575 vm_map_pmap_check_ledgers(
24576 	pmap_t          pmap,
24577 	ledger_t        ledger,
24578 	int             pid,
24579 	char            *procname)
24580 {
24581 	ledger_amount_t bal;
24582 	boolean_t       do_panic;
24583 
24584 	do_panic = FALSE;
24585 
24586 	pmap_ledgers_drift.num_pmaps_checked++;
24587 
24588 #define LEDGER_CHECK_BALANCE(__LEDGER)                                  \
24589 MACRO_BEGIN                                                             \
24590 	int panic_on_negative = TRUE;                                   \
24591 	ledger_get_balance(ledger,                                      \
24592 	                   task_ledgers.__LEDGER,                       \
24593 	                   &bal);                                       \
24594 	ledger_get_panic_on_negative(ledger,                            \
24595 	                             task_ledgers.__LEDGER,             \
24596 	                             &panic_on_negative);               \
24597 	if (bal != 0) {                                                 \
24598 	        if (panic_on_negative ||                                \
24599 	            (pmap_ledgers_panic &&                              \
24600 	             pmap_ledgers_panic_leeway > 0 &&                   \
24601 	             (bal > (pmap_ledgers_panic_leeway * PAGE_SIZE) ||  \
24602 	              bal < (-pmap_ledgers_panic_leeway * PAGE_SIZE)))) { \
24603 	                do_panic = TRUE;                                \
24604 	        }                                                       \
24605 	        printf("LEDGER BALANCE proc %d (%s) "                   \
24606 	               "\"%s\" = %lld\n",                               \
24607 	               pid, procname, #__LEDGER, bal);                  \
24608 	        if (bal > 0) {                                          \
24609 	                pmap_ledgers_drift.__LEDGER##_over++;           \
24610 	                pmap_ledgers_drift.__LEDGER##_over_total += bal; \
24611 	                if (bal > pmap_ledgers_drift.__LEDGER##_over_max) { \
24612 	                        pmap_ledgers_drift.__LEDGER##_over_max = bal; \
24613 	                }                                               \
24614 	        } else if (bal < 0) {                                   \
24615 	                pmap_ledgers_drift.__LEDGER##_under++;          \
24616 	                pmap_ledgers_drift.__LEDGER##_under_total += bal; \
24617 	                if (bal < pmap_ledgers_drift.__LEDGER##_under_max) { \
24618 	                        pmap_ledgers_drift.__LEDGER##_under_max = bal; \
24619 	                }                                               \
24620 	        }                                                       \
24621 	}                                                               \
24622 MACRO_END
24623 
24624 	LEDGER_CHECK_BALANCE(phys_footprint);
24625 	LEDGER_CHECK_BALANCE(internal);
24626 	LEDGER_CHECK_BALANCE(internal_compressed);
24627 	LEDGER_CHECK_BALANCE(external);
24628 	LEDGER_CHECK_BALANCE(reusable);
24629 	LEDGER_CHECK_BALANCE(iokit_mapped);
24630 	LEDGER_CHECK_BALANCE(alternate_accounting);
24631 	LEDGER_CHECK_BALANCE(alternate_accounting_compressed);
24632 	LEDGER_CHECK_BALANCE(page_table);
24633 	LEDGER_CHECK_BALANCE(purgeable_volatile);
24634 	LEDGER_CHECK_BALANCE(purgeable_nonvolatile);
24635 	LEDGER_CHECK_BALANCE(purgeable_volatile_compressed);
24636 	LEDGER_CHECK_BALANCE(purgeable_nonvolatile_compressed);
24637 	LEDGER_CHECK_BALANCE(tagged_nofootprint);
24638 	LEDGER_CHECK_BALANCE(tagged_footprint);
24639 	LEDGER_CHECK_BALANCE(tagged_nofootprint_compressed);
24640 	LEDGER_CHECK_BALANCE(tagged_footprint_compressed);
24641 	LEDGER_CHECK_BALANCE(network_volatile);
24642 	LEDGER_CHECK_BALANCE(network_nonvolatile);
24643 	LEDGER_CHECK_BALANCE(network_volatile_compressed);
24644 	LEDGER_CHECK_BALANCE(network_nonvolatile_compressed);
24645 	LEDGER_CHECK_BALANCE(media_nofootprint);
24646 	LEDGER_CHECK_BALANCE(media_footprint);
24647 	LEDGER_CHECK_BALANCE(media_nofootprint_compressed);
24648 	LEDGER_CHECK_BALANCE(media_footprint_compressed);
24649 	LEDGER_CHECK_BALANCE(graphics_nofootprint);
24650 	LEDGER_CHECK_BALANCE(graphics_footprint);
24651 	LEDGER_CHECK_BALANCE(graphics_nofootprint_compressed);
24652 	LEDGER_CHECK_BALANCE(graphics_footprint_compressed);
24653 	LEDGER_CHECK_BALANCE(neural_nofootprint);
24654 	LEDGER_CHECK_BALANCE(neural_footprint);
24655 	LEDGER_CHECK_BALANCE(neural_nofootprint_compressed);
24656 	LEDGER_CHECK_BALANCE(neural_footprint_compressed);
24657 	LEDGER_CHECK_BALANCE(neural_nofootprint_total);
24658 
24659 	if (do_panic) {
24660 		if (pmap_ledgers_panic) {
24661 			panic("pmap_destroy(%p) %d[%s] has imbalanced ledgers",
24662 			    pmap, pid, procname);
24663 		} else {
24664 			printf("pmap_destroy(%p) %d[%s] has imbalanced ledgers\n",
24665 			    pmap, pid, procname);
24666 		}
24667 	}
24668 }
24669 
24670 void
vm_map_pmap_set_process(vm_map_t map,int pid,char * procname)24671 vm_map_pmap_set_process(
24672 	vm_map_t map,
24673 	int pid,
24674 	char *procname)
24675 {
24676 	pmap_set_process(vm_map_pmap(map), pid, procname);
24677 }
24678 
24679 #endif /* MACH_ASSERT */
24680 
24681 /**
24682  * Check if a given given map operation size is valid for the given map, taking
24683  * in to account whether or not the map operation has overridden the soft limit.
24684  *
24685  * This function is meant to be inlined wherever possible as it can, in some
24686  * modes, generates telemetry events which capture shallow backtraces. To
24687  * maximize the usefulness of this backtrace, we want to minize the depth at
24688  * which the backtrace is taken.
24689  */
24690 __attribute__((always_inline))
24691 bool
vm_map_is_map_size_valid(vm_map_t target_map,vm_size_t size,bool no_soft_limit)24692 vm_map_is_map_size_valid(
24693 	vm_map_t target_map,
24694 	vm_size_t size,
24695 	bool no_soft_limit)
24696 {
24697 #ifdef __x86_64__
24698 	// Do not enforce any additional limits on x64
24699 	(void)target_map;
24700 	(void)size;
24701 	(void)no_soft_limit;
24702 	return true;
24703 #else
24704 	if (__probable(target_map->pmap != kernel_pmap ||
24705 	    size < VM_KERNEL_SIMPLE_MAX_SIZE || no_soft_limit)) {
24706 		// Allocation size matches policy
24707 		return true;
24708 	}
24709 
24710 	switch (vm_map_kernel_alloc_limit_mode) {
24711 	default:
24712 	case VM_MAP_KERNEL_ALLOC_LIMIT_MODE_BYPASS:
24713 		return true;
24714 	case VM_MAP_KERNEL_ALLOC_LIMIT_MODE_TRAP:
24715 		trap_telemetry_report_kernel_soft_error(
24716 			TRAP_TELEMETRY_KERNEL_SOFT_ERROR_VM_KERNEL_MAX_ALLOC_SIZE,
24717 			/* report_once_per_site */ false);
24718 		return true;
24719 	case VM_MAP_KERNEL_ALLOC_LIMIT_MODE_REJECT:
24720 		return false;
24721 	case VM_MAP_KERNEL_ALLOC_LIMIT_MODE_PANIC:
24722 		panic("1,000,000K ought to be enough for anybody "
24723 		    "(requested %lu bytes)", size);
24724 	}
24725 #endif /* __x86_64__ */
24726 }
24727