xref: /xnu-10002.81.5/osfmk/vm/vm_map.c (revision 5e3eaea39dcf651e66cb99ba7d70e32cc4a99587)
1 /*
2  * Copyright (c) 2000-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * @OSF_COPYRIGHT@
30  */
31 /*
32  * Mach Operating System
33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34  * All Rights Reserved.
35  *
36  * Permission to use, copy, modify and distribute this software and its
37  * documentation is hereby granted, provided that both the copyright
38  * notice and this permission notice appear in all copies of the
39  * software, derivative works or modified versions, and any portions
40  * thereof, and that both notices appear in supporting documentation.
41  *
42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45  *
46  * Carnegie Mellon requests users of this software to return to
47  *
48  *  Software Distribution Coordinator  or  [email protected]
49  *  School of Computer Science
50  *  Carnegie Mellon University
51  *  Pittsburgh PA 15213-3890
52  *
53  * any improvements or extensions that they make and grant Carnegie Mellon
54  * the rights to redistribute these changes.
55  */
56 /*
57  */
58 /*
59  *	File:	vm/vm_map.c
60  *	Author:	Avadis Tevanian, Jr., Michael Wayne Young
61  *	Date:	1985
62  *
63  *	Virtual memory mapping module.
64  */
65 
66 #include <mach/vm_types.h>
67 #include <mach_assert.h>
68 
69 #include <vm/vm_options.h>
70 
71 #include <libkern/OSAtomic.h>
72 
73 #include <mach/kern_return.h>
74 #include <mach/port.h>
75 #include <mach/vm_attributes.h>
76 #include <mach/vm_param.h>
77 #include <mach/vm_behavior.h>
78 #include <mach/vm_statistics.h>
79 #include <mach/memory_object.h>
80 #include <mach/mach_vm.h>
81 #include <machine/cpu_capabilities.h>
82 #include <mach/sdt.h>
83 
84 #include <kern/assert.h>
85 #include <kern/backtrace.h>
86 #include <kern/counter.h>
87 #include <kern/exc_guard.h>
88 #include <kern/kalloc.h>
89 #include <kern/zalloc_internal.h>
90 
91 #include <vm/cpm.h>
92 #include <vm/vm_compressor.h>
93 #include <vm/vm_compressor_pager.h>
94 #include <vm/vm_init.h>
95 #include <vm/vm_fault.h>
96 #include <vm/vm_map_internal.h>
97 #include <vm/vm_object.h>
98 #include <vm/vm_page.h>
99 #include <vm/vm_pageout.h>
100 #include <vm/pmap.h>
101 #include <vm/vm_kern.h>
102 #include <ipc/ipc_port.h>
103 #include <kern/sched_prim.h>
104 #include <kern/misc_protos.h>
105 
106 #include <mach/vm_map_server.h>
107 #include <mach/mach_host_server.h>
108 #include <vm/vm_memtag.h>
109 #include <vm/vm_protos.h>
110 #include <vm/vm_purgeable_internal.h>
111 #include <vm/vm_reclaim_internal.h>
112 
113 #include <vm/vm_protos.h>
114 #include <vm/vm_shared_region.h>
115 #include <vm/vm_map_store.h>
116 
117 #include <san/kasan.h>
118 
119 #include <sys/resource.h>
120 #include <sys/random.h>
121 #include <sys/codesign.h>
122 #include <sys/code_signing.h>
123 #include <sys/mman.h>
124 #include <sys/reboot.h>
125 #include <sys/kdebug_triage.h>
126 
127 #include <libkern/section_keywords.h>
128 
129 #if DEVELOPMENT || DEBUG
130 extern int proc_selfcsflags(void);
131 int vm_log_xnu_user_debug = 0;
132 int panic_on_unsigned_execute = 0;
133 int panic_on_mlock_failure = 0;
134 #endif /* DEVELOPMENT || DEBUG */
135 
136 #if MACH_ASSERT
137 int debug4k_filter = 0;
138 char debug4k_proc_name[1024] = "";
139 int debug4k_proc_filter = (int)-1 & ~(1 << __DEBUG4K_FAULT);
140 int debug4k_panic_on_misaligned_sharing = 0;
141 const char *debug4k_category_name[] = {
142 	"error",        /* 0 */
143 	"life",         /* 1 */
144 	"load",         /* 2 */
145 	"fault",        /* 3 */
146 	"copy",         /* 4 */
147 	"share",        /* 5 */
148 	"adjust",       /* 6 */
149 	"pmap",         /* 7 */
150 	"mementry",     /* 8 */
151 	"iokit",        /* 9 */
152 	"upl",          /* 10 */
153 	"exc",          /* 11 */
154 	"vfs"           /* 12 */
155 };
156 #endif /* MACH_ASSERT */
157 int debug4k_no_cow_copyin = 0;
158 
159 
160 #if __arm64__
161 extern const int fourk_binary_compatibility_unsafe;
162 extern const int fourk_binary_compatibility_allow_wx;
163 #endif /* __arm64__ */
164 extern void qsort(void *a, size_t n, size_t es, int (*cmp)(const void *, const void *));
165 extern int proc_selfpid(void);
166 extern char *proc_name_address(void *p);
167 extern char *proc_best_name(struct proc *p);
168 
169 #if VM_MAP_DEBUG_APPLE_PROTECT
170 int vm_map_debug_apple_protect = 0;
171 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
172 #if VM_MAP_DEBUG_FOURK
173 int vm_map_debug_fourk = 0;
174 #endif /* VM_MAP_DEBUG_FOURK */
175 
176 #if DEBUG || DEVELOPMENT
177 static TUNABLE(bool, vm_map_executable_immutable,
178     "vm_map_executable_immutable", true);
179 #else
180 #define vm_map_executable_immutable true
181 #endif
182 
183 os_refgrp_decl(static, map_refgrp, "vm_map", NULL);
184 
185 extern u_int32_t random(void);  /* from <libkern/libkern.h> */
186 /* Internal prototypes
187  */
188 
189 typedef struct vm_map_zap {
190 	vm_map_entry_t          vmz_head;
191 	vm_map_entry_t         *vmz_tail;
192 } *vm_map_zap_t;
193 
194 #define VM_MAP_ZAP_DECLARE(zap) \
195 	struct vm_map_zap zap = { .vmz_tail = &zap.vmz_head }
196 
197 static vm_map_entry_t   vm_map_entry_insert(
198 	vm_map_t                map,
199 	vm_map_entry_t          insp_entry,
200 	vm_map_offset_t         start,
201 	vm_map_offset_t         end,
202 	vm_object_t             object,
203 	vm_object_offset_t      offset,
204 	vm_map_kernel_flags_t   vmk_flags,
205 	boolean_t               needs_copy,
206 	vm_prot_t               cur_protection,
207 	vm_prot_t               max_protection,
208 	vm_inherit_t            inheritance,
209 	boolean_t               clear_map_aligned);
210 
211 static void vm_map_simplify_range(
212 	vm_map_t        map,
213 	vm_map_offset_t start,
214 	vm_map_offset_t end);   /* forward */
215 
216 static boolean_t        vm_map_range_check(
217 	vm_map_t        map,
218 	vm_map_offset_t start,
219 	vm_map_offset_t end,
220 	vm_map_entry_t  *entry);
221 
222 static void vm_map_submap_pmap_clean(
223 	vm_map_t        map,
224 	vm_map_offset_t start,
225 	vm_map_offset_t end,
226 	vm_map_t        sub_map,
227 	vm_map_offset_t offset);
228 
229 static void             vm_map_pmap_enter(
230 	vm_map_t                map,
231 	vm_map_offset_t         addr,
232 	vm_map_offset_t         end_addr,
233 	vm_object_t             object,
234 	vm_object_offset_t      offset,
235 	vm_prot_t               protection);
236 
237 static void             _vm_map_clip_end(
238 	struct vm_map_header    *map_header,
239 	vm_map_entry_t          entry,
240 	vm_map_offset_t         end);
241 
242 static void             _vm_map_clip_start(
243 	struct vm_map_header    *map_header,
244 	vm_map_entry_t          entry,
245 	vm_map_offset_t         start);
246 
247 static kmem_return_t vm_map_delete(
248 	vm_map_t        map,
249 	vm_map_offset_t start,
250 	vm_map_offset_t end,
251 	vmr_flags_t     flags,
252 	kmem_guard_t    guard,
253 	vm_map_zap_t    zap);
254 
255 static void             vm_map_copy_insert(
256 	vm_map_t        map,
257 	vm_map_entry_t  after_where,
258 	vm_map_copy_t   copy);
259 
260 static kern_return_t    vm_map_copy_overwrite_unaligned(
261 	vm_map_t        dst_map,
262 	vm_map_entry_t  entry,
263 	vm_map_copy_t   copy,
264 	vm_map_address_t start,
265 	boolean_t       discard_on_success);
266 
267 static kern_return_t    vm_map_copy_overwrite_aligned(
268 	vm_map_t        dst_map,
269 	vm_map_entry_t  tmp_entry,
270 	vm_map_copy_t   copy,
271 	vm_map_offset_t start,
272 	pmap_t          pmap);
273 
274 static kern_return_t    vm_map_copyin_kernel_buffer(
275 	vm_map_t        src_map,
276 	vm_map_address_t src_addr,
277 	vm_map_size_t   len,
278 	boolean_t       src_destroy,
279 	vm_map_copy_t   *copy_result);  /* OUT */
280 
281 static kern_return_t    vm_map_copyout_kernel_buffer(
282 	vm_map_t        map,
283 	vm_map_address_t *addr, /* IN/OUT */
284 	vm_map_copy_t   copy,
285 	vm_map_size_t   copy_size,
286 	boolean_t       overwrite,
287 	boolean_t       consume_on_success);
288 
289 static void             vm_map_fork_share(
290 	vm_map_t        old_map,
291 	vm_map_entry_t  old_entry,
292 	vm_map_t        new_map);
293 
294 static boolean_t        vm_map_fork_copy(
295 	vm_map_t        old_map,
296 	vm_map_entry_t  *old_entry_p,
297 	vm_map_t        new_map,
298 	int             vm_map_copyin_flags);
299 
300 static kern_return_t    vm_map_wire_nested(
301 	vm_map_t                   map,
302 	vm_map_offset_t            start,
303 	vm_map_offset_t            end,
304 	vm_prot_t                  caller_prot,
305 	vm_tag_t                   tag,
306 	boolean_t                  user_wire,
307 	pmap_t                     map_pmap,
308 	vm_map_offset_t            pmap_addr,
309 	ppnum_t                    *physpage_p);
310 
311 static kern_return_t    vm_map_unwire_nested(
312 	vm_map_t                   map,
313 	vm_map_offset_t            start,
314 	vm_map_offset_t            end,
315 	boolean_t                  user_wire,
316 	pmap_t                     map_pmap,
317 	vm_map_offset_t            pmap_addr);
318 
319 static kern_return_t    vm_map_overwrite_submap_recurse(
320 	vm_map_t                   dst_map,
321 	vm_map_offset_t            dst_addr,
322 	vm_map_size_t              dst_size);
323 
324 static kern_return_t    vm_map_copy_overwrite_nested(
325 	vm_map_t                   dst_map,
326 	vm_map_offset_t            dst_addr,
327 	vm_map_copy_t              copy,
328 	boolean_t                  interruptible,
329 	pmap_t                     pmap,
330 	boolean_t                  discard_on_success);
331 
332 static kern_return_t    vm_map_remap_extract(
333 	vm_map_t                map,
334 	vm_map_offset_t         addr,
335 	vm_map_size_t           size,
336 	boolean_t               copy,
337 	vm_map_copy_t           map_copy,
338 	vm_prot_t               *cur_protection,
339 	vm_prot_t               *max_protection,
340 	vm_inherit_t            inheritance,
341 	vm_map_kernel_flags_t   vmk_flags);
342 
343 static kern_return_t    vm_map_remap_range_allocate(
344 	vm_map_t                map,
345 	vm_map_address_t        *address,
346 	vm_map_size_t           size,
347 	vm_map_offset_t         mask,
348 	vm_map_kernel_flags_t   vmk_flags,
349 	vm_map_entry_t          *map_entry,
350 	vm_map_zap_t            zap_list);
351 
352 static void             vm_map_region_look_for_page(
353 	vm_map_t                   map,
354 	vm_map_offset_t            va,
355 	vm_object_t                object,
356 	vm_object_offset_t         offset,
357 	int                        max_refcnt,
358 	unsigned short             depth,
359 	vm_region_extended_info_t  extended,
360 	mach_msg_type_number_t count);
361 
362 static int              vm_map_region_count_obj_refs(
363 	vm_map_entry_t             entry,
364 	vm_object_t                object);
365 
366 
367 static kern_return_t    vm_map_willneed(
368 	vm_map_t        map,
369 	vm_map_offset_t start,
370 	vm_map_offset_t end);
371 
372 static kern_return_t    vm_map_reuse_pages(
373 	vm_map_t        map,
374 	vm_map_offset_t start,
375 	vm_map_offset_t end);
376 
377 static kern_return_t    vm_map_reusable_pages(
378 	vm_map_t        map,
379 	vm_map_offset_t start,
380 	vm_map_offset_t end);
381 
382 static kern_return_t    vm_map_can_reuse(
383 	vm_map_t        map,
384 	vm_map_offset_t start,
385 	vm_map_offset_t end);
386 
387 static kern_return_t    vm_map_random_address_for_size(
388 	vm_map_t                map,
389 	vm_map_offset_t        *address,
390 	vm_map_size_t           size,
391 	vm_map_kernel_flags_t   vmk_flags);
392 
393 
394 #if CONFIG_MAP_RANGES
395 
396 static vm_map_range_id_t vm_map_user_range_resolve(
397 	vm_map_t                map,
398 	mach_vm_address_t       addr,
399 	mach_vm_address_t       size,
400 	mach_vm_range_t         range);
401 
402 #endif /* CONFIG_MAP_RANGES */
403 #if MACH_ASSERT
404 static kern_return_t    vm_map_pageout(
405 	vm_map_t        map,
406 	vm_map_offset_t start,
407 	vm_map_offset_t end);
408 #endif /* MACH_ASSERT */
409 
410 kern_return_t vm_map_corpse_footprint_collect(
411 	vm_map_t        old_map,
412 	vm_map_entry_t  old_entry,
413 	vm_map_t        new_map);
414 void vm_map_corpse_footprint_collect_done(
415 	vm_map_t        new_map);
416 void vm_map_corpse_footprint_destroy(
417 	vm_map_t        map);
418 kern_return_t vm_map_corpse_footprint_query_page_info(
419 	vm_map_t        map,
420 	vm_map_offset_t va,
421 	int             *disposition_p);
422 void vm_map_footprint_query_page_info(
423 	vm_map_t        map,
424 	vm_map_entry_t  map_entry,
425 	vm_map_offset_t curr_s_offset,
426 	int             *disposition_p);
427 
428 #if CONFIG_MAP_RANGES
429 static void vm_map_range_map_init(void);
430 #endif /* CONFIG_MAP_RANGES */
431 
432 pid_t find_largest_process_vm_map_entries(void);
433 
434 extern int exit_with_guard_exception(void *p, mach_exception_data_type_t code,
435     mach_exception_data_type_t subcode);
436 
437 /*
438  * Macros to copy a vm_map_entry. We must be careful to correctly
439  * manage the wired page count. vm_map_entry_copy() creates a new
440  * map entry to the same memory - the wired count in the new entry
441  * must be set to zero. vm_map_entry_copy_full() creates a new
442  * entry that is identical to the old entry.  This preserves the
443  * wire count; it's used for map splitting and zone changing in
444  * vm_map_copyout.
445  */
446 
447 static inline void
vm_map_entry_copy_csm_assoc(vm_map_t map __unused,vm_map_entry_t new __unused,vm_map_entry_t old __unused)448 vm_map_entry_copy_csm_assoc(
449 	vm_map_t map __unused,
450 	vm_map_entry_t new __unused,
451 	vm_map_entry_t old __unused)
452 {
453 #if CODE_SIGNING_MONITOR
454 	/* when code signing monitor is enabled, we want to reset on copy */
455 	new->csm_associated = FALSE;
456 #else
457 	/* when code signing monitor is not enabled, assert as a sanity check */
458 	assert(new->csm_associated == FALSE);
459 #endif
460 #if DEVELOPMENT || DEBUG
461 	if (new->vme_xnu_user_debug && vm_log_xnu_user_debug) {
462 		printf("FBDP %d[%s] %s:%d map %p entry %p [ 0x%llx 0x%llx ] resetting vme_xnu_user_debug\n",
463 		    proc_selfpid(),
464 		    (get_bsdtask_info(current_task())
465 		    ? proc_name_address(get_bsdtask_info(current_task()))
466 		    : "?"),
467 		    __FUNCTION__, __LINE__,
468 		    map, new, new->vme_start, new->vme_end);
469 	}
470 #endif /* DEVELOPMENT || DEBUG */
471 	new->vme_xnu_user_debug = FALSE;
472 }
473 
474 /*
475  * The "used_for_jit" flag was copied from OLD to NEW in vm_map_entry_copy().
476  * But for security reasons on some platforms, we don't want the
477  * new mapping to be "used for jit", so we reset the flag here.
478  */
479 static inline void
vm_map_entry_copy_code_signing(vm_map_t map,vm_map_entry_t new,vm_map_entry_t old __unused)480 vm_map_entry_copy_code_signing(
481 	vm_map_t map,
482 	vm_map_entry_t new,
483 	vm_map_entry_t old __unused)
484 {
485 	if (VM_MAP_POLICY_ALLOW_JIT_COPY(map)) {
486 		assert(new->used_for_jit == old->used_for_jit);
487 	} else {
488 		new->used_for_jit = FALSE;
489 	}
490 }
491 
492 static inline void
vm_map_entry_copy_full(vm_map_entry_t new,vm_map_entry_t old)493 vm_map_entry_copy_full(
494 	vm_map_entry_t new,
495 	vm_map_entry_t old)
496 {
497 #if MAP_ENTRY_CREATION_DEBUG
498 	btref_put(new->vme_creation_bt);
499 	btref_retain(old->vme_creation_bt);
500 #endif
501 #if MAP_ENTRY_INSERTION_DEBUG
502 	btref_put(new->vme_insertion_bt);
503 	btref_retain(old->vme_insertion_bt);
504 #endif
505 #if VM_BTLOG_TAGS
506 	/* Discard the btref that might be in the new entry */
507 	if (new->vme_kernel_object) {
508 		btref_put(new->vme_tag_btref);
509 	}
510 	/* Retain the btref in the old entry to account for its copy */
511 	if (old->vme_kernel_object) {
512 		btref_retain(old->vme_tag_btref);
513 	}
514 #endif /* VM_BTLOG_TAGS */
515 	*new = *old;
516 }
517 
518 static inline void
vm_map_entry_copy(vm_map_t map,vm_map_entry_t new,vm_map_entry_t old)519 vm_map_entry_copy(
520 	vm_map_t map,
521 	vm_map_entry_t new,
522 	vm_map_entry_t old)
523 {
524 	vm_map_entry_copy_full(new, old);
525 
526 	new->is_shared = FALSE;
527 	new->needs_wakeup = FALSE;
528 	new->in_transition = FALSE;
529 	new->wired_count = 0;
530 	new->user_wired_count = 0;
531 	new->vme_permanent = FALSE;
532 	vm_map_entry_copy_code_signing(map, new, old);
533 	vm_map_entry_copy_csm_assoc(map, new, old);
534 	if (new->iokit_acct) {
535 		assertf(!new->use_pmap, "old %p new %p\n", old, new);
536 		new->iokit_acct = FALSE;
537 		new->use_pmap = TRUE;
538 	}
539 	new->vme_resilient_codesign = FALSE;
540 	new->vme_resilient_media = FALSE;
541 	new->vme_atomic = FALSE;
542 	new->vme_no_copy_on_read = FALSE;
543 }
544 
545 /*
546  * Normal lock_read_to_write() returns FALSE/0 on failure.
547  * These functions evaluate to zero on success and non-zero value on failure.
548  */
549 __attribute__((always_inline))
550 int
vm_map_lock_read_to_write(vm_map_t map)551 vm_map_lock_read_to_write(vm_map_t map)
552 {
553 	if (lck_rw_lock_shared_to_exclusive(&(map)->lock)) {
554 		DTRACE_VM(vm_map_lock_upgrade);
555 		return 0;
556 	}
557 	return 1;
558 }
559 
560 __attribute__((always_inline))
561 boolean_t
vm_map_try_lock(vm_map_t map)562 vm_map_try_lock(vm_map_t map)
563 {
564 	if (lck_rw_try_lock_exclusive(&(map)->lock)) {
565 		DTRACE_VM(vm_map_lock_w);
566 		return TRUE;
567 	}
568 	return FALSE;
569 }
570 
571 __attribute__((always_inline))
572 boolean_t
vm_map_try_lock_read(vm_map_t map)573 vm_map_try_lock_read(vm_map_t map)
574 {
575 	if (lck_rw_try_lock_shared(&(map)->lock)) {
576 		DTRACE_VM(vm_map_lock_r);
577 		return TRUE;
578 	}
579 	return FALSE;
580 }
581 
582 /*!
583  * @function kdp_vm_map_is_acquired_exclusive
584  *
585  * @abstract
586  * Checks if vm map is acquired exclusive.
587  *
588  * @discussion
589  * NOT SAFE: To be used only by kernel debugger.
590  *
591  * @param map map to check
592  *
593  * @returns TRUE if the map is acquired exclusively.
594  */
595 boolean_t
kdp_vm_map_is_acquired_exclusive(vm_map_t map)596 kdp_vm_map_is_acquired_exclusive(vm_map_t map)
597 {
598 	return kdp_lck_rw_lock_is_acquired_exclusive(&map->lock);
599 }
600 
601 /*
602  * Routines to get the page size the caller should
603  * use while inspecting the target address space.
604  * Use the "_safely" variant if the caller is dealing with a user-provided
605  * array whose size depends on the page size, to avoid any overflow or
606  * underflow of a user-allocated buffer.
607  */
608 int
vm_self_region_page_shift_safely(vm_map_t target_map)609 vm_self_region_page_shift_safely(
610 	vm_map_t target_map)
611 {
612 	int effective_page_shift = 0;
613 
614 	if (PAGE_SIZE == (4096)) {
615 		/* x86_64 and 4k watches: always use 4k */
616 		return PAGE_SHIFT;
617 	}
618 	/* did caller provide an explicit page size for this thread to use? */
619 	effective_page_shift = thread_self_region_page_shift();
620 	if (effective_page_shift) {
621 		/* use the explicitly-provided page size */
622 		return effective_page_shift;
623 	}
624 	/* no explicit page size: use the caller's page size... */
625 	effective_page_shift = VM_MAP_PAGE_SHIFT(current_map());
626 	if (effective_page_shift == VM_MAP_PAGE_SHIFT(target_map)) {
627 		/* page size match: safe to use */
628 		return effective_page_shift;
629 	}
630 	/* page size mismatch */
631 	return -1;
632 }
633 int
vm_self_region_page_shift(vm_map_t target_map)634 vm_self_region_page_shift(
635 	vm_map_t target_map)
636 {
637 	int effective_page_shift;
638 
639 	effective_page_shift = vm_self_region_page_shift_safely(target_map);
640 	if (effective_page_shift == -1) {
641 		/* no safe value but OK to guess for caller */
642 		effective_page_shift = MIN(VM_MAP_PAGE_SHIFT(current_map()),
643 		    VM_MAP_PAGE_SHIFT(target_map));
644 	}
645 	return effective_page_shift;
646 }
647 
648 
649 /*
650  *	Decide if we want to allow processes to execute from their data or stack areas.
651  *	override_nx() returns true if we do.  Data/stack execution can be enabled independently
652  *	for 32 and 64 bit processes.  Set the VM_ABI_32 or VM_ABI_64 flags in allow_data_exec
653  *	or allow_stack_exec to enable data execution for that type of data area for that particular
654  *	ABI (or both by or'ing the flags together).  These are initialized in the architecture
655  *	specific pmap files since the default behavior varies according to architecture.  The
656  *	main reason it varies is because of the need to provide binary compatibility with old
657  *	applications that were written before these restrictions came into being.  In the old
658  *	days, an app could execute anything it could read, but this has slowly been tightened
659  *	up over time.  The default behavior is:
660  *
661  *	32-bit PPC apps		may execute from both stack and data areas
662  *	32-bit Intel apps	may exeucte from data areas but not stack
663  *	64-bit PPC/Intel apps	may not execute from either data or stack
664  *
665  *	An application on any architecture may override these defaults by explicitly
666  *	adding PROT_EXEC permission to the page in question with the mprotect(2)
667  *	system call.  This code here just determines what happens when an app tries to
668  *      execute from a page that lacks execute permission.
669  *
670  *	Note that allow_data_exec or allow_stack_exec may also be modified by sysctl to change the
671  *	default behavior for both 32 and 64 bit apps on a system-wide basis. Furthermore,
672  *	a Mach-O header flag bit (MH_NO_HEAP_EXECUTION) can be used to forcibly disallow
673  *	execution from data areas for a particular binary even if the arch normally permits it. As
674  *	a final wrinkle, a posix_spawn attribute flag can be used to negate this opt-in header bit
675  *	to support some complicated use cases, notably browsers with out-of-process plugins that
676  *	are not all NX-safe.
677  */
678 
679 extern int allow_data_exec, allow_stack_exec;
680 
681 int
override_nx(vm_map_t map,uint32_t user_tag)682 override_nx(vm_map_t map, uint32_t user_tag) /* map unused on arm */
683 {
684 	int current_abi;
685 
686 	if (map->pmap == kernel_pmap) {
687 		return FALSE;
688 	}
689 
690 	/*
691 	 * Determine if the app is running in 32 or 64 bit mode.
692 	 */
693 
694 	if (vm_map_is_64bit(map)) {
695 		current_abi = VM_ABI_64;
696 	} else {
697 		current_abi = VM_ABI_32;
698 	}
699 
700 	/*
701 	 * Determine if we should allow the execution based on whether it's a
702 	 * stack or data area and the current architecture.
703 	 */
704 
705 	if (user_tag == VM_MEMORY_STACK) {
706 		return allow_stack_exec & current_abi;
707 	}
708 
709 	return (allow_data_exec & current_abi) && (map->map_disallow_data_exec == FALSE);
710 }
711 
712 
713 /*
714  *	Virtual memory maps provide for the mapping, protection,
715  *	and sharing of virtual memory objects.  In addition,
716  *	this module provides for an efficient virtual copy of
717  *	memory from one map to another.
718  *
719  *	Synchronization is required prior to most operations.
720  *
721  *	Maps consist of an ordered doubly-linked list of simple
722  *	entries; a single hint is used to speed up lookups.
723  *
724  *	Sharing maps have been deleted from this version of Mach.
725  *	All shared objects are now mapped directly into the respective
726  *	maps.  This requires a change in the copy on write strategy;
727  *	the asymmetric (delayed) strategy is used for shared temporary
728  *	objects instead of the symmetric (shadow) strategy.  All maps
729  *	are now "top level" maps (either task map, kernel map or submap
730  *	of the kernel map).
731  *
732  *	Since portions of maps are specified by start/end addreses,
733  *	which may not align with existing map entries, all
734  *	routines merely "clip" entries to these start/end values.
735  *	[That is, an entry is split into two, bordering at a
736  *	start or end value.]  Note that these clippings may not
737  *	always be necessary (as the two resulting entries are then
738  *	not changed); however, the clipping is done for convenience.
739  *	No attempt is currently made to "glue back together" two
740  *	abutting entries.
741  *
742  *	The symmetric (shadow) copy strategy implements virtual copy
743  *	by copying VM object references from one map to
744  *	another, and then marking both regions as copy-on-write.
745  *	It is important to note that only one writeable reference
746  *	to a VM object region exists in any map when this strategy
747  *	is used -- this means that shadow object creation can be
748  *	delayed until a write operation occurs.  The symmetric (delayed)
749  *	strategy allows multiple maps to have writeable references to
750  *	the same region of a vm object, and hence cannot delay creating
751  *	its copy objects.  See vm_object_copy_quickly() in vm_object.c.
752  *	Copying of permanent objects is completely different; see
753  *	vm_object_copy_strategically() in vm_object.c.
754  */
755 
756 ZONE_DECLARE_ID(ZONE_ID_VM_MAP_COPY, struct vm_map_copy);
757 
758 #define VM_MAP_ZONE_NAME        "maps"
759 #define VM_MAP_ZFLAGS           (ZC_NOENCRYPT | ZC_VM)
760 
761 #define VM_MAP_ENTRY_ZONE_NAME  "VM map entries"
762 #define VM_MAP_ENTRY_ZFLAGS     (ZC_NOENCRYPT | ZC_VM)
763 
764 #define VM_MAP_HOLES_ZONE_NAME  "VM map holes"
765 #define VM_MAP_HOLES_ZFLAGS     (ZC_NOENCRYPT | ZC_VM)
766 
767 /*
768  * Asserts that a vm_map_copy object is coming from the
769  * vm_map_copy_zone to ensure that it isn't a fake constructed
770  * anywhere else.
771  */
772 void
vm_map_copy_require(struct vm_map_copy * copy)773 vm_map_copy_require(struct vm_map_copy *copy)
774 {
775 	zone_id_require(ZONE_ID_VM_MAP_COPY, sizeof(struct vm_map_copy), copy);
776 }
777 
778 /*
779  *	vm_map_require:
780  *
781  *	Ensures that the argument is memory allocated from the genuine
782  *	vm map zone. (See zone_id_require_allow_foreign).
783  */
784 void
vm_map_require(vm_map_t map)785 vm_map_require(vm_map_t map)
786 {
787 	zone_id_require(ZONE_ID_VM_MAP, sizeof(struct _vm_map), map);
788 }
789 
790 #define VM_MAP_EARLY_COUNT_MAX         16
791 static __startup_data vm_offset_t      map_data;
792 static __startup_data vm_size_t        map_data_size;
793 static __startup_data vm_offset_t      kentry_data;
794 static __startup_data vm_size_t        kentry_data_size;
795 static __startup_data vm_offset_t      map_holes_data;
796 static __startup_data vm_size_t        map_holes_data_size;
797 static __startup_data vm_map_t        *early_map_owners[VM_MAP_EARLY_COUNT_MAX];
798 static __startup_data uint32_t         early_map_count;
799 
800 #if XNU_TARGET_OS_OSX
801 #define         NO_COALESCE_LIMIT  ((1024 * 128) - 1)
802 #else /* XNU_TARGET_OS_OSX */
803 #define         NO_COALESCE_LIMIT  0
804 #endif /* XNU_TARGET_OS_OSX */
805 
806 /* Skip acquiring locks if we're in the midst of a kernel core dump */
807 unsigned int not_in_kdp = 1;
808 
809 unsigned int vm_map_set_cache_attr_count = 0;
810 
811 kern_return_t
vm_map_set_cache_attr(vm_map_t map,vm_map_offset_t va)812 vm_map_set_cache_attr(
813 	vm_map_t        map,
814 	vm_map_offset_t va)
815 {
816 	vm_map_entry_t  map_entry;
817 	vm_object_t     object;
818 	kern_return_t   kr = KERN_SUCCESS;
819 
820 	vm_map_lock_read(map);
821 
822 	if (!vm_map_lookup_entry(map, va, &map_entry) ||
823 	    map_entry->is_sub_map) {
824 		/*
825 		 * that memory is not properly mapped
826 		 */
827 		kr = KERN_INVALID_ARGUMENT;
828 		goto done;
829 	}
830 	object = VME_OBJECT(map_entry);
831 
832 	if (object == VM_OBJECT_NULL) {
833 		/*
834 		 * there should be a VM object here at this point
835 		 */
836 		kr = KERN_INVALID_ARGUMENT;
837 		goto done;
838 	}
839 	vm_object_lock(object);
840 	object->set_cache_attr = TRUE;
841 	vm_object_unlock(object);
842 
843 	vm_map_set_cache_attr_count++;
844 done:
845 	vm_map_unlock_read(map);
846 
847 	return kr;
848 }
849 
850 
851 #if CONFIG_CODE_DECRYPTION
852 /*
853  * vm_map_apple_protected:
854  * This remaps the requested part of the object with an object backed by
855  * the decrypting pager.
856  * crypt_info contains entry points and session data for the crypt module.
857  * The crypt_info block will be copied by vm_map_apple_protected. The data structures
858  * referenced in crypt_info must remain valid until crypt_info->crypt_end() is called.
859  */
860 kern_return_t
vm_map_apple_protected(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_object_offset_t crypto_backing_offset,struct pager_crypt_info * crypt_info,uint32_t cryptid)861 vm_map_apple_protected(
862 	vm_map_t                map,
863 	vm_map_offset_t         start,
864 	vm_map_offset_t         end,
865 	vm_object_offset_t      crypto_backing_offset,
866 	struct pager_crypt_info *crypt_info,
867 	uint32_t                cryptid)
868 {
869 	boolean_t       map_locked;
870 	kern_return_t   kr;
871 	vm_map_entry_t  map_entry;
872 	struct vm_map_entry tmp_entry;
873 	memory_object_t unprotected_mem_obj;
874 	vm_object_t     protected_object;
875 	vm_map_offset_t map_addr;
876 	vm_map_offset_t start_aligned, end_aligned;
877 	vm_object_offset_t      crypto_start, crypto_end;
878 	boolean_t       cache_pager;
879 
880 	map_locked = FALSE;
881 	unprotected_mem_obj = MEMORY_OBJECT_NULL;
882 
883 	if (__improbable(vm_map_range_overflows(map, start, end - start))) {
884 		return KERN_INVALID_ADDRESS;
885 	}
886 	start_aligned = vm_map_trunc_page(start, PAGE_MASK_64);
887 	end_aligned = vm_map_round_page(end, PAGE_MASK_64);
888 	start_aligned = vm_map_trunc_page(start_aligned, VM_MAP_PAGE_MASK(map));
889 	end_aligned = vm_map_round_page(end_aligned, VM_MAP_PAGE_MASK(map));
890 
891 #if __arm64__
892 	/*
893 	 * "start" and "end" might be 4K-aligned but not 16K-aligned,
894 	 * so we might have to loop and establish up to 3 mappings:
895 	 *
896 	 * + the first 16K-page, which might overlap with the previous
897 	 *   4K-aligned mapping,
898 	 * + the center,
899 	 * + the last 16K-page, which might overlap with the next
900 	 *   4K-aligned mapping.
901 	 * Each of these mapping might be backed by a vnode pager (if
902 	 * properly page-aligned) or a "fourk_pager", itself backed by a
903 	 * vnode pager (if 4K-aligned but not page-aligned).
904 	 */
905 #endif /* __arm64__ */
906 
907 	map_addr = start_aligned;
908 	for (map_addr = start_aligned;
909 	    map_addr < end;
910 	    map_addr = tmp_entry.vme_end) {
911 		vm_map_lock(map);
912 		map_locked = TRUE;
913 
914 		/* lookup the protected VM object */
915 		if (!vm_map_lookup_entry(map,
916 		    map_addr,
917 		    &map_entry) ||
918 		    map_entry->is_sub_map ||
919 		    VME_OBJECT(map_entry) == VM_OBJECT_NULL) {
920 			/* that memory is not properly mapped */
921 			kr = KERN_INVALID_ARGUMENT;
922 			goto done;
923 		}
924 
925 		/* ensure mapped memory is mapped as executable except
926 		 *  except for model decryption flow */
927 		if ((cryptid != CRYPTID_MODEL_ENCRYPTION) &&
928 		    !(map_entry->protection & VM_PROT_EXECUTE)) {
929 			kr = KERN_INVALID_ARGUMENT;
930 			goto done;
931 		}
932 
933 		/* get the protected object to be decrypted */
934 		protected_object = VME_OBJECT(map_entry);
935 		if (protected_object == VM_OBJECT_NULL) {
936 			/* there should be a VM object here at this point */
937 			kr = KERN_INVALID_ARGUMENT;
938 			goto done;
939 		}
940 		/* ensure protected object stays alive while map is unlocked */
941 		vm_object_reference(protected_object);
942 
943 		/* limit the map entry to the area we want to cover */
944 		vm_map_clip_start(map, map_entry, start_aligned);
945 		vm_map_clip_end(map, map_entry, end_aligned);
946 
947 		tmp_entry = *map_entry;
948 		map_entry = VM_MAP_ENTRY_NULL; /* not valid after unlocking map */
949 		vm_map_unlock(map);
950 		map_locked = FALSE;
951 
952 		/*
953 		 * This map entry might be only partially encrypted
954 		 * (if not fully "page-aligned").
955 		 */
956 		crypto_start = 0;
957 		crypto_end = tmp_entry.vme_end - tmp_entry.vme_start;
958 		if (tmp_entry.vme_start < start) {
959 			if (tmp_entry.vme_start != start_aligned) {
960 				kr = KERN_INVALID_ADDRESS;
961 			}
962 			crypto_start += (start - tmp_entry.vme_start);
963 		}
964 		if (tmp_entry.vme_end > end) {
965 			if (tmp_entry.vme_end != end_aligned) {
966 				kr = KERN_INVALID_ADDRESS;
967 			}
968 			crypto_end -= (tmp_entry.vme_end - end);
969 		}
970 
971 		/*
972 		 * This "extra backing offset" is needed to get the decryption
973 		 * routine to use the right key.  It adjusts for the possibly
974 		 * relative offset of an interposed "4K" pager...
975 		 */
976 		if (crypto_backing_offset == (vm_object_offset_t) -1) {
977 			crypto_backing_offset = VME_OFFSET(&tmp_entry);
978 		}
979 
980 		cache_pager = TRUE;
981 #if XNU_TARGET_OS_OSX
982 		if (vm_map_is_alien(map)) {
983 			cache_pager = FALSE;
984 		}
985 #endif /* XNU_TARGET_OS_OSX */
986 
987 		/*
988 		 * Lookup (and create if necessary) the protected memory object
989 		 * matching that VM object.
990 		 * If successful, this also grabs a reference on the memory object,
991 		 * to guarantee that it doesn't go away before we get a chance to map
992 		 * it.
993 		 */
994 		unprotected_mem_obj = apple_protect_pager_setup(
995 			protected_object,
996 			VME_OFFSET(&tmp_entry),
997 			crypto_backing_offset,
998 			crypt_info,
999 			crypto_start,
1000 			crypto_end,
1001 			cache_pager);
1002 
1003 		/* release extra ref on protected object */
1004 		vm_object_deallocate(protected_object);
1005 
1006 		if (unprotected_mem_obj == NULL) {
1007 			kr = KERN_FAILURE;
1008 			goto done;
1009 		}
1010 
1011 		/* can overwrite an immutable mapping */
1012 		vm_map_kernel_flags_t vmk_flags = {
1013 			.vmf_fixed = true,
1014 			.vmf_overwrite = true,
1015 			.vmkf_overwrite_immutable = true,
1016 		};
1017 #if __arm64__
1018 		if (tmp_entry.used_for_jit &&
1019 		    (VM_MAP_PAGE_SHIFT(map) != FOURK_PAGE_SHIFT ||
1020 		    PAGE_SHIFT != FOURK_PAGE_SHIFT) &&
1021 		    fourk_binary_compatibility_unsafe &&
1022 		    fourk_binary_compatibility_allow_wx) {
1023 			printf("** FOURK_COMPAT [%d]: "
1024 			    "allowing write+execute at 0x%llx\n",
1025 			    proc_selfpid(), tmp_entry.vme_start);
1026 			vmk_flags.vmkf_map_jit = TRUE;
1027 		}
1028 #endif /* __arm64__ */
1029 
1030 		/* map this memory object in place of the current one */
1031 		map_addr = tmp_entry.vme_start;
1032 		kr = vm_map_enter_mem_object(map,
1033 		    &map_addr,
1034 		    (tmp_entry.vme_end -
1035 		    tmp_entry.vme_start),
1036 		    (mach_vm_offset_t) 0,
1037 		    vmk_flags,
1038 		    (ipc_port_t)(uintptr_t) unprotected_mem_obj,
1039 		    0,
1040 		    TRUE,
1041 		    tmp_entry.protection,
1042 		    tmp_entry.max_protection,
1043 		    tmp_entry.inheritance);
1044 		assertf(kr == KERN_SUCCESS,
1045 		    "kr = 0x%x\n", kr);
1046 		assertf(map_addr == tmp_entry.vme_start,
1047 		    "map_addr=0x%llx vme_start=0x%llx tmp_entry=%p\n",
1048 		    (uint64_t)map_addr,
1049 		    (uint64_t) tmp_entry.vme_start,
1050 		    &tmp_entry);
1051 
1052 #if VM_MAP_DEBUG_APPLE_PROTECT
1053 		if (vm_map_debug_apple_protect) {
1054 			printf("APPLE_PROTECT: map %p [0x%llx:0x%llx] pager %p:"
1055 			    " backing:[object:%p,offset:0x%llx,"
1056 			    "crypto_backing_offset:0x%llx,"
1057 			    "crypto_start:0x%llx,crypto_end:0x%llx]\n",
1058 			    map,
1059 			    (uint64_t) map_addr,
1060 			    (uint64_t) (map_addr + (tmp_entry.vme_end -
1061 			    tmp_entry.vme_start)),
1062 			    unprotected_mem_obj,
1063 			    protected_object,
1064 			    VME_OFFSET(&tmp_entry),
1065 			    crypto_backing_offset,
1066 			    crypto_start,
1067 			    crypto_end);
1068 		}
1069 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
1070 
1071 		/*
1072 		 * Release the reference obtained by
1073 		 * apple_protect_pager_setup().
1074 		 * The mapping (if it succeeded) is now holding a reference on
1075 		 * the memory object.
1076 		 */
1077 		memory_object_deallocate(unprotected_mem_obj);
1078 		unprotected_mem_obj = MEMORY_OBJECT_NULL;
1079 
1080 		/* continue with next map entry */
1081 		crypto_backing_offset += (tmp_entry.vme_end -
1082 		    tmp_entry.vme_start);
1083 		crypto_backing_offset -= crypto_start;
1084 	}
1085 	kr = KERN_SUCCESS;
1086 
1087 done:
1088 	if (map_locked) {
1089 		vm_map_unlock(map);
1090 	}
1091 	return kr;
1092 }
1093 #endif  /* CONFIG_CODE_DECRYPTION */
1094 
1095 
1096 LCK_GRP_DECLARE(vm_map_lck_grp, "vm_map");
1097 LCK_ATTR_DECLARE(vm_map_lck_attr, 0, 0);
1098 LCK_ATTR_DECLARE(vm_map_lck_rw_attr, 0, LCK_ATTR_DEBUG);
1099 
1100 #if XNU_TARGET_OS_OSX
1101 #define MALLOC_NO_COW_DEFAULT 1
1102 #define MALLOC_NO_COW_EXCEPT_FORK_DEFAULT 1
1103 #else /* XNU_TARGET_OS_OSX */
1104 #define MALLOC_NO_COW_DEFAULT 1
1105 #define MALLOC_NO_COW_EXCEPT_FORK_DEFAULT 0
1106 #endif /* XNU_TARGET_OS_OSX */
1107 TUNABLE(int, malloc_no_cow, "malloc_no_cow", MALLOC_NO_COW_DEFAULT);
1108 TUNABLE(int, malloc_no_cow_except_fork, "malloc_no_cow_except_fork", MALLOC_NO_COW_EXCEPT_FORK_DEFAULT);
1109 uint64_t vm_memory_malloc_no_cow_mask = 0ULL;
1110 #if DEBUG
1111 int vm_check_map_sanity = 0;
1112 #endif
1113 
1114 /*
1115  *	vm_map_init:
1116  *
1117  *	Initialize the vm_map module.  Must be called before
1118  *	any other vm_map routines.
1119  *
1120  *	Map and entry structures are allocated from zones -- we must
1121  *	initialize those zones.
1122  *
1123  *	There are three zones of interest:
1124  *
1125  *	vm_map_zone:		used to allocate maps.
1126  *	vm_map_entry_zone:	used to allocate map entries.
1127  *
1128  *	LP32:
1129  *	vm_map_entry_reserved_zone:     fallback zone for kernel map entries
1130  *
1131  *	The kernel allocates map entries from a special zone that is initially
1132  *	"crammed" with memory.  It would be difficult (perhaps impossible) for
1133  *	the kernel to allocate more memory to a entry zone when it became
1134  *	empty since the very act of allocating memory implies the creation
1135  *	of a new entry.
1136  */
1137 __startup_func
1138 void
vm_map_init(void)1139 vm_map_init(void)
1140 {
1141 
1142 #if MACH_ASSERT
1143 	PE_parse_boot_argn("debug4k_filter", &debug4k_filter,
1144 	    sizeof(debug4k_filter));
1145 #endif /* MACH_ASSERT */
1146 
1147 	zone_create_ext(VM_MAP_ZONE_NAME, sizeof(struct _vm_map),
1148 	    VM_MAP_ZFLAGS, ZONE_ID_VM_MAP, NULL);
1149 
1150 	/*
1151 	 * Don't quarantine because we always need elements available
1152 	 * Disallow GC on this zone... to aid the GC.
1153 	 */
1154 	zone_create_ext(VM_MAP_ENTRY_ZONE_NAME,
1155 	    sizeof(struct vm_map_entry), VM_MAP_ENTRY_ZFLAGS,
1156 	    ZONE_ID_VM_MAP_ENTRY, ^(zone_t z) {
1157 		z->z_elems_rsv = (uint16_t)(32 *
1158 		(ml_early_cpu_max_number() + 1));
1159 	});
1160 
1161 	zone_create_ext(VM_MAP_HOLES_ZONE_NAME,
1162 	    sizeof(struct vm_map_links), VM_MAP_HOLES_ZFLAGS,
1163 	    ZONE_ID_VM_MAP_HOLES, ^(zone_t z) {
1164 		z->z_elems_rsv = (uint16_t)(16 * 1024 / zone_elem_outer_size(z));
1165 	});
1166 
1167 	zone_create_ext("VM map copies", sizeof(struct vm_map_copy),
1168 	    ZC_NOENCRYPT, ZONE_ID_VM_MAP_COPY, NULL);
1169 
1170 	/*
1171 	 * Add the stolen memory to zones, adjust zone size and stolen counts.
1172 	 */
1173 	zone_cram_early(vm_map_zone, map_data, map_data_size);
1174 	zone_cram_early(vm_map_entry_zone, kentry_data, kentry_data_size);
1175 	zone_cram_early(vm_map_holes_zone, map_holes_data, map_holes_data_size);
1176 	printf("VM boostrap: %d maps, %d entries and %d holes available\n",
1177 	    zone_count_free(vm_map_zone),
1178 	    zone_count_free(vm_map_entry_zone),
1179 	    zone_count_free(vm_map_holes_zone));
1180 
1181 	/*
1182 	 * Since these are covered by zones, remove them from stolen page accounting.
1183 	 */
1184 	VM_PAGE_MOVE_STOLEN(atop_64(map_data_size) + atop_64(kentry_data_size) + atop_64(map_holes_data_size));
1185 
1186 #if VM_MAP_DEBUG_APPLE_PROTECT
1187 	PE_parse_boot_argn("vm_map_debug_apple_protect",
1188 	    &vm_map_debug_apple_protect,
1189 	    sizeof(vm_map_debug_apple_protect));
1190 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
1191 #if VM_MAP_DEBUG_APPLE_FOURK
1192 	PE_parse_boot_argn("vm_map_debug_fourk",
1193 	    &vm_map_debug_fourk,
1194 	    sizeof(vm_map_debug_fourk));
1195 #endif /* VM_MAP_DEBUG_FOURK */
1196 
1197 	if (malloc_no_cow) {
1198 		vm_memory_malloc_no_cow_mask = 0ULL;
1199 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC;
1200 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_SMALL;
1201 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_MEDIUM;
1202 #if XNU_TARGET_OS_OSX
1203 		/*
1204 		 * On macOS, keep copy-on-write for MALLOC_LARGE because
1205 		 * realloc() may use vm_copy() to transfer the old contents
1206 		 * to the new location.
1207 		 */
1208 #else /* XNU_TARGET_OS_OSX */
1209 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE;
1210 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE_REUSABLE;
1211 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE_REUSED;
1212 #endif /* XNU_TARGET_OS_OSX */
1213 //		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_HUGE;
1214 //		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_REALLOC;
1215 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_TINY;
1216 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_NANO;
1217 //		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_TCMALLOC;
1218 		PE_parse_boot_argn("vm_memory_malloc_no_cow_mask",
1219 		    &vm_memory_malloc_no_cow_mask,
1220 		    sizeof(vm_memory_malloc_no_cow_mask));
1221 	}
1222 
1223 #if CONFIG_MAP_RANGES
1224 	vm_map_range_map_init();
1225 #endif /* CONFIG_MAP_RANGES */
1226 
1227 #if DEBUG
1228 	PE_parse_boot_argn("vm_check_map_sanity", &vm_check_map_sanity, sizeof(vm_check_map_sanity));
1229 	if (vm_check_map_sanity) {
1230 		kprintf("VM sanity checking enabled\n");
1231 	} else {
1232 		kprintf("VM sanity checking disabled. Set bootarg vm_check_map_sanity=1 to enable\n");
1233 	}
1234 #endif /* DEBUG */
1235 
1236 #if DEVELOPMENT || DEBUG
1237 	PE_parse_boot_argn("panic_on_unsigned_execute",
1238 	    &panic_on_unsigned_execute,
1239 	    sizeof(panic_on_unsigned_execute));
1240 	PE_parse_boot_argn("panic_on_mlock_failure",
1241 	    &panic_on_mlock_failure,
1242 	    sizeof(panic_on_mlock_failure));
1243 #endif /* DEVELOPMENT || DEBUG */
1244 }
1245 
1246 __startup_func
1247 static void
vm_map_steal_memory(void)1248 vm_map_steal_memory(void)
1249 {
1250 	/*
1251 	 * We need to reserve enough memory to support boostraping VM maps
1252 	 * and the zone subsystem.
1253 	 *
1254 	 * The VM Maps that need to function before zones can support them
1255 	 * are the ones registered with vm_map_will_allocate_early_map(),
1256 	 * which are:
1257 	 * - the kernel map
1258 	 * - the various submaps used by zones (pgz, meta, ...)
1259 	 *
1260 	 * We also need enough entries and holes to support them
1261 	 * until zone_metadata_init() is called, which is when
1262 	 * the zone allocator becomes capable of expanding dynamically.
1263 	 *
1264 	 * We need:
1265 	 * - VM_MAP_EARLY_COUNT_MAX worth of VM Maps.
1266 	 * - To allow for 3-4 entries per map, but the kernel map
1267 	 *   needs a multiple of VM_MAP_EARLY_COUNT_MAX entries
1268 	 *   to describe the submaps, so double it (and make it 8x too)
1269 	 * - To allow for holes between entries,
1270 	 *   hence needs the same budget as entries
1271 	 */
1272 	map_data_size = zone_get_early_alloc_size(VM_MAP_ZONE_NAME,
1273 	    sizeof(struct _vm_map), VM_MAP_ZFLAGS,
1274 	    VM_MAP_EARLY_COUNT_MAX);
1275 
1276 	kentry_data_size = zone_get_early_alloc_size(VM_MAP_ENTRY_ZONE_NAME,
1277 	    sizeof(struct vm_map_entry), VM_MAP_ENTRY_ZFLAGS,
1278 	    8 * VM_MAP_EARLY_COUNT_MAX);
1279 
1280 	map_holes_data_size = zone_get_early_alloc_size(VM_MAP_HOLES_ZONE_NAME,
1281 	    sizeof(struct vm_map_links), VM_MAP_HOLES_ZFLAGS,
1282 	    8 * VM_MAP_EARLY_COUNT_MAX);
1283 
1284 	/*
1285 	 * Steal a contiguous range of memory so that a simple range check
1286 	 * can validate early addresses being freed/crammed to these
1287 	 * zones
1288 	 */
1289 	map_data       = zone_early_mem_init(map_data_size + kentry_data_size +
1290 	    map_holes_data_size);
1291 	kentry_data    = map_data + map_data_size;
1292 	map_holes_data = kentry_data + kentry_data_size;
1293 }
1294 STARTUP(PMAP_STEAL, STARTUP_RANK_FIRST, vm_map_steal_memory);
1295 
1296 __startup_func
1297 static void
vm_kernel_boostraped(void)1298 vm_kernel_boostraped(void)
1299 {
1300 	zone_enable_caching(&zone_array[ZONE_ID_VM_MAP_ENTRY]);
1301 	zone_enable_caching(&zone_array[ZONE_ID_VM_MAP_HOLES]);
1302 	zone_enable_caching(&zone_array[ZONE_ID_VM_MAP_COPY]);
1303 
1304 	printf("VM bootstrap done: %d maps, %d entries and %d holes left\n",
1305 	    zone_count_free(vm_map_zone),
1306 	    zone_count_free(vm_map_entry_zone),
1307 	    zone_count_free(vm_map_holes_zone));
1308 }
1309 STARTUP(ZALLOC, STARTUP_RANK_SECOND, vm_kernel_boostraped);
1310 
1311 void
vm_map_disable_hole_optimization(vm_map_t map)1312 vm_map_disable_hole_optimization(vm_map_t map)
1313 {
1314 	vm_map_entry_t  head_entry, hole_entry, next_hole_entry;
1315 
1316 	if (map->holelistenabled) {
1317 		head_entry = hole_entry = CAST_TO_VM_MAP_ENTRY(map->holes_list);
1318 
1319 		while (hole_entry != NULL) {
1320 			next_hole_entry = hole_entry->vme_next;
1321 
1322 			hole_entry->vme_next = NULL;
1323 			hole_entry->vme_prev = NULL;
1324 			zfree_id(ZONE_ID_VM_MAP_HOLES, hole_entry);
1325 
1326 			if (next_hole_entry == head_entry) {
1327 				hole_entry = NULL;
1328 			} else {
1329 				hole_entry = next_hole_entry;
1330 			}
1331 		}
1332 
1333 		map->holes_list = NULL;
1334 		map->holelistenabled = FALSE;
1335 
1336 		map->first_free = vm_map_first_entry(map);
1337 		SAVE_HINT_HOLE_WRITE(map, NULL);
1338 	}
1339 }
1340 
1341 boolean_t
vm_kernel_map_is_kernel(vm_map_t map)1342 vm_kernel_map_is_kernel(vm_map_t map)
1343 {
1344 	return map->pmap == kernel_pmap;
1345 }
1346 
1347 /*
1348  *	vm_map_create:
1349  *
1350  *	Creates and returns a new empty VM map with
1351  *	the given physical map structure, and having
1352  *	the given lower and upper address bounds.
1353  */
1354 
1355 extern vm_map_t vm_map_create_external(
1356 	pmap_t                  pmap,
1357 	vm_map_offset_t         min_off,
1358 	vm_map_offset_t         max_off,
1359 	boolean_t               pageable);
1360 
1361 vm_map_t
vm_map_create_external(pmap_t pmap,vm_map_offset_t min,vm_map_offset_t max,boolean_t pageable)1362 vm_map_create_external(
1363 	pmap_t                  pmap,
1364 	vm_map_offset_t         min,
1365 	vm_map_offset_t         max,
1366 	boolean_t               pageable)
1367 {
1368 	vm_map_create_options_t options = VM_MAP_CREATE_DEFAULT;
1369 
1370 	if (pageable) {
1371 		options |= VM_MAP_CREATE_PAGEABLE;
1372 	}
1373 	return vm_map_create_options(pmap, min, max, options);
1374 }
1375 
1376 __startup_func
1377 void
vm_map_will_allocate_early_map(vm_map_t * owner)1378 vm_map_will_allocate_early_map(vm_map_t *owner)
1379 {
1380 	if (early_map_count >= VM_MAP_EARLY_COUNT_MAX) {
1381 		panic("VM_MAP_EARLY_COUNT_MAX is too low");
1382 	}
1383 
1384 	early_map_owners[early_map_count++] = owner;
1385 }
1386 
1387 __startup_func
1388 void
vm_map_relocate_early_maps(vm_offset_t delta)1389 vm_map_relocate_early_maps(vm_offset_t delta)
1390 {
1391 	for (uint32_t i = 0; i < early_map_count; i++) {
1392 		vm_address_t addr = (vm_address_t)*early_map_owners[i];
1393 
1394 		*early_map_owners[i] = (vm_map_t)(addr + delta);
1395 	}
1396 
1397 	early_map_count = ~0u;
1398 }
1399 
1400 /*
1401  *	Routine:	vm_map_relocate_early_elem
1402  *
1403  *	Purpose:
1404  *		Early zone elements are allocated in a temporary part
1405  *		of the address space.
1406  *
1407  *		Once the zones live in their final place, the early
1408  *		VM maps, map entries and map holes need to be relocated.
1409  *
1410  *		It involves rewriting any vm_map_t, vm_map_entry_t or
1411  *		pointers to vm_map_links. Other pointers to other types
1412  *		are fine.
1413  *
1414  *		Fortunately, pointers to those types are self-contained
1415  *		in those zones, _except_ for pointers to VM maps,
1416  *		which are tracked during early boot and fixed with
1417  *		vm_map_relocate_early_maps().
1418  */
1419 __startup_func
1420 void
vm_map_relocate_early_elem(uint32_t zone_id,vm_offset_t new_addr,vm_offset_t delta)1421 vm_map_relocate_early_elem(
1422 	uint32_t                zone_id,
1423 	vm_offset_t             new_addr,
1424 	vm_offset_t             delta)
1425 {
1426 #define relocate(type_t, field)  ({ \
1427 	typeof(((type_t)NULL)->field) *__field = &((type_t)new_addr)->field;   \
1428 	if (*__field) {                                                        \
1429 	        *__field = (typeof(*__field))((vm_offset_t)*__field + delta);  \
1430 	}                                                                      \
1431 })
1432 
1433 	switch (zone_id) {
1434 	case ZONE_ID_VM_MAP:
1435 	case ZONE_ID_VM_MAP_ENTRY:
1436 	case ZONE_ID_VM_MAP_HOLES:
1437 		break;
1438 
1439 	default:
1440 		panic("Unexpected zone ID %d", zone_id);
1441 	}
1442 
1443 	if (zone_id == ZONE_ID_VM_MAP) {
1444 		relocate(vm_map_t, hdr.links.prev);
1445 		relocate(vm_map_t, hdr.links.next);
1446 		((vm_map_t)new_addr)->pmap = kernel_pmap;
1447 #ifdef VM_MAP_STORE_USE_RB
1448 		relocate(vm_map_t, hdr.rb_head_store.rbh_root);
1449 #endif /* VM_MAP_STORE_USE_RB */
1450 		relocate(vm_map_t, hint);
1451 		relocate(vm_map_t, hole_hint);
1452 		relocate(vm_map_t, first_free);
1453 		return;
1454 	}
1455 
1456 	relocate(struct vm_map_links *, prev);
1457 	relocate(struct vm_map_links *, next);
1458 
1459 	if (zone_id == ZONE_ID_VM_MAP_ENTRY) {
1460 #ifdef VM_MAP_STORE_USE_RB
1461 		relocate(vm_map_entry_t, store.entry.rbe_left);
1462 		relocate(vm_map_entry_t, store.entry.rbe_right);
1463 		relocate(vm_map_entry_t, store.entry.rbe_parent);
1464 #endif /* VM_MAP_STORE_USE_RB */
1465 		if (((vm_map_entry_t)new_addr)->is_sub_map) {
1466 			/* no object to relocate because we haven't made any */
1467 			((vm_map_entry_t)new_addr)->vme_submap +=
1468 			    delta >> VME_SUBMAP_SHIFT;
1469 		}
1470 #if MAP_ENTRY_CREATION_DEBUG
1471 		relocate(vm_map_entry_t, vme_creation_maphdr);
1472 #endif /* MAP_ENTRY_CREATION_DEBUG */
1473 	}
1474 
1475 #undef relocate
1476 }
1477 
1478 vm_map_t
vm_map_create_options(pmap_t pmap,vm_map_offset_t min,vm_map_offset_t max,vm_map_create_options_t options)1479 vm_map_create_options(
1480 	pmap_t                  pmap,
1481 	vm_map_offset_t         min,
1482 	vm_map_offset_t         max,
1483 	vm_map_create_options_t options)
1484 {
1485 	vm_map_t result;
1486 
1487 #if DEBUG || DEVELOPMENT
1488 	if (__improbable(startup_phase < STARTUP_SUB_ZALLOC)) {
1489 		if (early_map_count != ~0u && early_map_count !=
1490 		    zone_count_allocated(vm_map_zone) + 1) {
1491 			panic("allocating %dth early map, owner not known",
1492 			    zone_count_allocated(vm_map_zone) + 1);
1493 		}
1494 		if (early_map_count != ~0u && pmap && pmap != kernel_pmap) {
1495 			panic("allocating %dth early map for non kernel pmap",
1496 			    early_map_count);
1497 		}
1498 	}
1499 #endif /* DEBUG || DEVELOPMENT */
1500 
1501 	result = zalloc_id(ZONE_ID_VM_MAP, Z_WAITOK | Z_NOFAIL | Z_ZERO);
1502 
1503 	vm_map_store_init(&result->hdr);
1504 	result->hdr.entries_pageable = (bool)(options & VM_MAP_CREATE_PAGEABLE);
1505 	vm_map_set_page_shift(result, PAGE_SHIFT);
1506 
1507 	result->size_limit      = RLIM_INFINITY;        /* default unlimited */
1508 	result->data_limit      = RLIM_INFINITY;        /* default unlimited */
1509 	result->user_wire_limit = MACH_VM_MAX_ADDRESS;  /* default limit is unlimited */
1510 	os_ref_init_count_raw(&result->map_refcnt, &map_refgrp, 1);
1511 	result->pmap = pmap;
1512 	result->min_offset = min;
1513 	result->max_offset = max;
1514 	result->first_free = vm_map_to_entry(result);
1515 	result->hint = vm_map_to_entry(result);
1516 
1517 	if (options & VM_MAP_CREATE_NEVER_FAULTS) {
1518 		assert(pmap == kernel_pmap);
1519 		result->never_faults = true;
1520 	}
1521 
1522 	/* "has_corpse_footprint" and "holelistenabled" are mutually exclusive */
1523 	if (options & VM_MAP_CREATE_CORPSE_FOOTPRINT) {
1524 		result->has_corpse_footprint = true;
1525 	} else if (!(options & VM_MAP_CREATE_DISABLE_HOLELIST)) {
1526 		struct vm_map_links *hole_entry;
1527 
1528 		hole_entry = zalloc_id(ZONE_ID_VM_MAP_HOLES, Z_WAITOK | Z_NOFAIL);
1529 		hole_entry->start = min;
1530 #if defined(__arm64__)
1531 		hole_entry->end = result->max_offset;
1532 #else
1533 		hole_entry->end = MAX(max, (vm_map_offset_t)MACH_VM_MAX_ADDRESS);
1534 #endif
1535 		result->holes_list = result->hole_hint = hole_entry;
1536 		hole_entry->prev = hole_entry->next = CAST_TO_VM_MAP_ENTRY(hole_entry);
1537 		result->holelistenabled = true;
1538 	}
1539 
1540 	vm_map_lock_init(result);
1541 
1542 	return result;
1543 }
1544 
1545 /*
1546  * Adjusts a submap that was made by kmem_suballoc()
1547  * before it knew where it would be mapped,
1548  * so that it has the right min/max offsets.
1549  *
1550  * We do not need to hold any locks:
1551  * only the caller knows about this map,
1552  * and it is not published on any entry yet.
1553  */
1554 static void
vm_map_adjust_offsets(vm_map_t map,vm_map_offset_t min_off,vm_map_offset_t max_off)1555 vm_map_adjust_offsets(
1556 	vm_map_t                map,
1557 	vm_map_offset_t         min_off,
1558 	vm_map_offset_t         max_off)
1559 {
1560 	assert(map->min_offset == 0);
1561 	assert(map->max_offset == max_off - min_off);
1562 	assert(map->hdr.nentries == 0);
1563 	assert(os_ref_get_count_raw(&map->map_refcnt) == 2);
1564 
1565 	map->min_offset = min_off;
1566 	map->max_offset = max_off;
1567 
1568 	if (map->holelistenabled) {
1569 		struct vm_map_links *hole = map->holes_list;
1570 
1571 		hole->start = min_off;
1572 #if defined(__arm64__)
1573 		hole->end = max_off;
1574 #else
1575 		hole->end = MAX(max_off, (vm_map_offset_t)MACH_VM_MAX_ADDRESS);
1576 #endif
1577 	}
1578 }
1579 
1580 
1581 vm_map_size_t
vm_map_adjusted_size(vm_map_t map)1582 vm_map_adjusted_size(vm_map_t map)
1583 {
1584 	const struct vm_reserved_region *regions = NULL;
1585 	size_t num_regions = 0;
1586 	mach_vm_size_t  reserved_size = 0, map_size = 0;
1587 
1588 	if (map == NULL || (map->size == 0)) {
1589 		return 0;
1590 	}
1591 
1592 	map_size = map->size;
1593 
1594 	if (map->reserved_regions == FALSE || !vm_map_is_exotic(map) || map->terminated) {
1595 		/*
1596 		 * No special reserved regions or not an exotic map or the task
1597 		 * is terminating and these special regions might have already
1598 		 * been deallocated.
1599 		 */
1600 		return map_size;
1601 	}
1602 
1603 	num_regions = ml_get_vm_reserved_regions(vm_map_is_64bit(map), &regions);
1604 	assert((num_regions == 0) || (num_regions > 0 && regions != NULL));
1605 
1606 	while (num_regions) {
1607 		reserved_size += regions[--num_regions].vmrr_size;
1608 	}
1609 
1610 	/*
1611 	 * There are a few places where the map is being switched out due to
1612 	 * 'termination' without that bit being set (e.g. exec and corpse purging).
1613 	 * In those cases, we could have the map's regions being deallocated on
1614 	 * a core while some accounting process is trying to get the map's size.
1615 	 * So this assert can't be enabled till all those places are uniform in
1616 	 * their use of the 'map->terminated' bit.
1617 	 *
1618 	 * assert(map_size >= reserved_size);
1619 	 */
1620 
1621 	return (map_size >= reserved_size) ? (map_size - reserved_size) : map_size;
1622 }
1623 
1624 /*
1625  *	vm_map_entry_create:	[ internal use only ]
1626  *
1627  *	Allocates a VM map entry for insertion in the
1628  *	given map (or map copy).  No fields are filled.
1629  *
1630  *	The VM entry will be zero initialized, except for:
1631  *	- behavior set to VM_BEHAVIOR_DEFAULT
1632  *	- inheritance set to VM_INHERIT_DEFAULT
1633  */
1634 #define vm_map_entry_create(map)    _vm_map_entry_create(&(map)->hdr)
1635 
1636 #define vm_map_copy_entry_create(copy) _vm_map_entry_create(&(copy)->cpy_hdr)
1637 
1638 static vm_map_entry_t
_vm_map_entry_create(struct vm_map_header * map_header __unused)1639 _vm_map_entry_create(
1640 	struct vm_map_header    *map_header __unused)
1641 {
1642 	vm_map_entry_t entry = NULL;
1643 
1644 	entry = zalloc_id(ZONE_ID_VM_MAP_ENTRY, Z_WAITOK | Z_ZERO);
1645 
1646 	/*
1647 	 * Help the compiler with what we know to be true,
1648 	 * so that the further bitfields inits have good codegen.
1649 	 *
1650 	 * See rdar://87041299
1651 	 */
1652 	__builtin_assume(entry->vme_object_value == 0);
1653 	__builtin_assume(*(uint64_t *)(&entry->vme_object_value + 1) == 0);
1654 	__builtin_assume(*(uint64_t *)(&entry->vme_object_value + 2) == 0);
1655 
1656 	static_assert(VM_MAX_TAG_VALUE <= VME_ALIAS_MASK,
1657 	    "VME_ALIAS_MASK covers tags");
1658 
1659 	static_assert(VM_BEHAVIOR_DEFAULT == 0,
1660 	    "can skip zeroing of the behavior field");
1661 	entry->inheritance = VM_INHERIT_DEFAULT;
1662 
1663 #if MAP_ENTRY_CREATION_DEBUG
1664 	entry->vme_creation_maphdr = map_header;
1665 	entry->vme_creation_bt = btref_get(__builtin_frame_address(0),
1666 	    BTREF_GET_NOWAIT);
1667 #endif
1668 	return entry;
1669 }
1670 
1671 /*
1672  *	vm_map_entry_dispose:	[ internal use only ]
1673  *
1674  *	Inverse of vm_map_entry_create.
1675  *
1676  *      write map lock held so no need to
1677  *	do anything special to insure correctness
1678  *      of the stores
1679  */
1680 static void
vm_map_entry_dispose(vm_map_entry_t entry)1681 vm_map_entry_dispose(
1682 	vm_map_entry_t          entry)
1683 {
1684 #if VM_BTLOG_TAGS
1685 	if (entry->vme_kernel_object) {
1686 		btref_put(entry->vme_tag_btref);
1687 	}
1688 #endif /* VM_BTLOG_TAGS */
1689 #if MAP_ENTRY_CREATION_DEBUG
1690 	btref_put(entry->vme_creation_bt);
1691 #endif
1692 #if MAP_ENTRY_INSERTION_DEBUG
1693 	btref_put(entry->vme_insertion_bt);
1694 #endif
1695 	zfree(vm_map_entry_zone, entry);
1696 }
1697 
1698 #define vm_map_copy_entry_dispose(copy_entry) \
1699 	vm_map_entry_dispose(copy_entry)
1700 
1701 static vm_map_entry_t
vm_map_zap_first_entry(vm_map_zap_t list)1702 vm_map_zap_first_entry(
1703 	vm_map_zap_t            list)
1704 {
1705 	return list->vmz_head;
1706 }
1707 
1708 static vm_map_entry_t
vm_map_zap_last_entry(vm_map_zap_t list)1709 vm_map_zap_last_entry(
1710 	vm_map_zap_t            list)
1711 {
1712 	assert(vm_map_zap_first_entry(list));
1713 	return __container_of(list->vmz_tail, struct vm_map_entry, vme_next);
1714 }
1715 
1716 static void
vm_map_zap_append(vm_map_zap_t list,vm_map_entry_t entry)1717 vm_map_zap_append(
1718 	vm_map_zap_t            list,
1719 	vm_map_entry_t          entry)
1720 {
1721 	entry->vme_next = VM_MAP_ENTRY_NULL;
1722 	*list->vmz_tail = entry;
1723 	list->vmz_tail = &entry->vme_next;
1724 }
1725 
1726 static vm_map_entry_t
vm_map_zap_pop(vm_map_zap_t list)1727 vm_map_zap_pop(
1728 	vm_map_zap_t            list)
1729 {
1730 	vm_map_entry_t head = list->vmz_head;
1731 
1732 	if (head != VM_MAP_ENTRY_NULL &&
1733 	    (list->vmz_head = head->vme_next) == VM_MAP_ENTRY_NULL) {
1734 		list->vmz_tail = &list->vmz_head;
1735 	}
1736 
1737 	return head;
1738 }
1739 
1740 static void
vm_map_zap_dispose(vm_map_zap_t list)1741 vm_map_zap_dispose(
1742 	vm_map_zap_t            list)
1743 {
1744 	vm_map_entry_t          entry;
1745 
1746 	while ((entry = vm_map_zap_pop(list))) {
1747 		if (entry->is_sub_map) {
1748 			vm_map_deallocate(VME_SUBMAP(entry));
1749 		} else {
1750 			vm_object_deallocate(VME_OBJECT(entry));
1751 		}
1752 
1753 		vm_map_entry_dispose(entry);
1754 	}
1755 }
1756 
1757 #if MACH_ASSERT
1758 static boolean_t first_free_check = FALSE;
1759 boolean_t
first_free_is_valid(vm_map_t map)1760 first_free_is_valid(
1761 	vm_map_t        map)
1762 {
1763 	if (!first_free_check) {
1764 		return TRUE;
1765 	}
1766 
1767 	return first_free_is_valid_store( map );
1768 }
1769 #endif /* MACH_ASSERT */
1770 
1771 
1772 #define vm_map_copy_entry_link(copy, after_where, entry)                \
1773 	_vm_map_store_entry_link(&(copy)->cpy_hdr, after_where, (entry))
1774 
1775 #define vm_map_copy_entry_unlink(copy, entry)                           \
1776 	_vm_map_store_entry_unlink(&(copy)->cpy_hdr, (entry), false)
1777 
1778 /*
1779  *	vm_map_destroy:
1780  *
1781  *	Actually destroy a map.
1782  */
1783 void
vm_map_destroy(vm_map_t map)1784 vm_map_destroy(
1785 	vm_map_t        map)
1786 {
1787 	/* final cleanup: this is not allowed to fail */
1788 	vmr_flags_t flags = VM_MAP_REMOVE_NO_FLAGS;
1789 
1790 	VM_MAP_ZAP_DECLARE(zap);
1791 
1792 	vm_map_lock(map);
1793 
1794 	map->terminated = true;
1795 	/* clean up regular map entries */
1796 	(void)vm_map_delete(map, map->min_offset, map->max_offset, flags,
1797 	    KMEM_GUARD_NONE, &zap);
1798 	/* clean up leftover special mappings (commpage, GPU carveout, etc...) */
1799 	(void)vm_map_delete(map, 0x0, 0xFFFFFFFFFFFFF000ULL, flags,
1800 	    KMEM_GUARD_NONE, &zap);
1801 
1802 	vm_map_disable_hole_optimization(map);
1803 	vm_map_corpse_footprint_destroy(map);
1804 
1805 	vm_map_unlock(map);
1806 
1807 	vm_map_zap_dispose(&zap);
1808 
1809 	assert(map->hdr.nentries == 0);
1810 
1811 	if (map->pmap) {
1812 		pmap_destroy(map->pmap);
1813 	}
1814 
1815 	lck_rw_destroy(&map->lock, &vm_map_lck_grp);
1816 
1817 #if CONFIG_MAP_RANGES
1818 	kfree_data(map->extra_ranges,
1819 	    map->extra_ranges_count * sizeof(struct vm_map_user_range));
1820 #endif
1821 
1822 	zfree_id(ZONE_ID_VM_MAP, map);
1823 }
1824 
1825 /*
1826  * Returns pid of the task with the largest number of VM map entries.
1827  * Used in the zone-map-exhaustion jetsam path.
1828  */
1829 pid_t
find_largest_process_vm_map_entries(void)1830 find_largest_process_vm_map_entries(void)
1831 {
1832 	pid_t victim_pid = -1;
1833 	int max_vm_map_entries = 0;
1834 	task_t task = TASK_NULL;
1835 	queue_head_t *task_list = &tasks;
1836 
1837 	lck_mtx_lock(&tasks_threads_lock);
1838 	queue_iterate(task_list, task, task_t, tasks) {
1839 		if (task == kernel_task || !task->active) {
1840 			continue;
1841 		}
1842 
1843 		vm_map_t task_map = task->map;
1844 		if (task_map != VM_MAP_NULL) {
1845 			int task_vm_map_entries = task_map->hdr.nentries;
1846 			if (task_vm_map_entries > max_vm_map_entries) {
1847 				max_vm_map_entries = task_vm_map_entries;
1848 				victim_pid = pid_from_task(task);
1849 			}
1850 		}
1851 	}
1852 	lck_mtx_unlock(&tasks_threads_lock);
1853 
1854 	printf("zone_map_exhaustion: victim pid %d, vm region count: %d\n", victim_pid, max_vm_map_entries);
1855 	return victim_pid;
1856 }
1857 
1858 
1859 /*
1860  *	vm_map_lookup_entry:	[ internal use only ]
1861  *
1862  *	Calls into the vm map store layer to find the map
1863  *	entry containing (or immediately preceding) the
1864  *	specified address in the given map; the entry is returned
1865  *	in the "entry" parameter.  The boolean
1866  *	result indicates whether the address is
1867  *	actually contained in the map.
1868  */
1869 boolean_t
vm_map_lookup_entry(vm_map_t map,vm_map_offset_t address,vm_map_entry_t * entry)1870 vm_map_lookup_entry(
1871 	vm_map_t        map,
1872 	vm_map_offset_t address,
1873 	vm_map_entry_t  *entry)         /* OUT */
1874 {
1875 	if (VM_KERNEL_ADDRESS(address)) {
1876 		address = VM_KERNEL_STRIP_UPTR(address);
1877 	}
1878 #if CONFIG_PROB_GZALLOC
1879 	if (map->pmap == kernel_pmap) {
1880 		assertf(!pgz_owned(address),
1881 		    "it is the responsibility of callers to unguard PGZ addresses");
1882 	}
1883 #endif /* CONFIG_PROB_GZALLOC */
1884 	return vm_map_store_lookup_entry( map, address, entry );
1885 }
1886 
1887 boolean_t
vm_map_lookup_entry_or_next(vm_map_t map,vm_map_offset_t address,vm_map_entry_t * entry)1888 vm_map_lookup_entry_or_next(
1889 	vm_map_t        map,
1890 	vm_map_offset_t address,
1891 	vm_map_entry_t  *entry)         /* OUT */
1892 {
1893 	if (vm_map_lookup_entry(map, address, entry)) {
1894 		return true;
1895 	}
1896 
1897 	*entry = (*entry)->vme_next;
1898 	return false;
1899 }
1900 
1901 #if CONFIG_PROB_GZALLOC
1902 boolean_t
vm_map_lookup_entry_allow_pgz(vm_map_t map,vm_map_offset_t address,vm_map_entry_t * entry)1903 vm_map_lookup_entry_allow_pgz(
1904 	vm_map_t        map,
1905 	vm_map_offset_t address,
1906 	vm_map_entry_t  *entry)         /* OUT */
1907 {
1908 	if (VM_KERNEL_ADDRESS(address)) {
1909 		address = VM_KERNEL_STRIP_UPTR(address);
1910 	}
1911 	return vm_map_store_lookup_entry( map, address, entry );
1912 }
1913 #endif /* CONFIG_PROB_GZALLOC */
1914 
1915 /*
1916  *	Routine:	vm_map_range_invalid_panic
1917  *	Purpose:
1918  *			Panic on detection of an invalid range id.
1919  */
1920 __abortlike
1921 static void
vm_map_range_invalid_panic(vm_map_t map,vm_map_range_id_t range_id)1922 vm_map_range_invalid_panic(
1923 	vm_map_t                map,
1924 	vm_map_range_id_t       range_id)
1925 {
1926 	panic("invalid range ID (%u) for map %p", range_id, map);
1927 }
1928 
1929 /*
1930  *	Routine:	vm_map_get_range
1931  *	Purpose:
1932  *			Adjust bounds based on security policy.
1933  */
1934 static struct mach_vm_range
vm_map_get_range(vm_map_t map,vm_map_address_t * address,vm_map_kernel_flags_t * vmk_flags,vm_map_size_t size,bool * is_ptr)1935 vm_map_get_range(
1936 	vm_map_t                map,
1937 	vm_map_address_t       *address,
1938 	vm_map_kernel_flags_t  *vmk_flags,
1939 	vm_map_size_t           size,
1940 	bool                   *is_ptr)
1941 {
1942 	struct mach_vm_range effective_range = {};
1943 	vm_map_range_id_t range_id = vmk_flags->vmkf_range_id;
1944 
1945 	if (map == kernel_map) {
1946 		effective_range = kmem_ranges[range_id];
1947 
1948 		if (startup_phase >= STARTUP_SUB_KMEM) {
1949 			/*
1950 			 * Hint provided by caller is zeroed as the range is restricted to a
1951 			 * subset of the entire kernel_map VA, which could put the hint outside
1952 			 * the range, causing vm_map_store_find_space to fail.
1953 			 */
1954 			*address = 0ull;
1955 			/*
1956 			 * Ensure that range_id passed in by the caller is within meaningful
1957 			 * bounds. Range id of KMEM_RANGE_ID_NONE will cause vm_map_locate_space
1958 			 * to fail as the corresponding range is invalid. Range id larger than
1959 			 * KMEM_RANGE_ID_MAX will lead to an OOB access.
1960 			 */
1961 			if ((range_id == KMEM_RANGE_ID_NONE) ||
1962 			    (range_id > KMEM_RANGE_ID_MAX)) {
1963 				vm_map_range_invalid_panic(map, range_id);
1964 			}
1965 
1966 			/*
1967 			 * Pointer ranges use kmem_locate_space to do allocations.
1968 			 *
1969 			 * Non pointer fronts look like [ Small | Large | Permanent ]
1970 			 * Adjust range for allocations larger than KMEM_SMALLMAP_THRESHOLD.
1971 			 * Allocations smaller than KMEM_SMALLMAP_THRESHOLD are allowed to
1972 			 * use the entire range.
1973 			 */
1974 			if (range_id < KMEM_RANGE_ID_SPRAYQTN) {
1975 				*is_ptr = true;
1976 			} else if (size >= KMEM_SMALLMAP_THRESHOLD) {
1977 				effective_range = kmem_large_ranges[range_id];
1978 			}
1979 		}
1980 #if CONFIG_MAP_RANGES
1981 	} else if (map->uses_user_ranges) {
1982 		switch (range_id) {
1983 		case UMEM_RANGE_ID_DEFAULT:
1984 			effective_range = map->default_range;
1985 			break;
1986 		case UMEM_RANGE_ID_HEAP:
1987 			effective_range = map->data_range;
1988 			break;
1989 		case UMEM_RANGE_ID_FIXED:
1990 			/*
1991 			 * anywhere allocations with an address in "FIXED"
1992 			 * makes no sense, leave the range empty
1993 			 */
1994 			break;
1995 
1996 		default:
1997 			vm_map_range_invalid_panic(map, range_id);
1998 		}
1999 #endif /* CONFIG_MAP_RANGES */
2000 	} else {
2001 		/*
2002 		 * If minimum is 0, bump it up by PAGE_SIZE.  We want to limit
2003 		 * allocations of PAGEZERO to explicit requests since its
2004 		 * normal use is to catch dereferences of NULL and many
2005 		 * applications also treat pointers with a value of 0 as
2006 		 * special and suddenly having address 0 contain useable
2007 		 * memory would tend to confuse those applications.
2008 		 */
2009 		effective_range.min_address = MAX(map->min_offset, VM_MAP_PAGE_SIZE(map));
2010 		effective_range.max_address = map->max_offset;
2011 	}
2012 
2013 	return effective_range;
2014 }
2015 
2016 /*
2017  *	Routine:	vm_map_locate_space
2018  *	Purpose:
2019  *		Finds a range in the specified virtual address map,
2020  *		returning the start of that range,
2021  *		as well as the entry right before it.
2022  */
2023 kern_return_t
vm_map_locate_space(vm_map_t map,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_map_offset_t * start_inout,vm_map_entry_t * entry_out)2024 vm_map_locate_space(
2025 	vm_map_t                map,
2026 	vm_map_size_t           size,
2027 	vm_map_offset_t         mask,
2028 	vm_map_kernel_flags_t   vmk_flags,
2029 	vm_map_offset_t        *start_inout,
2030 	vm_map_entry_t         *entry_out)
2031 {
2032 	struct mach_vm_range effective_range = {};
2033 	vm_map_size_t   guard_offset;
2034 	vm_map_offset_t hint, limit;
2035 	vm_map_entry_t  entry;
2036 	bool            is_kmem_ptr_range = false;
2037 
2038 	/*
2039 	 * Only supported by vm_map_enter() with a fixed address.
2040 	 */
2041 	assert(!vmk_flags.vmkf_beyond_max);
2042 
2043 	if (__improbable(map->wait_for_space)) {
2044 		/*
2045 		 * support for "wait_for_space" is minimal,
2046 		 * its only consumer is the ipc_kernel_copy_map.
2047 		 */
2048 		assert(!map->holelistenabled &&
2049 		    !vmk_flags.vmkf_last_free &&
2050 		    !vmk_flags.vmkf_keep_map_locked &&
2051 		    !vmk_flags.vmkf_map_jit &&
2052 		    !vmk_flags.vmf_random_addr &&
2053 		    *start_inout <= map->min_offset);
2054 	} else if (vmk_flags.vmkf_last_free) {
2055 		assert(!vmk_flags.vmkf_map_jit &&
2056 		    !vmk_flags.vmf_random_addr);
2057 	}
2058 
2059 	if (vmk_flags.vmkf_guard_before) {
2060 		guard_offset = VM_MAP_PAGE_SIZE(map);
2061 		assert(size > guard_offset);
2062 		size -= guard_offset;
2063 	} else {
2064 		assert(size != 0);
2065 		guard_offset = 0;
2066 	}
2067 
2068 	/*
2069 	 * Validate range_id from flags and get associated range
2070 	 */
2071 	effective_range = vm_map_get_range(map, start_inout, &vmk_flags, size,
2072 	    &is_kmem_ptr_range);
2073 
2074 	if (is_kmem_ptr_range) {
2075 		return kmem_locate_space(size + guard_offset, vmk_flags.vmkf_range_id,
2076 		           vmk_flags.vmkf_last_free, start_inout, entry_out);
2077 	}
2078 
2079 #if XNU_TARGET_OS_OSX
2080 	if (__improbable(vmk_flags.vmkf_32bit_map_va)) {
2081 		assert(map != kernel_map);
2082 		effective_range.max_address = MIN(map->max_offset, 0x00000000FFFFF000ULL);
2083 	}
2084 #endif /* XNU_TARGET_OS_OSX */
2085 
2086 again:
2087 	if (vmk_flags.vmkf_last_free) {
2088 		hint = *start_inout;
2089 
2090 		if (hint == 0 || hint > effective_range.max_address) {
2091 			hint = effective_range.max_address;
2092 		}
2093 		if (hint <= effective_range.min_address) {
2094 			return KERN_NO_SPACE;
2095 		}
2096 		limit = effective_range.min_address;
2097 	} else {
2098 		hint = *start_inout;
2099 
2100 		if (vmk_flags.vmkf_map_jit) {
2101 			if (map->jit_entry_exists &&
2102 			    !VM_MAP_POLICY_ALLOW_MULTIPLE_JIT(map)) {
2103 				return KERN_INVALID_ARGUMENT;
2104 			}
2105 			if (VM_MAP_POLICY_ALLOW_JIT_RANDOM_ADDRESS(map)) {
2106 				vmk_flags.vmf_random_addr = true;
2107 			}
2108 		}
2109 
2110 		if (vmk_flags.vmf_random_addr) {
2111 			kern_return_t kr;
2112 
2113 			kr = vm_map_random_address_for_size(map, &hint, size, vmk_flags);
2114 			if (kr != KERN_SUCCESS) {
2115 				return kr;
2116 			}
2117 		}
2118 #if __x86_64__
2119 		else if ((hint == 0 || hint == vm_map_min(map)) &&
2120 		    !map->disable_vmentry_reuse &&
2121 		    map->vmmap_high_start != 0) {
2122 			hint = map->vmmap_high_start;
2123 		}
2124 #endif /* __x86_64__ */
2125 
2126 		if (hint < effective_range.min_address) {
2127 			hint = effective_range.min_address;
2128 		}
2129 		if (effective_range.max_address <= hint) {
2130 			return KERN_NO_SPACE;
2131 		}
2132 
2133 		limit = effective_range.max_address;
2134 	}
2135 	entry = vm_map_store_find_space(map,
2136 	    hint, limit, vmk_flags.vmkf_last_free,
2137 	    guard_offset, size, mask,
2138 	    start_inout);
2139 
2140 	if (__improbable(entry == NULL)) {
2141 		if (map->wait_for_space &&
2142 		    guard_offset + size <=
2143 		    effective_range.max_address - effective_range.min_address) {
2144 			assert_wait((event_t)map, THREAD_ABORTSAFE);
2145 			vm_map_unlock(map);
2146 			thread_block(THREAD_CONTINUE_NULL);
2147 			vm_map_lock(map);
2148 			goto again;
2149 		}
2150 		return KERN_NO_SPACE;
2151 	}
2152 
2153 	if (entry_out) {
2154 		*entry_out = entry;
2155 	}
2156 	return KERN_SUCCESS;
2157 }
2158 
2159 
2160 /*
2161  *	Routine:	vm_map_find_space
2162  *	Purpose:
2163  *		Allocate a range in the specified virtual address map,
2164  *		returning the entry allocated for that range.
2165  *		Used by kmem_alloc, etc.
2166  *
2167  *		The map must be NOT be locked. It will be returned locked
2168  *		on KERN_SUCCESS, unlocked on failure.
2169  *
2170  *		If an entry is allocated, the object/offset fields
2171  *		are initialized to zero.
2172  */
2173 kern_return_t
vm_map_find_space(vm_map_t map,vm_map_offset_t hint_address,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_map_entry_t * o_entry)2174 vm_map_find_space(
2175 	vm_map_t                map,
2176 	vm_map_offset_t         hint_address,
2177 	vm_map_size_t           size,
2178 	vm_map_offset_t         mask,
2179 	vm_map_kernel_flags_t   vmk_flags,
2180 	vm_map_entry_t          *o_entry)       /* OUT */
2181 {
2182 	vm_map_entry_t          new_entry, entry;
2183 	kern_return_t           kr;
2184 
2185 	if (size == 0) {
2186 		return KERN_INVALID_ARGUMENT;
2187 	}
2188 
2189 	new_entry = vm_map_entry_create(map);
2190 	new_entry->use_pmap = true;
2191 	new_entry->protection = VM_PROT_DEFAULT;
2192 	new_entry->max_protection = VM_PROT_ALL;
2193 
2194 	if (VM_MAP_PAGE_SHIFT(map) != PAGE_SHIFT) {
2195 		new_entry->map_aligned = true;
2196 	}
2197 	if (vmk_flags.vmf_permanent) {
2198 		new_entry->vme_permanent = true;
2199 	}
2200 
2201 	vm_map_lock(map);
2202 
2203 	kr = vm_map_locate_space(map, size, mask, vmk_flags,
2204 	    &hint_address, &entry);
2205 	if (kr != KERN_SUCCESS) {
2206 		vm_map_unlock(map);
2207 		vm_map_entry_dispose(new_entry);
2208 		return kr;
2209 	}
2210 	new_entry->vme_start = hint_address;
2211 	new_entry->vme_end = hint_address + size;
2212 
2213 	/*
2214 	 *	At this point,
2215 	 *
2216 	 *	- new_entry's "vme_start" and "vme_end" should define
2217 	 *	  the endpoints of the available new range,
2218 	 *
2219 	 *	- and "entry" should refer to the region before
2220 	 *	  the new range,
2221 	 *
2222 	 *	- and the map should still be locked.
2223 	 */
2224 
2225 	assert(page_aligned(new_entry->vme_start));
2226 	assert(page_aligned(new_entry->vme_end));
2227 	assert(VM_MAP_PAGE_ALIGNED(new_entry->vme_start, VM_MAP_PAGE_MASK(map)));
2228 	assert(VM_MAP_PAGE_ALIGNED(new_entry->vme_end, VM_MAP_PAGE_MASK(map)));
2229 
2230 	/*
2231 	 *	Insert the new entry into the list
2232 	 */
2233 
2234 	vm_map_store_entry_link(map, entry, new_entry,
2235 	    VM_MAP_KERNEL_FLAGS_NONE);
2236 	map->size += size;
2237 
2238 	/*
2239 	 *	Update the lookup hint
2240 	 */
2241 	SAVE_HINT_MAP_WRITE(map, new_entry);
2242 
2243 	*o_entry = new_entry;
2244 	return KERN_SUCCESS;
2245 }
2246 
2247 int vm_map_pmap_enter_print = FALSE;
2248 int vm_map_pmap_enter_enable = FALSE;
2249 
2250 /*
2251  *	Routine:	vm_map_pmap_enter [internal only]
2252  *
2253  *	Description:
2254  *		Force pages from the specified object to be entered into
2255  *		the pmap at the specified address if they are present.
2256  *		As soon as a page not found in the object the scan ends.
2257  *
2258  *	Returns:
2259  *		Nothing.
2260  *
2261  *	In/out conditions:
2262  *		The source map should not be locked on entry.
2263  */
2264 __unused static void
vm_map_pmap_enter(vm_map_t map,vm_map_offset_t addr,vm_map_offset_t end_addr,vm_object_t object,vm_object_offset_t offset,vm_prot_t protection)2265 vm_map_pmap_enter(
2266 	vm_map_t                map,
2267 	vm_map_offset_t         addr,
2268 	vm_map_offset_t         end_addr,
2269 	vm_object_t             object,
2270 	vm_object_offset_t      offset,
2271 	vm_prot_t               protection)
2272 {
2273 	int                     type_of_fault;
2274 	kern_return_t           kr;
2275 	uint8_t                 object_lock_type = 0;
2276 	struct vm_object_fault_info fault_info = {};
2277 
2278 	if (map->pmap == 0) {
2279 		return;
2280 	}
2281 
2282 	assert(VM_MAP_PAGE_SHIFT(map) == PAGE_SHIFT);
2283 
2284 	while (addr < end_addr) {
2285 		vm_page_t       m;
2286 
2287 
2288 		/*
2289 		 * TODO:
2290 		 * From vm_map_enter(), we come into this function without the map
2291 		 * lock held or the object lock held.
2292 		 * We haven't taken a reference on the object either.
2293 		 * We should do a proper lookup on the map to make sure
2294 		 * that things are sane before we go locking objects that
2295 		 * could have been deallocated from under us.
2296 		 */
2297 
2298 		object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2299 		vm_object_lock(object);
2300 
2301 		m = vm_page_lookup(object, offset);
2302 
2303 		if (m == VM_PAGE_NULL || m->vmp_busy || m->vmp_fictitious ||
2304 		    (m->vmp_unusual && (VMP_ERROR_GET(m) || m->vmp_restart || m->vmp_absent))) {
2305 			vm_object_unlock(object);
2306 			return;
2307 		}
2308 
2309 		if (vm_map_pmap_enter_print) {
2310 			printf("vm_map_pmap_enter:");
2311 			printf("map: %p, addr: %llx, object: %p, offset: %llx\n",
2312 			    map, (unsigned long long)addr, object, (unsigned long long)offset);
2313 		}
2314 		type_of_fault = DBG_CACHE_HIT_FAULT;
2315 		kr = vm_fault_enter(m, map->pmap,
2316 		    addr,
2317 		    PAGE_SIZE, 0,
2318 		    protection, protection,
2319 		    VM_PAGE_WIRED(m),
2320 		    FALSE,                 /* change_wiring */
2321 		    VM_KERN_MEMORY_NONE,                 /* tag - not wiring */
2322 		    &fault_info,
2323 		    NULL,                  /* need_retry */
2324 		    &type_of_fault,
2325 		    &object_lock_type); /* Exclusive lock mode. Will remain unchanged.*/
2326 
2327 		vm_object_unlock(object);
2328 
2329 		offset += PAGE_SIZE_64;
2330 		addr += PAGE_SIZE;
2331 	}
2332 }
2333 
2334 #define MAX_TRIES_TO_GET_RANDOM_ADDRESS 1000
2335 static kern_return_t
vm_map_random_address_for_size(vm_map_t map,vm_map_offset_t * address,vm_map_size_t size,vm_map_kernel_flags_t vmk_flags)2336 vm_map_random_address_for_size(
2337 	vm_map_t                map,
2338 	vm_map_offset_t        *address,
2339 	vm_map_size_t           size,
2340 	vm_map_kernel_flags_t   vmk_flags)
2341 {
2342 	kern_return_t   kr = KERN_SUCCESS;
2343 	int             tries = 0;
2344 	vm_map_offset_t random_addr = 0;
2345 	vm_map_offset_t hole_end;
2346 
2347 	vm_map_entry_t  next_entry = VM_MAP_ENTRY_NULL;
2348 	vm_map_entry_t  prev_entry = VM_MAP_ENTRY_NULL;
2349 	vm_map_size_t   vm_hole_size = 0;
2350 	vm_map_size_t   addr_space_size;
2351 	bool            is_kmem_ptr;
2352 	struct mach_vm_range effective_range;
2353 
2354 	effective_range = vm_map_get_range(map, address, &vmk_flags, size,
2355 	    &is_kmem_ptr);
2356 
2357 	addr_space_size = effective_range.max_address - effective_range.min_address;
2358 	if (size >= addr_space_size) {
2359 		return KERN_NO_SPACE;
2360 	}
2361 	addr_space_size -= size;
2362 
2363 	assert(VM_MAP_PAGE_ALIGNED(size, VM_MAP_PAGE_MASK(map)));
2364 
2365 	while (tries < MAX_TRIES_TO_GET_RANDOM_ADDRESS) {
2366 		if (startup_phase < STARTUP_SUB_ZALLOC) {
2367 			random_addr = (vm_map_offset_t)early_random();
2368 		} else {
2369 			random_addr = (vm_map_offset_t)random();
2370 		}
2371 		random_addr <<= VM_MAP_PAGE_SHIFT(map);
2372 		random_addr = vm_map_trunc_page(
2373 			effective_range.min_address + (random_addr % addr_space_size),
2374 			VM_MAP_PAGE_MASK(map));
2375 
2376 #if CONFIG_PROB_GZALLOC
2377 		if (map->pmap == kernel_pmap && pgz_owned(random_addr)) {
2378 			continue;
2379 		}
2380 #endif /* CONFIG_PROB_GZALLOC */
2381 
2382 		if (vm_map_lookup_entry(map, random_addr, &prev_entry) == FALSE) {
2383 			if (prev_entry == vm_map_to_entry(map)) {
2384 				next_entry = vm_map_first_entry(map);
2385 			} else {
2386 				next_entry = prev_entry->vme_next;
2387 			}
2388 			if (next_entry == vm_map_to_entry(map)) {
2389 				hole_end = vm_map_max(map);
2390 			} else {
2391 				hole_end = next_entry->vme_start;
2392 			}
2393 			vm_hole_size = hole_end - random_addr;
2394 			if (vm_hole_size >= size) {
2395 				*address = random_addr;
2396 				break;
2397 			}
2398 		}
2399 		tries++;
2400 	}
2401 
2402 	if (tries == MAX_TRIES_TO_GET_RANDOM_ADDRESS) {
2403 		kr = KERN_NO_SPACE;
2404 	}
2405 	return kr;
2406 }
2407 
2408 static boolean_t
vm_memory_malloc_no_cow(int alias)2409 vm_memory_malloc_no_cow(
2410 	int alias)
2411 {
2412 	uint64_t alias_mask;
2413 
2414 	if (!malloc_no_cow) {
2415 		return FALSE;
2416 	}
2417 	if (alias > 63) {
2418 		return FALSE;
2419 	}
2420 	alias_mask = 1ULL << alias;
2421 	if (alias_mask & vm_memory_malloc_no_cow_mask) {
2422 		return TRUE;
2423 	}
2424 	return FALSE;
2425 }
2426 
2427 uint64_t vm_map_enter_RLIMIT_AS_count = 0;
2428 uint64_t vm_map_enter_RLIMIT_DATA_count = 0;
2429 /*
2430  *	Routine:	vm_map_enter
2431  *
2432  *	Description:
2433  *		Allocate a range in the specified virtual address map.
2434  *		The resulting range will refer to memory defined by
2435  *		the given memory object and offset into that object.
2436  *
2437  *		Arguments are as defined in the vm_map call.
2438  */
2439 static unsigned int vm_map_enter_restore_successes = 0;
2440 static unsigned int vm_map_enter_restore_failures = 0;
2441 kern_return_t
vm_map_enter(vm_map_t map,vm_map_offset_t * address,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_object_t object,vm_object_offset_t offset,boolean_t needs_copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)2442 vm_map_enter(
2443 	vm_map_t                map,
2444 	vm_map_offset_t         *address,       /* IN/OUT */
2445 	vm_map_size_t           size,
2446 	vm_map_offset_t         mask,
2447 	vm_map_kernel_flags_t   vmk_flags,
2448 	vm_object_t             object,
2449 	vm_object_offset_t      offset,
2450 	boolean_t               needs_copy,
2451 	vm_prot_t               cur_protection,
2452 	vm_prot_t               max_protection,
2453 	vm_inherit_t            inheritance)
2454 {
2455 	vm_map_entry_t          entry, new_entry;
2456 	vm_map_offset_t         start, tmp_start, tmp_offset;
2457 	vm_map_offset_t         end, tmp_end;
2458 	vm_map_offset_t         tmp2_start, tmp2_end;
2459 	vm_map_offset_t         step;
2460 	kern_return_t           result = KERN_SUCCESS;
2461 	bool                    map_locked = FALSE;
2462 	bool                    pmap_empty = TRUE;
2463 	bool                    new_mapping_established = FALSE;
2464 	const bool              keep_map_locked = vmk_flags.vmkf_keep_map_locked;
2465 	const bool              anywhere = !vmk_flags.vmf_fixed;
2466 	const bool              purgable = vmk_flags.vmf_purgeable;
2467 	const bool              overwrite = vmk_flags.vmf_overwrite;
2468 	const bool              no_cache = vmk_flags.vmf_no_cache;
2469 	const bool              is_submap = vmk_flags.vmkf_submap;
2470 	const bool              permanent = vmk_flags.vmf_permanent;
2471 	const bool              no_copy_on_read = vmk_flags.vmkf_no_copy_on_read;
2472 	const bool              entry_for_jit = vmk_flags.vmkf_map_jit;
2473 	const bool              iokit_acct = vmk_flags.vmkf_iokit_acct;
2474 	const bool              resilient_codesign = vmk_flags.vmf_resilient_codesign;
2475 	const bool              resilient_media = vmk_flags.vmf_resilient_media;
2476 	const bool              entry_for_tpro = vmk_flags.vmf_tpro;
2477 	const unsigned int      superpage_size = vmk_flags.vmf_superpage_size;
2478 	const vm_tag_t          alias = vmk_flags.vm_tag;
2479 	vm_tag_t                user_alias;
2480 	kern_return_t           kr;
2481 	bool                    clear_map_aligned = FALSE;
2482 	vm_map_size_t           chunk_size = 0;
2483 	vm_object_t             caller_object;
2484 	VM_MAP_ZAP_DECLARE(zap_old_list);
2485 	VM_MAP_ZAP_DECLARE(zap_new_list);
2486 
2487 	caller_object = object;
2488 
2489 	assertf(vmk_flags.__vmkf_unused == 0, "vmk_flags unused=0x%x\n", vmk_flags.__vmkf_unused);
2490 
2491 	if (vmk_flags.vmf_4gb_chunk) {
2492 #if defined(__LP64__)
2493 		chunk_size = (4ULL * 1024 * 1024 * 1024); /* max. 4GB chunks for the new allocation */
2494 #else /* __LP64__ */
2495 		chunk_size = ANON_CHUNK_SIZE;
2496 #endif /* __LP64__ */
2497 	} else {
2498 		chunk_size = ANON_CHUNK_SIZE;
2499 	}
2500 
2501 
2502 
2503 	if (superpage_size) {
2504 		switch (superpage_size) {
2505 			/*
2506 			 * Note that the current implementation only supports
2507 			 * a single size for superpages, SUPERPAGE_SIZE, per
2508 			 * architecture. As soon as more sizes are supposed
2509 			 * to be supported, SUPERPAGE_SIZE has to be replaced
2510 			 * with a lookup of the size depending on superpage_size.
2511 			 */
2512 #ifdef __x86_64__
2513 		case SUPERPAGE_SIZE_ANY:
2514 			/* handle it like 2 MB and round up to page size */
2515 			size = (size + 2 * 1024 * 1024 - 1) & ~(2 * 1024 * 1024 - 1);
2516 			OS_FALLTHROUGH;
2517 		case SUPERPAGE_SIZE_2MB:
2518 			break;
2519 #endif
2520 		default:
2521 			return KERN_INVALID_ARGUMENT;
2522 		}
2523 		mask = SUPERPAGE_SIZE - 1;
2524 		if (size & (SUPERPAGE_SIZE - 1)) {
2525 			return KERN_INVALID_ARGUMENT;
2526 		}
2527 		inheritance = VM_INHERIT_NONE;  /* fork() children won't inherit superpages */
2528 	}
2529 
2530 
2531 	if ((cur_protection & VM_PROT_WRITE) &&
2532 	    (cur_protection & VM_PROT_EXECUTE) &&
2533 #if XNU_TARGET_OS_OSX
2534 	    map->pmap != kernel_pmap &&
2535 	    (cs_process_global_enforcement() ||
2536 	    (vmk_flags.vmkf_cs_enforcement_override
2537 	    ? vmk_flags.vmkf_cs_enforcement
2538 	    : (vm_map_cs_enforcement(map)
2539 #if __arm64__
2540 	    || !VM_MAP_IS_EXOTIC(map)
2541 #endif /* __arm64__ */
2542 	    ))) &&
2543 #endif /* XNU_TARGET_OS_OSX */
2544 #if CODE_SIGNING_MONITOR
2545 	    (csm_address_space_exempt(map->pmap) != KERN_SUCCESS) &&
2546 #endif
2547 	    (VM_MAP_POLICY_WX_FAIL(map) ||
2548 	    VM_MAP_POLICY_WX_STRIP_X(map)) &&
2549 	    !entry_for_jit) {
2550 		boolean_t vm_protect_wx_fail = VM_MAP_POLICY_WX_FAIL(map);
2551 
2552 		DTRACE_VM3(cs_wx,
2553 		    uint64_t, 0,
2554 		    uint64_t, 0,
2555 		    vm_prot_t, cur_protection);
2556 		printf("CODE SIGNING: %d[%s] %s: curprot cannot be write+execute. %s\n",
2557 		    proc_selfpid(),
2558 		    (get_bsdtask_info(current_task())
2559 		    ? proc_name_address(get_bsdtask_info(current_task()))
2560 		    : "?"),
2561 		    __FUNCTION__,
2562 		    (vm_protect_wx_fail ? "failing" : "turning off execute"));
2563 		cur_protection &= ~VM_PROT_EXECUTE;
2564 		if (vm_protect_wx_fail) {
2565 			return KERN_PROTECTION_FAILURE;
2566 		}
2567 	}
2568 
2569 	/*
2570 	 * If the task has requested executable lockdown,
2571 	 * deny any new executable mapping.
2572 	 */
2573 	if (map->map_disallow_new_exec == TRUE) {
2574 		if (cur_protection & VM_PROT_EXECUTE) {
2575 			return KERN_PROTECTION_FAILURE;
2576 		}
2577 	}
2578 
2579 	if (resilient_codesign) {
2580 		assert(!is_submap);
2581 		int reject_prot = (needs_copy ? VM_PROT_ALLEXEC : (VM_PROT_WRITE | VM_PROT_ALLEXEC));
2582 		if ((cur_protection | max_protection) & reject_prot) {
2583 			return KERN_PROTECTION_FAILURE;
2584 		}
2585 	}
2586 
2587 	if (resilient_media) {
2588 		assert(!is_submap);
2589 //		assert(!needs_copy);
2590 		if (object != VM_OBJECT_NULL &&
2591 		    !object->internal) {
2592 			/*
2593 			 * This mapping is directly backed by an external
2594 			 * memory manager (e.g. a vnode pager for a file):
2595 			 * we would not have any safe place to inject
2596 			 * a zero-filled page if an actual page is not
2597 			 * available, without possibly impacting the actual
2598 			 * contents of the mapped object (e.g. the file),
2599 			 * so we can't provide any media resiliency here.
2600 			 */
2601 			return KERN_INVALID_ARGUMENT;
2602 		}
2603 	}
2604 
2605 	if (entry_for_tpro) {
2606 		/*
2607 		 * TPRO overrides the effective permissions of the region
2608 		 * and explicitly maps as RW. Ensure we have been passed
2609 		 * the expected permissions. We accept `cur_protections`
2610 		 * RO as that will be handled on fault.
2611 		 */
2612 		if (!(max_protection & VM_PROT_READ) ||
2613 		    !(max_protection & VM_PROT_WRITE) ||
2614 		    !(cur_protection & VM_PROT_READ)) {
2615 			return KERN_PROTECTION_FAILURE;
2616 		}
2617 
2618 		/*
2619 		 * We can now downgrade the cur_protection to RO. This is a mild lie
2620 		 * to the VM layer. But TPRO will be responsible for toggling the
2621 		 * protections between RO/RW
2622 		 */
2623 		cur_protection = VM_PROT_READ;
2624 	}
2625 
2626 	if (is_submap) {
2627 		vm_map_t submap;
2628 		if (purgable) {
2629 			/* submaps can not be purgeable */
2630 			return KERN_INVALID_ARGUMENT;
2631 		}
2632 		if (object == VM_OBJECT_NULL) {
2633 			/* submaps can not be created lazily */
2634 			return KERN_INVALID_ARGUMENT;
2635 		}
2636 		submap = (vm_map_t) object;
2637 		if (VM_MAP_PAGE_SHIFT(submap) != VM_MAP_PAGE_SHIFT(map)) {
2638 			/* page size mismatch */
2639 			return KERN_INVALID_ARGUMENT;
2640 		}
2641 	}
2642 	if (vmk_flags.vmkf_already) {
2643 		/*
2644 		 * VM_FLAGS_ALREADY says that it's OK if the same mapping
2645 		 * is already present.  For it to be meaningul, the requested
2646 		 * mapping has to be at a fixed address (!VM_FLAGS_ANYWHERE) and
2647 		 * we shouldn't try and remove what was mapped there first
2648 		 * (!VM_FLAGS_OVERWRITE).
2649 		 */
2650 		if (!vmk_flags.vmf_fixed || vmk_flags.vmf_overwrite) {
2651 			return KERN_INVALID_ARGUMENT;
2652 		}
2653 	}
2654 
2655 	if (size == 0 ||
2656 	    (offset & MIN(VM_MAP_PAGE_MASK(map), PAGE_MASK_64)) != 0) {
2657 		*address = 0;
2658 		return KERN_INVALID_ARGUMENT;
2659 	}
2660 
2661 	if (map->pmap == kernel_pmap) {
2662 		user_alias = VM_KERN_MEMORY_NONE;
2663 	} else {
2664 		user_alias = alias;
2665 	}
2666 
2667 	if (user_alias == VM_MEMORY_MALLOC_MEDIUM) {
2668 		chunk_size = MALLOC_MEDIUM_CHUNK_SIZE;
2669 	}
2670 
2671 #define RETURN(value)   { result = value; goto BailOut; }
2672 
2673 	assertf(VM_MAP_PAGE_ALIGNED(*address, FOURK_PAGE_MASK), "0x%llx", (uint64_t)*address);
2674 	assertf(VM_MAP_PAGE_ALIGNED(size, FOURK_PAGE_MASK), "0x%llx", (uint64_t)size);
2675 	if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK) {
2676 		assertf(page_aligned(*address), "0x%llx", (uint64_t)*address);
2677 		assertf(page_aligned(size), "0x%llx", (uint64_t)size);
2678 	}
2679 
2680 	if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK &&
2681 	    !VM_MAP_PAGE_ALIGNED(size, VM_MAP_PAGE_MASK(map))) {
2682 		/*
2683 		 * In most cases, the caller rounds the size up to the
2684 		 * map's page size.
2685 		 * If we get a size that is explicitly not map-aligned here,
2686 		 * we'll have to respect the caller's wish and mark the
2687 		 * mapping as "not map-aligned" to avoid tripping the
2688 		 * map alignment checks later.
2689 		 */
2690 		clear_map_aligned = TRUE;
2691 	}
2692 	if (!anywhere &&
2693 	    VM_MAP_PAGE_MASK(map) >= PAGE_MASK &&
2694 	    !VM_MAP_PAGE_ALIGNED(*address, VM_MAP_PAGE_MASK(map))) {
2695 		/*
2696 		 * We've been asked to map at a fixed address and that
2697 		 * address is not aligned to the map's specific alignment.
2698 		 * The caller should know what it's doing (i.e. most likely
2699 		 * mapping some fragmented copy map, transferring memory from
2700 		 * a VM map with a different alignment), so clear map_aligned
2701 		 * for this new VM map entry and proceed.
2702 		 */
2703 		clear_map_aligned = TRUE;
2704 	}
2705 
2706 	/*
2707 	 * Only zero-fill objects are allowed to be purgable.
2708 	 * LP64todo - limit purgable objects to 32-bits for now
2709 	 */
2710 	if (purgable &&
2711 	    (offset != 0 ||
2712 	    (object != VM_OBJECT_NULL &&
2713 	    (object->vo_size != size ||
2714 	    object->purgable == VM_PURGABLE_DENY))
2715 #if __LP64__
2716 	    || size > ANON_MAX_SIZE
2717 #endif
2718 	    )) {
2719 		return KERN_INVALID_ARGUMENT;
2720 	}
2721 
2722 	start = *address;
2723 
2724 	if (anywhere) {
2725 		vm_map_lock(map);
2726 		map_locked = TRUE;
2727 
2728 		result = vm_map_locate_space(map, size, mask, vmk_flags,
2729 		    &start, &entry);
2730 		if (result != KERN_SUCCESS) {
2731 			goto BailOut;
2732 		}
2733 
2734 		*address = start;
2735 		end = start + size;
2736 		assert(VM_MAP_PAGE_ALIGNED(*address,
2737 		    VM_MAP_PAGE_MASK(map)));
2738 	} else {
2739 		vm_map_offset_t effective_min_offset, effective_max_offset;
2740 
2741 		effective_min_offset = map->min_offset;
2742 		effective_max_offset = map->max_offset;
2743 
2744 		if (vmk_flags.vmkf_beyond_max) {
2745 			/*
2746 			 * Allow an insertion beyond the map's max offset.
2747 			 */
2748 			effective_max_offset = 0x00000000FFFFF000ULL;
2749 			if (vm_map_is_64bit(map)) {
2750 				effective_max_offset = 0xFFFFFFFFFFFFF000ULL;
2751 			}
2752 #if XNU_TARGET_OS_OSX
2753 		} else if (__improbable(vmk_flags.vmkf_32bit_map_va)) {
2754 			effective_max_offset = MIN(map->max_offset, 0x00000000FFFFF000ULL);
2755 #endif /* XNU_TARGET_OS_OSX */
2756 		}
2757 
2758 		if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT &&
2759 		    !overwrite &&
2760 		    user_alias == VM_MEMORY_REALLOC) {
2761 			/*
2762 			 * Force realloc() to switch to a new allocation,
2763 			 * to prevent 4k-fragmented virtual ranges.
2764 			 */
2765 //			DEBUG4K_ERROR("no realloc in place");
2766 			return KERN_NO_SPACE;
2767 		}
2768 
2769 		/*
2770 		 *	Verify that:
2771 		 *		the address doesn't itself violate
2772 		 *		the mask requirement.
2773 		 */
2774 
2775 		vm_map_lock(map);
2776 		map_locked = TRUE;
2777 		if ((start & mask) != 0) {
2778 			RETURN(KERN_NO_SPACE);
2779 		}
2780 
2781 #if CONFIG_MAP_RANGES
2782 		if (map->uses_user_ranges) {
2783 			struct mach_vm_range r;
2784 
2785 			vm_map_user_range_resolve(map, start, 1, &r);
2786 			if (r.max_address == 0) {
2787 				RETURN(KERN_INVALID_ADDRESS);
2788 			}
2789 			effective_min_offset = r.min_address;
2790 			effective_max_offset = r.max_address;
2791 		}
2792 #endif /* CONFIG_MAP_RANGES */
2793 
2794 		if ((startup_phase >= STARTUP_SUB_KMEM) && !is_submap &&
2795 		    (map == kernel_map)) {
2796 			mach_vm_range_t r = kmem_validate_range_for_overwrite(start, size);
2797 			effective_min_offset = r->min_address;
2798 			effective_max_offset = r->max_address;
2799 		}
2800 
2801 		/*
2802 		 *	...	the address is within bounds
2803 		 */
2804 
2805 		end = start + size;
2806 
2807 		if ((start < effective_min_offset) ||
2808 		    (end > effective_max_offset) ||
2809 		    (start >= end)) {
2810 			RETURN(KERN_INVALID_ADDRESS);
2811 		}
2812 
2813 		if (overwrite) {
2814 			vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_MAP_ALIGN | VM_MAP_REMOVE_TO_OVERWRITE;
2815 			kern_return_t remove_kr;
2816 
2817 			/*
2818 			 * Fixed mapping and "overwrite" flag: attempt to
2819 			 * remove all existing mappings in the specified
2820 			 * address range, saving them in our "zap_old_list".
2821 			 *
2822 			 * This avoids releasing the VM map lock in
2823 			 * vm_map_entry_delete() and allows atomicity
2824 			 * when we want to replace some mappings with a new one.
2825 			 * It also allows us to restore the old VM mappings if the
2826 			 * new mapping fails.
2827 			 */
2828 			remove_flags |= VM_MAP_REMOVE_NO_YIELD;
2829 
2830 			if (vmk_flags.vmkf_overwrite_immutable) {
2831 				/* we can overwrite immutable mappings */
2832 				remove_flags |= VM_MAP_REMOVE_IMMUTABLE;
2833 			}
2834 			if (vmk_flags.vmkf_remap_prot_copy) {
2835 				remove_flags |= VM_MAP_REMOVE_IMMUTABLE_CODE;
2836 			}
2837 			remove_kr = vm_map_delete(map, start, end, remove_flags,
2838 			    KMEM_GUARD_NONE, &zap_old_list).kmr_return;
2839 			if (remove_kr) {
2840 				/* XXX FBDP restore zap_old_list? */
2841 				RETURN(remove_kr);
2842 			}
2843 		}
2844 
2845 		/*
2846 		 *	...	the starting address isn't allocated
2847 		 */
2848 
2849 		if (vm_map_lookup_entry(map, start, &entry)) {
2850 			if (!(vmk_flags.vmkf_already)) {
2851 				RETURN(KERN_NO_SPACE);
2852 			}
2853 			/*
2854 			 * Check if what's already there is what we want.
2855 			 */
2856 			tmp_start = start;
2857 			tmp_offset = offset;
2858 			if (entry->vme_start < start) {
2859 				tmp_start -= start - entry->vme_start;
2860 				tmp_offset -= start - entry->vme_start;
2861 			}
2862 			for (; entry->vme_start < end;
2863 			    entry = entry->vme_next) {
2864 				/*
2865 				 * Check if the mapping's attributes
2866 				 * match the existing map entry.
2867 				 */
2868 				if (entry == vm_map_to_entry(map) ||
2869 				    entry->vme_start != tmp_start ||
2870 				    entry->is_sub_map != is_submap ||
2871 				    VME_OFFSET(entry) != tmp_offset ||
2872 				    entry->needs_copy != needs_copy ||
2873 				    entry->protection != cur_protection ||
2874 				    entry->max_protection != max_protection ||
2875 				    entry->inheritance != inheritance ||
2876 				    entry->iokit_acct != iokit_acct ||
2877 				    VME_ALIAS(entry) != alias) {
2878 					/* not the same mapping ! */
2879 					RETURN(KERN_NO_SPACE);
2880 				}
2881 				/*
2882 				 * Check if the same object is being mapped.
2883 				 */
2884 				if (is_submap) {
2885 					if (VME_SUBMAP(entry) !=
2886 					    (vm_map_t) object) {
2887 						/* not the same submap */
2888 						RETURN(KERN_NO_SPACE);
2889 					}
2890 				} else {
2891 					if (VME_OBJECT(entry) != object) {
2892 						/* not the same VM object... */
2893 						vm_object_t obj2;
2894 
2895 						obj2 = VME_OBJECT(entry);
2896 						if ((obj2 == VM_OBJECT_NULL ||
2897 						    obj2->internal) &&
2898 						    (object == VM_OBJECT_NULL ||
2899 						    object->internal)) {
2900 							/*
2901 							 * ... but both are
2902 							 * anonymous memory,
2903 							 * so equivalent.
2904 							 */
2905 						} else {
2906 							RETURN(KERN_NO_SPACE);
2907 						}
2908 					}
2909 				}
2910 
2911 				tmp_offset += entry->vme_end - entry->vme_start;
2912 				tmp_start += entry->vme_end - entry->vme_start;
2913 				if (entry->vme_end >= end) {
2914 					/* reached the end of our mapping */
2915 					break;
2916 				}
2917 			}
2918 			/* it all matches:  let's use what's already there ! */
2919 			RETURN(KERN_MEMORY_PRESENT);
2920 		}
2921 
2922 		/*
2923 		 *	...	the next region doesn't overlap the
2924 		 *		end point.
2925 		 */
2926 
2927 		if ((entry->vme_next != vm_map_to_entry(map)) &&
2928 		    (entry->vme_next->vme_start < end)) {
2929 			RETURN(KERN_NO_SPACE);
2930 		}
2931 	}
2932 
2933 	/*
2934 	 *	At this point,
2935 	 *		"start" and "end" should define the endpoints of the
2936 	 *			available new range, and
2937 	 *		"entry" should refer to the region before the new
2938 	 *			range, and
2939 	 *
2940 	 *		the map should be locked.
2941 	 */
2942 
2943 	/*
2944 	 *	See whether we can avoid creating a new entry (and object) by
2945 	 *	extending one of our neighbors.  [So far, we only attempt to
2946 	 *	extend from below.]  Note that we can never extend/join
2947 	 *	purgable objects because they need to remain distinct
2948 	 *	entities in order to implement their "volatile object"
2949 	 *	semantics.
2950 	 */
2951 
2952 	if (purgable ||
2953 	    entry_for_jit ||
2954 	    entry_for_tpro ||
2955 	    vm_memory_malloc_no_cow(user_alias)) {
2956 		if (object == VM_OBJECT_NULL) {
2957 			object = vm_object_allocate(size);
2958 			object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
2959 			object->true_share = FALSE;
2960 			if (malloc_no_cow_except_fork &&
2961 			    !purgable &&
2962 			    !entry_for_jit &&
2963 			    !entry_for_tpro &&
2964 			    vm_memory_malloc_no_cow(user_alias)) {
2965 				object->copy_strategy = MEMORY_OBJECT_COPY_DELAY_FORK;
2966 				object->true_share = TRUE;
2967 			}
2968 			if (purgable) {
2969 				task_t owner;
2970 				object->purgable = VM_PURGABLE_NONVOLATILE;
2971 				if (map->pmap == kernel_pmap) {
2972 					/*
2973 					 * Purgeable mappings made in a kernel
2974 					 * map are "owned" by the kernel itself
2975 					 * rather than the current user task
2976 					 * because they're likely to be used by
2977 					 * more than this user task (see
2978 					 * execargs_purgeable_allocate(), for
2979 					 * example).
2980 					 */
2981 					owner = kernel_task;
2982 				} else {
2983 					owner = current_task();
2984 				}
2985 				assert(object->vo_owner == NULL);
2986 				assert(object->resident_page_count == 0);
2987 				assert(object->wired_page_count == 0);
2988 				vm_object_lock(object);
2989 				vm_purgeable_nonvolatile_enqueue(object, owner);
2990 				vm_object_unlock(object);
2991 			}
2992 			offset = (vm_object_offset_t)0;
2993 		}
2994 	} else if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
2995 		/* no coalescing if address space uses sub-pages */
2996 	} else if ((is_submap == FALSE) &&
2997 	    (object == VM_OBJECT_NULL) &&
2998 	    (entry != vm_map_to_entry(map)) &&
2999 	    (entry->vme_end == start) &&
3000 	    (!entry->is_shared) &&
3001 	    (!entry->is_sub_map) &&
3002 	    (!entry->in_transition) &&
3003 	    (!entry->needs_wakeup) &&
3004 	    (entry->behavior == VM_BEHAVIOR_DEFAULT) &&
3005 	    (entry->protection == cur_protection) &&
3006 	    (entry->max_protection == max_protection) &&
3007 	    (entry->inheritance == inheritance) &&
3008 	    ((user_alias == VM_MEMORY_REALLOC) ||
3009 	    (VME_ALIAS(entry) == alias)) &&
3010 	    (entry->no_cache == no_cache) &&
3011 	    (entry->vme_permanent == permanent) &&
3012 	    /* no coalescing for immutable executable mappings */
3013 	    !((entry->protection & VM_PROT_EXECUTE) &&
3014 	    entry->vme_permanent) &&
3015 	    (!entry->superpage_size && !superpage_size) &&
3016 	    /*
3017 	     * No coalescing if not map-aligned, to avoid propagating
3018 	     * that condition any further than needed:
3019 	     */
3020 	    (!entry->map_aligned || !clear_map_aligned) &&
3021 	    (!entry->zero_wired_pages) &&
3022 	    (!entry->used_for_jit && !entry_for_jit) &&
3023 #if __arm64e__
3024 	    (!entry->used_for_tpro && !entry_for_tpro) &&
3025 #endif
3026 	    (!entry->csm_associated) &&
3027 	    (entry->iokit_acct == iokit_acct) &&
3028 	    (!entry->vme_resilient_codesign) &&
3029 	    (!entry->vme_resilient_media) &&
3030 	    (!entry->vme_atomic) &&
3031 	    (entry->vme_no_copy_on_read == no_copy_on_read) &&
3032 
3033 	    ((entry->vme_end - entry->vme_start) + size <=
3034 	    (user_alias == VM_MEMORY_REALLOC ?
3035 	    ANON_CHUNK_SIZE :
3036 	    NO_COALESCE_LIMIT)) &&
3037 
3038 	    (entry->wired_count == 0)) {        /* implies user_wired_count == 0 */
3039 		if (vm_object_coalesce(VME_OBJECT(entry),
3040 		    VM_OBJECT_NULL,
3041 		    VME_OFFSET(entry),
3042 		    (vm_object_offset_t) 0,
3043 		    (vm_map_size_t)(entry->vme_end - entry->vme_start),
3044 		    (vm_map_size_t)(end - entry->vme_end))) {
3045 			/*
3046 			 *	Coalesced the two objects - can extend
3047 			 *	the previous map entry to include the
3048 			 *	new range.
3049 			 */
3050 			map->size += (end - entry->vme_end);
3051 			assert(entry->vme_start < end);
3052 			assert(VM_MAP_PAGE_ALIGNED(end,
3053 			    VM_MAP_PAGE_MASK(map)));
3054 			if (__improbable(vm_debug_events)) {
3055 				DTRACE_VM5(map_entry_extend, vm_map_t, map, vm_map_entry_t, entry, vm_address_t, entry->vme_start, vm_address_t, entry->vme_end, vm_address_t, end);
3056 			}
3057 			entry->vme_end = end;
3058 			if (map->holelistenabled) {
3059 				vm_map_store_update_first_free(map, entry, TRUE);
3060 			} else {
3061 				vm_map_store_update_first_free(map, map->first_free, TRUE);
3062 			}
3063 			new_mapping_established = TRUE;
3064 			RETURN(KERN_SUCCESS);
3065 		}
3066 	}
3067 
3068 	step = superpage_size ? SUPERPAGE_SIZE : (end - start);
3069 	new_entry = NULL;
3070 
3071 	if (vmk_flags.vmkf_submap_adjust) {
3072 		vm_map_adjust_offsets((vm_map_t)caller_object, start, end);
3073 		offset = start;
3074 	}
3075 
3076 	for (tmp2_start = start; tmp2_start < end; tmp2_start += step) {
3077 		tmp2_end = tmp2_start + step;
3078 		/*
3079 		 *	Create a new entry
3080 		 *
3081 		 * XXX FBDP
3082 		 * The reserved "page zero" in each process's address space can
3083 		 * be arbitrarily large.  Splitting it into separate objects and
3084 		 * therefore different VM map entries serves no purpose and just
3085 		 * slows down operations on the VM map, so let's not split the
3086 		 * allocation into chunks if the max protection is NONE.  That
3087 		 * memory should never be accessible, so it will never get to the
3088 		 * default pager.
3089 		 */
3090 		tmp_start = tmp2_start;
3091 		if (!is_submap &&
3092 		    object == VM_OBJECT_NULL &&
3093 		    size > chunk_size &&
3094 		    max_protection != VM_PROT_NONE &&
3095 		    superpage_size == 0) {
3096 			tmp_end = tmp_start + chunk_size;
3097 		} else {
3098 			tmp_end = tmp2_end;
3099 		}
3100 		do {
3101 			if (!is_submap &&
3102 			    object != VM_OBJECT_NULL &&
3103 			    object->internal &&
3104 			    offset + (tmp_end - tmp_start) > object->vo_size) {
3105 //				printf("FBDP object %p size 0x%llx overmapping offset 0x%llx size 0x%llx\n", object, object->vo_size, offset, (uint64_t)(tmp_end - tmp_start));
3106 				DTRACE_VM5(vm_map_enter_overmap,
3107 				    vm_map_t, map,
3108 				    vm_map_address_t, tmp_start,
3109 				    vm_map_address_t, tmp_end,
3110 				    vm_object_offset_t, offset,
3111 				    vm_object_size_t, object->vo_size);
3112 			}
3113 			new_entry = vm_map_entry_insert(map,
3114 			    entry, tmp_start, tmp_end,
3115 			    object, offset, vmk_flags,
3116 			    needs_copy,
3117 			    cur_protection, max_protection,
3118 			    (entry_for_jit && !VM_MAP_POLICY_ALLOW_JIT_INHERIT(map) ?
3119 			    VM_INHERIT_NONE : inheritance),
3120 			    clear_map_aligned);
3121 
3122 			assert(!is_kernel_object(object) || (VM_KERN_MEMORY_NONE != alias));
3123 
3124 			if (resilient_codesign) {
3125 				int reject_prot = (needs_copy ? VM_PROT_ALLEXEC : (VM_PROT_WRITE | VM_PROT_ALLEXEC));
3126 				if (!((cur_protection | max_protection) & reject_prot)) {
3127 					new_entry->vme_resilient_codesign = TRUE;
3128 				}
3129 			}
3130 
3131 			if (resilient_media &&
3132 			    (object == VM_OBJECT_NULL ||
3133 			    object->internal)) {
3134 				new_entry->vme_resilient_media = TRUE;
3135 			}
3136 
3137 			assert(!new_entry->iokit_acct);
3138 			if (!is_submap &&
3139 			    object != VM_OBJECT_NULL &&
3140 			    (object->purgable != VM_PURGABLE_DENY ||
3141 			    object->vo_ledger_tag)) {
3142 				assert(new_entry->use_pmap);
3143 				assert(!new_entry->iokit_acct);
3144 				/*
3145 				 * Turn off pmap accounting since
3146 				 * purgeable (or tagged) objects have their
3147 				 * own ledgers.
3148 				 */
3149 				new_entry->use_pmap = FALSE;
3150 			} else if (!is_submap &&
3151 			    iokit_acct &&
3152 			    object != VM_OBJECT_NULL &&
3153 			    object->internal) {
3154 				/* alternate accounting */
3155 				assert(!new_entry->iokit_acct);
3156 				assert(new_entry->use_pmap);
3157 				new_entry->iokit_acct = TRUE;
3158 				new_entry->use_pmap = FALSE;
3159 				DTRACE_VM4(
3160 					vm_map_iokit_mapped_region,
3161 					vm_map_t, map,
3162 					vm_map_offset_t, new_entry->vme_start,
3163 					vm_map_offset_t, new_entry->vme_end,
3164 					int, VME_ALIAS(new_entry));
3165 				vm_map_iokit_mapped_region(
3166 					map,
3167 					(new_entry->vme_end -
3168 					new_entry->vme_start));
3169 			} else if (!is_submap) {
3170 				assert(!new_entry->iokit_acct);
3171 				assert(new_entry->use_pmap);
3172 			}
3173 
3174 			if (is_submap) {
3175 				vm_map_t        submap;
3176 				boolean_t       submap_is_64bit;
3177 				boolean_t       use_pmap;
3178 
3179 				assert(new_entry->is_sub_map);
3180 				assert(!new_entry->use_pmap);
3181 				assert(!new_entry->iokit_acct);
3182 				submap = (vm_map_t) object;
3183 				submap_is_64bit = vm_map_is_64bit(submap);
3184 				use_pmap = vmk_flags.vmkf_nested_pmap;
3185 #ifndef NO_NESTED_PMAP
3186 				if (use_pmap && submap->pmap == NULL) {
3187 					ledger_t ledger = map->pmap->ledger;
3188 					/* we need a sub pmap to nest... */
3189 					submap->pmap = pmap_create_options(ledger, 0,
3190 					    submap_is_64bit ? PMAP_CREATE_64BIT : 0);
3191 					if (submap->pmap == NULL) {
3192 						/* let's proceed without nesting... */
3193 					}
3194 #if defined(__arm64__)
3195 					else {
3196 						pmap_set_nested(submap->pmap);
3197 					}
3198 #endif
3199 				}
3200 				if (use_pmap && submap->pmap != NULL) {
3201 					if (VM_MAP_PAGE_SHIFT(map) != VM_MAP_PAGE_SHIFT(submap)) {
3202 						DEBUG4K_ERROR("map %p (%d) submap %p (%d): incompatible page sizes\n", map, VM_MAP_PAGE_SHIFT(map), submap, VM_MAP_PAGE_SHIFT(submap));
3203 						kr = KERN_FAILURE;
3204 					} else {
3205 						kr = pmap_nest(map->pmap,
3206 						    submap->pmap,
3207 						    tmp_start,
3208 						    tmp_end - tmp_start);
3209 					}
3210 					if (kr != KERN_SUCCESS) {
3211 						printf("vm_map_enter: "
3212 						    "pmap_nest(0x%llx,0x%llx) "
3213 						    "error 0x%x\n",
3214 						    (long long)tmp_start,
3215 						    (long long)tmp_end,
3216 						    kr);
3217 					} else {
3218 						/* we're now nested ! */
3219 						new_entry->use_pmap = TRUE;
3220 						pmap_empty = FALSE;
3221 					}
3222 				}
3223 #endif /* NO_NESTED_PMAP */
3224 			}
3225 			entry = new_entry;
3226 
3227 			if (superpage_size) {
3228 				vm_page_t pages, m;
3229 				vm_object_t sp_object;
3230 				vm_object_offset_t sp_offset;
3231 
3232 				VME_OFFSET_SET(entry, 0);
3233 
3234 				/* allocate one superpage */
3235 				kr = cpm_allocate(SUPERPAGE_SIZE, &pages, 0, SUPERPAGE_NBASEPAGES - 1, TRUE, 0);
3236 				if (kr != KERN_SUCCESS) {
3237 					/* deallocate whole range... */
3238 					new_mapping_established = TRUE;
3239 					/* ... but only up to "tmp_end" */
3240 					size -= end - tmp_end;
3241 					RETURN(kr);
3242 				}
3243 
3244 				/* create one vm_object per superpage */
3245 				sp_object = vm_object_allocate((vm_map_size_t)(entry->vme_end - entry->vme_start));
3246 				sp_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
3247 				sp_object->phys_contiguous = TRUE;
3248 				sp_object->vo_shadow_offset = (vm_object_offset_t)VM_PAGE_GET_PHYS_PAGE(pages) * PAGE_SIZE;
3249 				VME_OBJECT_SET(entry, sp_object, false, 0);
3250 				assert(entry->use_pmap);
3251 
3252 				/* enter the base pages into the object */
3253 				vm_object_lock(sp_object);
3254 				for (sp_offset = 0;
3255 				    sp_offset < SUPERPAGE_SIZE;
3256 				    sp_offset += PAGE_SIZE) {
3257 					m = pages;
3258 					pmap_zero_page(VM_PAGE_GET_PHYS_PAGE(m));
3259 					pages = NEXT_PAGE(m);
3260 					*(NEXT_PAGE_PTR(m)) = VM_PAGE_NULL;
3261 					vm_page_insert_wired(m, sp_object, sp_offset, VM_KERN_MEMORY_OSFMK);
3262 				}
3263 				vm_object_unlock(sp_object);
3264 			}
3265 		} while (tmp_end != tmp2_end &&
3266 		    (tmp_start = tmp_end) &&
3267 		    (tmp_end = (tmp2_end - tmp_end > chunk_size) ?
3268 		    tmp_end + chunk_size : tmp2_end));
3269 	}
3270 
3271 	new_mapping_established = TRUE;
3272 
3273 BailOut:
3274 	assert(map_locked == TRUE);
3275 
3276 	/*
3277 	 * Address space limit enforcement (RLIMIT_AS and RLIMIT_DATA):
3278 	 * If we have identified and possibly established the new mapping(s),
3279 	 * make sure we did not go beyond the address space limit.
3280 	 */
3281 	if (result == KERN_SUCCESS) {
3282 		if (map->size_limit != RLIM_INFINITY &&
3283 		    map->size > map->size_limit) {
3284 			/*
3285 			 * Establishing the requested mappings would exceed
3286 			 * the process's RLIMIT_AS limit: fail with
3287 			 * KERN_NO_SPACE.
3288 			 */
3289 			result = KERN_NO_SPACE;
3290 			printf("%d[%s] %s: map size 0x%llx over RLIMIT_AS 0x%llx\n",
3291 			    proc_selfpid(),
3292 			    (get_bsdtask_info(current_task())
3293 			    ? proc_name_address(get_bsdtask_info(current_task()))
3294 			    : "?"),
3295 			    __FUNCTION__,
3296 			    (uint64_t) map->size,
3297 			    (uint64_t) map->size_limit);
3298 			DTRACE_VM2(vm_map_enter_RLIMIT_AS,
3299 			    vm_map_size_t, map->size,
3300 			    uint64_t, map->size_limit);
3301 			vm_map_enter_RLIMIT_AS_count++;
3302 		} else if (map->data_limit != RLIM_INFINITY &&
3303 		    map->size > map->data_limit) {
3304 			/*
3305 			 * Establishing the requested mappings would exceed
3306 			 * the process's RLIMIT_DATA limit: fail with
3307 			 * KERN_NO_SPACE.
3308 			 */
3309 			result = KERN_NO_SPACE;
3310 			printf("%d[%s] %s: map size 0x%llx over RLIMIT_DATA 0x%llx\n",
3311 			    proc_selfpid(),
3312 			    (get_bsdtask_info(current_task())
3313 			    ? proc_name_address(get_bsdtask_info(current_task()))
3314 			    : "?"),
3315 			    __FUNCTION__,
3316 			    (uint64_t) map->size,
3317 			    (uint64_t) map->data_limit);
3318 			DTRACE_VM2(vm_map_enter_RLIMIT_DATA,
3319 			    vm_map_size_t, map->size,
3320 			    uint64_t, map->data_limit);
3321 			vm_map_enter_RLIMIT_DATA_count++;
3322 		}
3323 	}
3324 
3325 	if (result == KERN_SUCCESS) {
3326 		vm_prot_t pager_prot;
3327 		memory_object_t pager;
3328 
3329 #if DEBUG
3330 		if (pmap_empty &&
3331 		    !(vmk_flags.vmkf_no_pmap_check)) {
3332 			assert(pmap_is_empty(map->pmap,
3333 			    *address,
3334 			    *address + size));
3335 		}
3336 #endif /* DEBUG */
3337 
3338 		/*
3339 		 * For "named" VM objects, let the pager know that the
3340 		 * memory object is being mapped.  Some pagers need to keep
3341 		 * track of this, to know when they can reclaim the memory
3342 		 * object, for example.
3343 		 * VM calls memory_object_map() for each mapping (specifying
3344 		 * the protection of each mapping) and calls
3345 		 * memory_object_last_unmap() when all the mappings are gone.
3346 		 */
3347 		pager_prot = max_protection;
3348 		if (needs_copy) {
3349 			/*
3350 			 * Copy-On-Write mapping: won't modify
3351 			 * the memory object.
3352 			 */
3353 			pager_prot &= ~VM_PROT_WRITE;
3354 		}
3355 		if (!is_submap &&
3356 		    object != VM_OBJECT_NULL &&
3357 		    object->named &&
3358 		    object->pager != MEMORY_OBJECT_NULL) {
3359 			vm_object_lock(object);
3360 			pager = object->pager;
3361 			if (object->named &&
3362 			    pager != MEMORY_OBJECT_NULL) {
3363 				assert(object->pager_ready);
3364 				vm_object_mapping_wait(object, THREAD_UNINT);
3365 				vm_object_mapping_begin(object);
3366 				vm_object_unlock(object);
3367 
3368 				kr = memory_object_map(pager, pager_prot);
3369 				assert(kr == KERN_SUCCESS);
3370 
3371 				vm_object_lock(object);
3372 				vm_object_mapping_end(object);
3373 			}
3374 			vm_object_unlock(object);
3375 		}
3376 	}
3377 
3378 	assert(map_locked == TRUE);
3379 
3380 	if (new_mapping_established) {
3381 		/*
3382 		 * If we release the map lock for any reason below,
3383 		 * another thread could deallocate our new mapping,
3384 		 * releasing the caller's reference on "caller_object",
3385 		 * which was transferred to the mapping.
3386 		 * If this was the only reference, the object could be
3387 		 * destroyed.
3388 		 *
3389 		 * We need to take an extra reference on "caller_object"
3390 		 * to keep it alive if we need to return the caller's
3391 		 * reference to the caller in case of failure.
3392 		 */
3393 		if (is_submap) {
3394 			vm_map_reference((vm_map_t)caller_object);
3395 		} else {
3396 			vm_object_reference(caller_object);
3397 		}
3398 	}
3399 
3400 	if (!keep_map_locked) {
3401 		vm_map_unlock(map);
3402 		map_locked = FALSE;
3403 		entry = VM_MAP_ENTRY_NULL;
3404 		new_entry = VM_MAP_ENTRY_NULL;
3405 	}
3406 
3407 	/*
3408 	 * We can't hold the map lock if we enter this block.
3409 	 */
3410 
3411 	if (result == KERN_SUCCESS) {
3412 		/*	Wire down the new entry if the user
3413 		 *	requested all new map entries be wired.
3414 		 */
3415 		if ((map->wiring_required) || (superpage_size)) {
3416 			assert(!keep_map_locked);
3417 			pmap_empty = FALSE; /* pmap won't be empty */
3418 			kr = vm_map_wire_kernel(map, start, end,
3419 			    cur_protection, VM_KERN_MEMORY_MLOCK,
3420 			    TRUE);
3421 			result = kr;
3422 		}
3423 
3424 	}
3425 
3426 	if (result != KERN_SUCCESS) {
3427 		if (new_mapping_established) {
3428 			vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_FLAGS;
3429 
3430 			/*
3431 			 * We have to get rid of the new mappings since we
3432 			 * won't make them available to the user.
3433 			 * Try and do that atomically, to minimize the risk
3434 			 * that someone else create new mappings that range.
3435 			 */
3436 			if (!map_locked) {
3437 				vm_map_lock(map);
3438 				map_locked = TRUE;
3439 			}
3440 			remove_flags |= VM_MAP_REMOVE_NO_MAP_ALIGN;
3441 			remove_flags |= VM_MAP_REMOVE_NO_YIELD;
3442 			if (permanent) {
3443 				remove_flags |= VM_MAP_REMOVE_IMMUTABLE;
3444 			}
3445 			(void) vm_map_delete(map,
3446 			    *address, *address + size,
3447 			    remove_flags,
3448 			    KMEM_GUARD_NONE, &zap_new_list);
3449 		}
3450 
3451 		if (vm_map_zap_first_entry(&zap_old_list)) {
3452 			vm_map_entry_t entry1, entry2;
3453 
3454 			/*
3455 			 * The new mapping failed.  Attempt to restore
3456 			 * the old mappings, saved in the "zap_old_map".
3457 			 */
3458 			if (!map_locked) {
3459 				vm_map_lock(map);
3460 				map_locked = TRUE;
3461 			}
3462 
3463 			/* first check if the coast is still clear */
3464 			start = vm_map_zap_first_entry(&zap_old_list)->vme_start;
3465 			end   = vm_map_zap_last_entry(&zap_old_list)->vme_end;
3466 
3467 			if (vm_map_lookup_entry(map, start, &entry1) ||
3468 			    vm_map_lookup_entry(map, end, &entry2) ||
3469 			    entry1 != entry2) {
3470 				/*
3471 				 * Part of that range has already been
3472 				 * re-mapped:  we can't restore the old
3473 				 * mappings...
3474 				 */
3475 				vm_map_enter_restore_failures++;
3476 			} else {
3477 				/*
3478 				 * Transfer the saved map entries from
3479 				 * "zap_old_map" to the original "map",
3480 				 * inserting them all after "entry1".
3481 				 */
3482 				while ((entry2 = vm_map_zap_pop(&zap_old_list))) {
3483 					vm_map_size_t entry_size;
3484 
3485 					entry_size = (entry2->vme_end -
3486 					    entry2->vme_start);
3487 					vm_map_store_entry_link(map, entry1, entry2,
3488 					    VM_MAP_KERNEL_FLAGS_NONE);
3489 					map->size += entry_size;
3490 					entry1 = entry2;
3491 				}
3492 				if (map->wiring_required) {
3493 					/*
3494 					 * XXX TODO: we should rewire the
3495 					 * old pages here...
3496 					 */
3497 				}
3498 				vm_map_enter_restore_successes++;
3499 			}
3500 		}
3501 	}
3502 
3503 	/*
3504 	 * The caller is responsible for releasing the lock if it requested to
3505 	 * keep the map locked.
3506 	 */
3507 	if (map_locked && !keep_map_locked) {
3508 		vm_map_unlock(map);
3509 	}
3510 
3511 	vm_map_zap_dispose(&zap_old_list);
3512 	vm_map_zap_dispose(&zap_new_list);
3513 
3514 	if (new_mapping_established) {
3515 		/*
3516 		 * The caller had a reference on "caller_object" and we
3517 		 * transferred that reference to the mapping.
3518 		 * We also took an extra reference on "caller_object" to keep
3519 		 * it alive while the map was unlocked.
3520 		 */
3521 		if (result == KERN_SUCCESS) {
3522 			/*
3523 			 * On success, the caller's reference on the object gets
3524 			 * tranferred to the mapping.
3525 			 * Release our extra reference.
3526 			 */
3527 			if (is_submap) {
3528 				vm_map_deallocate((vm_map_t)caller_object);
3529 			} else {
3530 				vm_object_deallocate(caller_object);
3531 			}
3532 		} else {
3533 			/*
3534 			 * On error, the caller expects to still have a
3535 			 * reference on the object it gave us.
3536 			 * Let's use our extra reference for that.
3537 			 */
3538 		}
3539 	}
3540 
3541 	return result;
3542 
3543 #undef  RETURN
3544 }
3545 
3546 #if __arm64__
3547 extern const struct memory_object_pager_ops fourk_pager_ops;
3548 kern_return_t
vm_map_enter_fourk(vm_map_t map,vm_map_offset_t * address,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_object_t object,vm_object_offset_t offset,boolean_t needs_copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)3549 vm_map_enter_fourk(
3550 	vm_map_t                map,
3551 	vm_map_offset_t         *address,       /* IN/OUT */
3552 	vm_map_size_t           size,
3553 	vm_map_offset_t         mask,
3554 	vm_map_kernel_flags_t   vmk_flags,
3555 	vm_object_t             object,
3556 	vm_object_offset_t      offset,
3557 	boolean_t               needs_copy,
3558 	vm_prot_t               cur_protection,
3559 	vm_prot_t               max_protection,
3560 	vm_inherit_t            inheritance)
3561 {
3562 	vm_map_entry_t          entry, new_entry;
3563 	vm_map_offset_t         start, fourk_start;
3564 	vm_map_offset_t         end, fourk_end;
3565 	vm_map_size_t           fourk_size;
3566 	kern_return_t           result = KERN_SUCCESS;
3567 	boolean_t               map_locked = FALSE;
3568 	boolean_t               pmap_empty = TRUE;
3569 	boolean_t               new_mapping_established = FALSE;
3570 	const bool              keep_map_locked = vmk_flags.vmkf_keep_map_locked;
3571 	const bool              anywhere = !vmk_flags.vmf_fixed;
3572 	const bool              purgable = vmk_flags.vmf_purgeable;
3573 	const bool              overwrite = vmk_flags.vmf_overwrite;
3574 	const bool              is_submap = vmk_flags.vmkf_submap;
3575 	const bool              entry_for_jit = vmk_flags.vmkf_map_jit;
3576 	const unsigned int      superpage_size = vmk_flags.vmf_superpage_size;
3577 	vm_map_offset_t         effective_min_offset, effective_max_offset;
3578 	kern_return_t           kr;
3579 	boolean_t               clear_map_aligned = FALSE;
3580 	memory_object_t         fourk_mem_obj;
3581 	vm_object_t             fourk_object;
3582 	vm_map_offset_t         fourk_pager_offset;
3583 	int                     fourk_pager_index_start, fourk_pager_index_num;
3584 	int                     cur_idx;
3585 	boolean_t               fourk_copy;
3586 	vm_object_t             copy_object;
3587 	vm_object_offset_t      copy_offset;
3588 	VM_MAP_ZAP_DECLARE(zap_list);
3589 
3590 	if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
3591 		panic("%s:%d", __FUNCTION__, __LINE__);
3592 	}
3593 	fourk_mem_obj = MEMORY_OBJECT_NULL;
3594 	fourk_object = VM_OBJECT_NULL;
3595 
3596 	if (superpage_size) {
3597 		return KERN_NOT_SUPPORTED;
3598 	}
3599 
3600 	if ((cur_protection & VM_PROT_WRITE) &&
3601 	    (cur_protection & VM_PROT_EXECUTE) &&
3602 #if XNU_TARGET_OS_OSX
3603 	    map->pmap != kernel_pmap &&
3604 	    (vm_map_cs_enforcement(map)
3605 #if __arm64__
3606 	    || !VM_MAP_IS_EXOTIC(map)
3607 #endif /* __arm64__ */
3608 	    ) &&
3609 #endif /* XNU_TARGET_OS_OSX */
3610 #if CODE_SIGNING_MONITOR
3611 	    (csm_address_space_exempt(map->pmap) != KERN_SUCCESS) &&
3612 #endif
3613 	    !entry_for_jit) {
3614 		DTRACE_VM3(cs_wx,
3615 		    uint64_t, 0,
3616 		    uint64_t, 0,
3617 		    vm_prot_t, cur_protection);
3618 		printf("CODE SIGNING: %d[%s] %s: curprot cannot be write+execute. "
3619 		    "turning off execute\n",
3620 		    proc_selfpid(),
3621 		    (get_bsdtask_info(current_task())
3622 		    ? proc_name_address(get_bsdtask_info(current_task()))
3623 		    : "?"),
3624 		    __FUNCTION__);
3625 		cur_protection &= ~VM_PROT_EXECUTE;
3626 	}
3627 
3628 	/*
3629 	 * If the task has requested executable lockdown,
3630 	 * deny any new executable mapping.
3631 	 */
3632 	if (map->map_disallow_new_exec == TRUE) {
3633 		if (cur_protection & VM_PROT_EXECUTE) {
3634 			return KERN_PROTECTION_FAILURE;
3635 		}
3636 	}
3637 
3638 	if (is_submap) {
3639 		return KERN_NOT_SUPPORTED;
3640 	}
3641 	if (vmk_flags.vmkf_already) {
3642 		return KERN_NOT_SUPPORTED;
3643 	}
3644 	if (purgable || entry_for_jit) {
3645 		return KERN_NOT_SUPPORTED;
3646 	}
3647 
3648 	effective_min_offset = map->min_offset;
3649 
3650 	if (vmk_flags.vmkf_beyond_max) {
3651 		return KERN_NOT_SUPPORTED;
3652 	} else {
3653 		effective_max_offset = map->max_offset;
3654 	}
3655 
3656 	if (size == 0 ||
3657 	    (offset & FOURK_PAGE_MASK) != 0) {
3658 		*address = 0;
3659 		return KERN_INVALID_ARGUMENT;
3660 	}
3661 
3662 #define RETURN(value)   { result = value; goto BailOut; }
3663 
3664 	assert(VM_MAP_PAGE_ALIGNED(*address, FOURK_PAGE_MASK));
3665 	assert(VM_MAP_PAGE_ALIGNED(size, FOURK_PAGE_MASK));
3666 
3667 	if (!anywhere && overwrite) {
3668 		return KERN_NOT_SUPPORTED;
3669 	}
3670 
3671 	fourk_start = *address;
3672 	fourk_size = size;
3673 	fourk_end = fourk_start + fourk_size;
3674 
3675 	start = vm_map_trunc_page(*address, VM_MAP_PAGE_MASK(map));
3676 	end = vm_map_round_page(fourk_end, VM_MAP_PAGE_MASK(map));
3677 	size = end - start;
3678 
3679 	if (anywhere) {
3680 		return KERN_NOT_SUPPORTED;
3681 	} else {
3682 		/*
3683 		 *	Verify that:
3684 		 *		the address doesn't itself violate
3685 		 *		the mask requirement.
3686 		 */
3687 
3688 		vm_map_lock(map);
3689 		map_locked = TRUE;
3690 		if ((start & mask) != 0) {
3691 			RETURN(KERN_NO_SPACE);
3692 		}
3693 
3694 		/*
3695 		 *	...	the address is within bounds
3696 		 */
3697 
3698 		end = start + size;
3699 
3700 		if ((start < effective_min_offset) ||
3701 		    (end > effective_max_offset) ||
3702 		    (start >= end)) {
3703 			RETURN(KERN_INVALID_ADDRESS);
3704 		}
3705 
3706 		/*
3707 		 *	...	the starting address isn't allocated
3708 		 */
3709 		if (vm_map_lookup_entry(map, start, &entry)) {
3710 			vm_object_t cur_object, shadow_object;
3711 
3712 			/*
3713 			 * We might already some 4K mappings
3714 			 * in a 16K page here.
3715 			 */
3716 
3717 			if (entry->vme_end - entry->vme_start
3718 			    != SIXTEENK_PAGE_SIZE) {
3719 				RETURN(KERN_NO_SPACE);
3720 			}
3721 			if (entry->is_sub_map) {
3722 				RETURN(KERN_NO_SPACE);
3723 			}
3724 			if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
3725 				RETURN(KERN_NO_SPACE);
3726 			}
3727 
3728 			/* go all the way down the shadow chain */
3729 			cur_object = VME_OBJECT(entry);
3730 			vm_object_lock(cur_object);
3731 			while (cur_object->shadow != VM_OBJECT_NULL) {
3732 				shadow_object = cur_object->shadow;
3733 				vm_object_lock(shadow_object);
3734 				vm_object_unlock(cur_object);
3735 				cur_object = shadow_object;
3736 				shadow_object = VM_OBJECT_NULL;
3737 			}
3738 			if (cur_object->internal ||
3739 			    cur_object->pager == NULL) {
3740 				vm_object_unlock(cur_object);
3741 				RETURN(KERN_NO_SPACE);
3742 			}
3743 			if (cur_object->pager->mo_pager_ops
3744 			    != &fourk_pager_ops) {
3745 				vm_object_unlock(cur_object);
3746 				RETURN(KERN_NO_SPACE);
3747 			}
3748 			fourk_object = cur_object;
3749 			fourk_mem_obj = fourk_object->pager;
3750 
3751 			/* keep the "4K" object alive */
3752 			vm_object_reference_locked(fourk_object);
3753 			memory_object_reference(fourk_mem_obj);
3754 			vm_object_unlock(fourk_object);
3755 
3756 			/* merge permissions */
3757 			entry->protection |= cur_protection;
3758 			entry->max_protection |= max_protection;
3759 
3760 			if ((entry->protection & VM_PROT_WRITE) &&
3761 			    (entry->protection & VM_PROT_ALLEXEC) &&
3762 			    fourk_binary_compatibility_unsafe &&
3763 			    fourk_binary_compatibility_allow_wx) {
3764 				/* write+execute: need to be "jit" */
3765 				entry->used_for_jit = TRUE;
3766 			}
3767 			goto map_in_fourk_pager;
3768 		}
3769 
3770 		/*
3771 		 *	...	the next region doesn't overlap the
3772 		 *		end point.
3773 		 */
3774 
3775 		if ((entry->vme_next != vm_map_to_entry(map)) &&
3776 		    (entry->vme_next->vme_start < end)) {
3777 			RETURN(KERN_NO_SPACE);
3778 		}
3779 	}
3780 
3781 	/*
3782 	 *	At this point,
3783 	 *		"start" and "end" should define the endpoints of the
3784 	 *			available new range, and
3785 	 *		"entry" should refer to the region before the new
3786 	 *			range, and
3787 	 *
3788 	 *		the map should be locked.
3789 	 */
3790 
3791 	/* create a new "4K" pager */
3792 	fourk_mem_obj = fourk_pager_create();
3793 	fourk_object = fourk_pager_to_vm_object(fourk_mem_obj);
3794 	assert(fourk_object);
3795 
3796 	/* keep the "4" object alive */
3797 	vm_object_reference(fourk_object);
3798 
3799 	/* create a "copy" object, to map the "4K" object copy-on-write */
3800 	fourk_copy = TRUE;
3801 	result = vm_object_copy_strategically(fourk_object,
3802 	    0,
3803 	    end - start,
3804 	    false,                                   /* forking */
3805 	    &copy_object,
3806 	    &copy_offset,
3807 	    &fourk_copy);
3808 	assert(result == KERN_SUCCESS);
3809 	assert(copy_object != VM_OBJECT_NULL);
3810 	assert(copy_offset == 0);
3811 
3812 	/* map the "4K" pager's copy object */
3813 	new_entry = vm_map_entry_insert(map,
3814 	    entry,
3815 	    vm_map_trunc_page(start, VM_MAP_PAGE_MASK(map)),
3816 	    vm_map_round_page(end, VM_MAP_PAGE_MASK(map)),
3817 	    copy_object,
3818 	    0,                      /* offset */
3819 	    vmk_flags,
3820 	    FALSE,                  /* needs_copy */
3821 	    cur_protection, max_protection,
3822 	    (entry_for_jit && !VM_MAP_POLICY_ALLOW_JIT_INHERIT(map) ?
3823 	    VM_INHERIT_NONE : inheritance),
3824 	    clear_map_aligned);
3825 	entry = new_entry;
3826 
3827 #if VM_MAP_DEBUG_FOURK
3828 	if (vm_map_debug_fourk) {
3829 		printf("FOURK_PAGER: map %p [0x%llx:0x%llx] new pager %p\n",
3830 		    map,
3831 		    (uint64_t) entry->vme_start,
3832 		    (uint64_t) entry->vme_end,
3833 		    fourk_mem_obj);
3834 	}
3835 #endif /* VM_MAP_DEBUG_FOURK */
3836 
3837 	new_mapping_established = TRUE;
3838 
3839 map_in_fourk_pager:
3840 	/* "map" the original "object" where it belongs in the "4K" pager */
3841 	fourk_pager_offset = (fourk_start & SIXTEENK_PAGE_MASK);
3842 	fourk_pager_index_start = (int) (fourk_pager_offset / FOURK_PAGE_SIZE);
3843 	if (fourk_size > SIXTEENK_PAGE_SIZE) {
3844 		fourk_pager_index_num = 4;
3845 	} else {
3846 		fourk_pager_index_num = (int) (fourk_size / FOURK_PAGE_SIZE);
3847 	}
3848 	if (fourk_pager_index_start + fourk_pager_index_num > 4) {
3849 		fourk_pager_index_num = 4 - fourk_pager_index_start;
3850 	}
3851 	for (cur_idx = 0;
3852 	    cur_idx < fourk_pager_index_num;
3853 	    cur_idx++) {
3854 		vm_object_t             old_object;
3855 		vm_object_offset_t      old_offset;
3856 
3857 		kr = fourk_pager_populate(fourk_mem_obj,
3858 		    TRUE,                       /* overwrite */
3859 		    fourk_pager_index_start + cur_idx,
3860 		    object,
3861 		    (object
3862 		    ? (offset +
3863 		    (cur_idx * FOURK_PAGE_SIZE))
3864 		    : 0),
3865 		    &old_object,
3866 		    &old_offset);
3867 #if VM_MAP_DEBUG_FOURK
3868 		if (vm_map_debug_fourk) {
3869 			if (old_object == (vm_object_t) -1 &&
3870 			    old_offset == (vm_object_offset_t) -1) {
3871 				printf("FOURK_PAGER: map %p [0x%llx:0x%llx] "
3872 				    "pager [%p:0x%llx] "
3873 				    "populate[%d] "
3874 				    "[object:%p,offset:0x%llx]\n",
3875 				    map,
3876 				    (uint64_t) entry->vme_start,
3877 				    (uint64_t) entry->vme_end,
3878 				    fourk_mem_obj,
3879 				    VME_OFFSET(entry),
3880 				    fourk_pager_index_start + cur_idx,
3881 				    object,
3882 				    (object
3883 				    ? (offset + (cur_idx * FOURK_PAGE_SIZE))
3884 				    : 0));
3885 			} else {
3886 				printf("FOURK_PAGER: map %p [0x%llx:0x%llx] "
3887 				    "pager [%p:0x%llx] "
3888 				    "populate[%d] [object:%p,offset:0x%llx] "
3889 				    "old [%p:0x%llx]\n",
3890 				    map,
3891 				    (uint64_t) entry->vme_start,
3892 				    (uint64_t) entry->vme_end,
3893 				    fourk_mem_obj,
3894 				    VME_OFFSET(entry),
3895 				    fourk_pager_index_start + cur_idx,
3896 				    object,
3897 				    (object
3898 				    ? (offset + (cur_idx * FOURK_PAGE_SIZE))
3899 				    : 0),
3900 				    old_object,
3901 				    old_offset);
3902 			}
3903 		}
3904 #endif /* VM_MAP_DEBUG_FOURK */
3905 
3906 		assert(kr == KERN_SUCCESS);
3907 		if (object != old_object &&
3908 		    object != VM_OBJECT_NULL &&
3909 		    object != (vm_object_t) -1) {
3910 			vm_object_reference(object);
3911 		}
3912 		if (object != old_object &&
3913 		    old_object != VM_OBJECT_NULL &&
3914 		    old_object != (vm_object_t) -1) {
3915 			vm_object_deallocate(old_object);
3916 		}
3917 	}
3918 
3919 BailOut:
3920 	assert(map_locked == TRUE);
3921 
3922 	if (result == KERN_SUCCESS) {
3923 		vm_prot_t pager_prot;
3924 		memory_object_t pager;
3925 
3926 #if DEBUG
3927 		if (pmap_empty &&
3928 		    !(vmk_flags.vmkf_no_pmap_check)) {
3929 			assert(pmap_is_empty(map->pmap,
3930 			    *address,
3931 			    *address + size));
3932 		}
3933 #endif /* DEBUG */
3934 
3935 		/*
3936 		 * For "named" VM objects, let the pager know that the
3937 		 * memory object is being mapped.  Some pagers need to keep
3938 		 * track of this, to know when they can reclaim the memory
3939 		 * object, for example.
3940 		 * VM calls memory_object_map() for each mapping (specifying
3941 		 * the protection of each mapping) and calls
3942 		 * memory_object_last_unmap() when all the mappings are gone.
3943 		 */
3944 		pager_prot = max_protection;
3945 		if (needs_copy) {
3946 			/*
3947 			 * Copy-On-Write mapping: won't modify
3948 			 * the memory object.
3949 			 */
3950 			pager_prot &= ~VM_PROT_WRITE;
3951 		}
3952 		if (!is_submap &&
3953 		    object != VM_OBJECT_NULL &&
3954 		    object->named &&
3955 		    object->pager != MEMORY_OBJECT_NULL) {
3956 			vm_object_lock(object);
3957 			pager = object->pager;
3958 			if (object->named &&
3959 			    pager != MEMORY_OBJECT_NULL) {
3960 				assert(object->pager_ready);
3961 				vm_object_mapping_wait(object, THREAD_UNINT);
3962 				vm_object_mapping_begin(object);
3963 				vm_object_unlock(object);
3964 
3965 				kr = memory_object_map(pager, pager_prot);
3966 				assert(kr == KERN_SUCCESS);
3967 
3968 				vm_object_lock(object);
3969 				vm_object_mapping_end(object);
3970 			}
3971 			vm_object_unlock(object);
3972 		}
3973 		if (!is_submap &&
3974 		    fourk_object != VM_OBJECT_NULL &&
3975 		    fourk_object->named &&
3976 		    fourk_object->pager != MEMORY_OBJECT_NULL) {
3977 			vm_object_lock(fourk_object);
3978 			pager = fourk_object->pager;
3979 			if (fourk_object->named &&
3980 			    pager != MEMORY_OBJECT_NULL) {
3981 				assert(fourk_object->pager_ready);
3982 				vm_object_mapping_wait(fourk_object,
3983 				    THREAD_UNINT);
3984 				vm_object_mapping_begin(fourk_object);
3985 				vm_object_unlock(fourk_object);
3986 
3987 				kr = memory_object_map(pager, VM_PROT_READ);
3988 				assert(kr == KERN_SUCCESS);
3989 
3990 				vm_object_lock(fourk_object);
3991 				vm_object_mapping_end(fourk_object);
3992 			}
3993 			vm_object_unlock(fourk_object);
3994 		}
3995 	}
3996 
3997 	if (fourk_object != VM_OBJECT_NULL) {
3998 		vm_object_deallocate(fourk_object);
3999 		fourk_object = VM_OBJECT_NULL;
4000 		memory_object_deallocate(fourk_mem_obj);
4001 		fourk_mem_obj = MEMORY_OBJECT_NULL;
4002 	}
4003 
4004 	assert(map_locked == TRUE);
4005 
4006 	if (!keep_map_locked) {
4007 		vm_map_unlock(map);
4008 		map_locked = FALSE;
4009 	}
4010 
4011 	/*
4012 	 * We can't hold the map lock if we enter this block.
4013 	 */
4014 
4015 	if (result == KERN_SUCCESS) {
4016 		/*	Wire down the new entry if the user
4017 		 *	requested all new map entries be wired.
4018 		 */
4019 		if ((map->wiring_required) || (superpage_size)) {
4020 			assert(!keep_map_locked);
4021 			pmap_empty = FALSE; /* pmap won't be empty */
4022 			kr = vm_map_wire_kernel(map, start, end,
4023 			    new_entry->protection, VM_KERN_MEMORY_MLOCK,
4024 			    TRUE);
4025 			result = kr;
4026 		}
4027 
4028 	}
4029 
4030 	if (result != KERN_SUCCESS) {
4031 		if (new_mapping_established) {
4032 			/*
4033 			 * We have to get rid of the new mappings since we
4034 			 * won't make them available to the user.
4035 			 * Try and do that atomically, to minimize the risk
4036 			 * that someone else create new mappings that range.
4037 			 */
4038 
4039 			if (!map_locked) {
4040 				vm_map_lock(map);
4041 				map_locked = TRUE;
4042 			}
4043 			(void)vm_map_delete(map, *address, *address + size,
4044 			    VM_MAP_REMOVE_NO_MAP_ALIGN | VM_MAP_REMOVE_NO_YIELD,
4045 			    KMEM_GUARD_NONE, &zap_list);
4046 		}
4047 	}
4048 
4049 	/*
4050 	 * The caller is responsible for releasing the lock if it requested to
4051 	 * keep the map locked.
4052 	 */
4053 	if (map_locked && !keep_map_locked) {
4054 		vm_map_unlock(map);
4055 	}
4056 
4057 	vm_map_zap_dispose(&zap_list);
4058 
4059 	return result;
4060 
4061 #undef  RETURN
4062 }
4063 #endif /* __arm64__ */
4064 
4065 /*
4066  * Counters for the prefault optimization.
4067  */
4068 int64_t vm_prefault_nb_pages = 0;
4069 int64_t vm_prefault_nb_bailout = 0;
4070 
4071 static kern_return_t
vm_map_enter_mem_object_helper(vm_map_t target_map,vm_map_offset_t * address,vm_map_size_t initial_size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,ipc_port_t port,vm_object_offset_t offset,boolean_t copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance,upl_page_list_ptr_t page_list,unsigned int page_list_count)4072 vm_map_enter_mem_object_helper(
4073 	vm_map_t                target_map,
4074 	vm_map_offset_t         *address,
4075 	vm_map_size_t           initial_size,
4076 	vm_map_offset_t         mask,
4077 	vm_map_kernel_flags_t   vmk_flags,
4078 	ipc_port_t              port,
4079 	vm_object_offset_t      offset,
4080 	boolean_t               copy,
4081 	vm_prot_t               cur_protection,
4082 	vm_prot_t               max_protection,
4083 	vm_inherit_t            inheritance,
4084 	upl_page_list_ptr_t     page_list,
4085 	unsigned int            page_list_count)
4086 {
4087 	vm_map_address_t        map_addr;
4088 	vm_map_size_t           map_size;
4089 	vm_object_t             object;
4090 	vm_object_size_t        size;
4091 	kern_return_t           result;
4092 	boolean_t               mask_cur_protection, mask_max_protection;
4093 	boolean_t               kernel_prefault, try_prefault = (page_list_count != 0);
4094 	vm_map_offset_t         offset_in_mapping = 0;
4095 #if __arm64__
4096 	boolean_t               fourk = vmk_flags.vmkf_fourk;
4097 #endif /* __arm64__ */
4098 
4099 	if (VM_MAP_PAGE_SHIFT(target_map) < PAGE_SHIFT) {
4100 		/* XXX TODO4K prefaulting depends on page size... */
4101 		try_prefault = FALSE;
4102 	}
4103 
4104 	assertf(vmk_flags.__vmkf_unused == 0, "vmk_flags unused=0x%x\n", vmk_flags.__vmkf_unused);
4105 	vm_map_kernel_flags_update_range_id(&vmk_flags, target_map);
4106 
4107 	mask_cur_protection = cur_protection & VM_PROT_IS_MASK;
4108 	mask_max_protection = max_protection & VM_PROT_IS_MASK;
4109 	cur_protection &= ~VM_PROT_IS_MASK;
4110 	max_protection &= ~VM_PROT_IS_MASK;
4111 
4112 	/*
4113 	 * Check arguments for validity
4114 	 */
4115 	if ((target_map == VM_MAP_NULL) ||
4116 	    (cur_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)) ||
4117 	    (max_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)) ||
4118 	    (inheritance > VM_INHERIT_LAST_VALID) ||
4119 	    (try_prefault && (copy || !page_list)) ||
4120 	    initial_size == 0) {
4121 		return KERN_INVALID_ARGUMENT;
4122 	}
4123 
4124 #if __arm64__
4125 	if (cur_protection & VM_PROT_EXECUTE) {
4126 		cur_protection |= VM_PROT_READ;
4127 	}
4128 
4129 	if (fourk && VM_MAP_PAGE_SHIFT(target_map) < PAGE_SHIFT) {
4130 		/* no "fourk" if map is using a sub-page page size */
4131 		fourk = FALSE;
4132 	}
4133 	if (fourk) {
4134 		map_addr = vm_map_trunc_page(*address, FOURK_PAGE_MASK);
4135 		map_size = vm_map_round_page(initial_size, FOURK_PAGE_MASK);
4136 	} else
4137 #endif /* __arm64__ */
4138 	{
4139 		map_addr = vm_map_trunc_page(*address,
4140 		    VM_MAP_PAGE_MASK(target_map));
4141 		map_size = vm_map_round_page(initial_size,
4142 		    VM_MAP_PAGE_MASK(target_map));
4143 	}
4144 	if (map_size == 0) {
4145 		return KERN_INVALID_ARGUMENT;
4146 	}
4147 	size = vm_object_round_page(initial_size);
4148 
4149 	/*
4150 	 * Find the vm object (if any) corresponding to this port.
4151 	 */
4152 	if (!IP_VALID(port)) {
4153 		object = VM_OBJECT_NULL;
4154 		offset = 0;
4155 		copy = FALSE;
4156 	} else if (ip_kotype(port) == IKOT_NAMED_ENTRY) {
4157 		vm_named_entry_t        named_entry;
4158 		vm_object_offset_t      data_offset;
4159 
4160 		named_entry = mach_memory_entry_from_port(port);
4161 
4162 		if (vmk_flags.vmf_return_data_addr ||
4163 		    vmk_flags.vmf_return_4k_data_addr) {
4164 			data_offset = named_entry->data_offset;
4165 			offset += named_entry->data_offset;
4166 		} else {
4167 			data_offset = 0;
4168 		}
4169 
4170 		/* a few checks to make sure user is obeying rules */
4171 		if (mask_max_protection) {
4172 			max_protection &= named_entry->protection;
4173 		}
4174 		if (mask_cur_protection) {
4175 			cur_protection &= named_entry->protection;
4176 		}
4177 		if ((named_entry->protection & max_protection) !=
4178 		    max_protection) {
4179 			return KERN_INVALID_RIGHT;
4180 		}
4181 		if ((named_entry->protection & cur_protection) !=
4182 		    cur_protection) {
4183 			return KERN_INVALID_RIGHT;
4184 		}
4185 		if (offset + size <= offset) {
4186 			/* overflow */
4187 			return KERN_INVALID_ARGUMENT;
4188 		}
4189 		if (named_entry->size < (offset + initial_size)) {
4190 			return KERN_INVALID_ARGUMENT;
4191 		}
4192 
4193 		if (named_entry->is_copy) {
4194 			/* for a vm_map_copy, we can only map it whole */
4195 			if ((size != named_entry->size) &&
4196 			    (vm_map_round_page(size,
4197 			    VM_MAP_PAGE_MASK(target_map)) ==
4198 			    named_entry->size)) {
4199 				/* XXX FBDP use the rounded size... */
4200 				size = vm_map_round_page(
4201 					size,
4202 					VM_MAP_PAGE_MASK(target_map));
4203 			}
4204 		}
4205 
4206 		/* the callers parameter offset is defined to be the */
4207 		/* offset from beginning of named entry offset in object */
4208 		offset = offset + named_entry->offset;
4209 
4210 		if (!VM_MAP_PAGE_ALIGNED(size,
4211 		    VM_MAP_PAGE_MASK(target_map))) {
4212 			/*
4213 			 * Let's not map more than requested;
4214 			 * vm_map_enter() will handle this "not map-aligned"
4215 			 * case.
4216 			 */
4217 			map_size = size;
4218 		}
4219 
4220 		named_entry_lock(named_entry);
4221 		if (named_entry->is_sub_map) {
4222 			vm_map_t                submap;
4223 
4224 			if (vmk_flags.vmf_return_data_addr ||
4225 			    vmk_flags.vmf_return_4k_data_addr) {
4226 				panic("VM_FLAGS_RETURN_DATA_ADDR not expected for submap.");
4227 			}
4228 
4229 			submap = named_entry->backing.map;
4230 			vm_map_reference(submap);
4231 			named_entry_unlock(named_entry);
4232 
4233 			vmk_flags.vmkf_submap = TRUE;
4234 
4235 			result = vm_map_enter(target_map,
4236 			    &map_addr,
4237 			    map_size,
4238 			    mask,
4239 			    vmk_flags,
4240 			    (vm_object_t)(uintptr_t) submap,
4241 			    offset,
4242 			    copy,
4243 			    cur_protection,
4244 			    max_protection,
4245 			    inheritance);
4246 			if (result != KERN_SUCCESS) {
4247 				vm_map_deallocate(submap);
4248 			} else {
4249 				/*
4250 				 * No need to lock "submap" just to check its
4251 				 * "mapped" flag: that flag is never reset
4252 				 * once it's been set and if we race, we'll
4253 				 * just end up setting it twice, which is OK.
4254 				 */
4255 				if (submap->mapped_in_other_pmaps == FALSE &&
4256 				    vm_map_pmap(submap) != PMAP_NULL &&
4257 				    vm_map_pmap(submap) !=
4258 				    vm_map_pmap(target_map)) {
4259 					/*
4260 					 * This submap is being mapped in a map
4261 					 * that uses a different pmap.
4262 					 * Set its "mapped_in_other_pmaps" flag
4263 					 * to indicate that we now need to
4264 					 * remove mappings from all pmaps rather
4265 					 * than just the submap's pmap.
4266 					 */
4267 					vm_map_lock(submap);
4268 					submap->mapped_in_other_pmaps = TRUE;
4269 					vm_map_unlock(submap);
4270 				}
4271 				*address = map_addr;
4272 			}
4273 			return result;
4274 		} else if (named_entry->is_copy) {
4275 			kern_return_t   kr;
4276 			vm_map_copy_t   copy_map;
4277 			vm_map_entry_t  copy_entry;
4278 			vm_map_offset_t copy_addr;
4279 			vm_map_copy_t   target_copy_map;
4280 			vm_map_offset_t overmap_start, overmap_end;
4281 			vm_map_offset_t trimmed_start;
4282 			vm_map_size_t   target_size;
4283 
4284 			if (!vm_map_kernel_flags_check_vmflags(vmk_flags,
4285 			    (VM_FLAGS_FIXED |
4286 			    VM_FLAGS_ANYWHERE |
4287 			    VM_FLAGS_OVERWRITE |
4288 			    VM_FLAGS_RETURN_4K_DATA_ADDR |
4289 			    VM_FLAGS_RETURN_DATA_ADDR))) {
4290 				named_entry_unlock(named_entry);
4291 				return KERN_INVALID_ARGUMENT;
4292 			}
4293 
4294 			copy_map = named_entry->backing.copy;
4295 			assert(copy_map->type == VM_MAP_COPY_ENTRY_LIST);
4296 			if (copy_map->type != VM_MAP_COPY_ENTRY_LIST) {
4297 				/* unsupported type; should not happen */
4298 				printf("vm_map_enter_mem_object: "
4299 				    "memory_entry->backing.copy "
4300 				    "unsupported type 0x%x\n",
4301 				    copy_map->type);
4302 				named_entry_unlock(named_entry);
4303 				return KERN_INVALID_ARGUMENT;
4304 			}
4305 
4306 			if (VM_MAP_PAGE_SHIFT(target_map) != copy_map->cpy_hdr.page_shift) {
4307 				DEBUG4K_SHARE("copy_map %p offset %llx size 0x%llx pgshift %d -> target_map %p pgshift %d\n", copy_map, offset, (uint64_t)map_size, copy_map->cpy_hdr.page_shift, target_map, VM_MAP_PAGE_SHIFT(target_map));
4308 			}
4309 
4310 			if (vmk_flags.vmf_return_data_addr ||
4311 			    vmk_flags.vmf_return_4k_data_addr) {
4312 				offset_in_mapping = offset & VM_MAP_PAGE_MASK(target_map);
4313 				if (vmk_flags.vmf_return_4k_data_addr) {
4314 					offset_in_mapping &= ~((signed)(0xFFF));
4315 				}
4316 			}
4317 
4318 			target_copy_map = VM_MAP_COPY_NULL;
4319 			target_size = copy_map->size;
4320 			overmap_start = 0;
4321 			overmap_end = 0;
4322 			trimmed_start = 0;
4323 			if (copy_map->cpy_hdr.page_shift != VM_MAP_PAGE_SHIFT(target_map)) {
4324 				DEBUG4K_ADJUST("adjusting...\n");
4325 				kr = vm_map_copy_adjust_to_target(
4326 					copy_map,
4327 					offset /* includes data_offset */,
4328 					initial_size,
4329 					target_map,
4330 					copy,
4331 					&target_copy_map,
4332 					&overmap_start,
4333 					&overmap_end,
4334 					&trimmed_start);
4335 				if (kr != KERN_SUCCESS) {
4336 					named_entry_unlock(named_entry);
4337 					return kr;
4338 				}
4339 				target_size = target_copy_map->size;
4340 				if (trimmed_start >= data_offset) {
4341 					data_offset = offset & VM_MAP_PAGE_MASK(target_map);
4342 				} else {
4343 					data_offset -= trimmed_start;
4344 				}
4345 			} else {
4346 				/*
4347 				 * Assert that the vm_map_copy is coming from the right
4348 				 * zone and hasn't been forged
4349 				 */
4350 				vm_map_copy_require(copy_map);
4351 				target_copy_map = copy_map;
4352 			}
4353 
4354 			vm_map_kernel_flags_t rsv_flags = vmk_flags;
4355 
4356 			vm_map_kernel_flags_and_vmflags(&rsv_flags,
4357 			    (VM_FLAGS_FIXED |
4358 			    VM_FLAGS_ANYWHERE |
4359 			    VM_FLAGS_OVERWRITE |
4360 			    VM_FLAGS_RETURN_4K_DATA_ADDR |
4361 			    VM_FLAGS_RETURN_DATA_ADDR));
4362 
4363 			/* reserve a contiguous range */
4364 			kr = vm_map_enter(target_map,
4365 			    &map_addr,
4366 			    vm_map_round_page(target_size, VM_MAP_PAGE_MASK(target_map)),
4367 			    mask,
4368 			    rsv_flags,
4369 			    VM_OBJECT_NULL,
4370 			    0,
4371 			    FALSE,               /* copy */
4372 			    cur_protection,
4373 			    max_protection,
4374 			    inheritance);
4375 			if (kr != KERN_SUCCESS) {
4376 				DEBUG4K_ERROR("kr 0x%x\n", kr);
4377 				if (target_copy_map != copy_map) {
4378 					vm_map_copy_discard(target_copy_map);
4379 					target_copy_map = VM_MAP_COPY_NULL;
4380 				}
4381 				named_entry_unlock(named_entry);
4382 				return kr;
4383 			}
4384 
4385 			copy_addr = map_addr;
4386 
4387 			for (copy_entry = vm_map_copy_first_entry(target_copy_map);
4388 			    copy_entry != vm_map_copy_to_entry(target_copy_map);
4389 			    copy_entry = copy_entry->vme_next) {
4390 				vm_map_t                copy_submap = VM_MAP_NULL;
4391 				vm_object_t             copy_object = VM_OBJECT_NULL;
4392 				vm_map_size_t           copy_size;
4393 				vm_object_offset_t      copy_offset;
4394 				boolean_t               do_copy = false;
4395 
4396 				if (copy_entry->is_sub_map) {
4397 					copy_submap = VME_SUBMAP(copy_entry);
4398 					copy_object = (vm_object_t)copy_submap;
4399 				} else {
4400 					copy_object = VME_OBJECT(copy_entry);
4401 				}
4402 				copy_offset = VME_OFFSET(copy_entry);
4403 				copy_size = (copy_entry->vme_end -
4404 				    copy_entry->vme_start);
4405 
4406 				/* sanity check */
4407 				if ((copy_addr + copy_size) >
4408 				    (map_addr +
4409 				    overmap_start + overmap_end +
4410 				    named_entry->size /* XXX full size */)) {
4411 					/* over-mapping too much !? */
4412 					kr = KERN_INVALID_ARGUMENT;
4413 					DEBUG4K_ERROR("kr 0x%x\n", kr);
4414 					/* abort */
4415 					break;
4416 				}
4417 
4418 				/* take a reference on the object */
4419 				if (copy_entry->is_sub_map) {
4420 					vm_map_reference(copy_submap);
4421 				} else {
4422 					if (!copy &&
4423 					    copy_object != VM_OBJECT_NULL &&
4424 					    copy_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
4425 						bool is_writable;
4426 
4427 						/*
4428 						 * We need to resolve our side of this
4429 						 * "symmetric" copy-on-write now; we
4430 						 * need a new object to map and share,
4431 						 * instead of the current one which
4432 						 * might still be shared with the
4433 						 * original mapping.
4434 						 *
4435 						 * Note: A "vm_map_copy_t" does not
4436 						 * have a lock but we're protected by
4437 						 * the named entry's lock here.
4438 						 */
4439 						// assert(copy_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC);
4440 						VME_OBJECT_SHADOW(copy_entry, copy_size, TRUE);
4441 						assert(copy_object != VME_OBJECT(copy_entry));
4442 						is_writable = false;
4443 						if (copy_entry->protection & VM_PROT_WRITE) {
4444 							is_writable = true;
4445 #if __arm64e__
4446 						} else if (copy_entry->used_for_tpro) {
4447 							is_writable = true;
4448 #endif /* __arm64e__ */
4449 						}
4450 						if (!copy_entry->needs_copy && is_writable) {
4451 							vm_prot_t prot;
4452 
4453 							prot = copy_entry->protection & ~VM_PROT_WRITE;
4454 							vm_object_pmap_protect(copy_object,
4455 							    copy_offset,
4456 							    copy_size,
4457 							    PMAP_NULL,
4458 							    PAGE_SIZE,
4459 							    0,
4460 							    prot);
4461 						}
4462 						copy_entry->needs_copy = FALSE;
4463 						copy_entry->is_shared = TRUE;
4464 						copy_object = VME_OBJECT(copy_entry);
4465 						copy_offset = VME_OFFSET(copy_entry);
4466 						vm_object_lock(copy_object);
4467 						/* we're about to make a shared mapping of this object */
4468 						copy_object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
4469 						copy_object->true_share = TRUE;
4470 						vm_object_unlock(copy_object);
4471 					}
4472 
4473 					if (copy_object != VM_OBJECT_NULL &&
4474 					    copy_object->named &&
4475 					    copy_object->pager != MEMORY_OBJECT_NULL &&
4476 					    copy_object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4477 						memory_object_t pager;
4478 						vm_prot_t       pager_prot;
4479 
4480 						/*
4481 						 * For "named" VM objects, let the pager know that the
4482 						 * memory object is being mapped.  Some pagers need to keep
4483 						 * track of this, to know when they can reclaim the memory
4484 						 * object, for example.
4485 						 * VM calls memory_object_map() for each mapping (specifying
4486 						 * the protection of each mapping) and calls
4487 						 * memory_object_last_unmap() when all the mappings are gone.
4488 						 */
4489 						pager_prot = max_protection;
4490 						if (copy) {
4491 							/*
4492 							 * Copy-On-Write mapping: won't modify the
4493 							 * memory object.
4494 							 */
4495 							pager_prot &= ~VM_PROT_WRITE;
4496 						}
4497 						vm_object_lock(copy_object);
4498 						pager = copy_object->pager;
4499 						if (copy_object->named &&
4500 						    pager != MEMORY_OBJECT_NULL &&
4501 						    copy_object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4502 							assert(copy_object->pager_ready);
4503 							vm_object_mapping_wait(copy_object, THREAD_UNINT);
4504 							vm_object_mapping_begin(copy_object);
4505 							vm_object_unlock(copy_object);
4506 
4507 							kr = memory_object_map(pager, pager_prot);
4508 							assert(kr == KERN_SUCCESS);
4509 
4510 							vm_object_lock(copy_object);
4511 							vm_object_mapping_end(copy_object);
4512 						}
4513 						vm_object_unlock(copy_object);
4514 					}
4515 
4516 					/*
4517 					 *	Perform the copy if requested
4518 					 */
4519 
4520 					if (copy && copy_object != VM_OBJECT_NULL) {
4521 						vm_object_t             new_object;
4522 						vm_object_offset_t      new_offset;
4523 
4524 						result = vm_object_copy_strategically(copy_object, copy_offset,
4525 						    copy_size,
4526 						    false,                                   /* forking */
4527 						    &new_object, &new_offset,
4528 						    &do_copy);
4529 
4530 
4531 						if (result == KERN_MEMORY_RESTART_COPY) {
4532 							boolean_t success;
4533 							boolean_t src_needs_copy;
4534 
4535 							/*
4536 							 * XXX
4537 							 * We currently ignore src_needs_copy.
4538 							 * This really is the issue of how to make
4539 							 * MEMORY_OBJECT_COPY_SYMMETRIC safe for
4540 							 * non-kernel users to use. Solution forthcoming.
4541 							 * In the meantime, since we don't allow non-kernel
4542 							 * memory managers to specify symmetric copy,
4543 							 * we won't run into problems here.
4544 							 */
4545 							new_object = copy_object;
4546 							new_offset = copy_offset;
4547 							success = vm_object_copy_quickly(new_object,
4548 							    new_offset,
4549 							    copy_size,
4550 							    &src_needs_copy,
4551 							    &do_copy);
4552 							assert(success);
4553 							result = KERN_SUCCESS;
4554 						}
4555 						if (result != KERN_SUCCESS) {
4556 							kr = result;
4557 							break;
4558 						}
4559 
4560 						copy_object = new_object;
4561 						copy_offset = new_offset;
4562 						/*
4563 						 * No extra object reference for the mapping:
4564 						 * the mapping should be the only thing keeping
4565 						 * this new object alive.
4566 						 */
4567 					} else {
4568 						/*
4569 						 * We already have the right object
4570 						 * to map.
4571 						 */
4572 						copy_object = VME_OBJECT(copy_entry);
4573 						/* take an extra ref for the mapping below */
4574 						vm_object_reference(copy_object);
4575 					}
4576 				}
4577 
4578 				/*
4579 				 * If the caller does not want a specific
4580 				 * tag for this new mapping:  use
4581 				 * the tag of the original mapping.
4582 				 */
4583 				vm_map_kernel_flags_t vmk_remap_flags = {
4584 					.vmkf_submap = copy_entry->is_sub_map,
4585 				};
4586 
4587 				vm_map_kernel_flags_set_vmflags(&vmk_remap_flags,
4588 				    vm_map_kernel_flags_vmflags(vmk_flags),
4589 				    vmk_flags.vm_tag ?: VME_ALIAS(copy_entry));
4590 
4591 				/* over-map the object into destination */
4592 				vmk_remap_flags.vmf_fixed = true;
4593 				vmk_remap_flags.vmf_overwrite = true;
4594 
4595 				if (!copy && !copy_entry->is_sub_map) {
4596 					/*
4597 					 * copy-on-write should have been
4598 					 * resolved at this point, or we would
4599 					 * end up sharing instead of copying.
4600 					 */
4601 					assert(!copy_entry->needs_copy);
4602 				}
4603 #if XNU_TARGET_OS_OSX
4604 				if (copy_entry->used_for_jit) {
4605 					vmk_remap_flags.vmkf_map_jit = TRUE;
4606 				}
4607 #endif /* XNU_TARGET_OS_OSX */
4608 
4609 				kr = vm_map_enter(target_map,
4610 				    &copy_addr,
4611 				    copy_size,
4612 				    (vm_map_offset_t) 0,
4613 				    vmk_remap_flags,
4614 				    copy_object,
4615 				    copy_offset,
4616 				    ((copy_object == NULL)
4617 				    ? FALSE
4618 				    : (copy || copy_entry->needs_copy)),
4619 				    cur_protection,
4620 				    max_protection,
4621 				    inheritance);
4622 				if (kr != KERN_SUCCESS) {
4623 					DEBUG4K_SHARE("failed kr 0x%x\n", kr);
4624 					if (copy_entry->is_sub_map) {
4625 						vm_map_deallocate(copy_submap);
4626 					} else {
4627 						vm_object_deallocate(copy_object);
4628 					}
4629 					/* abort */
4630 					break;
4631 				}
4632 
4633 				/* next mapping */
4634 				copy_addr += copy_size;
4635 			}
4636 
4637 			if (kr == KERN_SUCCESS) {
4638 				if (vmk_flags.vmf_return_data_addr ||
4639 				    vmk_flags.vmf_return_4k_data_addr) {
4640 					*address = map_addr + offset_in_mapping;
4641 				} else {
4642 					*address = map_addr;
4643 				}
4644 				if (overmap_start) {
4645 					*address += overmap_start;
4646 					DEBUG4K_SHARE("map %p map_addr 0x%llx offset_in_mapping 0x%llx overmap_start 0x%llx -> *address 0x%llx\n", target_map, (uint64_t)map_addr, (uint64_t) offset_in_mapping, (uint64_t)overmap_start, (uint64_t)*address);
4647 				}
4648 			}
4649 			named_entry_unlock(named_entry);
4650 			if (target_copy_map != copy_map) {
4651 				vm_map_copy_discard(target_copy_map);
4652 				target_copy_map = VM_MAP_COPY_NULL;
4653 			}
4654 
4655 			if (kr != KERN_SUCCESS && !vmk_flags.vmf_overwrite) {
4656 				/* deallocate the contiguous range */
4657 				(void) vm_deallocate(target_map,
4658 				    map_addr,
4659 				    map_size);
4660 			}
4661 
4662 			return kr;
4663 		}
4664 
4665 		if (named_entry->is_object) {
4666 			unsigned int    access;
4667 			unsigned int    wimg_mode;
4668 
4669 			/* we are mapping a VM object */
4670 
4671 			access = named_entry->access;
4672 
4673 			if (vmk_flags.vmf_return_data_addr ||
4674 			    vmk_flags.vmf_return_4k_data_addr) {
4675 				offset_in_mapping = offset - VM_MAP_TRUNC_PAGE(offset, VM_MAP_PAGE_MASK(target_map));
4676 				if (vmk_flags.vmf_return_4k_data_addr) {
4677 					offset_in_mapping &= ~((signed)(0xFFF));
4678 				}
4679 				offset = VM_MAP_TRUNC_PAGE(offset, VM_MAP_PAGE_MASK(target_map));
4680 				map_size = VM_MAP_ROUND_PAGE((offset + offset_in_mapping + initial_size) - offset, VM_MAP_PAGE_MASK(target_map));
4681 			}
4682 
4683 			object = vm_named_entry_to_vm_object(named_entry);
4684 			assert(object != VM_OBJECT_NULL);
4685 			vm_object_lock(object);
4686 			named_entry_unlock(named_entry);
4687 
4688 			vm_object_reference_locked(object);
4689 
4690 			wimg_mode = object->wimg_bits;
4691 			vm_prot_to_wimg(access, &wimg_mode);
4692 			if (object->wimg_bits != wimg_mode) {
4693 				vm_object_change_wimg_mode(object, wimg_mode);
4694 			}
4695 
4696 			vm_object_unlock(object);
4697 		} else {
4698 			panic("invalid VM named entry %p", named_entry);
4699 		}
4700 	} else if (ip_kotype(port) == IKOT_MEMORY_OBJECT) {
4701 		/*
4702 		 * JMM - This is temporary until we unify named entries
4703 		 * and raw memory objects.
4704 		 *
4705 		 * Detected fake ip_kotype for a memory object.  In
4706 		 * this case, the port isn't really a port at all, but
4707 		 * instead is just a raw memory object.
4708 		 */
4709 		if (vmk_flags.vmf_return_data_addr ||
4710 		    vmk_flags.vmf_return_4k_data_addr) {
4711 			panic("VM_FLAGS_RETURN_DATA_ADDR not expected for raw memory object.");
4712 		}
4713 
4714 		object = memory_object_to_vm_object((memory_object_t)port);
4715 		if (object == VM_OBJECT_NULL) {
4716 			return KERN_INVALID_OBJECT;
4717 		}
4718 		vm_object_reference(object);
4719 
4720 		/* wait for object (if any) to be ready */
4721 		if (object != VM_OBJECT_NULL) {
4722 			if (is_kernel_object(object)) {
4723 				printf("Warning: Attempt to map kernel object"
4724 				    " by a non-private kernel entity\n");
4725 				return KERN_INVALID_OBJECT;
4726 			}
4727 			if (!object->pager_ready) {
4728 				vm_object_lock(object);
4729 
4730 				while (!object->pager_ready) {
4731 					vm_object_wait(object,
4732 					    VM_OBJECT_EVENT_PAGER_READY,
4733 					    THREAD_UNINT);
4734 					vm_object_lock(object);
4735 				}
4736 				vm_object_unlock(object);
4737 			}
4738 		}
4739 	} else {
4740 		return KERN_INVALID_OBJECT;
4741 	}
4742 
4743 	if (object != VM_OBJECT_NULL &&
4744 	    object->named &&
4745 	    object->pager != MEMORY_OBJECT_NULL &&
4746 	    object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4747 		memory_object_t pager;
4748 		vm_prot_t       pager_prot;
4749 		kern_return_t   kr;
4750 
4751 		/*
4752 		 * For "named" VM objects, let the pager know that the
4753 		 * memory object is being mapped.  Some pagers need to keep
4754 		 * track of this, to know when they can reclaim the memory
4755 		 * object, for example.
4756 		 * VM calls memory_object_map() for each mapping (specifying
4757 		 * the protection of each mapping) and calls
4758 		 * memory_object_last_unmap() when all the mappings are gone.
4759 		 */
4760 		pager_prot = max_protection;
4761 		if (copy) {
4762 			/*
4763 			 * Copy-On-Write mapping: won't modify the
4764 			 * memory object.
4765 			 */
4766 			pager_prot &= ~VM_PROT_WRITE;
4767 		}
4768 		vm_object_lock(object);
4769 		pager = object->pager;
4770 		if (object->named &&
4771 		    pager != MEMORY_OBJECT_NULL &&
4772 		    object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4773 			assert(object->pager_ready);
4774 			vm_object_mapping_wait(object, THREAD_UNINT);
4775 			vm_object_mapping_begin(object);
4776 			vm_object_unlock(object);
4777 
4778 			kr = memory_object_map(pager, pager_prot);
4779 			assert(kr == KERN_SUCCESS);
4780 
4781 			vm_object_lock(object);
4782 			vm_object_mapping_end(object);
4783 		}
4784 		vm_object_unlock(object);
4785 	}
4786 
4787 	/*
4788 	 *	Perform the copy if requested
4789 	 */
4790 
4791 	if (copy) {
4792 		vm_object_t             new_object;
4793 		vm_object_offset_t      new_offset;
4794 
4795 		result = vm_object_copy_strategically(object, offset,
4796 		    map_size,
4797 		    false,                                   /* forking */
4798 		    &new_object, &new_offset,
4799 		    &copy);
4800 
4801 
4802 		if (result == KERN_MEMORY_RESTART_COPY) {
4803 			boolean_t success;
4804 			boolean_t src_needs_copy;
4805 
4806 			/*
4807 			 * XXX
4808 			 * We currently ignore src_needs_copy.
4809 			 * This really is the issue of how to make
4810 			 * MEMORY_OBJECT_COPY_SYMMETRIC safe for
4811 			 * non-kernel users to use. Solution forthcoming.
4812 			 * In the meantime, since we don't allow non-kernel
4813 			 * memory managers to specify symmetric copy,
4814 			 * we won't run into problems here.
4815 			 */
4816 			new_object = object;
4817 			new_offset = offset;
4818 			success = vm_object_copy_quickly(new_object,
4819 			    new_offset,
4820 			    map_size,
4821 			    &src_needs_copy,
4822 			    &copy);
4823 			assert(success);
4824 			result = KERN_SUCCESS;
4825 		}
4826 		/*
4827 		 *	Throw away the reference to the
4828 		 *	original object, as it won't be mapped.
4829 		 */
4830 
4831 		vm_object_deallocate(object);
4832 
4833 		if (result != KERN_SUCCESS) {
4834 			return result;
4835 		}
4836 
4837 		object = new_object;
4838 		offset = new_offset;
4839 	}
4840 
4841 	/*
4842 	 * If non-kernel users want to try to prefault pages, the mapping and prefault
4843 	 * needs to be atomic.
4844 	 */
4845 	kernel_prefault = (try_prefault && vm_kernel_map_is_kernel(target_map));
4846 	vmk_flags.vmkf_keep_map_locked = (try_prefault && !kernel_prefault);
4847 
4848 #if __arm64__
4849 	if (fourk) {
4850 		/* map this object in a "4K" pager */
4851 		result = vm_map_enter_fourk(target_map,
4852 		    &map_addr,
4853 		    map_size,
4854 		    (vm_map_offset_t) mask,
4855 		    vmk_flags,
4856 		    object,
4857 		    offset,
4858 		    copy,
4859 		    cur_protection,
4860 		    max_protection,
4861 		    inheritance);
4862 	} else
4863 #endif /* __arm64__ */
4864 	{
4865 		result = vm_map_enter(target_map,
4866 		    &map_addr, map_size,
4867 		    (vm_map_offset_t)mask,
4868 		    vmk_flags,
4869 		    object, offset,
4870 		    copy,
4871 		    cur_protection, max_protection,
4872 		    inheritance);
4873 	}
4874 	if (result != KERN_SUCCESS) {
4875 		vm_object_deallocate(object);
4876 	}
4877 
4878 	/*
4879 	 * Try to prefault, and do not forget to release the vm map lock.
4880 	 */
4881 	if (result == KERN_SUCCESS && try_prefault) {
4882 		mach_vm_address_t va = map_addr;
4883 		kern_return_t kr = KERN_SUCCESS;
4884 		unsigned int i = 0;
4885 		int pmap_options;
4886 
4887 		pmap_options = kernel_prefault ? 0 : PMAP_OPTIONS_NOWAIT;
4888 		if (object->internal) {
4889 			pmap_options |= PMAP_OPTIONS_INTERNAL;
4890 		}
4891 
4892 		for (i = 0; i < page_list_count; ++i) {
4893 			if (!UPL_VALID_PAGE(page_list, i)) {
4894 				if (kernel_prefault) {
4895 					assertf(FALSE, "kernel_prefault && !UPL_VALID_PAGE");
4896 					result = KERN_MEMORY_ERROR;
4897 					break;
4898 				}
4899 			} else {
4900 				/*
4901 				 * If this function call failed, we should stop
4902 				 * trying to optimize, other calls are likely
4903 				 * going to fail too.
4904 				 *
4905 				 * We are not gonna report an error for such
4906 				 * failure though. That's an optimization, not
4907 				 * something critical.
4908 				 */
4909 				kr = pmap_enter_options(target_map->pmap,
4910 				    va, UPL_PHYS_PAGE(page_list, i),
4911 				    cur_protection, VM_PROT_NONE,
4912 				    0, TRUE, pmap_options, NULL, PMAP_MAPPING_TYPE_INFER);
4913 				if (kr != KERN_SUCCESS) {
4914 					OSIncrementAtomic64(&vm_prefault_nb_bailout);
4915 					if (kernel_prefault) {
4916 						result = kr;
4917 					}
4918 					break;
4919 				}
4920 				OSIncrementAtomic64(&vm_prefault_nb_pages);
4921 			}
4922 
4923 			/* Next virtual address */
4924 			va += PAGE_SIZE;
4925 		}
4926 		if (vmk_flags.vmkf_keep_map_locked) {
4927 			vm_map_unlock(target_map);
4928 		}
4929 	}
4930 
4931 	if (vmk_flags.vmf_return_data_addr ||
4932 	    vmk_flags.vmf_return_4k_data_addr) {
4933 		*address = map_addr + offset_in_mapping;
4934 	} else {
4935 		*address = map_addr;
4936 	}
4937 	return result;
4938 }
4939 
4940 kern_return_t
vm_map_enter_mem_object(vm_map_t target_map,vm_map_offset_t * address,vm_map_size_t initial_size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,ipc_port_t port,vm_object_offset_t offset,boolean_t copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)4941 vm_map_enter_mem_object(
4942 	vm_map_t                target_map,
4943 	vm_map_offset_t         *address,
4944 	vm_map_size_t           initial_size,
4945 	vm_map_offset_t         mask,
4946 	vm_map_kernel_flags_t   vmk_flags,
4947 	ipc_port_t              port,
4948 	vm_object_offset_t      offset,
4949 	boolean_t               copy,
4950 	vm_prot_t               cur_protection,
4951 	vm_prot_t               max_protection,
4952 	vm_inherit_t            inheritance)
4953 {
4954 	kern_return_t ret;
4955 
4956 	/* range_id is set by vm_map_enter_mem_object_helper */
4957 	ret = vm_map_enter_mem_object_helper(target_map,
4958 	    address,
4959 	    initial_size,
4960 	    mask,
4961 	    vmk_flags,
4962 	    port,
4963 	    offset,
4964 	    copy,
4965 	    cur_protection,
4966 	    max_protection,
4967 	    inheritance,
4968 	    NULL,
4969 	    0);
4970 
4971 #if KASAN
4972 	if (ret == KERN_SUCCESS && address && target_map->pmap == kernel_pmap) {
4973 		kasan_notify_address(*address, initial_size);
4974 	}
4975 #endif
4976 
4977 	return ret;
4978 }
4979 
4980 kern_return_t
vm_map_enter_mem_object_prefault(vm_map_t target_map,vm_map_offset_t * address,vm_map_size_t initial_size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,ipc_port_t port,vm_object_offset_t offset,vm_prot_t cur_protection,vm_prot_t max_protection,upl_page_list_ptr_t page_list,unsigned int page_list_count)4981 vm_map_enter_mem_object_prefault(
4982 	vm_map_t                target_map,
4983 	vm_map_offset_t         *address,
4984 	vm_map_size_t           initial_size,
4985 	vm_map_offset_t         mask,
4986 	vm_map_kernel_flags_t   vmk_flags,
4987 	ipc_port_t              port,
4988 	vm_object_offset_t      offset,
4989 	vm_prot_t               cur_protection,
4990 	vm_prot_t               max_protection,
4991 	upl_page_list_ptr_t     page_list,
4992 	unsigned int            page_list_count)
4993 {
4994 	kern_return_t ret;
4995 
4996 	/* range_id is set by vm_map_enter_mem_object_helper */
4997 	ret = vm_map_enter_mem_object_helper(target_map,
4998 	    address,
4999 	    initial_size,
5000 	    mask,
5001 	    vmk_flags,
5002 	    port,
5003 	    offset,
5004 	    FALSE,
5005 	    cur_protection,
5006 	    max_protection,
5007 	    VM_INHERIT_DEFAULT,
5008 	    page_list,
5009 	    page_list_count);
5010 
5011 #if KASAN
5012 	if (ret == KERN_SUCCESS && address && target_map->pmap == kernel_pmap) {
5013 		kasan_notify_address(*address, initial_size);
5014 	}
5015 #endif
5016 
5017 	return ret;
5018 }
5019 
5020 
5021 kern_return_t
vm_map_enter_mem_object_control(vm_map_t target_map,vm_map_offset_t * address,vm_map_size_t initial_size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,memory_object_control_t control,vm_object_offset_t offset,boolean_t copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)5022 vm_map_enter_mem_object_control(
5023 	vm_map_t                target_map,
5024 	vm_map_offset_t         *address,
5025 	vm_map_size_t           initial_size,
5026 	vm_map_offset_t         mask,
5027 	vm_map_kernel_flags_t   vmk_flags,
5028 	memory_object_control_t control,
5029 	vm_object_offset_t      offset,
5030 	boolean_t               copy,
5031 	vm_prot_t               cur_protection,
5032 	vm_prot_t               max_protection,
5033 	vm_inherit_t            inheritance)
5034 {
5035 	vm_map_address_t        map_addr;
5036 	vm_map_size_t           map_size;
5037 	vm_object_t             object;
5038 	vm_object_size_t        size;
5039 	kern_return_t           result;
5040 	memory_object_t         pager;
5041 	vm_prot_t               pager_prot;
5042 	kern_return_t           kr;
5043 #if __arm64__
5044 	boolean_t               fourk = vmk_flags.vmkf_fourk;
5045 #endif /* __arm64__ */
5046 
5047 	/*
5048 	 * Check arguments for validity
5049 	 */
5050 	if ((target_map == VM_MAP_NULL) ||
5051 	    (cur_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)) ||
5052 	    (max_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)) ||
5053 	    (inheritance > VM_INHERIT_LAST_VALID) ||
5054 	    initial_size == 0) {
5055 		return KERN_INVALID_ARGUMENT;
5056 	}
5057 
5058 #if __arm64__
5059 	if (fourk && VM_MAP_PAGE_MASK(target_map) < PAGE_MASK) {
5060 		fourk = FALSE;
5061 	}
5062 
5063 	if (fourk) {
5064 		map_addr = vm_map_trunc_page(*address,
5065 		    FOURK_PAGE_MASK);
5066 		map_size = vm_map_round_page(initial_size,
5067 		    FOURK_PAGE_MASK);
5068 	} else
5069 #endif /* __arm64__ */
5070 	{
5071 		map_addr = vm_map_trunc_page(*address,
5072 		    VM_MAP_PAGE_MASK(target_map));
5073 		map_size = vm_map_round_page(initial_size,
5074 		    VM_MAP_PAGE_MASK(target_map));
5075 	}
5076 	size = vm_object_round_page(initial_size);
5077 
5078 	object = memory_object_control_to_vm_object(control);
5079 
5080 	if (object == VM_OBJECT_NULL) {
5081 		return KERN_INVALID_OBJECT;
5082 	}
5083 
5084 	if (is_kernel_object(object)) {
5085 		printf("Warning: Attempt to map kernel object"
5086 		    " by a non-private kernel entity\n");
5087 		return KERN_INVALID_OBJECT;
5088 	}
5089 
5090 	vm_object_lock(object);
5091 	object->ref_count++;
5092 
5093 	/*
5094 	 * For "named" VM objects, let the pager know that the
5095 	 * memory object is being mapped.  Some pagers need to keep
5096 	 * track of this, to know when they can reclaim the memory
5097 	 * object, for example.
5098 	 * VM calls memory_object_map() for each mapping (specifying
5099 	 * the protection of each mapping) and calls
5100 	 * memory_object_last_unmap() when all the mappings are gone.
5101 	 */
5102 	pager_prot = max_protection;
5103 	if (copy) {
5104 		pager_prot &= ~VM_PROT_WRITE;
5105 	}
5106 	pager = object->pager;
5107 	if (object->named &&
5108 	    pager != MEMORY_OBJECT_NULL &&
5109 	    object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
5110 		assert(object->pager_ready);
5111 		vm_object_mapping_wait(object, THREAD_UNINT);
5112 		vm_object_mapping_begin(object);
5113 		vm_object_unlock(object);
5114 
5115 		kr = memory_object_map(pager, pager_prot);
5116 		assert(kr == KERN_SUCCESS);
5117 
5118 		vm_object_lock(object);
5119 		vm_object_mapping_end(object);
5120 	}
5121 	vm_object_unlock(object);
5122 
5123 	/*
5124 	 *	Perform the copy if requested
5125 	 */
5126 
5127 	if (copy) {
5128 		vm_object_t             new_object;
5129 		vm_object_offset_t      new_offset;
5130 
5131 		result = vm_object_copy_strategically(object, offset, size,
5132 		    false,                                   /* forking */
5133 		    &new_object, &new_offset,
5134 		    &copy);
5135 
5136 
5137 		if (result == KERN_MEMORY_RESTART_COPY) {
5138 			boolean_t success;
5139 			boolean_t src_needs_copy;
5140 
5141 			/*
5142 			 * XXX
5143 			 * We currently ignore src_needs_copy.
5144 			 * This really is the issue of how to make
5145 			 * MEMORY_OBJECT_COPY_SYMMETRIC safe for
5146 			 * non-kernel users to use. Solution forthcoming.
5147 			 * In the meantime, since we don't allow non-kernel
5148 			 * memory managers to specify symmetric copy,
5149 			 * we won't run into problems here.
5150 			 */
5151 			new_object = object;
5152 			new_offset = offset;
5153 			success = vm_object_copy_quickly(new_object,
5154 			    new_offset, size,
5155 			    &src_needs_copy,
5156 			    &copy);
5157 			assert(success);
5158 			result = KERN_SUCCESS;
5159 		}
5160 		/*
5161 		 *	Throw away the reference to the
5162 		 *	original object, as it won't be mapped.
5163 		 */
5164 
5165 		vm_object_deallocate(object);
5166 
5167 		if (result != KERN_SUCCESS) {
5168 			return result;
5169 		}
5170 
5171 		object = new_object;
5172 		offset = new_offset;
5173 	}
5174 
5175 #if __arm64__
5176 	if (fourk) {
5177 		result = vm_map_enter_fourk(target_map,
5178 		    &map_addr,
5179 		    map_size,
5180 		    (vm_map_offset_t)mask,
5181 		    vmk_flags,
5182 		    object, offset,
5183 		    copy,
5184 		    cur_protection, max_protection,
5185 		    inheritance);
5186 	} else
5187 #endif /* __arm64__ */
5188 	{
5189 		result = vm_map_enter(target_map,
5190 		    &map_addr, map_size,
5191 		    (vm_map_offset_t)mask,
5192 		    vmk_flags,
5193 		    object, offset,
5194 		    copy,
5195 		    cur_protection, max_protection,
5196 		    inheritance);
5197 	}
5198 	if (result != KERN_SUCCESS) {
5199 		vm_object_deallocate(object);
5200 	}
5201 	*address = map_addr;
5202 
5203 	return result;
5204 }
5205 
5206 
5207 #if     VM_CPM
5208 
5209 #ifdef MACH_ASSERT
5210 extern pmap_paddr_t     avail_start, avail_end;
5211 #endif
5212 
5213 /*
5214  *	Allocate memory in the specified map, with the caveat that
5215  *	the memory is physically contiguous.  This call may fail
5216  *	if the system can't find sufficient contiguous memory.
5217  *	This call may cause or lead to heart-stopping amounts of
5218  *	paging activity.
5219  *
5220  *	Memory obtained from this call should be freed in the
5221  *	normal way, viz., via vm_deallocate.
5222  */
5223 kern_return_t
vm_map_enter_cpm(vm_map_t map,vm_map_offset_t * addr,vm_map_size_t size,vm_map_kernel_flags_t vmk_flags)5224 vm_map_enter_cpm(
5225 	vm_map_t                map,
5226 	vm_map_offset_t        *addr,
5227 	vm_map_size_t           size,
5228 	vm_map_kernel_flags_t   vmk_flags)
5229 {
5230 	vm_object_t             cpm_obj;
5231 	pmap_t                  pmap;
5232 	vm_page_t               m, pages;
5233 	kern_return_t           kr;
5234 	vm_map_offset_t         va, start, end, offset;
5235 #if     MACH_ASSERT
5236 	vm_map_offset_t         prev_addr = 0;
5237 #endif  /* MACH_ASSERT */
5238 	uint8_t                 object_lock_type = 0;
5239 
5240 	if (VM_MAP_PAGE_SHIFT(map) != PAGE_SHIFT) {
5241 		/* XXX TODO4K do we need to support this? */
5242 		*addr = 0;
5243 		return KERN_NOT_SUPPORTED;
5244 	}
5245 
5246 	if (size == 0) {
5247 		*addr = 0;
5248 		return KERN_SUCCESS;
5249 	}
5250 	if (vmk_flags.vmf_fixed) {
5251 		*addr = vm_map_trunc_page(*addr,
5252 		    VM_MAP_PAGE_MASK(map));
5253 	} else {
5254 		*addr = vm_map_min(map);
5255 	}
5256 	size = vm_map_round_page(size,
5257 	    VM_MAP_PAGE_MASK(map));
5258 
5259 	/*
5260 	 * LP64todo - cpm_allocate should probably allow
5261 	 * allocations of >4GB, but not with the current
5262 	 * algorithm, so just cast down the size for now.
5263 	 */
5264 	if (size > VM_MAX_ADDRESS) {
5265 		return KERN_RESOURCE_SHORTAGE;
5266 	}
5267 	if ((kr = cpm_allocate(CAST_DOWN(vm_size_t, size),
5268 	    &pages, 0, 0, TRUE, flags)) != KERN_SUCCESS) {
5269 		return kr;
5270 	}
5271 
5272 	cpm_obj = vm_object_allocate((vm_object_size_t)size);
5273 	assert(cpm_obj != VM_OBJECT_NULL);
5274 	assert(cpm_obj->internal);
5275 	assert(cpm_obj->vo_size == (vm_object_size_t)size);
5276 	assert(cpm_obj->can_persist == FALSE);
5277 	assert(cpm_obj->pager_created == FALSE);
5278 	assert(cpm_obj->pageout == FALSE);
5279 	assert(cpm_obj->shadow == VM_OBJECT_NULL);
5280 
5281 	/*
5282 	 *	Insert pages into object.
5283 	 */
5284 	object_lock_type = OBJECT_LOCK_EXCLUSIVE;
5285 	vm_object_lock(cpm_obj);
5286 	for (offset = 0; offset < size; offset += PAGE_SIZE) {
5287 		m = pages;
5288 		pages = NEXT_PAGE(m);
5289 		*(NEXT_PAGE_PTR(m)) = VM_PAGE_NULL;
5290 
5291 		assert(!m->vmp_gobbled);
5292 		assert(!m->vmp_wanted);
5293 		assert(!m->vmp_pageout);
5294 		assert(!m->vmp_tabled);
5295 		assert(VM_PAGE_WIRED(m));
5296 		assert(m->vmp_busy);
5297 		assert(VM_PAGE_GET_PHYS_PAGE(m) >= (avail_start >> PAGE_SHIFT) && VM_PAGE_GET_PHYS_PAGE(m) <= (avail_end >> PAGE_SHIFT));
5298 
5299 		m->vmp_busy = FALSE;
5300 		vm_page_insert(m, cpm_obj, offset);
5301 	}
5302 	assert(cpm_obj->resident_page_count == size / PAGE_SIZE);
5303 	vm_object_unlock(cpm_obj);
5304 
5305 	/*
5306 	 *	Hang onto a reference on the object in case a
5307 	 *	multi-threaded application for some reason decides
5308 	 *	to deallocate the portion of the address space into
5309 	 *	which we will insert this object.
5310 	 *
5311 	 *	Unfortunately, we must insert the object now before
5312 	 *	we can talk to the pmap module about which addresses
5313 	 *	must be wired down.  Hence, the race with a multi-
5314 	 *	threaded app.
5315 	 */
5316 	vm_object_reference(cpm_obj);
5317 
5318 	/*
5319 	 *	Insert object into map.
5320 	 */
5321 
5322 	kr = vm_map_enter(
5323 		map,
5324 		addr,
5325 		size,
5326 		(vm_map_offset_t)0,
5327 		vmk_flags,
5328 		cpm_obj,
5329 		(vm_object_offset_t)0,
5330 		FALSE,
5331 		VM_PROT_ALL,
5332 		VM_PROT_ALL,
5333 		VM_INHERIT_DEFAULT);
5334 
5335 	if (kr != KERN_SUCCESS) {
5336 		/*
5337 		 *	A CPM object doesn't have can_persist set,
5338 		 *	so all we have to do is deallocate it to
5339 		 *	free up these pages.
5340 		 */
5341 		assert(cpm_obj->pager_created == FALSE);
5342 		assert(cpm_obj->can_persist == FALSE);
5343 		assert(cpm_obj->pageout == FALSE);
5344 		assert(cpm_obj->shadow == VM_OBJECT_NULL);
5345 		vm_object_deallocate(cpm_obj); /* kill acquired ref */
5346 		vm_object_deallocate(cpm_obj); /* kill creation ref */
5347 	}
5348 
5349 	/*
5350 	 *	Inform the physical mapping system that the
5351 	 *	range of addresses may not fault, so that
5352 	 *	page tables and such can be locked down as well.
5353 	 */
5354 	start = *addr;
5355 	end = start + size;
5356 	pmap = vm_map_pmap(map);
5357 	pmap_pageable(pmap, start, end, FALSE);
5358 
5359 	/*
5360 	 *	Enter each page into the pmap, to avoid faults.
5361 	 *	Note that this loop could be coded more efficiently,
5362 	 *	if the need arose, rather than looking up each page
5363 	 *	again.
5364 	 */
5365 	for (offset = 0, va = start; offset < size;
5366 	    va += PAGE_SIZE, offset += PAGE_SIZE) {
5367 		int type_of_fault;
5368 
5369 		vm_object_lock(cpm_obj);
5370 		m = vm_page_lookup(cpm_obj, (vm_object_offset_t)offset);
5371 		assert(m != VM_PAGE_NULL);
5372 
5373 		vm_page_zero_fill(m);
5374 
5375 		type_of_fault = DBG_ZERO_FILL_FAULT;
5376 
5377 		vm_fault_enter(m, pmap, va,
5378 		    PAGE_SIZE, 0,
5379 		    VM_PROT_ALL, VM_PROT_WRITE,
5380 		    VM_PAGE_WIRED(m),
5381 		    FALSE,                             /* change_wiring */
5382 		    VM_KERN_MEMORY_NONE,                             /* tag - not wiring */
5383 		    FALSE,                             /* cs_bypass */
5384 		    0,                                 /* user_tag */
5385 		    0,                             /* pmap_options */
5386 		    NULL,                              /* need_retry */
5387 		    &type_of_fault,
5388 		    &object_lock_type);                 /* Exclusive lock mode. Will remain unchanged.*/
5389 
5390 		vm_object_unlock(cpm_obj);
5391 	}
5392 
5393 #if     MACH_ASSERT
5394 	/*
5395 	 *	Verify ordering in address space.
5396 	 */
5397 	for (offset = 0; offset < size; offset += PAGE_SIZE) {
5398 		vm_object_lock(cpm_obj);
5399 		m = vm_page_lookup(cpm_obj, (vm_object_offset_t)offset);
5400 		vm_object_unlock(cpm_obj);
5401 		if (m == VM_PAGE_NULL) {
5402 			panic("vm_allocate_cpm:  obj %p off 0x%llx no page",
5403 			    cpm_obj, (uint64_t)offset);
5404 		}
5405 		assert(m->vmp_tabled);
5406 		assert(!m->vmp_busy);
5407 		assert(!m->vmp_wanted);
5408 		assert(!m->vmp_fictitious);
5409 		assert(!m->vmp_private);
5410 		assert(!m->vmp_absent);
5411 		assert(!m->vmp_cleaning);
5412 		assert(!m->vmp_laundry);
5413 		assert(!m->vmp_precious);
5414 		assert(!m->vmp_clustered);
5415 		if (offset != 0) {
5416 			if (VM_PAGE_GET_PHYS_PAGE(m) != prev_addr + 1) {
5417 				printf("start 0x%llx end 0x%llx va 0x%llx\n",
5418 				    (uint64_t)start, (uint64_t)end, (uint64_t)va);
5419 				printf("obj %p off 0x%llx\n", cpm_obj, (uint64_t)offset);
5420 				printf("m %p prev_address 0x%llx\n", m, (uint64_t)prev_addr);
5421 				panic("vm_allocate_cpm:  pages not contig!");
5422 			}
5423 		}
5424 		prev_addr = VM_PAGE_GET_PHYS_PAGE(m);
5425 	}
5426 #endif  /* MACH_ASSERT */
5427 
5428 	vm_object_deallocate(cpm_obj); /* kill extra ref */
5429 
5430 	return kr;
5431 }
5432 
5433 
5434 #else   /* VM_CPM */
5435 
5436 /*
5437  *	Interface is defined in all cases, but unless the kernel
5438  *	is built explicitly for this option, the interface does
5439  *	nothing.
5440  */
5441 
5442 kern_return_t
vm_map_enter_cpm(__unused vm_map_t map,__unused vm_map_offset_t * addr,__unused vm_map_size_t size,__unused vm_map_kernel_flags_t vmk_flags)5443 vm_map_enter_cpm(
5444 	__unused vm_map_t                map,
5445 	__unused vm_map_offset_t        *addr,
5446 	__unused vm_map_size_t           size,
5447 	__unused vm_map_kernel_flags_t   vmk_flags)
5448 {
5449 	return KERN_FAILURE;
5450 }
5451 #endif /* VM_CPM */
5452 
5453 /* Not used without nested pmaps */
5454 #ifndef NO_NESTED_PMAP
5455 /*
5456  * Clip and unnest a portion of a nested submap mapping.
5457  */
5458 
5459 
5460 static void
vm_map_clip_unnest(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t start_unnest,vm_map_offset_t end_unnest)5461 vm_map_clip_unnest(
5462 	vm_map_t        map,
5463 	vm_map_entry_t  entry,
5464 	vm_map_offset_t start_unnest,
5465 	vm_map_offset_t end_unnest)
5466 {
5467 	vm_map_offset_t old_start_unnest = start_unnest;
5468 	vm_map_offset_t old_end_unnest = end_unnest;
5469 
5470 	assert(entry->is_sub_map);
5471 	assert(VME_SUBMAP(entry) != NULL);
5472 	assert(entry->use_pmap);
5473 
5474 	/*
5475 	 * Query the platform for the optimal unnest range.
5476 	 * DRK: There's some duplication of effort here, since
5477 	 * callers may have adjusted the range to some extent. This
5478 	 * routine was introduced to support 1GiB subtree nesting
5479 	 * for x86 platforms, which can also nest on 2MiB boundaries
5480 	 * depending on size/alignment.
5481 	 */
5482 	if (pmap_adjust_unnest_parameters(map->pmap, &start_unnest, &end_unnest)) {
5483 		assert(VME_SUBMAP(entry)->is_nested_map);
5484 		assert(!VME_SUBMAP(entry)->disable_vmentry_reuse);
5485 		log_unnest_badness(map,
5486 		    old_start_unnest,
5487 		    old_end_unnest,
5488 		    VME_SUBMAP(entry)->is_nested_map,
5489 		    (entry->vme_start +
5490 		    VME_SUBMAP(entry)->lowest_unnestable_start -
5491 		    VME_OFFSET(entry)));
5492 	}
5493 
5494 	if (entry->vme_start > start_unnest ||
5495 	    entry->vme_end < end_unnest) {
5496 		panic("vm_map_clip_unnest(0x%llx,0x%llx): "
5497 		    "bad nested entry: start=0x%llx end=0x%llx\n",
5498 		    (long long)start_unnest, (long long)end_unnest,
5499 		    (long long)entry->vme_start, (long long)entry->vme_end);
5500 	}
5501 
5502 	if (start_unnest > entry->vme_start) {
5503 		_vm_map_clip_start(&map->hdr,
5504 		    entry,
5505 		    start_unnest);
5506 		if (map->holelistenabled) {
5507 			vm_map_store_update_first_free(map, NULL, FALSE);
5508 		} else {
5509 			vm_map_store_update_first_free(map, map->first_free, FALSE);
5510 		}
5511 	}
5512 	if (entry->vme_end > end_unnest) {
5513 		_vm_map_clip_end(&map->hdr,
5514 		    entry,
5515 		    end_unnest);
5516 		if (map->holelistenabled) {
5517 			vm_map_store_update_first_free(map, NULL, FALSE);
5518 		} else {
5519 			vm_map_store_update_first_free(map, map->first_free, FALSE);
5520 		}
5521 	}
5522 
5523 	pmap_unnest(map->pmap,
5524 	    entry->vme_start,
5525 	    entry->vme_end - entry->vme_start);
5526 	if ((map->mapped_in_other_pmaps) && os_ref_get_count_raw(&map->map_refcnt) != 0) {
5527 		/* clean up parent map/maps */
5528 		vm_map_submap_pmap_clean(
5529 			map, entry->vme_start,
5530 			entry->vme_end,
5531 			VME_SUBMAP(entry),
5532 			VME_OFFSET(entry));
5533 	}
5534 	entry->use_pmap = FALSE;
5535 	if ((map->pmap != kernel_pmap) &&
5536 	    (VME_ALIAS(entry) == VM_MEMORY_SHARED_PMAP)) {
5537 		VME_ALIAS_SET(entry, VM_MEMORY_UNSHARED_PMAP);
5538 	}
5539 }
5540 #endif  /* NO_NESTED_PMAP */
5541 
5542 __abortlike
5543 static void
__vm_map_clip_atomic_entry_panic(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t where)5544 __vm_map_clip_atomic_entry_panic(
5545 	vm_map_t        map,
5546 	vm_map_entry_t  entry,
5547 	vm_map_offset_t where)
5548 {
5549 	panic("vm_map_clip(%p): Attempting to clip an atomic VM map entry "
5550 	    "%p [0x%llx:0x%llx] at 0x%llx", map, entry,
5551 	    (uint64_t)entry->vme_start,
5552 	    (uint64_t)entry->vme_end,
5553 	    (uint64_t)where);
5554 }
5555 
5556 /*
5557  *	vm_map_clip_start:	[ internal use only ]
5558  *
5559  *	Asserts that the given entry begins at or after
5560  *	the specified address; if necessary,
5561  *	it splits the entry into two.
5562  */
5563 void
vm_map_clip_start(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t startaddr)5564 vm_map_clip_start(
5565 	vm_map_t        map,
5566 	vm_map_entry_t  entry,
5567 	vm_map_offset_t startaddr)
5568 {
5569 #ifndef NO_NESTED_PMAP
5570 	if (entry->is_sub_map &&
5571 	    entry->use_pmap &&
5572 	    startaddr >= entry->vme_start) {
5573 		vm_map_offset_t start_unnest, end_unnest;
5574 
5575 		/*
5576 		 * Make sure "startaddr" is no longer in a nested range
5577 		 * before we clip.  Unnest only the minimum range the platform
5578 		 * can handle.
5579 		 * vm_map_clip_unnest may perform additional adjustments to
5580 		 * the unnest range.
5581 		 */
5582 		start_unnest = startaddr & ~(pmap_shared_region_size_min(map->pmap) - 1);
5583 		end_unnest = start_unnest + pmap_shared_region_size_min(map->pmap);
5584 		vm_map_clip_unnest(map, entry, start_unnest, end_unnest);
5585 	}
5586 #endif /* NO_NESTED_PMAP */
5587 	if (startaddr > entry->vme_start) {
5588 		if (!entry->is_sub_map &&
5589 		    VME_OBJECT(entry) &&
5590 		    VME_OBJECT(entry)->phys_contiguous) {
5591 			pmap_remove(map->pmap,
5592 			    (addr64_t)(entry->vme_start),
5593 			    (addr64_t)(entry->vme_end));
5594 		}
5595 		if (entry->vme_atomic) {
5596 			__vm_map_clip_atomic_entry_panic(map, entry, startaddr);
5597 		}
5598 
5599 		DTRACE_VM5(
5600 			vm_map_clip_start,
5601 			vm_map_t, map,
5602 			vm_map_offset_t, entry->vme_start,
5603 			vm_map_offset_t, entry->vme_end,
5604 			vm_map_offset_t, startaddr,
5605 			int, VME_ALIAS(entry));
5606 
5607 		_vm_map_clip_start(&map->hdr, entry, startaddr);
5608 		if (map->holelistenabled) {
5609 			vm_map_store_update_first_free(map, NULL, FALSE);
5610 		} else {
5611 			vm_map_store_update_first_free(map, map->first_free, FALSE);
5612 		}
5613 	}
5614 }
5615 
5616 
5617 #define vm_map_copy_clip_start(copy, entry, startaddr) \
5618 	MACRO_BEGIN \
5619 	if ((startaddr) > (entry)->vme_start) \
5620 	        _vm_map_clip_start(&(copy)->cpy_hdr,(entry),(startaddr)); \
5621 	MACRO_END
5622 
5623 /*
5624  *	This routine is called only when it is known that
5625  *	the entry must be split.
5626  */
5627 static void
_vm_map_clip_start(struct vm_map_header * map_header,vm_map_entry_t entry,vm_map_offset_t start)5628 _vm_map_clip_start(
5629 	struct vm_map_header    *map_header,
5630 	vm_map_entry_t          entry,
5631 	vm_map_offset_t         start)
5632 {
5633 	vm_map_entry_t  new_entry;
5634 
5635 	/*
5636 	 *	Split off the front portion --
5637 	 *	note that we must insert the new
5638 	 *	entry BEFORE this one, so that
5639 	 *	this entry has the specified starting
5640 	 *	address.
5641 	 */
5642 
5643 	if (entry->map_aligned) {
5644 		assert(VM_MAP_PAGE_ALIGNED(start,
5645 		    VM_MAP_HDR_PAGE_MASK(map_header)));
5646 	}
5647 
5648 	new_entry = _vm_map_entry_create(map_header);
5649 	vm_map_entry_copy_full(new_entry, entry);
5650 
5651 	new_entry->vme_end = start;
5652 	assert(new_entry->vme_start < new_entry->vme_end);
5653 	VME_OFFSET_SET(entry, VME_OFFSET(entry) + (start - entry->vme_start));
5654 	if (__improbable(start >= entry->vme_end)) {
5655 		panic("mapHdr %p entry %p start 0x%llx end 0x%llx new start 0x%llx", map_header, entry, entry->vme_start, entry->vme_end, start);
5656 	}
5657 	assert(start < entry->vme_end);
5658 	entry->vme_start = start;
5659 
5660 #if VM_BTLOG_TAGS
5661 	if (new_entry->vme_kernel_object) {
5662 		btref_retain(new_entry->vme_tag_btref);
5663 	}
5664 #endif /* VM_BTLOG_TAGS */
5665 
5666 	_vm_map_store_entry_link(map_header, entry->vme_prev, new_entry);
5667 
5668 	if (entry->is_sub_map) {
5669 		vm_map_reference(VME_SUBMAP(new_entry));
5670 	} else {
5671 		vm_object_reference(VME_OBJECT(new_entry));
5672 	}
5673 }
5674 
5675 
5676 /*
5677  *	vm_map_clip_end:	[ internal use only ]
5678  *
5679  *	Asserts that the given entry ends at or before
5680  *	the specified address; if necessary,
5681  *	it splits the entry into two.
5682  */
5683 void
vm_map_clip_end(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t endaddr)5684 vm_map_clip_end(
5685 	vm_map_t        map,
5686 	vm_map_entry_t  entry,
5687 	vm_map_offset_t endaddr)
5688 {
5689 	if (endaddr > entry->vme_end) {
5690 		/*
5691 		 * Within the scope of this clipping, limit "endaddr" to
5692 		 * the end of this map entry...
5693 		 */
5694 		endaddr = entry->vme_end;
5695 	}
5696 #ifndef NO_NESTED_PMAP
5697 	if (entry->is_sub_map && entry->use_pmap) {
5698 		vm_map_offset_t start_unnest, end_unnest;
5699 
5700 		/*
5701 		 * Make sure the range between the start of this entry and
5702 		 * the new "endaddr" is no longer nested before we clip.
5703 		 * Unnest only the minimum range the platform can handle.
5704 		 * vm_map_clip_unnest may perform additional adjustments to
5705 		 * the unnest range.
5706 		 */
5707 		start_unnest = entry->vme_start;
5708 		end_unnest =
5709 		    (endaddr + pmap_shared_region_size_min(map->pmap) - 1) &
5710 		    ~(pmap_shared_region_size_min(map->pmap) - 1);
5711 		vm_map_clip_unnest(map, entry, start_unnest, end_unnest);
5712 	}
5713 #endif /* NO_NESTED_PMAP */
5714 	if (endaddr < entry->vme_end) {
5715 		if (!entry->is_sub_map &&
5716 		    VME_OBJECT(entry) &&
5717 		    VME_OBJECT(entry)->phys_contiguous) {
5718 			pmap_remove(map->pmap,
5719 			    (addr64_t)(entry->vme_start),
5720 			    (addr64_t)(entry->vme_end));
5721 		}
5722 		if (entry->vme_atomic) {
5723 			__vm_map_clip_atomic_entry_panic(map, entry, endaddr);
5724 		}
5725 		DTRACE_VM5(
5726 			vm_map_clip_end,
5727 			vm_map_t, map,
5728 			vm_map_offset_t, entry->vme_start,
5729 			vm_map_offset_t, entry->vme_end,
5730 			vm_map_offset_t, endaddr,
5731 			int, VME_ALIAS(entry));
5732 
5733 		_vm_map_clip_end(&map->hdr, entry, endaddr);
5734 		if (map->holelistenabled) {
5735 			vm_map_store_update_first_free(map, NULL, FALSE);
5736 		} else {
5737 			vm_map_store_update_first_free(map, map->first_free, FALSE);
5738 		}
5739 	}
5740 }
5741 
5742 
5743 #define vm_map_copy_clip_end(copy, entry, endaddr) \
5744 	MACRO_BEGIN \
5745 	if ((endaddr) < (entry)->vme_end) \
5746 	        _vm_map_clip_end(&(copy)->cpy_hdr,(entry),(endaddr)); \
5747 	MACRO_END
5748 
5749 /*
5750  *	This routine is called only when it is known that
5751  *	the entry must be split.
5752  */
5753 static void
_vm_map_clip_end(struct vm_map_header * map_header,vm_map_entry_t entry,vm_map_offset_t end)5754 _vm_map_clip_end(
5755 	struct vm_map_header    *map_header,
5756 	vm_map_entry_t          entry,
5757 	vm_map_offset_t         end)
5758 {
5759 	vm_map_entry_t  new_entry;
5760 
5761 	/*
5762 	 *	Create a new entry and insert it
5763 	 *	AFTER the specified entry
5764 	 */
5765 
5766 	if (entry->map_aligned) {
5767 		assert(VM_MAP_PAGE_ALIGNED(end,
5768 		    VM_MAP_HDR_PAGE_MASK(map_header)));
5769 	}
5770 
5771 	new_entry = _vm_map_entry_create(map_header);
5772 	vm_map_entry_copy_full(new_entry, entry);
5773 
5774 	if (__improbable(end <= entry->vme_start)) {
5775 		panic("mapHdr %p entry %p start 0x%llx end 0x%llx new end 0x%llx", map_header, entry, entry->vme_start, entry->vme_end, end);
5776 	}
5777 	assert(entry->vme_start < end);
5778 	new_entry->vme_start = entry->vme_end = end;
5779 	VME_OFFSET_SET(new_entry,
5780 	    VME_OFFSET(new_entry) + (end - entry->vme_start));
5781 	assert(new_entry->vme_start < new_entry->vme_end);
5782 
5783 #if VM_BTLOG_TAGS
5784 	if (new_entry->vme_kernel_object) {
5785 		btref_retain(new_entry->vme_tag_btref);
5786 	}
5787 #endif /* VM_BTLOG_TAGS */
5788 
5789 	_vm_map_store_entry_link(map_header, entry, new_entry);
5790 
5791 	if (entry->is_sub_map) {
5792 		vm_map_reference(VME_SUBMAP(new_entry));
5793 	} else {
5794 		vm_object_reference(VME_OBJECT(new_entry));
5795 	}
5796 }
5797 
5798 
5799 /*
5800  *	VM_MAP_RANGE_CHECK:	[ internal use only ]
5801  *
5802  *	Asserts that the starting and ending region
5803  *	addresses fall within the valid range of the map.
5804  */
5805 #define VM_MAP_RANGE_CHECK(map, start, end)     \
5806 	MACRO_BEGIN                             \
5807 	if (start < vm_map_min(map))            \
5808 	        start = vm_map_min(map);        \
5809 	if (end > vm_map_max(map))              \
5810 	        end = vm_map_max(map);          \
5811 	if (start > end)                        \
5812 	        start = end;                    \
5813 	MACRO_END
5814 
5815 /*
5816  *	vm_map_range_check:	[ internal use only ]
5817  *
5818  *	Check that the region defined by the specified start and
5819  *	end addresses are wholly contained within a single map
5820  *	entry or set of adjacent map entries of the spacified map,
5821  *	i.e. the specified region contains no unmapped space.
5822  *	If any or all of the region is unmapped, FALSE is returned.
5823  *	Otherwise, TRUE is returned and if the output argument 'entry'
5824  *	is not NULL it points to the map entry containing the start
5825  *	of the region.
5826  *
5827  *	The map is locked for reading on entry and is left locked.
5828  */
5829 static boolean_t
vm_map_range_check(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_map_entry_t * entry)5830 vm_map_range_check(
5831 	vm_map_t                map,
5832 	vm_map_offset_t         start,
5833 	vm_map_offset_t         end,
5834 	vm_map_entry_t          *entry)
5835 {
5836 	vm_map_entry_t          cur;
5837 	vm_map_offset_t         prev;
5838 
5839 	/*
5840 	 *      Basic sanity checks first
5841 	 */
5842 	if (start < vm_map_min(map) || end > vm_map_max(map) || start > end) {
5843 		return FALSE;
5844 	}
5845 
5846 	/*
5847 	 *      Check first if the region starts within a valid
5848 	 *	mapping for the map.
5849 	 */
5850 	if (!vm_map_lookup_entry(map, start, &cur)) {
5851 		return FALSE;
5852 	}
5853 
5854 	/*
5855 	 *	Optimize for the case that the region is contained
5856 	 *	in a single map entry.
5857 	 */
5858 	if (entry != (vm_map_entry_t *) NULL) {
5859 		*entry = cur;
5860 	}
5861 	if (end <= cur->vme_end) {
5862 		return TRUE;
5863 	}
5864 
5865 	/*
5866 	 *      If the region is not wholly contained within a
5867 	 *      single entry, walk the entries looking for holes.
5868 	 */
5869 	prev = cur->vme_end;
5870 	cur = cur->vme_next;
5871 	while ((cur != vm_map_to_entry(map)) && (prev == cur->vme_start)) {
5872 		if (end <= cur->vme_end) {
5873 			return TRUE;
5874 		}
5875 		prev = cur->vme_end;
5876 		cur = cur->vme_next;
5877 	}
5878 	return FALSE;
5879 }
5880 
5881 /*
5882  *	vm_map_protect:
5883  *
5884  *	Sets the protection of the specified address
5885  *	region in the target map.  If "set_max" is
5886  *	specified, the maximum protection is to be set;
5887  *	otherwise, only the current protection is affected.
5888  */
5889 kern_return_t
vm_map_protect(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t new_prot,boolean_t set_max)5890 vm_map_protect(
5891 	vm_map_t        map,
5892 	vm_map_offset_t start,
5893 	vm_map_offset_t end,
5894 	vm_prot_t       new_prot,
5895 	boolean_t       set_max)
5896 {
5897 	vm_map_entry_t                  current;
5898 	vm_map_offset_t                 prev;
5899 	vm_map_entry_t                  entry;
5900 	vm_prot_t                       new_max;
5901 	int                             pmap_options = 0;
5902 	kern_return_t                   kr;
5903 
5904 	if (__improbable(vm_map_range_overflows(map, start, end - start))) {
5905 		return KERN_INVALID_ARGUMENT;
5906 	}
5907 
5908 	if (new_prot & VM_PROT_COPY) {
5909 		vm_map_offset_t         new_start;
5910 		vm_prot_t               cur_prot, max_prot;
5911 		vm_map_kernel_flags_t   kflags;
5912 
5913 		/* LP64todo - see below */
5914 		if (start >= map->max_offset) {
5915 			return KERN_INVALID_ADDRESS;
5916 		}
5917 
5918 		if ((new_prot & VM_PROT_ALLEXEC) &&
5919 		    map->pmap != kernel_pmap &&
5920 		    (vm_map_cs_enforcement(map)
5921 #if XNU_TARGET_OS_OSX && __arm64__
5922 		    || !VM_MAP_IS_EXOTIC(map)
5923 #endif /* XNU_TARGET_OS_OSX && __arm64__ */
5924 		    ) &&
5925 		    VM_MAP_POLICY_WX_FAIL(map)) {
5926 			DTRACE_VM3(cs_wx,
5927 			    uint64_t, (uint64_t) start,
5928 			    uint64_t, (uint64_t) end,
5929 			    vm_prot_t, new_prot);
5930 			printf("CODE SIGNING: %d[%s] %s:%d(0x%llx,0x%llx,0x%x) can't have both write and exec at the same time\n",
5931 			    proc_selfpid(),
5932 			    (get_bsdtask_info(current_task())
5933 			    ? proc_name_address(get_bsdtask_info(current_task()))
5934 			    : "?"),
5935 			    __FUNCTION__, __LINE__,
5936 #if DEVELOPMENT || DEBUG
5937 			    (uint64_t)start,
5938 			    (uint64_t)end,
5939 #else /* DEVELOPMENT || DEBUG */
5940 			    (uint64_t)0,
5941 			    (uint64_t)0,
5942 #endif /* DEVELOPMENT || DEBUG */
5943 			    new_prot);
5944 			return KERN_PROTECTION_FAILURE;
5945 		}
5946 
5947 		/*
5948 		 * Let vm_map_remap_extract() know that it will need to:
5949 		 * + make a copy of the mapping
5950 		 * + add VM_PROT_WRITE to the max protections
5951 		 * + remove any protections that are no longer allowed from the
5952 		 *   max protections (to avoid any WRITE/EXECUTE conflict, for
5953 		 *   example).
5954 		 * Note that "max_prot" is an IN/OUT parameter only for this
5955 		 * specific (VM_PROT_COPY) case.  It's usually an OUT parameter
5956 		 * only.
5957 		 */
5958 		max_prot = new_prot & (VM_PROT_ALL | VM_PROT_ALLEXEC);
5959 		cur_prot = VM_PROT_NONE;
5960 		kflags = VM_MAP_KERNEL_FLAGS_FIXED(.vmf_overwrite = true);
5961 		kflags.vmkf_remap_prot_copy = true;
5962 		kflags.vmkf_tpro_enforcement_override = !vm_map_tpro_enforcement(map);
5963 		new_start = start;
5964 		kr = vm_map_remap(map,
5965 		    &new_start,
5966 		    end - start,
5967 		    0, /* mask */
5968 		    kflags,
5969 		    map,
5970 		    start,
5971 		    TRUE, /* copy-on-write remapping! */
5972 		    &cur_prot, /* IN/OUT */
5973 		    &max_prot, /* IN/OUT */
5974 		    VM_INHERIT_DEFAULT);
5975 		if (kr != KERN_SUCCESS) {
5976 			return kr;
5977 		}
5978 		new_prot &= ~VM_PROT_COPY;
5979 	}
5980 
5981 	vm_map_lock(map);
5982 
5983 	/* LP64todo - remove this check when vm_map_commpage64()
5984 	 * no longer has to stuff in a map_entry for the commpage
5985 	 * above the map's max_offset.
5986 	 */
5987 	if (start >= map->max_offset) {
5988 		vm_map_unlock(map);
5989 		return KERN_INVALID_ADDRESS;
5990 	}
5991 
5992 	while (1) {
5993 		/*
5994 		 *      Lookup the entry.  If it doesn't start in a valid
5995 		 *	entry, return an error.
5996 		 */
5997 		if (!vm_map_lookup_entry(map, start, &entry)) {
5998 			vm_map_unlock(map);
5999 			return KERN_INVALID_ADDRESS;
6000 		}
6001 
6002 		if (entry->superpage_size && (start & (SUPERPAGE_SIZE - 1))) { /* extend request to whole entry */
6003 			start = SUPERPAGE_ROUND_DOWN(start);
6004 			continue;
6005 		}
6006 		break;
6007 	}
6008 	if (entry->superpage_size) {
6009 		end = SUPERPAGE_ROUND_UP(end);
6010 	}
6011 
6012 	/*
6013 	 *	Make a first pass to check for protection and address
6014 	 *	violations.
6015 	 */
6016 
6017 	current = entry;
6018 	prev = current->vme_start;
6019 	while ((current != vm_map_to_entry(map)) &&
6020 	    (current->vme_start < end)) {
6021 		/*
6022 		 * If there is a hole, return an error.
6023 		 */
6024 		if (current->vme_start != prev) {
6025 			vm_map_unlock(map);
6026 			return KERN_INVALID_ADDRESS;
6027 		}
6028 
6029 		new_max = current->max_protection;
6030 
6031 #if defined(__x86_64__)
6032 		/* Allow max mask to include execute prot bits if this map doesn't enforce CS */
6033 		if (set_max && (new_prot & VM_PROT_ALLEXEC) && !vm_map_cs_enforcement(map)) {
6034 			new_max = (new_max & ~VM_PROT_ALLEXEC) | (new_prot & VM_PROT_ALLEXEC);
6035 		}
6036 #elif CODE_SIGNING_MONITOR
6037 		if (set_max && (new_prot & VM_PROT_EXECUTE) && (csm_address_space_exempt(map->pmap) == KERN_SUCCESS)) {
6038 			new_max |= VM_PROT_EXECUTE;
6039 		}
6040 #endif
6041 		if ((new_prot & new_max) != new_prot) {
6042 			vm_map_unlock(map);
6043 			return KERN_PROTECTION_FAILURE;
6044 		}
6045 
6046 		if (current->used_for_jit &&
6047 		    pmap_has_prot_policy(map->pmap, current->translated_allow_execute, current->protection)) {
6048 			vm_map_unlock(map);
6049 			return KERN_PROTECTION_FAILURE;
6050 		}
6051 
6052 #if __arm64e__
6053 		/* Disallow remapping hw assisted TPRO mappings */
6054 		if (current->used_for_tpro) {
6055 			vm_map_unlock(map);
6056 			return KERN_PROTECTION_FAILURE;
6057 		}
6058 #endif /* __arm64e__ */
6059 
6060 
6061 		if ((new_prot & VM_PROT_WRITE) &&
6062 		    (new_prot & VM_PROT_ALLEXEC) &&
6063 #if XNU_TARGET_OS_OSX
6064 		    map->pmap != kernel_pmap &&
6065 		    (vm_map_cs_enforcement(map)
6066 #if __arm64__
6067 		    || !VM_MAP_IS_EXOTIC(map)
6068 #endif /* __arm64__ */
6069 		    ) &&
6070 #endif /* XNU_TARGET_OS_OSX */
6071 #if CODE_SIGNING_MONITOR
6072 		    (csm_address_space_exempt(map->pmap) != KERN_SUCCESS) &&
6073 #endif
6074 		    !(current->used_for_jit)) {
6075 			DTRACE_VM3(cs_wx,
6076 			    uint64_t, (uint64_t) current->vme_start,
6077 			    uint64_t, (uint64_t) current->vme_end,
6078 			    vm_prot_t, new_prot);
6079 			printf("CODE SIGNING: %d[%s] %s:%d(0x%llx,0x%llx,0x%x) can't have both write and exec at the same time\n",
6080 			    proc_selfpid(),
6081 			    (get_bsdtask_info(current_task())
6082 			    ? proc_name_address(get_bsdtask_info(current_task()))
6083 			    : "?"),
6084 			    __FUNCTION__, __LINE__,
6085 #if DEVELOPMENT || DEBUG
6086 			    (uint64_t)current->vme_start,
6087 			    (uint64_t)current->vme_end,
6088 #else /* DEVELOPMENT || DEBUG */
6089 			    (uint64_t)0,
6090 			    (uint64_t)0,
6091 #endif /* DEVELOPMENT || DEBUG */
6092 			    new_prot);
6093 			new_prot &= ~VM_PROT_ALLEXEC;
6094 			if (VM_MAP_POLICY_WX_FAIL(map)) {
6095 				vm_map_unlock(map);
6096 				return KERN_PROTECTION_FAILURE;
6097 			}
6098 		}
6099 
6100 		/*
6101 		 * If the task has requested executable lockdown,
6102 		 * deny both:
6103 		 * - adding executable protections OR
6104 		 * - adding write protections to an existing executable mapping.
6105 		 */
6106 		if (map->map_disallow_new_exec == TRUE) {
6107 			if ((new_prot & VM_PROT_ALLEXEC) ||
6108 			    ((current->protection & VM_PROT_EXECUTE) && (new_prot & VM_PROT_WRITE))) {
6109 				vm_map_unlock(map);
6110 				return KERN_PROTECTION_FAILURE;
6111 			}
6112 		}
6113 
6114 		prev = current->vme_end;
6115 		current = current->vme_next;
6116 	}
6117 
6118 #if __arm64__
6119 	if (end > prev &&
6120 	    end == vm_map_round_page(prev, VM_MAP_PAGE_MASK(map))) {
6121 		vm_map_entry_t prev_entry;
6122 
6123 		prev_entry = current->vme_prev;
6124 		if (prev_entry != vm_map_to_entry(map) &&
6125 		    !prev_entry->map_aligned &&
6126 		    (vm_map_round_page(prev_entry->vme_end,
6127 		    VM_MAP_PAGE_MASK(map))
6128 		    == end)) {
6129 			/*
6130 			 * The last entry in our range is not "map-aligned"
6131 			 * but it would have reached all the way to "end"
6132 			 * if it had been map-aligned, so this is not really
6133 			 * a hole in the range and we can proceed.
6134 			 */
6135 			prev = end;
6136 		}
6137 	}
6138 #endif /* __arm64__ */
6139 
6140 	if (end > prev) {
6141 		vm_map_unlock(map);
6142 		return KERN_INVALID_ADDRESS;
6143 	}
6144 
6145 	/*
6146 	 *	Go back and fix up protections.
6147 	 *	Clip to start here if the range starts within
6148 	 *	the entry.
6149 	 */
6150 
6151 	current = entry;
6152 	if (current != vm_map_to_entry(map)) {
6153 		/* clip and unnest if necessary */
6154 		vm_map_clip_start(map, current, start);
6155 	}
6156 
6157 	while ((current != vm_map_to_entry(map)) &&
6158 	    (current->vme_start < end)) {
6159 		vm_prot_t       old_prot;
6160 
6161 		vm_map_clip_end(map, current, end);
6162 
6163 #if DEVELOPMENT || DEBUG
6164 		if (current->csm_associated && vm_log_xnu_user_debug) {
6165 			printf("FBDP %d[%s] %s(0x%llx,0x%llx,0x%x) on map %p entry %p [0x%llx:0x%llx 0x%x/0x%x] csm_associated\n",
6166 			    proc_selfpid(),
6167 			    (get_bsdtask_info(current_task())
6168 			    ? proc_name_address(get_bsdtask_info(current_task()))
6169 			    : "?"),
6170 			    __FUNCTION__,
6171 			    (uint64_t)start,
6172 			    (uint64_t)end,
6173 			    new_prot,
6174 			    map, current,
6175 			    current->vme_start,
6176 			    current->vme_end,
6177 			    current->protection,
6178 			    current->max_protection);
6179 		}
6180 #endif /* DEVELOPMENT || DEBUG */
6181 
6182 		if (current->is_sub_map) {
6183 			/* clipping did unnest if needed */
6184 			assert(!current->use_pmap);
6185 		}
6186 
6187 		old_prot = current->protection;
6188 
6189 		if (set_max) {
6190 			current->max_protection = new_prot;
6191 			/* Consider either EXECUTE or UEXEC as EXECUTE for this masking */
6192 			current->protection = (new_prot & old_prot);
6193 		} else {
6194 			current->protection = new_prot;
6195 		}
6196 
6197 #if CODE_SIGNING_MONITOR
6198 		if (!current->vme_xnu_user_debug &&
6199 		    /* a !csm_associated mapping becoming executable */
6200 		    ((!current->csm_associated &&
6201 		    !(old_prot & VM_PROT_EXECUTE) &&
6202 		    (current->protection & VM_PROT_EXECUTE))
6203 		    ||
6204 		    /* a csm_associated mapping becoming writable */
6205 		    (current->csm_associated &&
6206 		    !(old_prot & VM_PROT_WRITE) &&
6207 		    (current->protection & VM_PROT_WRITE)))) {
6208 			/*
6209 			 * This mapping has not already been marked as
6210 			 * "user_debug" and it is either:
6211 			 * 1. not code-signing-monitored and becoming executable
6212 			 * 2. code-signing-monitored and becoming writable,
6213 			 * so inform the CodeSigningMonitor and mark the
6214 			 * mapping as "user_debug" if appropriate.
6215 			 */
6216 			vm_map_kernel_flags_t vmk_flags;
6217 			vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
6218 			/* pretend it's a vm_protect(VM_PROT_COPY)... */
6219 			vmk_flags.vmkf_remap_prot_copy = true;
6220 			kr = vm_map_entry_cs_associate(map, current, vmk_flags);
6221 #if DEVELOPMENT || DEBUG
6222 			if (vm_log_xnu_user_debug) {
6223 				printf("FBDP %d[%s] %s:%d map %p entry %p [ 0x%llx 0x%llx ] prot 0x%x -> 0x%x cs_associate -> %d user_debug=%d\n",
6224 				    proc_selfpid(),
6225 				    (get_bsdtask_info(current_task()) ? proc_name_address(get_bsdtask_info(current_task())) : "?"),
6226 				    __FUNCTION__, __LINE__,
6227 				    map, current,
6228 				    current->vme_start, current->vme_end,
6229 				    old_prot, current->protection,
6230 				    kr, current->vme_xnu_user_debug);
6231 			}
6232 #endif /* DEVELOPMENT || DEBUG */
6233 		}
6234 #endif /* CODE_SIGNING_MONITOR */
6235 
6236 		/*
6237 		 *	Update physical map if necessary.
6238 		 *	If the request is to turn off write protection,
6239 		 *	we won't do it for real (in pmap). This is because
6240 		 *	it would cause copy-on-write to fail.  We've already
6241 		 *	set, the new protection in the map, so if a
6242 		 *	write-protect fault occurred, it will be fixed up
6243 		 *	properly, COW or not.
6244 		 */
6245 		if (current->protection != old_prot) {
6246 			/* Look one level in we support nested pmaps */
6247 			/* from mapped submaps which are direct entries */
6248 			/* in our map */
6249 
6250 			vm_prot_t prot;
6251 
6252 			prot = current->protection;
6253 			if (current->is_sub_map || (VME_OBJECT(current) == NULL) || (VME_OBJECT(current) != compressor_object)) {
6254 				prot &= ~VM_PROT_WRITE;
6255 			} else {
6256 				assert(!VME_OBJECT(current)->code_signed);
6257 				assert(VME_OBJECT(current)->copy_strategy == MEMORY_OBJECT_COPY_NONE);
6258 				if (prot & VM_PROT_WRITE) {
6259 					/*
6260 					 * For write requests on the
6261 					 * compressor, we wil ask the
6262 					 * pmap layer to prevent us from
6263 					 * taking a write fault when we
6264 					 * attempt to access the mapping
6265 					 * next.
6266 					 */
6267 					pmap_options |= PMAP_OPTIONS_PROTECT_IMMEDIATE;
6268 				}
6269 			}
6270 
6271 			if (override_nx(map, VME_ALIAS(current)) && prot) {
6272 				prot |= VM_PROT_EXECUTE;
6273 			}
6274 
6275 #if DEVELOPMENT || DEBUG
6276 			if (!(old_prot & VM_PROT_EXECUTE) &&
6277 			    (prot & VM_PROT_EXECUTE) &&
6278 			    panic_on_unsigned_execute &&
6279 			    (proc_selfcsflags() & CS_KILL)) {
6280 				panic("vm_map_protect(%p,0x%llx,0x%llx) old=0x%x new=0x%x - <rdar://23770418> code-signing bypass?", map, (uint64_t)current->vme_start, (uint64_t)current->vme_end, old_prot, prot);
6281 			}
6282 #endif /* DEVELOPMENT || DEBUG */
6283 
6284 			if (pmap_has_prot_policy(map->pmap, current->translated_allow_execute, prot)) {
6285 				if (current->wired_count) {
6286 					panic("vm_map_protect(%p,0x%llx,0x%llx) new=0x%x wired=%x",
6287 					    map, (uint64_t)current->vme_start, (uint64_t)current->vme_end, prot, current->wired_count);
6288 				}
6289 
6290 				/* If the pmap layer cares about this
6291 				 * protection type, force a fault for
6292 				 * each page so that vm_fault will
6293 				 * repopulate the page with the full
6294 				 * set of protections.
6295 				 */
6296 				/*
6297 				 * TODO: We don't seem to need this,
6298 				 * but this is due to an internal
6299 				 * implementation detail of
6300 				 * pmap_protect.  Do we want to rely
6301 				 * on this?
6302 				 */
6303 				prot = VM_PROT_NONE;
6304 			}
6305 
6306 			if (current->is_sub_map && current->use_pmap) {
6307 				pmap_protect(VME_SUBMAP(current)->pmap,
6308 				    current->vme_start,
6309 				    current->vme_end,
6310 				    prot);
6311 			} else {
6312 				pmap_protect_options(map->pmap,
6313 				    current->vme_start,
6314 				    current->vme_end,
6315 				    prot,
6316 				    pmap_options,
6317 				    NULL);
6318 			}
6319 		}
6320 		current = current->vme_next;
6321 	}
6322 
6323 	current = entry;
6324 	while ((current != vm_map_to_entry(map)) &&
6325 	    (current->vme_start <= end)) {
6326 		vm_map_simplify_entry(map, current);
6327 		current = current->vme_next;
6328 	}
6329 
6330 	vm_map_unlock(map);
6331 	return KERN_SUCCESS;
6332 }
6333 
6334 /*
6335  *	vm_map_inherit:
6336  *
6337  *	Sets the inheritance of the specified address
6338  *	range in the target map.  Inheritance
6339  *	affects how the map will be shared with
6340  *	child maps at the time of vm_map_fork.
6341  */
6342 kern_return_t
vm_map_inherit(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_inherit_t new_inheritance)6343 vm_map_inherit(
6344 	vm_map_t        map,
6345 	vm_map_offset_t start,
6346 	vm_map_offset_t end,
6347 	vm_inherit_t    new_inheritance)
6348 {
6349 	vm_map_entry_t  entry;
6350 	vm_map_entry_t  temp_entry;
6351 
6352 	vm_map_lock(map);
6353 
6354 	VM_MAP_RANGE_CHECK(map, start, end);
6355 
6356 	if (__improbable(vm_map_range_overflows(map, start, end - start))) {
6357 		vm_map_unlock(map);
6358 		return KERN_INVALID_ADDRESS;
6359 	}
6360 
6361 	if (vm_map_lookup_entry(map, start, &temp_entry)) {
6362 		entry = temp_entry;
6363 	} else {
6364 		temp_entry = temp_entry->vme_next;
6365 		entry = temp_entry;
6366 	}
6367 
6368 	/* first check entire range for submaps which can't support the */
6369 	/* given inheritance. */
6370 	while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
6371 		if (entry->is_sub_map) {
6372 			if (new_inheritance == VM_INHERIT_COPY) {
6373 				vm_map_unlock(map);
6374 				return KERN_INVALID_ARGUMENT;
6375 			}
6376 		}
6377 
6378 		entry = entry->vme_next;
6379 	}
6380 
6381 	entry = temp_entry;
6382 	if (entry != vm_map_to_entry(map)) {
6383 		/* clip and unnest if necessary */
6384 		vm_map_clip_start(map, entry, start);
6385 	}
6386 
6387 	while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
6388 		vm_map_clip_end(map, entry, end);
6389 		if (entry->is_sub_map) {
6390 			/* clip did unnest if needed */
6391 			assert(!entry->use_pmap);
6392 		}
6393 
6394 		entry->inheritance = new_inheritance;
6395 
6396 		entry = entry->vme_next;
6397 	}
6398 
6399 	vm_map_unlock(map);
6400 	return KERN_SUCCESS;
6401 }
6402 
6403 /*
6404  * Update the accounting for the amount of wired memory in this map.  If the user has
6405  * exceeded the defined limits, then we fail.  Wiring on behalf of the kernel never fails.
6406  */
6407 
6408 static kern_return_t
add_wire_counts(vm_map_t map,vm_map_entry_t entry,boolean_t user_wire)6409 add_wire_counts(
6410 	vm_map_t        map,
6411 	vm_map_entry_t  entry,
6412 	boolean_t       user_wire)
6413 {
6414 	vm_map_size_t   size;
6415 
6416 	bool first_wire = entry->wired_count == 0 && entry->user_wired_count == 0;
6417 
6418 	if (user_wire) {
6419 		unsigned int total_wire_count =  vm_page_wire_count + vm_lopage_free_count;
6420 
6421 		/*
6422 		 * We're wiring memory at the request of the user.  Check if this is the first time the user is wiring
6423 		 * this map entry.
6424 		 */
6425 
6426 		if (entry->user_wired_count == 0) {
6427 			size = entry->vme_end - entry->vme_start;
6428 
6429 			/*
6430 			 * Since this is the first time the user is wiring this map entry, check to see if we're
6431 			 * exceeding the user wire limits.  There is a per map limit which is the smaller of either
6432 			 * the process's rlimit or the global vm_per_task_user_wire_limit which caps this value.  There is also
6433 			 * a system-wide limit on the amount of memory all users can wire.  If the user is over either
6434 			 * limit, then we fail.
6435 			 */
6436 
6437 			if (size + map->user_wire_size > MIN(map->user_wire_limit, vm_per_task_user_wire_limit) ||
6438 			    size + ptoa_64(total_wire_count) > vm_global_user_wire_limit) {
6439 				if (size + ptoa_64(total_wire_count) > vm_global_user_wire_limit) {
6440 #if DEVELOPMENT || DEBUG
6441 					if (panic_on_mlock_failure) {
6442 						panic("mlock: Over global wire limit. %llu bytes wired and requested to wire %llu bytes more", ptoa_64(total_wire_count), (uint64_t) size);
6443 					}
6444 #endif /* DEVELOPMENT || DEBUG */
6445 					os_atomic_inc(&vm_add_wire_count_over_global_limit, relaxed);
6446 				} else {
6447 					os_atomic_inc(&vm_add_wire_count_over_user_limit, relaxed);
6448 #if DEVELOPMENT || DEBUG
6449 					if (panic_on_mlock_failure) {
6450 						panic("mlock: Over process wire limit. %llu bytes wired and requested to wire %llu bytes more", (uint64_t) map->user_wire_size, (uint64_t) size);
6451 					}
6452 #endif /* DEVELOPMENT || DEBUG */
6453 				}
6454 				return KERN_RESOURCE_SHORTAGE;
6455 			}
6456 
6457 			/*
6458 			 * The first time the user wires an entry, we also increment the wired_count and add this to
6459 			 * the total that has been wired in the map.
6460 			 */
6461 
6462 			if (entry->wired_count >= MAX_WIRE_COUNT) {
6463 				return KERN_FAILURE;
6464 			}
6465 
6466 			entry->wired_count++;
6467 			map->user_wire_size += size;
6468 		}
6469 
6470 		if (entry->user_wired_count >= MAX_WIRE_COUNT) {
6471 			return KERN_FAILURE;
6472 		}
6473 
6474 		entry->user_wired_count++;
6475 	} else {
6476 		/*
6477 		 * The kernel's wiring the memory.  Just bump the count and continue.
6478 		 */
6479 
6480 		if (entry->wired_count >= MAX_WIRE_COUNT) {
6481 			panic("vm_map_wire: too many wirings");
6482 		}
6483 
6484 		entry->wired_count++;
6485 	}
6486 
6487 	if (first_wire) {
6488 		vme_btref_consider_and_set(entry, __builtin_frame_address(0));
6489 	}
6490 
6491 	return KERN_SUCCESS;
6492 }
6493 
6494 /*
6495  * Update the memory wiring accounting now that the given map entry is being unwired.
6496  */
6497 
6498 static void
subtract_wire_counts(vm_map_t map,vm_map_entry_t entry,boolean_t user_wire)6499 subtract_wire_counts(
6500 	vm_map_t        map,
6501 	vm_map_entry_t  entry,
6502 	boolean_t       user_wire)
6503 {
6504 	if (user_wire) {
6505 		/*
6506 		 * We're unwiring memory at the request of the user.  See if we're removing the last user wire reference.
6507 		 */
6508 
6509 		if (entry->user_wired_count == 1) {
6510 			/*
6511 			 * We're removing the last user wire reference.  Decrement the wired_count and the total
6512 			 * user wired memory for this map.
6513 			 */
6514 
6515 			assert(entry->wired_count >= 1);
6516 			entry->wired_count--;
6517 			map->user_wire_size -= entry->vme_end - entry->vme_start;
6518 		}
6519 
6520 		assert(entry->user_wired_count >= 1);
6521 		entry->user_wired_count--;
6522 	} else {
6523 		/*
6524 		 * The kernel is unwiring the memory.   Just update the count.
6525 		 */
6526 
6527 		assert(entry->wired_count >= 1);
6528 		entry->wired_count--;
6529 	}
6530 
6531 	vme_btref_consider_and_put(entry);
6532 }
6533 
6534 int cs_executable_wire = 0;
6535 
6536 /*
6537  *	vm_map_wire:
6538  *
6539  *	Sets the pageability of the specified address range in the
6540  *	target map as wired.  Regions specified as not pageable require
6541  *	locked-down physical memory and physical page maps.  The
6542  *	access_type variable indicates types of accesses that must not
6543  *	generate page faults.  This is checked against protection of
6544  *	memory being locked-down.
6545  *
6546  *	The map must not be locked, but a reference must remain to the
6547  *	map throughout the call.
6548  */
6549 static kern_return_t
vm_map_wire_nested(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t caller_prot,vm_tag_t tag,boolean_t user_wire,pmap_t map_pmap,vm_map_offset_t pmap_addr,ppnum_t * physpage_p)6550 vm_map_wire_nested(
6551 	vm_map_t                map,
6552 	vm_map_offset_t         start,
6553 	vm_map_offset_t         end,
6554 	vm_prot_t               caller_prot,
6555 	vm_tag_t                tag,
6556 	boolean_t               user_wire,
6557 	pmap_t                  map_pmap,
6558 	vm_map_offset_t         pmap_addr,
6559 	ppnum_t                 *physpage_p)
6560 {
6561 	vm_map_entry_t          entry;
6562 	vm_prot_t               access_type;
6563 	struct vm_map_entry     *first_entry, tmp_entry;
6564 	vm_map_t                real_map;
6565 	vm_map_offset_t         s, e;
6566 	kern_return_t           rc;
6567 	boolean_t               need_wakeup;
6568 	boolean_t               main_map = FALSE;
6569 	wait_interrupt_t        interruptible_state;
6570 	thread_t                cur_thread;
6571 	unsigned int            last_timestamp;
6572 	vm_map_size_t           size;
6573 	boolean_t               wire_and_extract;
6574 	vm_prot_t               extra_prots;
6575 
6576 	extra_prots = VM_PROT_COPY;
6577 	extra_prots |= VM_PROT_COPY_FAIL_IF_EXECUTABLE;
6578 #if XNU_TARGET_OS_OSX
6579 	if (map->pmap == kernel_pmap ||
6580 	    !vm_map_cs_enforcement(map)) {
6581 		extra_prots &= ~VM_PROT_COPY_FAIL_IF_EXECUTABLE;
6582 	}
6583 #endif /* XNU_TARGET_OS_OSX */
6584 #if CODE_SIGNING_MONITOR
6585 	if (csm_address_space_exempt(map->pmap) == KERN_SUCCESS) {
6586 		extra_prots &= ~VM_PROT_COPY_FAIL_IF_EXECUTABLE;
6587 	}
6588 #endif /* CODE_SIGNING_MONITOR */
6589 
6590 	access_type = (caller_prot & (VM_PROT_ALL | VM_PROT_ALLEXEC));
6591 
6592 	wire_and_extract = FALSE;
6593 	if (physpage_p != NULL) {
6594 		/*
6595 		 * The caller wants the physical page number of the
6596 		 * wired page.  We return only one physical page number
6597 		 * so this works for only one page at a time.
6598 		 */
6599 		if ((end - start) != PAGE_SIZE) {
6600 			return KERN_INVALID_ARGUMENT;
6601 		}
6602 		wire_and_extract = TRUE;
6603 		*physpage_p = 0;
6604 	}
6605 
6606 	vm_map_lock(map);
6607 	if (map_pmap == NULL) {
6608 		main_map = TRUE;
6609 	}
6610 	last_timestamp = map->timestamp;
6611 
6612 	VM_MAP_RANGE_CHECK(map, start, end);
6613 	assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
6614 	assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
6615 
6616 	if (start == end) {
6617 		/* We wired what the caller asked for, zero pages */
6618 		vm_map_unlock(map);
6619 		return KERN_SUCCESS;
6620 	}
6621 
6622 	if (__improbable(vm_map_range_overflows(map, start, end - start))) {
6623 		vm_map_unlock(map);
6624 		return KERN_INVALID_ADDRESS;
6625 	}
6626 
6627 	need_wakeup = FALSE;
6628 	cur_thread = current_thread();
6629 
6630 	s = start;
6631 	rc = KERN_SUCCESS;
6632 
6633 	if (vm_map_lookup_entry(map, s, &first_entry)) {
6634 		entry = first_entry;
6635 		/*
6636 		 * vm_map_clip_start will be done later.
6637 		 * We don't want to unnest any nested submaps here !
6638 		 */
6639 	} else {
6640 		/* Start address is not in map */
6641 		rc = KERN_INVALID_ADDRESS;
6642 		goto done;
6643 	}
6644 
6645 	while ((entry != vm_map_to_entry(map)) && (s < end)) {
6646 		/*
6647 		 * At this point, we have wired from "start" to "s".
6648 		 * We still need to wire from "s" to "end".
6649 		 *
6650 		 * "entry" hasn't been clipped, so it could start before "s"
6651 		 * and/or end after "end".
6652 		 */
6653 
6654 		/* "e" is how far we want to wire in this entry */
6655 		e = entry->vme_end;
6656 		if (e > end) {
6657 			e = end;
6658 		}
6659 
6660 		/*
6661 		 * If another thread is wiring/unwiring this entry then
6662 		 * block after informing other thread to wake us up.
6663 		 */
6664 		if (entry->in_transition) {
6665 			wait_result_t wait_result;
6666 
6667 			/*
6668 			 * We have not clipped the entry.  Make sure that
6669 			 * the start address is in range so that the lookup
6670 			 * below will succeed.
6671 			 * "s" is the current starting point: we've already
6672 			 * wired from "start" to "s" and we still have
6673 			 * to wire from "s" to "end".
6674 			 */
6675 
6676 			entry->needs_wakeup = TRUE;
6677 
6678 			/*
6679 			 * wake up anybody waiting on entries that we have
6680 			 * already wired.
6681 			 */
6682 			if (need_wakeup) {
6683 				vm_map_entry_wakeup(map);
6684 				need_wakeup = FALSE;
6685 			}
6686 			/*
6687 			 * User wiring is interruptible
6688 			 */
6689 			wait_result = vm_map_entry_wait(map,
6690 			    (user_wire) ? THREAD_ABORTSAFE :
6691 			    THREAD_UNINT);
6692 			if (user_wire && wait_result == THREAD_INTERRUPTED) {
6693 				/*
6694 				 * undo the wirings we have done so far
6695 				 * We do not clear the needs_wakeup flag,
6696 				 * because we cannot tell if we were the
6697 				 * only one waiting.
6698 				 */
6699 				rc = KERN_FAILURE;
6700 				goto done;
6701 			}
6702 
6703 			/*
6704 			 * Cannot avoid a lookup here. reset timestamp.
6705 			 */
6706 			last_timestamp = map->timestamp;
6707 
6708 			/*
6709 			 * The entry could have been clipped, look it up again.
6710 			 * Worse that can happen is, it may not exist anymore.
6711 			 */
6712 			if (!vm_map_lookup_entry(map, s, &first_entry)) {
6713 				/*
6714 				 * User: undo everything upto the previous
6715 				 * entry.  let vm_map_unwire worry about
6716 				 * checking the validity of the range.
6717 				 */
6718 				rc = KERN_FAILURE;
6719 				goto done;
6720 			}
6721 			entry = first_entry;
6722 			continue;
6723 		}
6724 
6725 		if (entry->is_sub_map) {
6726 			vm_map_offset_t sub_start;
6727 			vm_map_offset_t sub_end;
6728 			vm_map_offset_t local_start;
6729 			vm_map_offset_t local_end;
6730 			pmap_t          pmap;
6731 
6732 			if (wire_and_extract) {
6733 				/*
6734 				 * Wiring would result in copy-on-write
6735 				 * which would not be compatible with
6736 				 * the sharing we have with the original
6737 				 * provider of this memory.
6738 				 */
6739 				rc = KERN_INVALID_ARGUMENT;
6740 				goto done;
6741 			}
6742 
6743 			vm_map_clip_start(map, entry, s);
6744 			vm_map_clip_end(map, entry, end);
6745 
6746 			sub_start = VME_OFFSET(entry);
6747 			sub_end = entry->vme_end;
6748 			sub_end += VME_OFFSET(entry) - entry->vme_start;
6749 
6750 			local_end = entry->vme_end;
6751 			if (map_pmap == NULL) {
6752 				vm_object_t             object;
6753 				vm_object_offset_t      offset;
6754 				vm_prot_t               prot;
6755 				boolean_t               wired;
6756 				vm_map_entry_t          local_entry;
6757 				vm_map_version_t         version;
6758 				vm_map_t                lookup_map;
6759 
6760 				if (entry->use_pmap) {
6761 					pmap = VME_SUBMAP(entry)->pmap;
6762 					/* ppc implementation requires that */
6763 					/* submaps pmap address ranges line */
6764 					/* up with parent map */
6765 #ifdef notdef
6766 					pmap_addr = sub_start;
6767 #endif
6768 					pmap_addr = s;
6769 				} else {
6770 					pmap = map->pmap;
6771 					pmap_addr = s;
6772 				}
6773 
6774 				if (entry->wired_count) {
6775 					if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6776 						goto done;
6777 					}
6778 
6779 					/*
6780 					 * The map was not unlocked:
6781 					 * no need to goto re-lookup.
6782 					 * Just go directly to next entry.
6783 					 */
6784 					entry = entry->vme_next;
6785 					s = entry->vme_start;
6786 					continue;
6787 				}
6788 
6789 				/* call vm_map_lookup_and_lock_object to */
6790 				/* cause any needs copy to be   */
6791 				/* evaluated */
6792 				local_start = entry->vme_start;
6793 				lookup_map = map;
6794 				vm_map_lock_write_to_read(map);
6795 				rc = vm_map_lookup_and_lock_object(
6796 					&lookup_map, local_start,
6797 					(access_type | extra_prots),
6798 					OBJECT_LOCK_EXCLUSIVE,
6799 					&version, &object,
6800 					&offset, &prot, &wired,
6801 					NULL,
6802 					&real_map, NULL);
6803 				if (rc != KERN_SUCCESS) {
6804 					vm_map_unlock_read(lookup_map);
6805 					assert(map_pmap == NULL);
6806 					vm_map_unwire(map, start,
6807 					    s, user_wire);
6808 					return rc;
6809 				}
6810 				vm_object_unlock(object);
6811 				if (real_map != lookup_map) {
6812 					vm_map_unlock(real_map);
6813 				}
6814 				vm_map_unlock_read(lookup_map);
6815 				vm_map_lock(map);
6816 
6817 				/* we unlocked, so must re-lookup */
6818 				if (!vm_map_lookup_entry(map,
6819 				    local_start,
6820 				    &local_entry)) {
6821 					rc = KERN_FAILURE;
6822 					goto done;
6823 				}
6824 
6825 				/*
6826 				 * entry could have been "simplified",
6827 				 * so re-clip
6828 				 */
6829 				entry = local_entry;
6830 				assert(s == local_start);
6831 				vm_map_clip_start(map, entry, s);
6832 				vm_map_clip_end(map, entry, end);
6833 				/* re-compute "e" */
6834 				e = entry->vme_end;
6835 				if (e > end) {
6836 					e = end;
6837 				}
6838 
6839 				/* did we have a change of type? */
6840 				if (!entry->is_sub_map) {
6841 					last_timestamp = map->timestamp;
6842 					continue;
6843 				}
6844 			} else {
6845 				local_start = entry->vme_start;
6846 				pmap = map_pmap;
6847 			}
6848 
6849 			if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6850 				goto done;
6851 			}
6852 
6853 			entry->in_transition = TRUE;
6854 
6855 			vm_map_unlock(map);
6856 			rc = vm_map_wire_nested(VME_SUBMAP(entry),
6857 			    sub_start, sub_end,
6858 			    caller_prot, tag,
6859 			    user_wire, pmap, pmap_addr,
6860 			    NULL);
6861 			vm_map_lock(map);
6862 
6863 			/*
6864 			 * Find the entry again.  It could have been clipped
6865 			 * after we unlocked the map.
6866 			 */
6867 			if (!vm_map_lookup_entry(map, local_start,
6868 			    &first_entry)) {
6869 				panic("vm_map_wire: re-lookup failed");
6870 			}
6871 			entry = first_entry;
6872 
6873 			assert(local_start == s);
6874 			/* re-compute "e" */
6875 			e = entry->vme_end;
6876 			if (e > end) {
6877 				e = end;
6878 			}
6879 
6880 			last_timestamp = map->timestamp;
6881 			while ((entry != vm_map_to_entry(map)) &&
6882 			    (entry->vme_start < e)) {
6883 				assert(entry->in_transition);
6884 				entry->in_transition = FALSE;
6885 				if (entry->needs_wakeup) {
6886 					entry->needs_wakeup = FALSE;
6887 					need_wakeup = TRUE;
6888 				}
6889 				if (rc != KERN_SUCCESS) {/* from vm_*_wire */
6890 					subtract_wire_counts(map, entry, user_wire);
6891 				}
6892 				entry = entry->vme_next;
6893 			}
6894 			if (rc != KERN_SUCCESS) {       /* from vm_*_wire */
6895 				goto done;
6896 			}
6897 
6898 			/* no need to relookup again */
6899 			s = entry->vme_start;
6900 			continue;
6901 		}
6902 
6903 		/*
6904 		 * If this entry is already wired then increment
6905 		 * the appropriate wire reference count.
6906 		 */
6907 		if (entry->wired_count) {
6908 			if ((entry->protection & access_type) != access_type) {
6909 				/* found a protection problem */
6910 
6911 				/*
6912 				 * XXX FBDP
6913 				 * We should always return an error
6914 				 * in this case but since we didn't
6915 				 * enforce it before, let's do
6916 				 * it only for the new "wire_and_extract"
6917 				 * code path for now...
6918 				 */
6919 				if (wire_and_extract) {
6920 					rc = KERN_PROTECTION_FAILURE;
6921 					goto done;
6922 				}
6923 			}
6924 
6925 			/*
6926 			 * entry is already wired down, get our reference
6927 			 * after clipping to our range.
6928 			 */
6929 			vm_map_clip_start(map, entry, s);
6930 			vm_map_clip_end(map, entry, end);
6931 
6932 			if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6933 				goto done;
6934 			}
6935 
6936 			if (wire_and_extract) {
6937 				vm_object_t             object;
6938 				vm_object_offset_t      offset;
6939 				vm_page_t               m;
6940 
6941 				/*
6942 				 * We don't have to "wire" the page again
6943 				 * bit we still have to "extract" its
6944 				 * physical page number, after some sanity
6945 				 * checks.
6946 				 */
6947 				assert((entry->vme_end - entry->vme_start)
6948 				    == PAGE_SIZE);
6949 				assert(!entry->needs_copy);
6950 				assert(!entry->is_sub_map);
6951 				assert(VME_OBJECT(entry));
6952 				if (((entry->vme_end - entry->vme_start)
6953 				    != PAGE_SIZE) ||
6954 				    entry->needs_copy ||
6955 				    entry->is_sub_map ||
6956 				    VME_OBJECT(entry) == VM_OBJECT_NULL) {
6957 					rc = KERN_INVALID_ARGUMENT;
6958 					goto done;
6959 				}
6960 
6961 				object = VME_OBJECT(entry);
6962 				offset = VME_OFFSET(entry);
6963 				/* need exclusive lock to update m->dirty */
6964 				if (entry->protection & VM_PROT_WRITE) {
6965 					vm_object_lock(object);
6966 				} else {
6967 					vm_object_lock_shared(object);
6968 				}
6969 				m = vm_page_lookup(object, offset);
6970 				assert(m != VM_PAGE_NULL);
6971 				assert(VM_PAGE_WIRED(m));
6972 				if (m != VM_PAGE_NULL && VM_PAGE_WIRED(m)) {
6973 					*physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
6974 					if (entry->protection & VM_PROT_WRITE) {
6975 						vm_object_lock_assert_exclusive(
6976 							object);
6977 						m->vmp_dirty = TRUE;
6978 					}
6979 				} else {
6980 					/* not already wired !? */
6981 					*physpage_p = 0;
6982 				}
6983 				vm_object_unlock(object);
6984 			}
6985 
6986 			/* map was not unlocked: no need to relookup */
6987 			entry = entry->vme_next;
6988 			s = entry->vme_start;
6989 			continue;
6990 		}
6991 
6992 		/*
6993 		 * Unwired entry or wire request transmitted via submap
6994 		 */
6995 
6996 		/*
6997 		 * Wiring would copy the pages to the shadow object.
6998 		 * The shadow object would not be code-signed so
6999 		 * attempting to execute code from these copied pages
7000 		 * would trigger a code-signing violation.
7001 		 */
7002 
7003 		if ((entry->protection & VM_PROT_EXECUTE)
7004 #if XNU_TARGET_OS_OSX
7005 		    &&
7006 		    map->pmap != kernel_pmap &&
7007 		    (vm_map_cs_enforcement(map)
7008 #if __arm64__
7009 		    || !VM_MAP_IS_EXOTIC(map)
7010 #endif /* __arm64__ */
7011 		    )
7012 #endif /* XNU_TARGET_OS_OSX */
7013 #if CODE_SIGNING_MONITOR
7014 		    &&
7015 		    (csm_address_space_exempt(map->pmap) != KERN_SUCCESS)
7016 #endif
7017 		    ) {
7018 #if MACH_ASSERT
7019 			printf("pid %d[%s] wiring executable range from "
7020 			    "0x%llx to 0x%llx: rejected to preserve "
7021 			    "code-signing\n",
7022 			    proc_selfpid(),
7023 			    (get_bsdtask_info(current_task())
7024 			    ? proc_name_address(get_bsdtask_info(current_task()))
7025 			    : "?"),
7026 			    (uint64_t) entry->vme_start,
7027 			    (uint64_t) entry->vme_end);
7028 #endif /* MACH_ASSERT */
7029 			DTRACE_VM2(cs_executable_wire,
7030 			    uint64_t, (uint64_t)entry->vme_start,
7031 			    uint64_t, (uint64_t)entry->vme_end);
7032 			cs_executable_wire++;
7033 			rc = KERN_PROTECTION_FAILURE;
7034 			goto done;
7035 		}
7036 
7037 		/*
7038 		 * Perform actions of vm_map_lookup that need the write
7039 		 * lock on the map: create a shadow object for a
7040 		 * copy-on-write region, or an object for a zero-fill
7041 		 * region.
7042 		 */
7043 		size = entry->vme_end - entry->vme_start;
7044 		/*
7045 		 * If wiring a copy-on-write page, we need to copy it now
7046 		 * even if we're only (currently) requesting read access.
7047 		 * This is aggressive, but once it's wired we can't move it.
7048 		 */
7049 		if (entry->needs_copy) {
7050 			if (wire_and_extract) {
7051 				/*
7052 				 * We're supposed to share with the original
7053 				 * provider so should not be "needs_copy"
7054 				 */
7055 				rc = KERN_INVALID_ARGUMENT;
7056 				goto done;
7057 			}
7058 
7059 			VME_OBJECT_SHADOW(entry, size,
7060 			    vm_map_always_shadow(map));
7061 			entry->needs_copy = FALSE;
7062 		} else if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
7063 			if (wire_and_extract) {
7064 				/*
7065 				 * We're supposed to share with the original
7066 				 * provider so should already have an object.
7067 				 */
7068 				rc = KERN_INVALID_ARGUMENT;
7069 				goto done;
7070 			}
7071 			VME_OBJECT_SET(entry, vm_object_allocate(size), false, 0);
7072 			VME_OFFSET_SET(entry, (vm_object_offset_t)0);
7073 			assert(entry->use_pmap);
7074 		} else if (VME_OBJECT(entry)->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
7075 			if (wire_and_extract) {
7076 				/*
7077 				 * We're supposed to share with the original
7078 				 * provider so should not be COPY_SYMMETRIC.
7079 				 */
7080 				rc = KERN_INVALID_ARGUMENT;
7081 				goto done;
7082 			}
7083 			/*
7084 			 * Force an unrequested "copy-on-write" but only for
7085 			 * the range we're wiring.
7086 			 */
7087 //			printf("FBDP %s:%d map %p entry %p [ 0x%llx 0x%llx ] s 0x%llx end 0x%llx wire&extract=%d\n", __FUNCTION__, __LINE__, map, entry, (uint64_t)entry->vme_start, (uint64_t)entry->vme_end, (uint64_t)s, (uint64_t)end, wire_and_extract);
7088 			vm_map_clip_start(map, entry, s);
7089 			vm_map_clip_end(map, entry, end);
7090 			/* recompute "size" */
7091 			size = entry->vme_end - entry->vme_start;
7092 			/* make a shadow object */
7093 			vm_object_t orig_object;
7094 			vm_object_offset_t orig_offset;
7095 			orig_object = VME_OBJECT(entry);
7096 			orig_offset = VME_OFFSET(entry);
7097 			VME_OBJECT_SHADOW(entry, size, vm_map_always_shadow(map));
7098 			if (VME_OBJECT(entry) != orig_object) {
7099 				/*
7100 				 * This mapping has not been shared (or it would be
7101 				 * COPY_DELAY instead of COPY_SYMMETRIC) and it has
7102 				 * not been copied-on-write (or it would be marked
7103 				 * as "needs_copy" and would have been handled above
7104 				 * and also already write-protected).
7105 				 * We still need to write-protect here to prevent
7106 				 * other threads from modifying these pages while
7107 				 * we're in the process of copying and wiring
7108 				 * the copied pages.
7109 				 * Since the mapping is neither shared nor COWed,
7110 				 * we only need to write-protect the PTEs for this
7111 				 * mapping.
7112 				 */
7113 				vm_object_pmap_protect(orig_object,
7114 				    orig_offset,
7115 				    size,
7116 				    map->pmap,
7117 				    VM_MAP_PAGE_SIZE(map),
7118 				    entry->vme_start,
7119 				    entry->protection & ~VM_PROT_WRITE);
7120 			}
7121 		}
7122 		if (VME_OBJECT(entry)->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
7123 			/*
7124 			 * Make the object COPY_DELAY to get a stable object
7125 			 * to wire.
7126 			 * That should avoid creating long shadow chains while
7127 			 * wiring/unwiring the same range repeatedly.
7128 			 * That also prevents part of the object from being
7129 			 * wired while another part is "needs_copy", which
7130 			 * could result in conflicting rules wrt copy-on-write.
7131 			 */
7132 			vm_object_t object;
7133 
7134 			object = VME_OBJECT(entry);
7135 			vm_object_lock(object);
7136 			if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
7137 				assertf(vm_object_round_page(VME_OFFSET(entry) + size) - vm_object_trunc_page(VME_OFFSET(entry)) == object->vo_size,
7138 				    "object %p size 0x%llx entry %p [0x%llx:0x%llx:0x%llx] size 0x%llx\n",
7139 				    object, (uint64_t)object->vo_size,
7140 				    entry,
7141 				    (uint64_t)entry->vme_start,
7142 				    (uint64_t)entry->vme_end,
7143 				    (uint64_t)VME_OFFSET(entry),
7144 				    (uint64_t)size);
7145 				assertf(object->ref_count == 1,
7146 				    "object %p ref_count %d\n",
7147 				    object, object->ref_count);
7148 				assertf(!entry->needs_copy,
7149 				    "entry %p\n", entry);
7150 				object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
7151 				object->true_share = TRUE;
7152 			}
7153 			vm_object_unlock(object);
7154 		}
7155 
7156 		vm_map_clip_start(map, entry, s);
7157 		vm_map_clip_end(map, entry, end);
7158 
7159 		/* re-compute "e" */
7160 		e = entry->vme_end;
7161 		if (e > end) {
7162 			e = end;
7163 		}
7164 
7165 		/*
7166 		 * Check for holes and protection mismatch.
7167 		 * Holes: Next entry should be contiguous unless this
7168 		 *	  is the end of the region.
7169 		 * Protection: Access requested must be allowed, unless
7170 		 *	wiring is by protection class
7171 		 */
7172 		if ((entry->vme_end < end) &&
7173 		    ((entry->vme_next == vm_map_to_entry(map)) ||
7174 		    (entry->vme_next->vme_start > entry->vme_end))) {
7175 			/* found a hole */
7176 			rc = KERN_INVALID_ADDRESS;
7177 			goto done;
7178 		}
7179 		if ((entry->protection & access_type) != access_type) {
7180 			/* found a protection problem */
7181 			rc = KERN_PROTECTION_FAILURE;
7182 			goto done;
7183 		}
7184 
7185 		assert(entry->wired_count == 0 && entry->user_wired_count == 0);
7186 
7187 		if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
7188 			goto done;
7189 		}
7190 
7191 		entry->in_transition = TRUE;
7192 
7193 		/*
7194 		 * This entry might get split once we unlock the map.
7195 		 * In vm_fault_wire(), we need the current range as
7196 		 * defined by this entry.  In order for this to work
7197 		 * along with a simultaneous clip operation, we make a
7198 		 * temporary copy of this entry and use that for the
7199 		 * wiring.  Note that the underlying objects do not
7200 		 * change during a clip.
7201 		 */
7202 		tmp_entry = *entry;
7203 
7204 		/*
7205 		 * The in_transition state guarentees that the entry
7206 		 * (or entries for this range, if split occured) will be
7207 		 * there when the map lock is acquired for the second time.
7208 		 */
7209 		vm_map_unlock(map);
7210 
7211 		if (!user_wire && cur_thread != THREAD_NULL) {
7212 			interruptible_state = thread_interrupt_level(THREAD_UNINT);
7213 		} else {
7214 			interruptible_state = THREAD_UNINT;
7215 		}
7216 
7217 		if (map_pmap) {
7218 			rc = vm_fault_wire(map,
7219 			    &tmp_entry, caller_prot, tag, map_pmap, pmap_addr,
7220 			    physpage_p);
7221 		} else {
7222 			rc = vm_fault_wire(map,
7223 			    &tmp_entry, caller_prot, tag, map->pmap,
7224 			    tmp_entry.vme_start,
7225 			    physpage_p);
7226 		}
7227 
7228 		if (!user_wire && cur_thread != THREAD_NULL) {
7229 			thread_interrupt_level(interruptible_state);
7230 		}
7231 
7232 		vm_map_lock(map);
7233 
7234 		if (last_timestamp + 1 != map->timestamp) {
7235 			/*
7236 			 * Find the entry again.  It could have been clipped
7237 			 * after we unlocked the map.
7238 			 */
7239 			if (!vm_map_lookup_entry(map, tmp_entry.vme_start,
7240 			    &first_entry)) {
7241 				panic("vm_map_wire: re-lookup failed");
7242 			}
7243 
7244 			entry = first_entry;
7245 		}
7246 
7247 		last_timestamp = map->timestamp;
7248 
7249 		while ((entry != vm_map_to_entry(map)) &&
7250 		    (entry->vme_start < tmp_entry.vme_end)) {
7251 			assert(entry->in_transition);
7252 			entry->in_transition = FALSE;
7253 			if (entry->needs_wakeup) {
7254 				entry->needs_wakeup = FALSE;
7255 				need_wakeup = TRUE;
7256 			}
7257 			if (rc != KERN_SUCCESS) {       /* from vm_*_wire */
7258 				subtract_wire_counts(map, entry, user_wire);
7259 			}
7260 			entry = entry->vme_next;
7261 		}
7262 
7263 		if (rc != KERN_SUCCESS) {               /* from vm_*_wire */
7264 			goto done;
7265 		}
7266 
7267 		if ((entry != vm_map_to_entry(map)) && /* we still have entries in the map */
7268 		    (tmp_entry.vme_end != end) &&    /* AND, we are not at the end of the requested range */
7269 		    (entry->vme_start != tmp_entry.vme_end)) { /* AND, the next entry is not contiguous. */
7270 			/* found a "new" hole */
7271 			s = tmp_entry.vme_end;
7272 			rc = KERN_INVALID_ADDRESS;
7273 			goto done;
7274 		}
7275 
7276 		s = entry->vme_start;
7277 	} /* end while loop through map entries */
7278 
7279 done:
7280 	if (rc == KERN_SUCCESS) {
7281 		/* repair any damage we may have made to the VM map */
7282 		vm_map_simplify_range(map, start, end);
7283 	}
7284 
7285 	vm_map_unlock(map);
7286 
7287 	/*
7288 	 * wake up anybody waiting on entries we wired.
7289 	 */
7290 	if (need_wakeup) {
7291 		vm_map_entry_wakeup(map);
7292 	}
7293 
7294 	if (rc != KERN_SUCCESS) {
7295 		/* undo what has been wired so far */
7296 		vm_map_unwire_nested(map, start, s, user_wire,
7297 		    map_pmap, pmap_addr);
7298 		if (physpage_p) {
7299 			*physpage_p = 0;
7300 		}
7301 	}
7302 
7303 	return rc;
7304 }
7305 
7306 kern_return_t
vm_map_wire_external(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t caller_prot,boolean_t user_wire)7307 vm_map_wire_external(
7308 	vm_map_t                map,
7309 	vm_map_offset_t         start,
7310 	vm_map_offset_t         end,
7311 	vm_prot_t               caller_prot,
7312 	boolean_t               user_wire)
7313 {
7314 	kern_return_t   kret;
7315 
7316 	kret = vm_map_wire_nested(map, start, end, caller_prot, vm_tag_bt(),
7317 	    user_wire, (pmap_t)NULL, 0, NULL);
7318 	return kret;
7319 }
7320 
7321 kern_return_t
vm_map_wire_kernel(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t caller_prot,vm_tag_t tag,boolean_t user_wire)7322 vm_map_wire_kernel(
7323 	vm_map_t                map,
7324 	vm_map_offset_t         start,
7325 	vm_map_offset_t         end,
7326 	vm_prot_t               caller_prot,
7327 	vm_tag_t                tag,
7328 	boolean_t               user_wire)
7329 {
7330 	kern_return_t   kret;
7331 
7332 	kret = vm_map_wire_nested(map, start, end, caller_prot, tag,
7333 	    user_wire, (pmap_t)NULL, 0, NULL);
7334 	return kret;
7335 }
7336 
7337 kern_return_t
vm_map_wire_and_extract_external(vm_map_t map,vm_map_offset_t start,vm_prot_t caller_prot,boolean_t user_wire,ppnum_t * physpage_p)7338 vm_map_wire_and_extract_external(
7339 	vm_map_t        map,
7340 	vm_map_offset_t start,
7341 	vm_prot_t       caller_prot,
7342 	boolean_t       user_wire,
7343 	ppnum_t         *physpage_p)
7344 {
7345 	kern_return_t   kret;
7346 
7347 	kret = vm_map_wire_nested(map,
7348 	    start,
7349 	    start + VM_MAP_PAGE_SIZE(map),
7350 	    caller_prot,
7351 	    vm_tag_bt(),
7352 	    user_wire,
7353 	    (pmap_t)NULL,
7354 	    0,
7355 	    physpage_p);
7356 	if (kret != KERN_SUCCESS &&
7357 	    physpage_p != NULL) {
7358 		*physpage_p = 0;
7359 	}
7360 	return kret;
7361 }
7362 
7363 /*
7364  *	vm_map_unwire:
7365  *
7366  *	Sets the pageability of the specified address range in the target
7367  *	as pageable.  Regions specified must have been wired previously.
7368  *
7369  *	The map must not be locked, but a reference must remain to the map
7370  *	throughout the call.
7371  *
7372  *	Kernel will panic on failures.  User unwire ignores holes and
7373  *	unwired and intransition entries to avoid losing memory by leaving
7374  *	it unwired.
7375  */
7376 static kern_return_t
vm_map_unwire_nested(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,boolean_t user_wire,pmap_t map_pmap,vm_map_offset_t pmap_addr)7377 vm_map_unwire_nested(
7378 	vm_map_t                map,
7379 	vm_map_offset_t         start,
7380 	vm_map_offset_t         end,
7381 	boolean_t               user_wire,
7382 	pmap_t                  map_pmap,
7383 	vm_map_offset_t         pmap_addr)
7384 {
7385 	vm_map_entry_t          entry;
7386 	struct vm_map_entry     *first_entry, tmp_entry;
7387 	boolean_t               need_wakeup;
7388 	boolean_t               main_map = FALSE;
7389 	unsigned int            last_timestamp;
7390 
7391 	vm_map_lock(map);
7392 	if (map_pmap == NULL) {
7393 		main_map = TRUE;
7394 	}
7395 	last_timestamp = map->timestamp;
7396 
7397 	VM_MAP_RANGE_CHECK(map, start, end);
7398 	assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
7399 	assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
7400 
7401 	if (start == end) {
7402 		/* We unwired what the caller asked for: zero pages */
7403 		vm_map_unlock(map);
7404 		return KERN_SUCCESS;
7405 	}
7406 
7407 	if (__improbable(vm_map_range_overflows(map, start, end - start))) {
7408 		vm_map_unlock(map);
7409 		return KERN_INVALID_ADDRESS;
7410 	}
7411 
7412 	if (vm_map_lookup_entry(map, start, &first_entry)) {
7413 		entry = first_entry;
7414 		/*
7415 		 * vm_map_clip_start will be done later.
7416 		 * We don't want to unnest any nested sub maps here !
7417 		 */
7418 	} else {
7419 		if (!user_wire) {
7420 			panic("vm_map_unwire: start not found");
7421 		}
7422 		/*	Start address is not in map. */
7423 		vm_map_unlock(map);
7424 		return KERN_INVALID_ADDRESS;
7425 	}
7426 
7427 	if (entry->superpage_size) {
7428 		/* superpages are always wired */
7429 		vm_map_unlock(map);
7430 		return KERN_INVALID_ADDRESS;
7431 	}
7432 
7433 	need_wakeup = FALSE;
7434 	while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
7435 		if (entry->in_transition) {
7436 			/*
7437 			 * 1)
7438 			 * Another thread is wiring down this entry. Note
7439 			 * that if it is not for the other thread we would
7440 			 * be unwiring an unwired entry.  This is not
7441 			 * permitted.  If we wait, we will be unwiring memory
7442 			 * we did not wire.
7443 			 *
7444 			 * 2)
7445 			 * Another thread is unwiring this entry.  We did not
7446 			 * have a reference to it, because if we did, this
7447 			 * entry will not be getting unwired now.
7448 			 */
7449 			if (!user_wire) {
7450 				/*
7451 				 * XXX FBDP
7452 				 * This could happen:  there could be some
7453 				 * overlapping vslock/vsunlock operations
7454 				 * going on.
7455 				 * We should probably just wait and retry,
7456 				 * but then we have to be careful that this
7457 				 * entry could get "simplified" after
7458 				 * "in_transition" gets unset and before
7459 				 * we re-lookup the entry, so we would
7460 				 * have to re-clip the entry to avoid
7461 				 * re-unwiring what we have already unwired...
7462 				 * See vm_map_wire_nested().
7463 				 *
7464 				 * Or we could just ignore "in_transition"
7465 				 * here and proceed to decement the wired
7466 				 * count(s) on this entry.  That should be fine
7467 				 * as long as "wired_count" doesn't drop all
7468 				 * the way to 0 (and we should panic if THAT
7469 				 * happens).
7470 				 */
7471 				panic("vm_map_unwire: in_transition entry");
7472 			}
7473 
7474 			entry = entry->vme_next;
7475 			continue;
7476 		}
7477 
7478 		if (entry->is_sub_map) {
7479 			vm_map_offset_t sub_start;
7480 			vm_map_offset_t sub_end;
7481 			vm_map_offset_t local_end;
7482 			pmap_t          pmap;
7483 
7484 			vm_map_clip_start(map, entry, start);
7485 			vm_map_clip_end(map, entry, end);
7486 
7487 			sub_start = VME_OFFSET(entry);
7488 			sub_end = entry->vme_end - entry->vme_start;
7489 			sub_end += VME_OFFSET(entry);
7490 			local_end = entry->vme_end;
7491 			if (map_pmap == NULL) {
7492 				if (entry->use_pmap) {
7493 					pmap = VME_SUBMAP(entry)->pmap;
7494 					pmap_addr = sub_start;
7495 				} else {
7496 					pmap = map->pmap;
7497 					pmap_addr = start;
7498 				}
7499 				if (entry->wired_count == 0 ||
7500 				    (user_wire && entry->user_wired_count == 0)) {
7501 					if (!user_wire) {
7502 						panic("vm_map_unwire: entry is unwired");
7503 					}
7504 					entry = entry->vme_next;
7505 					continue;
7506 				}
7507 
7508 				/*
7509 				 * Check for holes
7510 				 * Holes: Next entry should be contiguous unless
7511 				 * this is the end of the region.
7512 				 */
7513 				if (((entry->vme_end < end) &&
7514 				    ((entry->vme_next == vm_map_to_entry(map)) ||
7515 				    (entry->vme_next->vme_start
7516 				    > entry->vme_end)))) {
7517 					if (!user_wire) {
7518 						panic("vm_map_unwire: non-contiguous region");
7519 					}
7520 /*
7521  *                                       entry = entry->vme_next;
7522  *                                       continue;
7523  */
7524 				}
7525 
7526 				subtract_wire_counts(map, entry, user_wire);
7527 
7528 				if (entry->wired_count != 0) {
7529 					entry = entry->vme_next;
7530 					continue;
7531 				}
7532 
7533 				entry->in_transition = TRUE;
7534 				tmp_entry = *entry;/* see comment in vm_map_wire() */
7535 
7536 				/*
7537 				 * We can unlock the map now. The in_transition state
7538 				 * guarantees existance of the entry.
7539 				 */
7540 				vm_map_unlock(map);
7541 				vm_map_unwire_nested(VME_SUBMAP(entry),
7542 				    sub_start, sub_end, user_wire, pmap, pmap_addr);
7543 				vm_map_lock(map);
7544 
7545 				if (last_timestamp + 1 != map->timestamp) {
7546 					/*
7547 					 * Find the entry again.  It could have been
7548 					 * clipped or deleted after we unlocked the map.
7549 					 */
7550 					if (!vm_map_lookup_entry(map,
7551 					    tmp_entry.vme_start,
7552 					    &first_entry)) {
7553 						if (!user_wire) {
7554 							panic("vm_map_unwire: re-lookup failed");
7555 						}
7556 						entry = first_entry->vme_next;
7557 					} else {
7558 						entry = first_entry;
7559 					}
7560 				}
7561 				last_timestamp = map->timestamp;
7562 
7563 				/*
7564 				 * clear transition bit for all constituent entries
7565 				 * that were in the original entry (saved in
7566 				 * tmp_entry).  Also check for waiters.
7567 				 */
7568 				while ((entry != vm_map_to_entry(map)) &&
7569 				    (entry->vme_start < tmp_entry.vme_end)) {
7570 					assert(entry->in_transition);
7571 					entry->in_transition = FALSE;
7572 					if (entry->needs_wakeup) {
7573 						entry->needs_wakeup = FALSE;
7574 						need_wakeup = TRUE;
7575 					}
7576 					entry = entry->vme_next;
7577 				}
7578 				continue;
7579 			} else {
7580 				tmp_entry = *entry;
7581 				vm_map_unlock(map);
7582 				vm_map_unwire_nested(VME_SUBMAP(entry),
7583 				    sub_start, sub_end, user_wire, map_pmap,
7584 				    pmap_addr);
7585 				vm_map_lock(map);
7586 
7587 				if (last_timestamp + 1 != map->timestamp) {
7588 					/*
7589 					 * Find the entry again.  It could have been
7590 					 * clipped or deleted after we unlocked the map.
7591 					 */
7592 					if (!vm_map_lookup_entry(map,
7593 					    tmp_entry.vme_start,
7594 					    &first_entry)) {
7595 						if (!user_wire) {
7596 							panic("vm_map_unwire: re-lookup failed");
7597 						}
7598 						entry = first_entry->vme_next;
7599 					} else {
7600 						entry = first_entry;
7601 					}
7602 				}
7603 				last_timestamp = map->timestamp;
7604 			}
7605 		}
7606 
7607 
7608 		if ((entry->wired_count == 0) ||
7609 		    (user_wire && entry->user_wired_count == 0)) {
7610 			if (!user_wire) {
7611 				panic("vm_map_unwire: entry is unwired");
7612 			}
7613 
7614 			entry = entry->vme_next;
7615 			continue;
7616 		}
7617 
7618 		assert(entry->wired_count > 0 &&
7619 		    (!user_wire || entry->user_wired_count > 0));
7620 
7621 		vm_map_clip_start(map, entry, start);
7622 		vm_map_clip_end(map, entry, end);
7623 
7624 		/*
7625 		 * Check for holes
7626 		 * Holes: Next entry should be contiguous unless
7627 		 *	  this is the end of the region.
7628 		 */
7629 		if (((entry->vme_end < end) &&
7630 		    ((entry->vme_next == vm_map_to_entry(map)) ||
7631 		    (entry->vme_next->vme_start > entry->vme_end)))) {
7632 			if (!user_wire) {
7633 				panic("vm_map_unwire: non-contiguous region");
7634 			}
7635 			entry = entry->vme_next;
7636 			continue;
7637 		}
7638 
7639 		subtract_wire_counts(map, entry, user_wire);
7640 
7641 		if (entry->wired_count != 0) {
7642 			entry = entry->vme_next;
7643 			continue;
7644 		}
7645 
7646 		if (entry->zero_wired_pages) {
7647 			entry->zero_wired_pages = FALSE;
7648 		}
7649 
7650 		entry->in_transition = TRUE;
7651 		tmp_entry = *entry;     /* see comment in vm_map_wire() */
7652 
7653 		/*
7654 		 * We can unlock the map now. The in_transition state
7655 		 * guarantees existance of the entry.
7656 		 */
7657 		vm_map_unlock(map);
7658 		if (map_pmap) {
7659 			vm_fault_unwire(map, &tmp_entry, FALSE, map_pmap,
7660 			    pmap_addr, tmp_entry.vme_end);
7661 		} else {
7662 			vm_fault_unwire(map, &tmp_entry, FALSE, map->pmap,
7663 			    tmp_entry.vme_start, tmp_entry.vme_end);
7664 		}
7665 		vm_map_lock(map);
7666 
7667 		if (last_timestamp + 1 != map->timestamp) {
7668 			/*
7669 			 * Find the entry again.  It could have been clipped
7670 			 * or deleted after we unlocked the map.
7671 			 */
7672 			if (!vm_map_lookup_entry(map, tmp_entry.vme_start,
7673 			    &first_entry)) {
7674 				if (!user_wire) {
7675 					panic("vm_map_unwire: re-lookup failed");
7676 				}
7677 				entry = first_entry->vme_next;
7678 			} else {
7679 				entry = first_entry;
7680 			}
7681 		}
7682 		last_timestamp = map->timestamp;
7683 
7684 		/*
7685 		 * clear transition bit for all constituent entries that
7686 		 * were in the original entry (saved in tmp_entry).  Also
7687 		 * check for waiters.
7688 		 */
7689 		while ((entry != vm_map_to_entry(map)) &&
7690 		    (entry->vme_start < tmp_entry.vme_end)) {
7691 			assert(entry->in_transition);
7692 			entry->in_transition = FALSE;
7693 			if (entry->needs_wakeup) {
7694 				entry->needs_wakeup = FALSE;
7695 				need_wakeup = TRUE;
7696 			}
7697 			entry = entry->vme_next;
7698 		}
7699 	}
7700 
7701 	/*
7702 	 * We might have fragmented the address space when we wired this
7703 	 * range of addresses.  Attempt to re-coalesce these VM map entries
7704 	 * with their neighbors now that they're no longer wired.
7705 	 * Under some circumstances, address space fragmentation can
7706 	 * prevent VM object shadow chain collapsing, which can cause
7707 	 * swap space leaks.
7708 	 */
7709 	vm_map_simplify_range(map, start, end);
7710 
7711 	vm_map_unlock(map);
7712 	/*
7713 	 * wake up anybody waiting on entries that we have unwired.
7714 	 */
7715 	if (need_wakeup) {
7716 		vm_map_entry_wakeup(map);
7717 	}
7718 	return KERN_SUCCESS;
7719 }
7720 
7721 kern_return_t
vm_map_unwire(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,boolean_t user_wire)7722 vm_map_unwire(
7723 	vm_map_t                map,
7724 	vm_map_offset_t         start,
7725 	vm_map_offset_t         end,
7726 	boolean_t               user_wire)
7727 {
7728 	return vm_map_unwire_nested(map, start, end,
7729 	           user_wire, (pmap_t)NULL, 0);
7730 }
7731 
7732 
7733 /*
7734  *	vm_map_entry_zap:	[ internal use only ]
7735  *
7736  *	Remove the entry from the target map
7737  *	and put it on a zap list.
7738  */
7739 static void
vm_map_entry_zap(vm_map_t map,vm_map_entry_t entry,vm_map_zap_t zap)7740 vm_map_entry_zap(
7741 	vm_map_t                map,
7742 	vm_map_entry_t          entry,
7743 	vm_map_zap_t            zap)
7744 {
7745 	vm_map_offset_t s, e;
7746 
7747 	s = entry->vme_start;
7748 	e = entry->vme_end;
7749 	assert(VM_MAP_PAGE_ALIGNED(s, FOURK_PAGE_MASK));
7750 	assert(VM_MAP_PAGE_ALIGNED(e, FOURK_PAGE_MASK));
7751 	if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK) {
7752 		assert(page_aligned(s));
7753 		assert(page_aligned(e));
7754 	}
7755 	if (entry->map_aligned == TRUE) {
7756 		assert(VM_MAP_PAGE_ALIGNED(s, VM_MAP_PAGE_MASK(map)));
7757 		assert(VM_MAP_PAGE_ALIGNED(e, VM_MAP_PAGE_MASK(map)));
7758 	}
7759 	assert(entry->wired_count == 0);
7760 	assert(entry->user_wired_count == 0);
7761 	assert(!entry->vme_permanent);
7762 
7763 	vm_map_store_entry_unlink(map, entry, false);
7764 	map->size -= e - s;
7765 
7766 	vm_map_zap_append(zap, entry);
7767 }
7768 
7769 static void
vm_map_submap_pmap_clean(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_map_t sub_map,vm_map_offset_t offset)7770 vm_map_submap_pmap_clean(
7771 	vm_map_t        map,
7772 	vm_map_offset_t start,
7773 	vm_map_offset_t end,
7774 	vm_map_t        sub_map,
7775 	vm_map_offset_t offset)
7776 {
7777 	vm_map_offset_t submap_start;
7778 	vm_map_offset_t submap_end;
7779 	vm_map_size_t   remove_size;
7780 	vm_map_entry_t  entry;
7781 
7782 	submap_end = offset + (end - start);
7783 	submap_start = offset;
7784 
7785 	vm_map_lock_read(sub_map);
7786 	if (vm_map_lookup_entry(sub_map, offset, &entry)) {
7787 		remove_size = (entry->vme_end - entry->vme_start);
7788 		if (offset > entry->vme_start) {
7789 			remove_size -= offset - entry->vme_start;
7790 		}
7791 
7792 
7793 		if (submap_end < entry->vme_end) {
7794 			remove_size -=
7795 			    entry->vme_end - submap_end;
7796 		}
7797 		if (entry->is_sub_map) {
7798 			vm_map_submap_pmap_clean(
7799 				sub_map,
7800 				start,
7801 				start + remove_size,
7802 				VME_SUBMAP(entry),
7803 				VME_OFFSET(entry));
7804 		} else {
7805 			if (map->mapped_in_other_pmaps &&
7806 			    os_ref_get_count_raw(&map->map_refcnt) != 0 &&
7807 			    VME_OBJECT(entry) != NULL) {
7808 				vm_object_pmap_protect_options(
7809 					VME_OBJECT(entry),
7810 					(VME_OFFSET(entry) +
7811 					offset -
7812 					entry->vme_start),
7813 					remove_size,
7814 					PMAP_NULL,
7815 					PAGE_SIZE,
7816 					entry->vme_start,
7817 					VM_PROT_NONE,
7818 					PMAP_OPTIONS_REMOVE);
7819 			} else {
7820 				pmap_remove(map->pmap,
7821 				    (addr64_t)start,
7822 				    (addr64_t)(start + remove_size));
7823 			}
7824 		}
7825 	}
7826 
7827 	entry = entry->vme_next;
7828 
7829 	while ((entry != vm_map_to_entry(sub_map))
7830 	    && (entry->vme_start < submap_end)) {
7831 		remove_size = (entry->vme_end - entry->vme_start);
7832 		if (submap_end < entry->vme_end) {
7833 			remove_size -= entry->vme_end - submap_end;
7834 		}
7835 		if (entry->is_sub_map) {
7836 			vm_map_submap_pmap_clean(
7837 				sub_map,
7838 				(start + entry->vme_start) - offset,
7839 				((start + entry->vme_start) - offset) + remove_size,
7840 				VME_SUBMAP(entry),
7841 				VME_OFFSET(entry));
7842 		} else {
7843 			if (map->mapped_in_other_pmaps &&
7844 			    os_ref_get_count_raw(&map->map_refcnt) != 0 &&
7845 			    VME_OBJECT(entry) != NULL) {
7846 				vm_object_pmap_protect_options(
7847 					VME_OBJECT(entry),
7848 					VME_OFFSET(entry),
7849 					remove_size,
7850 					PMAP_NULL,
7851 					PAGE_SIZE,
7852 					entry->vme_start,
7853 					VM_PROT_NONE,
7854 					PMAP_OPTIONS_REMOVE);
7855 			} else {
7856 				pmap_remove(map->pmap,
7857 				    (addr64_t)((start + entry->vme_start)
7858 				    - offset),
7859 				    (addr64_t)(((start + entry->vme_start)
7860 				    - offset) + remove_size));
7861 			}
7862 		}
7863 		entry = entry->vme_next;
7864 	}
7865 	vm_map_unlock_read(sub_map);
7866 	return;
7867 }
7868 
7869 /*
7870  *     virt_memory_guard_ast:
7871  *
7872  *     Handle the AST callout for a virtual memory guard.
7873  *	   raise an EXC_GUARD exception and terminate the task
7874  *     if configured to do so.
7875  */
7876 void
virt_memory_guard_ast(thread_t thread,mach_exception_data_type_t code,mach_exception_data_type_t subcode)7877 virt_memory_guard_ast(
7878 	thread_t thread,
7879 	mach_exception_data_type_t code,
7880 	mach_exception_data_type_t subcode)
7881 {
7882 	task_t task = get_threadtask(thread);
7883 	assert(task != kernel_task);
7884 	assert(task == current_task());
7885 	kern_return_t sync_exception_result;
7886 	uint32_t behavior;
7887 
7888 	behavior = task->task_exc_guard;
7889 
7890 	/* Is delivery enabled */
7891 	if ((behavior & TASK_EXC_GUARD_VM_DELIVER) == 0) {
7892 		return;
7893 	}
7894 
7895 	/* If only once, make sure we're that once */
7896 	while (behavior & TASK_EXC_GUARD_VM_ONCE) {
7897 		uint32_t new_behavior = behavior & ~TASK_EXC_GUARD_VM_DELIVER;
7898 
7899 		if (OSCompareAndSwap(behavior, new_behavior, &task->task_exc_guard)) {
7900 			break;
7901 		}
7902 		behavior = task->task_exc_guard;
7903 		if ((behavior & TASK_EXC_GUARD_VM_DELIVER) == 0) {
7904 			return;
7905 		}
7906 	}
7907 
7908 	const bool fatal = task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL;
7909 	/* Raise exception synchronously and see if handler claimed it */
7910 	sync_exception_result = task_exception_notify(EXC_GUARD, code, subcode, fatal);
7911 
7912 	if (fatal) {
7913 		/*
7914 		 * If Synchronous EXC_GUARD delivery was successful then
7915 		 * kill the process and return, else kill the process
7916 		 * and deliver the exception via EXC_CORPSE_NOTIFY.
7917 		 */
7918 		if (sync_exception_result == KERN_SUCCESS) {
7919 			task_bsdtask_kill(current_task());
7920 		} else {
7921 			exit_with_guard_exception(current_proc(), code, subcode);
7922 		}
7923 	} else if (task->task_exc_guard & TASK_EXC_GUARD_VM_CORPSE) {
7924 		/*
7925 		 * If the synchronous EXC_GUARD delivery was not successful,
7926 		 * raise a simulated crash.
7927 		 */
7928 		if (sync_exception_result != KERN_SUCCESS) {
7929 			task_violated_guard(code, subcode, NULL, FALSE);
7930 		}
7931 	}
7932 }
7933 
7934 /*
7935  *     vm_map_guard_exception:
7936  *
7937  *     Generate a GUARD_TYPE_VIRTUAL_MEMORY EXC_GUARD exception.
7938  *
7939  *     Right now, we do this when we find nothing mapped, or a
7940  *     gap in the mapping when a user address space deallocate
7941  *     was requested. We report the address of the first gap found.
7942  */
7943 static void
vm_map_guard_exception(vm_map_offset_t gap_start,unsigned reason)7944 vm_map_guard_exception(
7945 	vm_map_offset_t gap_start,
7946 	unsigned reason)
7947 {
7948 	mach_exception_code_t code = 0;
7949 	unsigned int guard_type = GUARD_TYPE_VIRT_MEMORY;
7950 	unsigned int target = 0; /* should we pass in pid associated with map? */
7951 	mach_exception_data_type_t subcode = (uint64_t)gap_start;
7952 	boolean_t fatal = FALSE;
7953 
7954 	task_t task = current_task_early();
7955 
7956 	/* Can't deliver exceptions to a NULL task (early boot) or kernel task */
7957 	if (task == NULL || task == kernel_task) {
7958 		return;
7959 	}
7960 
7961 	EXC_GUARD_ENCODE_TYPE(code, guard_type);
7962 	EXC_GUARD_ENCODE_FLAVOR(code, reason);
7963 	EXC_GUARD_ENCODE_TARGET(code, target);
7964 
7965 	if (task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL) {
7966 		fatal = TRUE;
7967 	}
7968 	thread_guard_violation(current_thread(), code, subcode, fatal);
7969 }
7970 
7971 static kern_return_t
vm_map_delete_submap_recurse(vm_map_t submap,vm_map_offset_t submap_start,vm_map_offset_t submap_end)7972 vm_map_delete_submap_recurse(
7973 	vm_map_t submap,
7974 	vm_map_offset_t submap_start,
7975 	vm_map_offset_t submap_end)
7976 {
7977 	vm_map_entry_t submap_entry;
7978 
7979 	/*
7980 	 * Verify that the submap does not contain any "permanent" entries
7981 	 * within the specified range.
7982 	 * We do not care about gaps.
7983 	 */
7984 
7985 	vm_map_lock(submap);
7986 
7987 	if (!vm_map_lookup_entry(submap, submap_start, &submap_entry)) {
7988 		submap_entry = submap_entry->vme_next;
7989 	}
7990 
7991 	for (;
7992 	    submap_entry != vm_map_to_entry(submap) &&
7993 	    submap_entry->vme_start < submap_end;
7994 	    submap_entry = submap_entry->vme_next) {
7995 		if (submap_entry->vme_permanent) {
7996 			/* "permanent" entry -> fail */
7997 			vm_map_unlock(submap);
7998 			return KERN_PROTECTION_FAILURE;
7999 		}
8000 	}
8001 	/* no "permanent" entries in the range -> success */
8002 	vm_map_unlock(submap);
8003 	return KERN_SUCCESS;
8004 }
8005 
8006 __abortlike
8007 static void
__vm_map_delete_misaligned_panic(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)8008 __vm_map_delete_misaligned_panic(
8009 	vm_map_t                map,
8010 	vm_map_offset_t         start,
8011 	vm_map_offset_t         end)
8012 {
8013 	panic("vm_map_delete(%p,0x%llx,0x%llx): start is not aligned to 0x%x",
8014 	    map, (uint64_t)start, (uint64_t)end, VM_MAP_PAGE_SIZE(map));
8015 }
8016 
8017 __abortlike
8018 static void
__vm_map_delete_failed_panic(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,kern_return_t kr)8019 __vm_map_delete_failed_panic(
8020 	vm_map_t                map,
8021 	vm_map_offset_t         start,
8022 	vm_map_offset_t         end,
8023 	kern_return_t           kr)
8024 {
8025 	panic("vm_map_delete(%p,0x%llx,0x%llx): failed unexpected with %d",
8026 	    map, (uint64_t)start, (uint64_t)end, kr);
8027 }
8028 
8029 __abortlike
8030 static void
__vm_map_delete_gap_panic(vm_map_t map,vm_map_offset_t where,vm_map_offset_t start,vm_map_offset_t end)8031 __vm_map_delete_gap_panic(
8032 	vm_map_t                map,
8033 	vm_map_offset_t         where,
8034 	vm_map_offset_t         start,
8035 	vm_map_offset_t         end)
8036 {
8037 	panic("vm_map_delete(%p,0x%llx,0x%llx): no map entry at 0x%llx",
8038 	    map, (uint64_t)start, (uint64_t)end, (uint64_t)where);
8039 }
8040 
8041 __abortlike
8042 static void
__vm_map_delete_permanent_panic(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_map_entry_t entry)8043 __vm_map_delete_permanent_panic(
8044 	vm_map_t                map,
8045 	vm_map_offset_t         start,
8046 	vm_map_offset_t         end,
8047 	vm_map_entry_t          entry)
8048 {
8049 	panic("vm_map_delete(%p,0x%llx,0x%llx): "
8050 	    "Attempting to remove permanent VM map entry %p [0x%llx:0x%llx]",
8051 	    map, (uint64_t)start, (uint64_t)end, entry,
8052 	    (uint64_t)entry->vme_start,
8053 	    (uint64_t)entry->vme_end);
8054 }
8055 
8056 __options_decl(vm_map_delete_state_t, uint32_t, {
8057 	VMDS_NONE               = 0x0000,
8058 
8059 	VMDS_FOUND_GAP          = 0x0001,
8060 	VMDS_GAPS_OK            = 0x0002,
8061 
8062 	VMDS_KERNEL_PMAP        = 0x0004,
8063 	VMDS_NEEDS_LOOKUP       = 0x0008,
8064 	VMDS_NEEDS_WAKEUP       = 0x0010,
8065 	VMDS_KERNEL_KMEMPTR     = 0x0020
8066 });
8067 
8068 /*
8069  *	vm_map_delete:	[ internal use only ]
8070  *
8071  *	Deallocates the given address range from the target map.
8072  *	Removes all user wirings. Unwires one kernel wiring if
8073  *	VM_MAP_REMOVE_KUNWIRE is set.  Waits for kernel wirings to go
8074  *	away if VM_MAP_REMOVE_WAIT_FOR_KWIRE is set.  Sleeps
8075  *	interruptibly if VM_MAP_REMOVE_INTERRUPTIBLE is set.
8076  *
8077  *
8078  *	When the map is a kernel map, then any error in removing mappings
8079  *	will lead to a panic so that clients do not have to repeat the panic
8080  *	code at each call site.  If VM_MAP_REMOVE_INTERRUPTIBLE
8081  *	is also passed, then KERN_ABORTED will not lead to a panic.
8082  *
8083  *	This routine is called with map locked and leaves map locked.
8084  */
8085 static kmem_return_t
vm_map_delete(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vmr_flags_t flags,kmem_guard_t guard,vm_map_zap_t zap_list)8086 vm_map_delete(
8087 	vm_map_t                map,
8088 	vm_map_offset_t         start,
8089 	vm_map_offset_t         end,
8090 	vmr_flags_t             flags,
8091 	kmem_guard_t            guard,
8092 	vm_map_zap_t            zap_list)
8093 {
8094 	vm_map_entry_t          entry, next;
8095 	int                     interruptible;
8096 	vm_map_offset_t         gap_start = 0;
8097 	vm_map_offset_t         clear_in_transition_end = 0;
8098 	__unused vm_map_offset_t save_start = start;
8099 	__unused vm_map_offset_t save_end = end;
8100 	vm_map_delete_state_t   state = VMDS_NONE;
8101 	kmem_return_t           ret = { };
8102 	vm_map_range_id_t       range_id = 0;
8103 	struct kmem_page_meta  *meta = NULL;
8104 	uint32_t                size_idx, slot_idx;
8105 	struct mach_vm_range    slot;
8106 
8107 	if (vm_map_pmap(map) == kernel_pmap) {
8108 		state |= VMDS_KERNEL_PMAP;
8109 		range_id = kmem_addr_get_range(start, end - start);
8110 		if (kmem_is_ptr_range(range_id)) {
8111 			state |= VMDS_KERNEL_KMEMPTR;
8112 			slot_idx = kmem_addr_get_slot_idx(start, end, range_id, &meta,
8113 			    &size_idx, &slot);
8114 		}
8115 	}
8116 
8117 	if (map->terminated || os_ref_get_count_raw(&map->map_refcnt) == 0) {
8118 		state |= VMDS_GAPS_OK;
8119 	}
8120 
8121 	if (map->corpse_source &&
8122 	    !(flags & VM_MAP_REMOVE_TO_OVERWRITE) &&
8123 	    !map->terminated) {
8124 		/*
8125 		 * The map is being used for corpses related diagnostics.
8126 		 * So skip any entry removal to avoid perturbing the map state.
8127 		 * The cleanup will happen in task_terminate_internal after the
8128 		 * call to task_port_no_senders.
8129 		 */
8130 		goto out;
8131 	}
8132 
8133 	interruptible = (flags & VM_MAP_REMOVE_INTERRUPTIBLE) ?
8134 	    THREAD_ABORTSAFE : THREAD_UNINT;
8135 
8136 	if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) == 0 &&
8137 	    (start & VM_MAP_PAGE_MASK(map))) {
8138 		__vm_map_delete_misaligned_panic(map, start, end);
8139 	}
8140 
8141 	if ((state & VMDS_GAPS_OK) == 0) {
8142 		/*
8143 		 * If the map isn't terminated then all deletions must have
8144 		 * no gaps, and be within the [min, max) of the map.
8145 		 *
8146 		 * We got here without VM_MAP_RANGE_CHECK() being called,
8147 		 * and hence must validate bounds manually.
8148 		 *
8149 		 * It is worth noting that because vm_deallocate() will
8150 		 * round_page() the deallocation size, it's possible for "end"
8151 		 * to be 0 here due to overflow. We hence must treat it as being
8152 		 * beyond vm_map_max(map).
8153 		 *
8154 		 * Similarly, end < start means some wrap around happend,
8155 		 * which should cause an error or panic.
8156 		 */
8157 		if (end == 0 || end > vm_map_max(map)) {
8158 			state |= VMDS_FOUND_GAP;
8159 			gap_start = vm_map_max(map);
8160 			if (state & VMDS_KERNEL_PMAP) {
8161 				__vm_map_delete_gap_panic(map,
8162 				    gap_start, start, end);
8163 			}
8164 			goto out;
8165 		}
8166 
8167 		if (end < start) {
8168 			if (state & VMDS_KERNEL_PMAP) {
8169 				__vm_map_delete_gap_panic(map,
8170 				    vm_map_max(map), start, end);
8171 			}
8172 			ret.kmr_return = KERN_INVALID_ARGUMENT;
8173 			goto out;
8174 		}
8175 
8176 		if (start < vm_map_min(map)) {
8177 			state |= VMDS_FOUND_GAP;
8178 			gap_start = start;
8179 			if (state & VMDS_KERNEL_PMAP) {
8180 				__vm_map_delete_gap_panic(map,
8181 				    gap_start, start, end);
8182 			}
8183 			goto out;
8184 		}
8185 	} else {
8186 		/*
8187 		 * If the map is terminated, we must accept start/end
8188 		 * being beyond the boundaries of the map as this is
8189 		 * how some of the mappings like commpage mappings
8190 		 * can be destroyed (they're outside of those bounds).
8191 		 *
8192 		 * end < start is still something we can't cope with,
8193 		 * so just bail.
8194 		 */
8195 		if (end < start) {
8196 			goto out;
8197 		}
8198 	}
8199 
8200 
8201 	/*
8202 	 *	Find the start of the region.
8203 	 *
8204 	 *	If in a superpage, extend the range
8205 	 *	to include the start of the mapping.
8206 	 */
8207 	while (vm_map_lookup_entry_or_next(map, start, &entry)) {
8208 		if (entry->superpage_size && (start & ~SUPERPAGE_MASK)) {
8209 			start = SUPERPAGE_ROUND_DOWN(start);
8210 		} else {
8211 			SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
8212 			break;
8213 		}
8214 	}
8215 
8216 	if (entry->superpage_size) {
8217 		end = SUPERPAGE_ROUND_UP(end);
8218 	}
8219 
8220 	/*
8221 	 *	Step through all entries in this region
8222 	 */
8223 	for (vm_map_offset_t s = start; s < end;) {
8224 		/*
8225 		 * At this point, we have deleted all the memory entries
8226 		 * in [start, s) and are proceeding with the [s, end) range.
8227 		 *
8228 		 * This loop might drop the map lock, and it is possible that
8229 		 * some memory was already reallocated within [start, s)
8230 		 * and we don't want to mess with those entries.
8231 		 *
8232 		 * Some of those entries could even have been re-assembled
8233 		 * with an entry after "s" (in vm_map_simplify_entry()), so
8234 		 * we may have to vm_map_clip_start() again.
8235 		 *
8236 		 * When clear_in_transition_end is set, the we had marked
8237 		 * [start, clear_in_transition_end) as "in_transition"
8238 		 * during a previous iteration and we need to clear it.
8239 		 */
8240 
8241 		/*
8242 		 * Step 1: If needed (because we dropped locks),
8243 		 *         lookup the entry again.
8244 		 *
8245 		 *         If we're coming back from unwiring (Step 5),
8246 		 *         we also need to mark the entries as no longer
8247 		 *         in transition after that.
8248 		 */
8249 
8250 		if (state & VMDS_NEEDS_LOOKUP) {
8251 			state &= ~VMDS_NEEDS_LOOKUP;
8252 
8253 			if (vm_map_lookup_entry_or_next(map, s, &entry)) {
8254 				SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
8255 			}
8256 
8257 			if (state & VMDS_KERNEL_KMEMPTR) {
8258 				kmem_validate_slot(s, meta, size_idx, slot_idx);
8259 			}
8260 		}
8261 
8262 		if (clear_in_transition_end) {
8263 			for (vm_map_entry_t it = entry;
8264 			    it != vm_map_to_entry(map) &&
8265 			    it->vme_start < clear_in_transition_end;
8266 			    it = it->vme_next) {
8267 				assert(it->in_transition);
8268 				it->in_transition = FALSE;
8269 				if (it->needs_wakeup) {
8270 					it->needs_wakeup = FALSE;
8271 					state |= VMDS_NEEDS_WAKEUP;
8272 				}
8273 			}
8274 
8275 			clear_in_transition_end = 0;
8276 		}
8277 
8278 
8279 		/*
8280 		 * Step 2: Perform various policy checks
8281 		 *         before we do _anything_ to this entry.
8282 		 */
8283 
8284 		if (entry == vm_map_to_entry(map) || s < entry->vme_start) {
8285 			if (state & (VMDS_GAPS_OK | VMDS_FOUND_GAP)) {
8286 				/*
8287 				 * Either we found a gap already,
8288 				 * or we are tearing down a map,
8289 				 * keep going.
8290 				 */
8291 			} else if (state & VMDS_KERNEL_PMAP) {
8292 				__vm_map_delete_gap_panic(map, s, start, end);
8293 			} else if (s < end) {
8294 				state |= VMDS_FOUND_GAP;
8295 				gap_start = s;
8296 			}
8297 
8298 			if (entry == vm_map_to_entry(map) ||
8299 			    end <= entry->vme_start) {
8300 				break;
8301 			}
8302 
8303 			s = entry->vme_start;
8304 		}
8305 
8306 		if (state & VMDS_KERNEL_PMAP) {
8307 			/*
8308 			 * In the kernel map and its submaps,
8309 			 * permanent entries never die, even
8310 			 * if VM_MAP_REMOVE_IMMUTABLE is passed.
8311 			 */
8312 			if (entry->vme_permanent) {
8313 				__vm_map_delete_permanent_panic(map, start, end, entry);
8314 			}
8315 
8316 			if (flags & VM_MAP_REMOVE_GUESS_SIZE) {
8317 				end = entry->vme_end;
8318 				flags &= ~VM_MAP_REMOVE_GUESS_SIZE;
8319 			}
8320 
8321 			/*
8322 			 * In the kernel map and its submaps,
8323 			 * the removal of an atomic/guarded entry is strict.
8324 			 *
8325 			 * An atomic entry is processed only if it was
8326 			 * specifically targeted.
8327 			 *
8328 			 * We might have deleted non-atomic entries before
8329 			 * we reach this this point however...
8330 			 */
8331 			kmem_entry_validate_guard(map, entry,
8332 			    start, end - start, guard);
8333 		}
8334 
8335 		/*
8336 		 * Step 2.1: handle "permanent" and "submap" entries
8337 		 * *before* clipping to avoid triggering some unnecessary
8338 		 * un-nesting of the shared region.
8339 		 */
8340 		if (entry->vme_permanent && entry->is_sub_map) {
8341 //			printf("FBDP %s:%d permanent submap...\n", __FUNCTION__, __LINE__);
8342 			/*
8343 			 * Un-mapping a "permanent" mapping of a user-space
8344 			 * submap is not allowed unless...
8345 			 */
8346 			if (flags & VM_MAP_REMOVE_IMMUTABLE) {
8347 				/*
8348 				 * a. explicitly requested by the kernel caller.
8349 				 */
8350 //				printf("FBDP %s:%d flags & REMOVE_IMMUTABLE\n", __FUNCTION__, __LINE__);
8351 			} else if ((flags & VM_MAP_REMOVE_IMMUTABLE_CODE) &&
8352 			    developer_mode_state()) {
8353 				/*
8354 				 * b. we're in "developer" mode (for
8355 				 *    breakpoints, dtrace probes, ...).
8356 				 */
8357 //				printf("FBDP %s:%d flags & REMOVE_IMMUTABLE_CODE\n", __FUNCTION__, __LINE__);
8358 			} else if (map->terminated) {
8359 				/*
8360 				 * c. this is the final address space cleanup.
8361 				 */
8362 //				printf("FBDP %s:%d map->terminated\n", __FUNCTION__, __LINE__);
8363 			} else {
8364 				vm_map_offset_t submap_start, submap_end;
8365 				kern_return_t submap_kr;
8366 
8367 				/*
8368 				 * Check if there are any "permanent" mappings
8369 				 * in this range in the submap.
8370 				 */
8371 				if (entry->in_transition) {
8372 					/* can that even happen ? */
8373 					goto in_transition;
8374 				}
8375 				/* compute the clipped range in the submap */
8376 				submap_start = s - entry->vme_start;
8377 				submap_start += VME_OFFSET(entry);
8378 				submap_end = end - entry->vme_start;
8379 				submap_end += VME_OFFSET(entry);
8380 				submap_kr = vm_map_delete_submap_recurse(
8381 					VME_SUBMAP(entry),
8382 					submap_start,
8383 					submap_end);
8384 				if (submap_kr != KERN_SUCCESS) {
8385 					/*
8386 					 * There are some "permanent" mappings
8387 					 * in the submap: we are not allowed
8388 					 * to remove this range.
8389 					 */
8390 					printf("%d[%s] removing permanent submap entry "
8391 					    "%p [0x%llx:0x%llx] prot 0x%x/0x%x -> KERN_PROT_FAILURE\n",
8392 					    proc_selfpid(),
8393 					    (get_bsdtask_info(current_task())
8394 					    ? proc_name_address(get_bsdtask_info(current_task()))
8395 					    : "?"), entry,
8396 					    (uint64_t)entry->vme_start,
8397 					    (uint64_t)entry->vme_end,
8398 					    entry->protection,
8399 					    entry->max_protection);
8400 					DTRACE_VM6(vm_map_delete_permanent_deny_submap,
8401 					    vm_map_entry_t, entry,
8402 					    vm_map_offset_t, entry->vme_start,
8403 					    vm_map_offset_t, entry->vme_end,
8404 					    vm_prot_t, entry->protection,
8405 					    vm_prot_t, entry->max_protection,
8406 					    int, VME_ALIAS(entry));
8407 					ret.kmr_return = KERN_PROTECTION_FAILURE;
8408 					goto out;
8409 				}
8410 				/* no permanent mappings: proceed */
8411 			}
8412 		}
8413 
8414 		/*
8415 		 * Step 3: Perform any clipping needed.
8416 		 *
8417 		 *         After this, "entry" starts at "s", ends before "end"
8418 		 */
8419 
8420 		if (entry->vme_start < s) {
8421 			if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) &&
8422 			    entry->map_aligned &&
8423 			    !VM_MAP_PAGE_ALIGNED(s, VM_MAP_PAGE_MASK(map))) {
8424 				/*
8425 				 * The entry will no longer be map-aligned
8426 				 * after clipping and the caller said it's OK.
8427 				 */
8428 				entry->map_aligned = FALSE;
8429 			}
8430 			vm_map_clip_start(map, entry, s);
8431 			SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
8432 		}
8433 
8434 		if (end < entry->vme_end) {
8435 			if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) &&
8436 			    entry->map_aligned &&
8437 			    !VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map))) {
8438 				/*
8439 				 * The entry will no longer be map-aligned
8440 				 * after clipping and the caller said it's OK.
8441 				 */
8442 				entry->map_aligned = FALSE;
8443 			}
8444 			vm_map_clip_end(map, entry, end);
8445 		}
8446 
8447 		if (entry->vme_permanent && entry->is_sub_map) {
8448 			/*
8449 			 * We already went through step 2.1 which did not deny
8450 			 * the removal of this "permanent" and "is_sub_map"
8451 			 * entry.
8452 			 * Now that we've clipped what we actually want to
8453 			 * delete, undo the "permanent" part to allow the
8454 			 * removal to proceed.
8455 			 */
8456 			DTRACE_VM6(vm_map_delete_permanent_allow_submap,
8457 			    vm_map_entry_t, entry,
8458 			    vm_map_offset_t, entry->vme_start,
8459 			    vm_map_offset_t, entry->vme_end,
8460 			    vm_prot_t, entry->protection,
8461 			    vm_prot_t, entry->max_protection,
8462 			    int, VME_ALIAS(entry));
8463 			entry->vme_permanent = false;
8464 		}
8465 
8466 		assert(s == entry->vme_start);
8467 		assert(entry->vme_end <= end);
8468 
8469 
8470 		/*
8471 		 * Step 4: If the entry is in flux, wait for this to resolve.
8472 		 */
8473 
8474 		if (entry->in_transition) {
8475 			wait_result_t wait_result;
8476 
8477 in_transition:
8478 			/*
8479 			 * Another thread is wiring/unwiring this entry.
8480 			 * Let the other thread know we are waiting.
8481 			 */
8482 
8483 			entry->needs_wakeup = TRUE;
8484 
8485 			/*
8486 			 * wake up anybody waiting on entries that we have
8487 			 * already unwired/deleted.
8488 			 */
8489 			if (state & VMDS_NEEDS_WAKEUP) {
8490 				vm_map_entry_wakeup(map);
8491 				state &= ~VMDS_NEEDS_WAKEUP;
8492 			}
8493 
8494 			wait_result = vm_map_entry_wait(map, interruptible);
8495 
8496 			if (interruptible &&
8497 			    wait_result == THREAD_INTERRUPTED) {
8498 				/*
8499 				 * We do not clear the needs_wakeup flag,
8500 				 * since we cannot tell if we were the only one.
8501 				 */
8502 				ret.kmr_return = KERN_ABORTED;
8503 				return ret;
8504 			}
8505 
8506 			/*
8507 			 * The entry could have been clipped or it
8508 			 * may not exist anymore.  Look it up again.
8509 			 */
8510 			state |= VMDS_NEEDS_LOOKUP;
8511 			continue;
8512 		}
8513 
8514 
8515 		/*
8516 		 * Step 5: Handle wiring
8517 		 */
8518 
8519 		if (entry->wired_count) {
8520 			struct vm_map_entry tmp_entry;
8521 			boolean_t           user_wire;
8522 			unsigned int        last_timestamp;
8523 
8524 			user_wire = entry->user_wired_count > 0;
8525 
8526 			/*
8527 			 *      Remove a kernel wiring if requested
8528 			 */
8529 			if (flags & VM_MAP_REMOVE_KUNWIRE) {
8530 				entry->wired_count--;
8531 				vme_btref_consider_and_put(entry);
8532 			}
8533 
8534 			/*
8535 			 *	Remove all user wirings for proper accounting
8536 			 */
8537 			while (entry->user_wired_count) {
8538 				subtract_wire_counts(map, entry, user_wire);
8539 			}
8540 
8541 			/*
8542 			 * All our DMA I/O operations in IOKit are currently
8543 			 * done by wiring through the map entries of the task
8544 			 * requesting the I/O.
8545 			 *
8546 			 * Because of this, we must always wait for kernel wirings
8547 			 * to go away on the entries before deleting them.
8548 			 *
8549 			 * Any caller who wants to actually remove a kernel wiring
8550 			 * should explicitly set the VM_MAP_REMOVE_KUNWIRE flag to
8551 			 * properly remove one wiring instead of blasting through
8552 			 * them all.
8553 			 */
8554 			if (entry->wired_count != 0) {
8555 				assert(map != kernel_map);
8556 				/*
8557 				 * Cannot continue.  Typical case is when
8558 				 * a user thread has physical io pending on
8559 				 * on this page.  Either wait for the
8560 				 * kernel wiring to go away or return an
8561 				 * error.
8562 				 */
8563 				wait_result_t wait_result;
8564 
8565 				entry->needs_wakeup = TRUE;
8566 				wait_result = vm_map_entry_wait(map,
8567 				    interruptible);
8568 
8569 				if (interruptible &&
8570 				    wait_result == THREAD_INTERRUPTED) {
8571 					/*
8572 					 * We do not clear the
8573 					 * needs_wakeup flag, since we
8574 					 * cannot tell if we were the
8575 					 * only one.
8576 					 */
8577 					ret.kmr_return = KERN_ABORTED;
8578 					return ret;
8579 				}
8580 
8581 
8582 				/*
8583 				 * The entry could have been clipped or
8584 				 * it may not exist anymore.  Look it
8585 				 * up again.
8586 				 */
8587 				state |= VMDS_NEEDS_LOOKUP;
8588 				continue;
8589 			}
8590 
8591 			/*
8592 			 * We can unlock the map now.
8593 			 *
8594 			 * The entry might be split once we unlock the map,
8595 			 * but we need the range as defined by this entry
8596 			 * to be stable. So we must make a local copy.
8597 			 *
8598 			 * The underlying objects do not change during clips,
8599 			 * and the in_transition state guarentees existence
8600 			 * of the entry.
8601 			 */
8602 			last_timestamp = map->timestamp;
8603 			entry->in_transition = TRUE;
8604 			tmp_entry = *entry;
8605 			vm_map_unlock(map);
8606 
8607 			if (tmp_entry.is_sub_map) {
8608 				vm_map_t sub_map;
8609 				vm_map_offset_t sub_start, sub_end;
8610 				pmap_t pmap;
8611 				vm_map_offset_t pmap_addr;
8612 
8613 
8614 				sub_map = VME_SUBMAP(&tmp_entry);
8615 				sub_start = VME_OFFSET(&tmp_entry);
8616 				sub_end = sub_start + (tmp_entry.vme_end -
8617 				    tmp_entry.vme_start);
8618 				if (tmp_entry.use_pmap) {
8619 					pmap = sub_map->pmap;
8620 					pmap_addr = tmp_entry.vme_start;
8621 				} else {
8622 					pmap = map->pmap;
8623 					pmap_addr = tmp_entry.vme_start;
8624 				}
8625 				(void) vm_map_unwire_nested(sub_map,
8626 				    sub_start, sub_end,
8627 				    user_wire,
8628 				    pmap, pmap_addr);
8629 			} else {
8630 				vm_map_offset_t entry_end = tmp_entry.vme_end;
8631 				vm_map_offset_t max_end;
8632 
8633 				if (flags & VM_MAP_REMOVE_NOKUNWIRE_LAST) {
8634 					max_end = end - VM_MAP_PAGE_SIZE(map);
8635 					if (entry_end > max_end) {
8636 						entry_end = max_end;
8637 					}
8638 				}
8639 
8640 				if (tmp_entry.vme_kernel_object) {
8641 					pmap_protect_options(
8642 						map->pmap,
8643 						tmp_entry.vme_start,
8644 						entry_end,
8645 						VM_PROT_NONE,
8646 						PMAP_OPTIONS_REMOVE,
8647 						NULL);
8648 				}
8649 				vm_fault_unwire(map, &tmp_entry,
8650 				    tmp_entry.vme_kernel_object, map->pmap,
8651 				    tmp_entry.vme_start, entry_end);
8652 			}
8653 
8654 			vm_map_lock(map);
8655 
8656 			/*
8657 			 * Unwiring happened, we can now go back to deleting
8658 			 * them (after we clear the in_transition bit for the range).
8659 			 */
8660 			if (last_timestamp + 1 != map->timestamp) {
8661 				state |= VMDS_NEEDS_LOOKUP;
8662 			}
8663 			clear_in_transition_end = tmp_entry.vme_end;
8664 			continue;
8665 		}
8666 
8667 		assert(entry->wired_count == 0);
8668 		assert(entry->user_wired_count == 0);
8669 
8670 
8671 		/*
8672 		 * Step 6: Entry is unwired and ready for us to delete !
8673 		 */
8674 
8675 		if (!entry->vme_permanent) {
8676 			/*
8677 			 * Typical case: the entry really shouldn't be permanent
8678 			 */
8679 		} else if ((flags & VM_MAP_REMOVE_IMMUTABLE_CODE) &&
8680 		    (entry->protection & VM_PROT_EXECUTE) &&
8681 		    developer_mode_state()) {
8682 			/*
8683 			 * Allow debuggers to undo executable mappings
8684 			 * when developer mode is on.
8685 			 */
8686 #if 0
8687 			printf("FBDP %d[%s] removing permanent executable entry "
8688 			    "%p [0x%llx:0x%llx] prot 0x%x/0x%x\n",
8689 			    proc_selfpid(),
8690 			    (current_task()->bsd_info
8691 			    ? proc_name_address(current_task()->bsd_info)
8692 			    : "?"), entry,
8693 			    (uint64_t)entry->vme_start,
8694 			    (uint64_t)entry->vme_end,
8695 			    entry->protection,
8696 			    entry->max_protection);
8697 #endif
8698 			entry->vme_permanent = FALSE;
8699 		} else if ((flags & VM_MAP_REMOVE_IMMUTABLE) || map->terminated) {
8700 #if 0
8701 			printf("FBDP %d[%s] removing permanent entry "
8702 			    "%p [0x%llx:0x%llx] prot 0x%x/0x%x\n",
8703 			    proc_selfpid(),
8704 			    (current_task()->bsd_info
8705 			    ? proc_name_address(current_task()->bsd_info)
8706 			    : "?"), entry,
8707 			    (uint64_t)entry->vme_start,
8708 			    (uint64_t)entry->vme_end,
8709 			    entry->protection,
8710 			    entry->max_protection);
8711 #endif
8712 			entry->vme_permanent = FALSE;
8713 #if CODE_SIGNING_MONITOR
8714 		} else if ((entry->protection & VM_PROT_EXECUTE) && !csm_enabled()) {
8715 			entry->vme_permanent = FALSE;
8716 
8717 			printf("%d[%s] %s(0x%llx,0x%llx): "
8718 			    "code signing monitor disabled, allowing for permanent executable entry [0x%llx:0x%llx] "
8719 			    "prot 0x%x/0x%x\n",
8720 			    proc_selfpid(),
8721 			    (get_bsdtask_info(current_task())
8722 			    ? proc_name_address(get_bsdtask_info(current_task()))
8723 			    : "?"),
8724 			    __FUNCTION__,
8725 			    (uint64_t)start,
8726 			    (uint64_t)end,
8727 			    (uint64_t)entry->vme_start,
8728 			    (uint64_t)entry->vme_end,
8729 			    entry->protection,
8730 			    entry->max_protection);
8731 #endif
8732 		} else {
8733 			DTRACE_VM6(vm_map_delete_permanent,
8734 			    vm_map_entry_t, entry,
8735 			    vm_map_offset_t, entry->vme_start,
8736 			    vm_map_offset_t, entry->vme_end,
8737 			    vm_prot_t, entry->protection,
8738 			    vm_prot_t, entry->max_protection,
8739 			    int, VME_ALIAS(entry));
8740 		}
8741 
8742 		if (entry->is_sub_map) {
8743 			assertf(VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)) >= VM_MAP_PAGE_SHIFT(map),
8744 			    "map %p (%d) entry %p submap %p (%d)\n",
8745 			    map, VM_MAP_PAGE_SHIFT(map), entry,
8746 			    VME_SUBMAP(entry),
8747 			    VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)));
8748 			if (entry->use_pmap) {
8749 #ifndef NO_NESTED_PMAP
8750 				int pmap_flags;
8751 
8752 				if (map->terminated) {
8753 					/*
8754 					 * This is the final cleanup of the
8755 					 * address space being terminated.
8756 					 * No new mappings are expected and
8757 					 * we don't really need to unnest the
8758 					 * shared region (and lose the "global"
8759 					 * pmap mappings, if applicable).
8760 					 *
8761 					 * Tell the pmap layer that we're
8762 					 * "clean" wrt nesting.
8763 					 */
8764 					pmap_flags = PMAP_UNNEST_CLEAN;
8765 				} else {
8766 					/*
8767 					 * We're unmapping part of the nested
8768 					 * shared region, so we can't keep the
8769 					 * nested pmap.
8770 					 */
8771 					pmap_flags = 0;
8772 				}
8773 				pmap_unnest_options(
8774 					map->pmap,
8775 					(addr64_t)entry->vme_start,
8776 					entry->vme_end - entry->vme_start,
8777 					pmap_flags);
8778 #endif  /* NO_NESTED_PMAP */
8779 				if (map->mapped_in_other_pmaps &&
8780 				    os_ref_get_count_raw(&map->map_refcnt) != 0) {
8781 					/* clean up parent map/maps */
8782 					vm_map_submap_pmap_clean(
8783 						map, entry->vme_start,
8784 						entry->vme_end,
8785 						VME_SUBMAP(entry),
8786 						VME_OFFSET(entry));
8787 				}
8788 			} else {
8789 				vm_map_submap_pmap_clean(
8790 					map, entry->vme_start, entry->vme_end,
8791 					VME_SUBMAP(entry),
8792 					VME_OFFSET(entry));
8793 			}
8794 		} else if (entry->vme_kernel_object ||
8795 		    VME_OBJECT(entry) == compressor_object) {
8796 			/*
8797 			 * nothing to do
8798 			 */
8799 		} else if (map->mapped_in_other_pmaps &&
8800 		    os_ref_get_count_raw(&map->map_refcnt) != 0) {
8801 			vm_object_pmap_protect_options(
8802 				VME_OBJECT(entry), VME_OFFSET(entry),
8803 				entry->vme_end - entry->vme_start,
8804 				PMAP_NULL,
8805 				PAGE_SIZE,
8806 				entry->vme_start,
8807 				VM_PROT_NONE,
8808 				PMAP_OPTIONS_REMOVE);
8809 		} else if ((VME_OBJECT(entry) != VM_OBJECT_NULL) ||
8810 		    (state & VMDS_KERNEL_PMAP)) {
8811 			/* Remove translations associated
8812 			 * with this range unless the entry
8813 			 * does not have an object, or
8814 			 * it's the kernel map or a descendant
8815 			 * since the platform could potentially
8816 			 * create "backdoor" mappings invisible
8817 			 * to the VM. It is expected that
8818 			 * objectless, non-kernel ranges
8819 			 * do not have such VM invisible
8820 			 * translations.
8821 			 */
8822 			pmap_remove_options(map->pmap,
8823 			    (addr64_t)entry->vme_start,
8824 			    (addr64_t)entry->vme_end,
8825 			    PMAP_OPTIONS_REMOVE);
8826 		}
8827 
8828 #if DEBUG
8829 		/*
8830 		 * All pmap mappings for this map entry must have been
8831 		 * cleared by now.
8832 		 */
8833 		assert(pmap_is_empty(map->pmap,
8834 		    entry->vme_start,
8835 		    entry->vme_end));
8836 #endif /* DEBUG */
8837 
8838 		if (entry->iokit_acct) {
8839 			/* alternate accounting */
8840 			DTRACE_VM4(vm_map_iokit_unmapped_region,
8841 			    vm_map_t, map,
8842 			    vm_map_offset_t, entry->vme_start,
8843 			    vm_map_offset_t, entry->vme_end,
8844 			    int, VME_ALIAS(entry));
8845 			vm_map_iokit_unmapped_region(map,
8846 			    (entry->vme_end -
8847 			    entry->vme_start));
8848 			entry->iokit_acct = FALSE;
8849 			entry->use_pmap = FALSE;
8850 		}
8851 
8852 		/* move "s" forward */
8853 		s    = entry->vme_end;
8854 		next = entry->vme_next;
8855 		if (!entry->map_aligned) {
8856 			vm_map_offset_t rounded_s;
8857 
8858 			/*
8859 			 * Skip artificial gap due to mis-aligned entry
8860 			 * on devices with a page size smaller than the
8861 			 * map's page size (i.e. 16k task on a 4k device).
8862 			 */
8863 			rounded_s = VM_MAP_ROUND_PAGE(s, VM_MAP_PAGE_MASK(map));
8864 			if (next == vm_map_to_entry(map)) {
8865 				s = rounded_s;
8866 			} else if (s < rounded_s) {
8867 				s = MIN(rounded_s, next->vme_start);
8868 			}
8869 		}
8870 		ret.kmr_size += s - entry->vme_start;
8871 
8872 		if (entry->vme_permanent) {
8873 			/*
8874 			 * A permanent entry can not be removed, so leave it
8875 			 * in place but remove all access permissions.
8876 			 */
8877 			if (!entry->csm_associated) {
8878 				printf("%s:%d %d[%s] map %p entry %p [ 0x%llx - 0x%llx ] submap %d prot 0x%x/0x%x -> 0/0\n",
8879 				    __FUNCTION__, __LINE__,
8880 				    proc_selfpid(),
8881 				    (get_bsdtask_info(current_task())
8882 				    ? proc_name_address(get_bsdtask_info(current_task()))
8883 				    : "?"),
8884 				    map,
8885 				    entry,
8886 				    (uint64_t)entry->vme_start,
8887 				    (uint64_t)entry->vme_end,
8888 				    entry->is_sub_map,
8889 				    entry->protection,
8890 				    entry->max_protection);
8891 			}
8892 			DTRACE_VM6(vm_map_delete_permanent_prot_none,
8893 			    vm_map_entry_t, entry,
8894 			    vm_map_offset_t, entry->vme_start,
8895 			    vm_map_offset_t, entry->vme_end,
8896 			    vm_prot_t, entry->protection,
8897 			    vm_prot_t, entry->max_protection,
8898 			    int, VME_ALIAS(entry));
8899 			entry->protection = VM_PROT_NONE;
8900 			entry->max_protection = VM_PROT_NONE;
8901 		} else {
8902 			vm_map_entry_zap(map, entry, zap_list);
8903 		}
8904 
8905 		entry = next;
8906 		next  = VM_MAP_ENTRY_NULL;
8907 
8908 		if ((flags & VM_MAP_REMOVE_NO_YIELD) == 0 && s < end) {
8909 			unsigned int last_timestamp = map->timestamp++;
8910 
8911 			if (lck_rw_lock_yield_exclusive(&map->lock,
8912 			    LCK_RW_YIELD_ANY_WAITER)) {
8913 				if (last_timestamp != map->timestamp + 1) {
8914 					state |= VMDS_NEEDS_LOOKUP;
8915 				}
8916 			} else {
8917 				/* we didn't yield, undo our change */
8918 				map->timestamp--;
8919 			}
8920 		}
8921 	}
8922 
8923 	if (map->wait_for_space) {
8924 		thread_wakeup((event_t) map);
8925 	}
8926 
8927 	if (state & VMDS_NEEDS_WAKEUP) {
8928 		vm_map_entry_wakeup(map);
8929 	}
8930 
8931 out:
8932 	if ((state & VMDS_KERNEL_PMAP) && ret.kmr_return) {
8933 		__vm_map_delete_failed_panic(map, start, end, ret.kmr_return);
8934 	}
8935 
8936 	if (state & VMDS_KERNEL_KMEMPTR) {
8937 		kmem_free_space(start, end, range_id, &slot);
8938 	}
8939 
8940 	if (state & VMDS_FOUND_GAP) {
8941 		DTRACE_VM3(kern_vm_deallocate_gap,
8942 		    vm_map_offset_t, gap_start,
8943 		    vm_map_offset_t, save_start,
8944 		    vm_map_offset_t, save_end);
8945 		if (flags & VM_MAP_REMOVE_GAPS_FAIL) {
8946 			ret.kmr_return = KERN_INVALID_VALUE;
8947 		} else {
8948 			vm_map_guard_exception(gap_start, kGUARD_EXC_DEALLOC_GAP);
8949 		}
8950 	}
8951 
8952 	return ret;
8953 }
8954 
8955 kmem_return_t
vm_map_remove_and_unlock(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vmr_flags_t flags,kmem_guard_t guard)8956 vm_map_remove_and_unlock(
8957 	vm_map_t        map,
8958 	vm_map_offset_t start,
8959 	vm_map_offset_t end,
8960 	vmr_flags_t     flags,
8961 	kmem_guard_t    guard)
8962 {
8963 	kmem_return_t ret;
8964 	VM_MAP_ZAP_DECLARE(zap);
8965 
8966 	ret = vm_map_delete(map, start, end, flags, guard, &zap);
8967 	vm_map_unlock(map);
8968 
8969 	vm_map_zap_dispose(&zap);
8970 
8971 	return ret;
8972 }
8973 
8974 /*
8975  *	vm_map_remove_guard:
8976  *
8977  *	Remove the given address range from the target map.
8978  *	This is the exported form of vm_map_delete.
8979  */
8980 kmem_return_t
vm_map_remove_guard(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vmr_flags_t flags,kmem_guard_t guard)8981 vm_map_remove_guard(
8982 	vm_map_t        map,
8983 	vm_map_offset_t start,
8984 	vm_map_offset_t end,
8985 	vmr_flags_t     flags,
8986 	kmem_guard_t    guard)
8987 {
8988 	vm_map_lock(map);
8989 	return vm_map_remove_and_unlock(map, start, end, flags, guard);
8990 }
8991 
8992 /*
8993  *	vm_map_terminate:
8994  *
8995  *	Clean out a task's map.
8996  */
8997 kern_return_t
vm_map_terminate(vm_map_t map)8998 vm_map_terminate(
8999 	vm_map_t        map)
9000 {
9001 	vm_map_lock(map);
9002 	map->terminated = TRUE;
9003 	vm_map_disable_hole_optimization(map);
9004 	(void)vm_map_remove_and_unlock(map, map->min_offset, map->max_offset,
9005 	    VM_MAP_REMOVE_NO_FLAGS, KMEM_GUARD_NONE);
9006 	return KERN_SUCCESS;
9007 }
9008 
9009 /*
9010  *	Routine:	vm_map_copy_allocate
9011  *
9012  *	Description:
9013  *		Allocates and initializes a map copy object.
9014  */
9015 static vm_map_copy_t
vm_map_copy_allocate(uint16_t type)9016 vm_map_copy_allocate(uint16_t type)
9017 {
9018 	vm_map_copy_t new_copy;
9019 
9020 	new_copy = zalloc_id(ZONE_ID_VM_MAP_COPY, Z_WAITOK | Z_ZERO);
9021 	new_copy->type = type;
9022 	if (type == VM_MAP_COPY_ENTRY_LIST) {
9023 		new_copy->c_u.hdr.rb_head_store.rbh_root = (void*)(int)SKIP_RB_TREE;
9024 		vm_map_store_init(&new_copy->cpy_hdr);
9025 	}
9026 	return new_copy;
9027 }
9028 
9029 /*
9030  *	Routine:	vm_map_copy_discard
9031  *
9032  *	Description:
9033  *		Dispose of a map copy object (returned by
9034  *		vm_map_copyin).
9035  */
9036 void
vm_map_copy_discard(vm_map_copy_t copy)9037 vm_map_copy_discard(
9038 	vm_map_copy_t   copy)
9039 {
9040 	if (copy == VM_MAP_COPY_NULL) {
9041 		return;
9042 	}
9043 
9044 	/*
9045 	 * Assert that the vm_map_copy is coming from the right
9046 	 * zone and hasn't been forged
9047 	 */
9048 	vm_map_copy_require(copy);
9049 
9050 	switch (copy->type) {
9051 	case VM_MAP_COPY_ENTRY_LIST:
9052 		while (vm_map_copy_first_entry(copy) !=
9053 		    vm_map_copy_to_entry(copy)) {
9054 			vm_map_entry_t  entry = vm_map_copy_first_entry(copy);
9055 
9056 			vm_map_copy_entry_unlink(copy, entry);
9057 			if (entry->is_sub_map) {
9058 				vm_map_deallocate(VME_SUBMAP(entry));
9059 			} else {
9060 				vm_object_deallocate(VME_OBJECT(entry));
9061 			}
9062 			vm_map_copy_entry_dispose(entry);
9063 		}
9064 		break;
9065 	case VM_MAP_COPY_KERNEL_BUFFER:
9066 
9067 		/*
9068 		 * The vm_map_copy_t and possibly the data buffer were
9069 		 * allocated by a single call to kalloc_data(), i.e. the
9070 		 * vm_map_copy_t was not allocated out of the zone.
9071 		 */
9072 		if (copy->size > msg_ool_size_small || copy->offset) {
9073 			panic("Invalid vm_map_copy_t sz:%lld, ofst:%lld",
9074 			    (long long)copy->size, (long long)copy->offset);
9075 		}
9076 		kfree_data(copy->cpy_kdata, copy->size);
9077 	}
9078 	zfree_id(ZONE_ID_VM_MAP_COPY, copy);
9079 }
9080 
9081 #if XNU_PLATFORM_MacOSX
9082 
9083 /*
9084  *	Routine:	vm_map_copy_copy
9085  *
9086  *	Description:
9087  *			Move the information in a map copy object to
9088  *			a new map copy object, leaving the old one
9089  *			empty.
9090  *
9091  *			This is used by kernel routines that need
9092  *			to look at out-of-line data (in copyin form)
9093  *			before deciding whether to return SUCCESS.
9094  *			If the routine returns FAILURE, the original
9095  *			copy object will be deallocated; therefore,
9096  *			these routines must make a copy of the copy
9097  *			object and leave the original empty so that
9098  *			deallocation will not fail.
9099  */
9100 vm_map_copy_t
vm_map_copy_copy(vm_map_copy_t copy)9101 vm_map_copy_copy(
9102 	vm_map_copy_t   copy)
9103 {
9104 	vm_map_copy_t   new_copy;
9105 
9106 	if (copy == VM_MAP_COPY_NULL) {
9107 		return VM_MAP_COPY_NULL;
9108 	}
9109 
9110 	/*
9111 	 * Assert that the vm_map_copy is coming from the right
9112 	 * zone and hasn't been forged
9113 	 */
9114 	vm_map_copy_require(copy);
9115 
9116 	/*
9117 	 * Allocate a new copy object, and copy the information
9118 	 * from the old one into it.
9119 	 */
9120 
9121 	new_copy = zalloc_id(ZONE_ID_VM_MAP_COPY, Z_WAITOK | Z_ZERO | Z_NOFAIL);
9122 	memcpy((void *) new_copy, (void *) copy, sizeof(struct vm_map_copy));
9123 #if __has_feature(ptrauth_calls)
9124 	if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) {
9125 		new_copy->cpy_kdata = copy->cpy_kdata;
9126 	}
9127 #endif
9128 
9129 	if (copy->type == VM_MAP_COPY_ENTRY_LIST) {
9130 		/*
9131 		 * The links in the entry chain must be
9132 		 * changed to point to the new copy object.
9133 		 */
9134 		vm_map_copy_first_entry(copy)->vme_prev
9135 		        = vm_map_copy_to_entry(new_copy);
9136 		vm_map_copy_last_entry(copy)->vme_next
9137 		        = vm_map_copy_to_entry(new_copy);
9138 	}
9139 
9140 	/*
9141 	 * Change the old copy object into one that contains
9142 	 * nothing to be deallocated.
9143 	 */
9144 	bzero(copy, sizeof(struct vm_map_copy));
9145 	copy->type = VM_MAP_COPY_KERNEL_BUFFER;
9146 
9147 	/*
9148 	 * Return the new object.
9149 	 */
9150 	return new_copy;
9151 }
9152 
9153 #endif /* XNU_PLATFORM_MacOSX */
9154 
9155 static boolean_t
vm_map_entry_is_overwritable(vm_map_t dst_map __unused,vm_map_entry_t entry)9156 vm_map_entry_is_overwritable(
9157 	vm_map_t        dst_map __unused,
9158 	vm_map_entry_t  entry)
9159 {
9160 	if (!(entry->protection & VM_PROT_WRITE)) {
9161 		/* can't overwrite if not writable */
9162 		return FALSE;
9163 	}
9164 #if !__x86_64__
9165 	if (entry->used_for_jit &&
9166 	    vm_map_cs_enforcement(dst_map) &&
9167 	    !dst_map->cs_debugged) {
9168 		/*
9169 		 * Can't overwrite a JIT region while cs_enforced
9170 		 * and not cs_debugged.
9171 		 */
9172 		return FALSE;
9173 	}
9174 
9175 #if __arm64e__
9176 	/* Do not allow overwrite HW assisted TPRO entries */
9177 	if (entry->used_for_tpro) {
9178 		return FALSE;
9179 	}
9180 #endif /* __arm64e__ */
9181 
9182 	if (entry->vme_permanent) {
9183 		if (entry->is_sub_map) {
9184 			/*
9185 			 * We can't tell if the submap contains "permanent"
9186 			 * entries within the range targeted by the caller.
9187 			 * The caller will have to check for that with
9188 			 * vm_map_overwrite_submap_recurse() for example.
9189 			 */
9190 		} else {
9191 			/*
9192 			 * Do not allow overwriting of a "permanent"
9193 			 * entry.
9194 			 */
9195 			DTRACE_VM6(vm_map_delete_permanent_deny_overwrite,
9196 			    vm_map_entry_t, entry,
9197 			    vm_map_offset_t, entry->vme_start,
9198 			    vm_map_offset_t, entry->vme_end,
9199 			    vm_prot_t, entry->protection,
9200 			    vm_prot_t, entry->max_protection,
9201 			    int, VME_ALIAS(entry));
9202 			return FALSE;
9203 		}
9204 	}
9205 #endif /* !__x86_64__ */
9206 	return TRUE;
9207 }
9208 
9209 static kern_return_t
vm_map_overwrite_submap_recurse(vm_map_t dst_map,vm_map_offset_t dst_addr,vm_map_size_t dst_size)9210 vm_map_overwrite_submap_recurse(
9211 	vm_map_t        dst_map,
9212 	vm_map_offset_t dst_addr,
9213 	vm_map_size_t   dst_size)
9214 {
9215 	vm_map_offset_t dst_end;
9216 	vm_map_entry_t  tmp_entry;
9217 	vm_map_entry_t  entry;
9218 	kern_return_t   result;
9219 	boolean_t       encountered_sub_map = FALSE;
9220 
9221 
9222 
9223 	/*
9224 	 *	Verify that the destination is all writeable
9225 	 *	initially.  We have to trunc the destination
9226 	 *	address and round the copy size or we'll end up
9227 	 *	splitting entries in strange ways.
9228 	 */
9229 
9230 	dst_end = vm_map_round_page(dst_addr + dst_size,
9231 	    VM_MAP_PAGE_MASK(dst_map));
9232 	vm_map_lock(dst_map);
9233 
9234 start_pass_1:
9235 	if (!vm_map_lookup_entry(dst_map, dst_addr, &tmp_entry)) {
9236 		vm_map_unlock(dst_map);
9237 		return KERN_INVALID_ADDRESS;
9238 	}
9239 
9240 	vm_map_clip_start(dst_map,
9241 	    tmp_entry,
9242 	    vm_map_trunc_page(dst_addr,
9243 	    VM_MAP_PAGE_MASK(dst_map)));
9244 	if (tmp_entry->is_sub_map) {
9245 		/* clipping did unnest if needed */
9246 		assert(!tmp_entry->use_pmap);
9247 	}
9248 
9249 	for (entry = tmp_entry;;) {
9250 		vm_map_entry_t  next;
9251 
9252 		next = entry->vme_next;
9253 		while (entry->is_sub_map) {
9254 			vm_map_offset_t sub_start;
9255 			vm_map_offset_t sub_end;
9256 			vm_map_offset_t local_end;
9257 
9258 			if (entry->in_transition) {
9259 				/*
9260 				 * Say that we are waiting, and wait for entry.
9261 				 */
9262 				entry->needs_wakeup = TRUE;
9263 				vm_map_entry_wait(dst_map, THREAD_UNINT);
9264 
9265 				goto start_pass_1;
9266 			}
9267 
9268 			encountered_sub_map = TRUE;
9269 			sub_start = VME_OFFSET(entry);
9270 
9271 			if (entry->vme_end < dst_end) {
9272 				sub_end = entry->vme_end;
9273 			} else {
9274 				sub_end = dst_end;
9275 			}
9276 			sub_end -= entry->vme_start;
9277 			sub_end += VME_OFFSET(entry);
9278 			local_end = entry->vme_end;
9279 			vm_map_unlock(dst_map);
9280 
9281 			result = vm_map_overwrite_submap_recurse(
9282 				VME_SUBMAP(entry),
9283 				sub_start,
9284 				sub_end - sub_start);
9285 
9286 			if (result != KERN_SUCCESS) {
9287 				return result;
9288 			}
9289 			if (dst_end <= entry->vme_end) {
9290 				return KERN_SUCCESS;
9291 			}
9292 			vm_map_lock(dst_map);
9293 			if (!vm_map_lookup_entry(dst_map, local_end,
9294 			    &tmp_entry)) {
9295 				vm_map_unlock(dst_map);
9296 				return KERN_INVALID_ADDRESS;
9297 			}
9298 			entry = tmp_entry;
9299 			next = entry->vme_next;
9300 		}
9301 
9302 		if (!(entry->protection & VM_PROT_WRITE)) {
9303 			vm_map_unlock(dst_map);
9304 			return KERN_PROTECTION_FAILURE;
9305 		}
9306 
9307 		if (!vm_map_entry_is_overwritable(dst_map, entry)) {
9308 			vm_map_unlock(dst_map);
9309 			return KERN_PROTECTION_FAILURE;
9310 		}
9311 
9312 		/*
9313 		 *	If the entry is in transition, we must wait
9314 		 *	for it to exit that state.  Anything could happen
9315 		 *	when we unlock the map, so start over.
9316 		 */
9317 		if (entry->in_transition) {
9318 			/*
9319 			 * Say that we are waiting, and wait for entry.
9320 			 */
9321 			entry->needs_wakeup = TRUE;
9322 			vm_map_entry_wait(dst_map, THREAD_UNINT);
9323 
9324 			goto start_pass_1;
9325 		}
9326 
9327 /*
9328  *		our range is contained completely within this map entry
9329  */
9330 		if (dst_end <= entry->vme_end) {
9331 			vm_map_unlock(dst_map);
9332 			return KERN_SUCCESS;
9333 		}
9334 /*
9335  *		check that range specified is contiguous region
9336  */
9337 		if ((next == vm_map_to_entry(dst_map)) ||
9338 		    (next->vme_start != entry->vme_end)) {
9339 			vm_map_unlock(dst_map);
9340 			return KERN_INVALID_ADDRESS;
9341 		}
9342 
9343 		/*
9344 		 *	Check for permanent objects in the destination.
9345 		 */
9346 		if ((VME_OBJECT(entry) != VM_OBJECT_NULL) &&
9347 		    ((!VME_OBJECT(entry)->internal) ||
9348 		    (VME_OBJECT(entry)->true_share))) {
9349 			if (encountered_sub_map) {
9350 				vm_map_unlock(dst_map);
9351 				return KERN_FAILURE;
9352 			}
9353 		}
9354 
9355 
9356 		entry = next;
9357 	}/* for */
9358 	vm_map_unlock(dst_map);
9359 	return KERN_SUCCESS;
9360 }
9361 
9362 /*
9363  *	Routine:	vm_map_copy_overwrite
9364  *
9365  *	Description:
9366  *		Copy the memory described by the map copy
9367  *		object (copy; returned by vm_map_copyin) onto
9368  *		the specified destination region (dst_map, dst_addr).
9369  *		The destination must be writeable.
9370  *
9371  *		Unlike vm_map_copyout, this routine actually
9372  *		writes over previously-mapped memory.  If the
9373  *		previous mapping was to a permanent (user-supplied)
9374  *		memory object, it is preserved.
9375  *
9376  *		The attributes (protection and inheritance) of the
9377  *		destination region are preserved.
9378  *
9379  *		If successful, consumes the copy object.
9380  *		Otherwise, the caller is responsible for it.
9381  *
9382  *	Implementation notes:
9383  *		To overwrite aligned temporary virtual memory, it is
9384  *		sufficient to remove the previous mapping and insert
9385  *		the new copy.  This replacement is done either on
9386  *		the whole region (if no permanent virtual memory
9387  *		objects are embedded in the destination region) or
9388  *		in individual map entries.
9389  *
9390  *		To overwrite permanent virtual memory , it is necessary
9391  *		to copy each page, as the external memory management
9392  *		interface currently does not provide any optimizations.
9393  *
9394  *		Unaligned memory also has to be copied.  It is possible
9395  *		to use 'vm_trickery' to copy the aligned data.  This is
9396  *		not done but not hard to implement.
9397  *
9398  *		Once a page of permanent memory has been overwritten,
9399  *		it is impossible to interrupt this function; otherwise,
9400  *		the call would be neither atomic nor location-independent.
9401  *		The kernel-state portion of a user thread must be
9402  *		interruptible.
9403  *
9404  *		It may be expensive to forward all requests that might
9405  *		overwrite permanent memory (vm_write, vm_copy) to
9406  *		uninterruptible kernel threads.  This routine may be
9407  *		called by interruptible threads; however, success is
9408  *		not guaranteed -- if the request cannot be performed
9409  *		atomically and interruptibly, an error indication is
9410  *		returned.
9411  *
9412  *		Callers of this function must call vm_map_copy_require on
9413  *		previously created vm_map_copy_t or pass a newly created
9414  *		one to ensure that it hasn't been forged.
9415  */
9416 static kern_return_t
vm_map_copy_overwrite_nested(vm_map_t dst_map,vm_map_address_t dst_addr,vm_map_copy_t copy,boolean_t interruptible,pmap_t pmap,boolean_t discard_on_success)9417 vm_map_copy_overwrite_nested(
9418 	vm_map_t                dst_map,
9419 	vm_map_address_t        dst_addr,
9420 	vm_map_copy_t           copy,
9421 	boolean_t               interruptible,
9422 	pmap_t                  pmap,
9423 	boolean_t               discard_on_success)
9424 {
9425 	vm_map_offset_t         dst_end;
9426 	vm_map_entry_t          tmp_entry;
9427 	vm_map_entry_t          entry;
9428 	kern_return_t           kr;
9429 	boolean_t               aligned = TRUE;
9430 	boolean_t               contains_permanent_objects = FALSE;
9431 	boolean_t               encountered_sub_map = FALSE;
9432 	vm_map_offset_t         base_addr;
9433 	vm_map_size_t           copy_size;
9434 	vm_map_size_t           total_size;
9435 	uint16_t                copy_page_shift;
9436 
9437 	/*
9438 	 *	Check for special kernel buffer allocated
9439 	 *	by new_ipc_kmsg_copyin.
9440 	 */
9441 
9442 	if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) {
9443 		kr = vm_map_copyout_kernel_buffer(
9444 			dst_map, &dst_addr,
9445 			copy, copy->size, TRUE, discard_on_success);
9446 		return kr;
9447 	}
9448 
9449 	/*
9450 	 *      Only works for entry lists at the moment.  Will
9451 	 *	support page lists later.
9452 	 */
9453 
9454 	assert(copy->type == VM_MAP_COPY_ENTRY_LIST);
9455 
9456 	if (copy->size == 0) {
9457 		if (discard_on_success) {
9458 			vm_map_copy_discard(copy);
9459 		}
9460 		return KERN_SUCCESS;
9461 	}
9462 
9463 	copy_page_shift = copy->cpy_hdr.page_shift;
9464 
9465 	/*
9466 	 *	Verify that the destination is all writeable
9467 	 *	initially.  We have to trunc the destination
9468 	 *	address and round the copy size or we'll end up
9469 	 *	splitting entries in strange ways.
9470 	 */
9471 
9472 	if (!VM_MAP_PAGE_ALIGNED(copy->size,
9473 	    VM_MAP_PAGE_MASK(dst_map)) ||
9474 	    !VM_MAP_PAGE_ALIGNED(copy->offset,
9475 	    VM_MAP_PAGE_MASK(dst_map)) ||
9476 	    !VM_MAP_PAGE_ALIGNED(dst_addr,
9477 	    VM_MAP_PAGE_MASK(dst_map)) ||
9478 	    copy_page_shift != VM_MAP_PAGE_SHIFT(dst_map)) {
9479 		aligned = FALSE;
9480 		dst_end = vm_map_round_page(dst_addr + copy->size,
9481 		    VM_MAP_PAGE_MASK(dst_map));
9482 	} else {
9483 		dst_end = dst_addr + copy->size;
9484 	}
9485 
9486 	vm_map_lock(dst_map);
9487 
9488 	/* LP64todo - remove this check when vm_map_commpage64()
9489 	 * no longer has to stuff in a map_entry for the commpage
9490 	 * above the map's max_offset.
9491 	 */
9492 	if (dst_addr >= dst_map->max_offset) {
9493 		vm_map_unlock(dst_map);
9494 		return KERN_INVALID_ADDRESS;
9495 	}
9496 
9497 start_pass_1:
9498 	if (!vm_map_lookup_entry(dst_map, dst_addr, &tmp_entry)) {
9499 		vm_map_unlock(dst_map);
9500 		return KERN_INVALID_ADDRESS;
9501 	}
9502 	vm_map_clip_start(dst_map,
9503 	    tmp_entry,
9504 	    vm_map_trunc_page(dst_addr,
9505 	    VM_MAP_PAGE_MASK(dst_map)));
9506 	for (entry = tmp_entry;;) {
9507 		vm_map_entry_t  next = entry->vme_next;
9508 
9509 		while (entry->is_sub_map) {
9510 			vm_map_offset_t sub_start;
9511 			vm_map_offset_t sub_end;
9512 			vm_map_offset_t local_end;
9513 
9514 			if (entry->in_transition) {
9515 				/*
9516 				 * Say that we are waiting, and wait for entry.
9517 				 */
9518 				entry->needs_wakeup = TRUE;
9519 				vm_map_entry_wait(dst_map, THREAD_UNINT);
9520 
9521 				goto start_pass_1;
9522 			}
9523 
9524 			local_end = entry->vme_end;
9525 			if (!(entry->needs_copy)) {
9526 				/* if needs_copy we are a COW submap */
9527 				/* in such a case we just replace so */
9528 				/* there is no need for the follow-  */
9529 				/* ing check.                        */
9530 				encountered_sub_map = TRUE;
9531 				sub_start = VME_OFFSET(entry);
9532 
9533 				if (entry->vme_end < dst_end) {
9534 					sub_end = entry->vme_end;
9535 				} else {
9536 					sub_end = dst_end;
9537 				}
9538 				sub_end -= entry->vme_start;
9539 				sub_end += VME_OFFSET(entry);
9540 				vm_map_unlock(dst_map);
9541 
9542 				kr = vm_map_overwrite_submap_recurse(
9543 					VME_SUBMAP(entry),
9544 					sub_start,
9545 					sub_end - sub_start);
9546 				if (kr != KERN_SUCCESS) {
9547 					return kr;
9548 				}
9549 				vm_map_lock(dst_map);
9550 			}
9551 
9552 			if (dst_end <= entry->vme_end) {
9553 				goto start_overwrite;
9554 			}
9555 			if (!vm_map_lookup_entry(dst_map, local_end,
9556 			    &entry)) {
9557 				vm_map_unlock(dst_map);
9558 				return KERN_INVALID_ADDRESS;
9559 			}
9560 			next = entry->vme_next;
9561 		}
9562 
9563 		if (!(entry->protection & VM_PROT_WRITE)) {
9564 			vm_map_unlock(dst_map);
9565 			return KERN_PROTECTION_FAILURE;
9566 		}
9567 
9568 		if (!vm_map_entry_is_overwritable(dst_map, entry)) {
9569 			vm_map_unlock(dst_map);
9570 			return KERN_PROTECTION_FAILURE;
9571 		}
9572 
9573 		/*
9574 		 *	If the entry is in transition, we must wait
9575 		 *	for it to exit that state.  Anything could happen
9576 		 *	when we unlock the map, so start over.
9577 		 */
9578 		if (entry->in_transition) {
9579 			/*
9580 			 * Say that we are waiting, and wait for entry.
9581 			 */
9582 			entry->needs_wakeup = TRUE;
9583 			vm_map_entry_wait(dst_map, THREAD_UNINT);
9584 
9585 			goto start_pass_1;
9586 		}
9587 
9588 /*
9589  *		our range is contained completely within this map entry
9590  */
9591 		if (dst_end <= entry->vme_end) {
9592 			break;
9593 		}
9594 /*
9595  *		check that range specified is contiguous region
9596  */
9597 		if ((next == vm_map_to_entry(dst_map)) ||
9598 		    (next->vme_start != entry->vme_end)) {
9599 			vm_map_unlock(dst_map);
9600 			return KERN_INVALID_ADDRESS;
9601 		}
9602 
9603 
9604 		/*
9605 		 *	Check for permanent objects in the destination.
9606 		 */
9607 		if ((VME_OBJECT(entry) != VM_OBJECT_NULL) &&
9608 		    ((!VME_OBJECT(entry)->internal) ||
9609 		    (VME_OBJECT(entry)->true_share))) {
9610 			contains_permanent_objects = TRUE;
9611 		}
9612 
9613 		entry = next;
9614 	}/* for */
9615 
9616 start_overwrite:
9617 	/*
9618 	 *	If there are permanent objects in the destination, then
9619 	 *	the copy cannot be interrupted.
9620 	 */
9621 
9622 	if (interruptible && contains_permanent_objects) {
9623 		vm_map_unlock(dst_map);
9624 		return KERN_FAILURE;   /* XXX */
9625 	}
9626 
9627 	/*
9628 	 *
9629 	 *	Make a second pass, overwriting the data
9630 	 *	At the beginning of each loop iteration,
9631 	 *	the next entry to be overwritten is "tmp_entry"
9632 	 *	(initially, the value returned from the lookup above),
9633 	 *	and the starting address expected in that entry
9634 	 *	is "start".
9635 	 */
9636 
9637 	total_size = copy->size;
9638 	if (encountered_sub_map) {
9639 		copy_size = 0;
9640 		/* re-calculate tmp_entry since we've had the map */
9641 		/* unlocked */
9642 		if (!vm_map_lookup_entry( dst_map, dst_addr, &tmp_entry)) {
9643 			vm_map_unlock(dst_map);
9644 			return KERN_INVALID_ADDRESS;
9645 		}
9646 	} else {
9647 		copy_size = copy->size;
9648 	}
9649 
9650 	base_addr = dst_addr;
9651 	while (TRUE) {
9652 		/* deconstruct the copy object and do in parts */
9653 		/* only in sub_map, interruptable case */
9654 		vm_map_entry_t  copy_entry;
9655 		vm_map_entry_t  previous_prev = VM_MAP_ENTRY_NULL;
9656 		vm_map_entry_t  next_copy = VM_MAP_ENTRY_NULL;
9657 		int             nentries;
9658 		int             remaining_entries = 0;
9659 		vm_map_offset_t new_offset = 0;
9660 
9661 		for (entry = tmp_entry; copy_size == 0;) {
9662 			vm_map_entry_t  next;
9663 
9664 			next = entry->vme_next;
9665 
9666 			/* tmp_entry and base address are moved along */
9667 			/* each time we encounter a sub-map.  Otherwise */
9668 			/* entry can outpase tmp_entry, and the copy_size */
9669 			/* may reflect the distance between them */
9670 			/* if the current entry is found to be in transition */
9671 			/* we will start over at the beginning or the last */
9672 			/* encounter of a submap as dictated by base_addr */
9673 			/* we will zero copy_size accordingly. */
9674 			if (entry->in_transition) {
9675 				/*
9676 				 * Say that we are waiting, and wait for entry.
9677 				 */
9678 				entry->needs_wakeup = TRUE;
9679 				vm_map_entry_wait(dst_map, THREAD_UNINT);
9680 
9681 				if (!vm_map_lookup_entry(dst_map, base_addr,
9682 				    &tmp_entry)) {
9683 					vm_map_unlock(dst_map);
9684 					return KERN_INVALID_ADDRESS;
9685 				}
9686 				copy_size = 0;
9687 				entry = tmp_entry;
9688 				continue;
9689 			}
9690 			if (entry->is_sub_map) {
9691 				vm_map_offset_t sub_start;
9692 				vm_map_offset_t sub_end;
9693 				vm_map_offset_t local_end;
9694 
9695 				if (entry->needs_copy) {
9696 					/* if this is a COW submap */
9697 					/* just back the range with a */
9698 					/* anonymous entry */
9699 					assert(!entry->vme_permanent);
9700 					if (entry->vme_end < dst_end) {
9701 						sub_end = entry->vme_end;
9702 					} else {
9703 						sub_end = dst_end;
9704 					}
9705 					if (entry->vme_start < base_addr) {
9706 						sub_start = base_addr;
9707 					} else {
9708 						sub_start = entry->vme_start;
9709 					}
9710 					vm_map_clip_end(
9711 						dst_map, entry, sub_end);
9712 					vm_map_clip_start(
9713 						dst_map, entry, sub_start);
9714 					assert(!entry->use_pmap);
9715 					assert(!entry->iokit_acct);
9716 					entry->use_pmap = TRUE;
9717 					vm_map_deallocate(VME_SUBMAP(entry));
9718 					assert(!entry->vme_permanent);
9719 					VME_OBJECT_SET(entry, VM_OBJECT_NULL, false, 0);
9720 					VME_OFFSET_SET(entry, 0);
9721 					entry->is_shared = FALSE;
9722 					entry->needs_copy = FALSE;
9723 					entry->protection = VM_PROT_DEFAULT;
9724 					entry->max_protection = VM_PROT_ALL;
9725 					entry->wired_count = 0;
9726 					entry->user_wired_count = 0;
9727 					if (entry->inheritance
9728 					    == VM_INHERIT_SHARE) {
9729 						entry->inheritance = VM_INHERIT_COPY;
9730 					}
9731 					continue;
9732 				}
9733 				/* first take care of any non-sub_map */
9734 				/* entries to send */
9735 				if (base_addr < entry->vme_start) {
9736 					/* stuff to send */
9737 					copy_size =
9738 					    entry->vme_start - base_addr;
9739 					break;
9740 				}
9741 				sub_start = VME_OFFSET(entry);
9742 
9743 				if (entry->vme_end < dst_end) {
9744 					sub_end = entry->vme_end;
9745 				} else {
9746 					sub_end = dst_end;
9747 				}
9748 				sub_end -= entry->vme_start;
9749 				sub_end += VME_OFFSET(entry);
9750 				local_end = entry->vme_end;
9751 				vm_map_unlock(dst_map);
9752 				copy_size = sub_end - sub_start;
9753 
9754 				/* adjust the copy object */
9755 				if (total_size > copy_size) {
9756 					vm_map_size_t   local_size = 0;
9757 					vm_map_size_t   entry_size;
9758 
9759 					nentries = 1;
9760 					new_offset = copy->offset;
9761 					copy_entry = vm_map_copy_first_entry(copy);
9762 					while (copy_entry !=
9763 					    vm_map_copy_to_entry(copy)) {
9764 						entry_size = copy_entry->vme_end -
9765 						    copy_entry->vme_start;
9766 						if ((local_size < copy_size) &&
9767 						    ((local_size + entry_size)
9768 						    >= copy_size)) {
9769 							vm_map_copy_clip_end(copy,
9770 							    copy_entry,
9771 							    copy_entry->vme_start +
9772 							    (copy_size - local_size));
9773 							entry_size = copy_entry->vme_end -
9774 							    copy_entry->vme_start;
9775 							local_size += entry_size;
9776 							new_offset += entry_size;
9777 						}
9778 						if (local_size >= copy_size) {
9779 							next_copy = copy_entry->vme_next;
9780 							copy_entry->vme_next =
9781 							    vm_map_copy_to_entry(copy);
9782 							previous_prev =
9783 							    copy->cpy_hdr.links.prev;
9784 							copy->cpy_hdr.links.prev = copy_entry;
9785 							copy->size = copy_size;
9786 							remaining_entries =
9787 							    copy->cpy_hdr.nentries;
9788 							remaining_entries -= nentries;
9789 							copy->cpy_hdr.nentries = nentries;
9790 							break;
9791 						} else {
9792 							local_size += entry_size;
9793 							new_offset += entry_size;
9794 							nentries++;
9795 						}
9796 						copy_entry = copy_entry->vme_next;
9797 					}
9798 				}
9799 
9800 				if ((entry->use_pmap) && (pmap == NULL)) {
9801 					kr = vm_map_copy_overwrite_nested(
9802 						VME_SUBMAP(entry),
9803 						sub_start,
9804 						copy,
9805 						interruptible,
9806 						VME_SUBMAP(entry)->pmap,
9807 						TRUE);
9808 				} else if (pmap != NULL) {
9809 					kr = vm_map_copy_overwrite_nested(
9810 						VME_SUBMAP(entry),
9811 						sub_start,
9812 						copy,
9813 						interruptible, pmap,
9814 						TRUE);
9815 				} else {
9816 					kr = vm_map_copy_overwrite_nested(
9817 						VME_SUBMAP(entry),
9818 						sub_start,
9819 						copy,
9820 						interruptible,
9821 						dst_map->pmap,
9822 						TRUE);
9823 				}
9824 				if (kr != KERN_SUCCESS) {
9825 					if (next_copy != NULL) {
9826 						copy->cpy_hdr.nentries +=
9827 						    remaining_entries;
9828 						copy->cpy_hdr.links.prev->vme_next =
9829 						    next_copy;
9830 						copy->cpy_hdr.links.prev
9831 						        = previous_prev;
9832 						copy->size = total_size;
9833 					}
9834 					return kr;
9835 				}
9836 				if (dst_end <= local_end) {
9837 					return KERN_SUCCESS;
9838 				}
9839 				/* otherwise copy no longer exists, it was */
9840 				/* destroyed after successful copy_overwrite */
9841 				copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
9842 				copy->offset = new_offset;
9843 				copy->cpy_hdr.page_shift = copy_page_shift;
9844 
9845 				total_size -= copy_size;
9846 				copy_size = 0;
9847 				/* put back remainder of copy in container */
9848 				if (next_copy != NULL) {
9849 					copy->cpy_hdr.nentries = remaining_entries;
9850 					copy->cpy_hdr.links.next = next_copy;
9851 					copy->cpy_hdr.links.prev = previous_prev;
9852 					copy->size = total_size;
9853 					next_copy->vme_prev =
9854 					    vm_map_copy_to_entry(copy);
9855 					next_copy = NULL;
9856 				}
9857 				base_addr = local_end;
9858 				vm_map_lock(dst_map);
9859 				if (!vm_map_lookup_entry(dst_map,
9860 				    local_end, &tmp_entry)) {
9861 					vm_map_unlock(dst_map);
9862 					return KERN_INVALID_ADDRESS;
9863 				}
9864 				entry = tmp_entry;
9865 				continue;
9866 			}
9867 			if (dst_end <= entry->vme_end) {
9868 				copy_size = dst_end - base_addr;
9869 				break;
9870 			}
9871 
9872 			if ((next == vm_map_to_entry(dst_map)) ||
9873 			    (next->vme_start != entry->vme_end)) {
9874 				vm_map_unlock(dst_map);
9875 				return KERN_INVALID_ADDRESS;
9876 			}
9877 
9878 			entry = next;
9879 		}/* for */
9880 
9881 		next_copy = NULL;
9882 		nentries = 1;
9883 
9884 		/* adjust the copy object */
9885 		if (total_size > copy_size) {
9886 			vm_map_size_t   local_size = 0;
9887 			vm_map_size_t   entry_size;
9888 
9889 			new_offset = copy->offset;
9890 			copy_entry = vm_map_copy_first_entry(copy);
9891 			while (copy_entry != vm_map_copy_to_entry(copy)) {
9892 				entry_size = copy_entry->vme_end -
9893 				    copy_entry->vme_start;
9894 				if ((local_size < copy_size) &&
9895 				    ((local_size + entry_size)
9896 				    >= copy_size)) {
9897 					vm_map_copy_clip_end(copy, copy_entry,
9898 					    copy_entry->vme_start +
9899 					    (copy_size - local_size));
9900 					entry_size = copy_entry->vme_end -
9901 					    copy_entry->vme_start;
9902 					local_size += entry_size;
9903 					new_offset += entry_size;
9904 				}
9905 				if (local_size >= copy_size) {
9906 					next_copy = copy_entry->vme_next;
9907 					copy_entry->vme_next =
9908 					    vm_map_copy_to_entry(copy);
9909 					previous_prev =
9910 					    copy->cpy_hdr.links.prev;
9911 					copy->cpy_hdr.links.prev = copy_entry;
9912 					copy->size = copy_size;
9913 					remaining_entries =
9914 					    copy->cpy_hdr.nentries;
9915 					remaining_entries -= nentries;
9916 					copy->cpy_hdr.nentries = nentries;
9917 					break;
9918 				} else {
9919 					local_size += entry_size;
9920 					new_offset += entry_size;
9921 					nentries++;
9922 				}
9923 				copy_entry = copy_entry->vme_next;
9924 			}
9925 		}
9926 
9927 		if (aligned) {
9928 			pmap_t  local_pmap;
9929 
9930 			if (pmap) {
9931 				local_pmap = pmap;
9932 			} else {
9933 				local_pmap = dst_map->pmap;
9934 			}
9935 
9936 			if ((kr =  vm_map_copy_overwrite_aligned(
9937 				    dst_map, tmp_entry, copy,
9938 				    base_addr, local_pmap)) != KERN_SUCCESS) {
9939 				if (next_copy != NULL) {
9940 					copy->cpy_hdr.nentries +=
9941 					    remaining_entries;
9942 					copy->cpy_hdr.links.prev->vme_next =
9943 					    next_copy;
9944 					copy->cpy_hdr.links.prev =
9945 					    previous_prev;
9946 					copy->size += copy_size;
9947 				}
9948 				return kr;
9949 			}
9950 			vm_map_unlock(dst_map);
9951 		} else {
9952 			/*
9953 			 * Performance gain:
9954 			 *
9955 			 * if the copy and dst address are misaligned but the same
9956 			 * offset within the page we can copy_not_aligned the
9957 			 * misaligned parts and copy aligned the rest.  If they are
9958 			 * aligned but len is unaligned we simply need to copy
9959 			 * the end bit unaligned.  We'll need to split the misaligned
9960 			 * bits of the region in this case !
9961 			 */
9962 			/* ALWAYS UNLOCKS THE dst_map MAP */
9963 			kr = vm_map_copy_overwrite_unaligned(
9964 				dst_map,
9965 				tmp_entry,
9966 				copy,
9967 				base_addr,
9968 				discard_on_success);
9969 			if (kr != KERN_SUCCESS) {
9970 				if (next_copy != NULL) {
9971 					copy->cpy_hdr.nentries +=
9972 					    remaining_entries;
9973 					copy->cpy_hdr.links.prev->vme_next =
9974 					    next_copy;
9975 					copy->cpy_hdr.links.prev =
9976 					    previous_prev;
9977 					copy->size += copy_size;
9978 				}
9979 				return kr;
9980 			}
9981 		}
9982 		total_size -= copy_size;
9983 		if (total_size == 0) {
9984 			break;
9985 		}
9986 		base_addr += copy_size;
9987 		copy_size = 0;
9988 		copy->offset = new_offset;
9989 		if (next_copy != NULL) {
9990 			copy->cpy_hdr.nentries = remaining_entries;
9991 			copy->cpy_hdr.links.next = next_copy;
9992 			copy->cpy_hdr.links.prev = previous_prev;
9993 			next_copy->vme_prev = vm_map_copy_to_entry(copy);
9994 			copy->size = total_size;
9995 		}
9996 		vm_map_lock(dst_map);
9997 		while (TRUE) {
9998 			if (!vm_map_lookup_entry(dst_map,
9999 			    base_addr, &tmp_entry)) {
10000 				vm_map_unlock(dst_map);
10001 				return KERN_INVALID_ADDRESS;
10002 			}
10003 			if (tmp_entry->in_transition) {
10004 				entry->needs_wakeup = TRUE;
10005 				vm_map_entry_wait(dst_map, THREAD_UNINT);
10006 			} else {
10007 				break;
10008 			}
10009 		}
10010 		vm_map_clip_start(dst_map,
10011 		    tmp_entry,
10012 		    vm_map_trunc_page(base_addr,
10013 		    VM_MAP_PAGE_MASK(dst_map)));
10014 
10015 		entry = tmp_entry;
10016 	} /* while */
10017 
10018 	/*
10019 	 *	Throw away the vm_map_copy object
10020 	 */
10021 	if (discard_on_success) {
10022 		vm_map_copy_discard(copy);
10023 	}
10024 
10025 	return KERN_SUCCESS;
10026 }/* vm_map_copy_overwrite */
10027 
10028 kern_return_t
vm_map_copy_overwrite(vm_map_t dst_map,vm_map_offset_t dst_addr,vm_map_copy_t copy,vm_map_size_t copy_size,boolean_t interruptible)10029 vm_map_copy_overwrite(
10030 	vm_map_t        dst_map,
10031 	vm_map_offset_t dst_addr,
10032 	vm_map_copy_t   copy,
10033 	vm_map_size_t   copy_size,
10034 	boolean_t       interruptible)
10035 {
10036 	vm_map_size_t   head_size, tail_size;
10037 	vm_map_copy_t   head_copy, tail_copy;
10038 	vm_map_offset_t head_addr, tail_addr;
10039 	vm_map_entry_t  entry;
10040 	kern_return_t   kr;
10041 	vm_map_offset_t effective_page_mask, effective_page_size;
10042 	uint16_t        copy_page_shift;
10043 
10044 	head_size = 0;
10045 	tail_size = 0;
10046 	head_copy = NULL;
10047 	tail_copy = NULL;
10048 	head_addr = 0;
10049 	tail_addr = 0;
10050 
10051 	/*
10052 	 *	Check for null copy object.
10053 	 */
10054 	if (copy == VM_MAP_COPY_NULL) {
10055 		return KERN_SUCCESS;
10056 	}
10057 
10058 	if (__improbable(vm_map_range_overflows(dst_map, dst_addr, copy_size))) {
10059 		return KERN_INVALID_ADDRESS;
10060 	}
10061 
10062 	/*
10063 	 * Assert that the vm_map_copy is coming from the right
10064 	 * zone and hasn't been forged
10065 	 */
10066 	vm_map_copy_require(copy);
10067 
10068 	if (interruptible ||
10069 	    copy->type != VM_MAP_COPY_ENTRY_LIST) {
10070 		/*
10071 		 * We can't split the "copy" map if we're interruptible
10072 		 * or if we don't have a "copy" map...
10073 		 */
10074 blunt_copy:
10075 		kr = vm_map_copy_overwrite_nested(dst_map,
10076 		    dst_addr,
10077 		    copy,
10078 		    interruptible,
10079 		    (pmap_t) NULL,
10080 		    TRUE);
10081 		if (kr) {
10082 			ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOVERWRITE_FULL_NESTED_ERROR), kr /* arg */);
10083 		}
10084 		return kr;
10085 	}
10086 
10087 	copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy);
10088 	if (copy_page_shift < PAGE_SHIFT ||
10089 	    VM_MAP_PAGE_SHIFT(dst_map) < PAGE_SHIFT) {
10090 		goto blunt_copy;
10091 	}
10092 
10093 	if (VM_MAP_PAGE_SHIFT(dst_map) < PAGE_SHIFT) {
10094 		effective_page_mask = VM_MAP_PAGE_MASK(dst_map);
10095 	} else {
10096 		effective_page_mask = MAX(VM_MAP_PAGE_MASK(dst_map), PAGE_MASK);
10097 		effective_page_mask = MAX(VM_MAP_COPY_PAGE_MASK(copy),
10098 		    effective_page_mask);
10099 	}
10100 	effective_page_size = effective_page_mask + 1;
10101 
10102 	if (copy_size < VM_MAP_COPY_OVERWRITE_OPTIMIZATION_THRESHOLD_PAGES * effective_page_size) {
10103 		/*
10104 		 * Too small to bother with optimizing...
10105 		 */
10106 		goto blunt_copy;
10107 	}
10108 
10109 	if ((dst_addr & effective_page_mask) !=
10110 	    (copy->offset & effective_page_mask)) {
10111 		/*
10112 		 * Incompatible mis-alignment of source and destination...
10113 		 */
10114 		goto blunt_copy;
10115 	}
10116 
10117 	/*
10118 	 * Proper alignment or identical mis-alignment at the beginning.
10119 	 * Let's try and do a small unaligned copy first (if needed)
10120 	 * and then an aligned copy for the rest.
10121 	 */
10122 	if (!vm_map_page_aligned(dst_addr, effective_page_mask)) {
10123 		head_addr = dst_addr;
10124 		head_size = (effective_page_size -
10125 		    (copy->offset & effective_page_mask));
10126 		head_size = MIN(head_size, copy_size);
10127 	}
10128 	if (!vm_map_page_aligned(copy->offset + copy_size,
10129 	    effective_page_mask)) {
10130 		/*
10131 		 * Mis-alignment at the end.
10132 		 * Do an aligned copy up to the last page and
10133 		 * then an unaligned copy for the remaining bytes.
10134 		 */
10135 		tail_size = ((copy->offset + copy_size) &
10136 		    effective_page_mask);
10137 		tail_size = MIN(tail_size, copy_size);
10138 		tail_addr = dst_addr + copy_size - tail_size;
10139 		assert(tail_addr >= head_addr + head_size);
10140 	}
10141 	assert(head_size + tail_size <= copy_size);
10142 
10143 	if (head_size + tail_size == copy_size) {
10144 		/*
10145 		 * It's all unaligned, no optimization possible...
10146 		 */
10147 		goto blunt_copy;
10148 	}
10149 
10150 	/*
10151 	 * Can't optimize if there are any submaps in the
10152 	 * destination due to the way we free the "copy" map
10153 	 * progressively in vm_map_copy_overwrite_nested()
10154 	 * in that case.
10155 	 */
10156 	vm_map_lock_read(dst_map);
10157 	if (!vm_map_lookup_entry(dst_map, dst_addr, &entry)) {
10158 		vm_map_unlock_read(dst_map);
10159 		goto blunt_copy;
10160 	}
10161 	for (;
10162 	    (entry != vm_map_to_entry(dst_map) &&
10163 	    entry->vme_start < dst_addr + copy_size);
10164 	    entry = entry->vme_next) {
10165 		if (entry->is_sub_map) {
10166 			vm_map_unlock_read(dst_map);
10167 			goto blunt_copy;
10168 		}
10169 	}
10170 	vm_map_unlock_read(dst_map);
10171 
10172 	if (head_size) {
10173 		/*
10174 		 * Unaligned copy of the first "head_size" bytes, to reach
10175 		 * a page boundary.
10176 		 */
10177 
10178 		/*
10179 		 * Extract "head_copy" out of "copy".
10180 		 */
10181 		head_copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
10182 		head_copy->cpy_hdr.entries_pageable =
10183 		    copy->cpy_hdr.entries_pageable;
10184 		head_copy->cpy_hdr.page_shift = copy_page_shift;
10185 
10186 		entry = vm_map_copy_first_entry(copy);
10187 		if (entry->vme_end < copy->offset + head_size) {
10188 			head_size = entry->vme_end - copy->offset;
10189 		}
10190 
10191 		head_copy->offset = copy->offset;
10192 		head_copy->size = head_size;
10193 		copy->offset += head_size;
10194 		copy->size -= head_size;
10195 		copy_size -= head_size;
10196 		assert(copy_size > 0);
10197 
10198 		vm_map_copy_clip_end(copy, entry, copy->offset);
10199 		vm_map_copy_entry_unlink(copy, entry);
10200 		vm_map_copy_entry_link(head_copy,
10201 		    vm_map_copy_to_entry(head_copy),
10202 		    entry);
10203 
10204 		/*
10205 		 * Do the unaligned copy.
10206 		 */
10207 		kr = vm_map_copy_overwrite_nested(dst_map,
10208 		    head_addr,
10209 		    head_copy,
10210 		    interruptible,
10211 		    (pmap_t) NULL,
10212 		    FALSE);
10213 		if (kr != KERN_SUCCESS) {
10214 			ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOVERWRITE_PARTIAL_HEAD_NESTED_ERROR), kr /* arg */);
10215 			goto done;
10216 		}
10217 	}
10218 
10219 	if (tail_size) {
10220 		/*
10221 		 * Extract "tail_copy" out of "copy".
10222 		 */
10223 		tail_copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
10224 		tail_copy->cpy_hdr.entries_pageable =
10225 		    copy->cpy_hdr.entries_pageable;
10226 		tail_copy->cpy_hdr.page_shift = copy_page_shift;
10227 
10228 		tail_copy->offset = copy->offset + copy_size - tail_size;
10229 		tail_copy->size = tail_size;
10230 
10231 		copy->size -= tail_size;
10232 		copy_size -= tail_size;
10233 		assert(copy_size > 0);
10234 
10235 		entry = vm_map_copy_last_entry(copy);
10236 		vm_map_copy_clip_start(copy, entry, tail_copy->offset);
10237 		entry = vm_map_copy_last_entry(copy);
10238 		vm_map_copy_entry_unlink(copy, entry);
10239 		vm_map_copy_entry_link(tail_copy,
10240 		    vm_map_copy_last_entry(tail_copy),
10241 		    entry);
10242 	}
10243 
10244 	/*
10245 	 * If we are here from ipc_kmsg_copyout_ool_descriptor(),
10246 	 * we want to avoid TOCTOU issues w.r.t copy->size but
10247 	 * we don't need to change vm_map_copy_overwrite_nested()
10248 	 * and all other vm_map_copy_overwrite variants.
10249 	 *
10250 	 * So we assign the original copy_size that was passed into
10251 	 * this routine back to copy.
10252 	 *
10253 	 * This use of local 'copy_size' passed into this routine is
10254 	 * to try and protect against TOCTOU attacks where the kernel
10255 	 * has been exploited. We don't expect this to be an issue
10256 	 * during normal system operation.
10257 	 */
10258 	assertf(copy->size == copy_size,
10259 	    "Mismatch of copy sizes. Expected 0x%llx, Got 0x%llx\n", (uint64_t) copy_size, (uint64_t) copy->size);
10260 	copy->size = copy_size;
10261 
10262 	/*
10263 	 * Copy most (or possibly all) of the data.
10264 	 */
10265 	kr = vm_map_copy_overwrite_nested(dst_map,
10266 	    dst_addr + head_size,
10267 	    copy,
10268 	    interruptible,
10269 	    (pmap_t) NULL,
10270 	    FALSE);
10271 	if (kr != KERN_SUCCESS) {
10272 		ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOVERWRITE_PARTIAL_NESTED_ERROR), kr /* arg */);
10273 		goto done;
10274 	}
10275 
10276 	if (tail_size) {
10277 		kr = vm_map_copy_overwrite_nested(dst_map,
10278 		    tail_addr,
10279 		    tail_copy,
10280 		    interruptible,
10281 		    (pmap_t) NULL,
10282 		    FALSE);
10283 		if (kr) {
10284 			ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOVERWRITE_PARTIAL_TAIL_NESTED_ERROR), kr /* arg */);
10285 		}
10286 	}
10287 
10288 done:
10289 	assert(copy->type == VM_MAP_COPY_ENTRY_LIST);
10290 	if (kr == KERN_SUCCESS) {
10291 		/*
10292 		 * Discard all the copy maps.
10293 		 */
10294 		if (head_copy) {
10295 			vm_map_copy_discard(head_copy);
10296 			head_copy = NULL;
10297 		}
10298 		vm_map_copy_discard(copy);
10299 		if (tail_copy) {
10300 			vm_map_copy_discard(tail_copy);
10301 			tail_copy = NULL;
10302 		}
10303 	} else {
10304 		/*
10305 		 * Re-assemble the original copy map.
10306 		 */
10307 		if (head_copy) {
10308 			entry = vm_map_copy_first_entry(head_copy);
10309 			vm_map_copy_entry_unlink(head_copy, entry);
10310 			vm_map_copy_entry_link(copy,
10311 			    vm_map_copy_to_entry(copy),
10312 			    entry);
10313 			copy->offset -= head_size;
10314 			copy->size += head_size;
10315 			vm_map_copy_discard(head_copy);
10316 			head_copy = NULL;
10317 		}
10318 		if (tail_copy) {
10319 			entry = vm_map_copy_last_entry(tail_copy);
10320 			vm_map_copy_entry_unlink(tail_copy, entry);
10321 			vm_map_copy_entry_link(copy,
10322 			    vm_map_copy_last_entry(copy),
10323 			    entry);
10324 			copy->size += tail_size;
10325 			vm_map_copy_discard(tail_copy);
10326 			tail_copy = NULL;
10327 		}
10328 	}
10329 	return kr;
10330 }
10331 
10332 
10333 /*
10334  *	Routine: vm_map_copy_overwrite_unaligned	[internal use only]
10335  *
10336  *	Decription:
10337  *	Physically copy unaligned data
10338  *
10339  *	Implementation:
10340  *	Unaligned parts of pages have to be physically copied.  We use
10341  *	a modified form of vm_fault_copy (which understands none-aligned
10342  *	page offsets and sizes) to do the copy.  We attempt to copy as
10343  *	much memory in one go as possibly, however vm_fault_copy copies
10344  *	within 1 memory object so we have to find the smaller of "amount left"
10345  *	"source object data size" and "target object data size".  With
10346  *	unaligned data we don't need to split regions, therefore the source
10347  *	(copy) object should be one map entry, the target range may be split
10348  *	over multiple map entries however.  In any event we are pessimistic
10349  *	about these assumptions.
10350  *
10351  *	Callers of this function must call vm_map_copy_require on
10352  *	previously created vm_map_copy_t or pass a newly created
10353  *	one to ensure that it hasn't been forged.
10354  *
10355  *	Assumptions:
10356  *	dst_map is locked on entry and is return locked on success,
10357  *	unlocked on error.
10358  */
10359 
10360 static kern_return_t
vm_map_copy_overwrite_unaligned(vm_map_t dst_map,vm_map_entry_t entry,vm_map_copy_t copy,vm_map_offset_t start,boolean_t discard_on_success)10361 vm_map_copy_overwrite_unaligned(
10362 	vm_map_t        dst_map,
10363 	vm_map_entry_t  entry,
10364 	vm_map_copy_t   copy,
10365 	vm_map_offset_t start,
10366 	boolean_t       discard_on_success)
10367 {
10368 	vm_map_entry_t          copy_entry;
10369 	vm_map_entry_t          copy_entry_next;
10370 	vm_map_version_t        version;
10371 	vm_object_t             dst_object;
10372 	vm_object_offset_t      dst_offset;
10373 	vm_object_offset_t      src_offset;
10374 	vm_object_offset_t      entry_offset;
10375 	vm_map_offset_t         entry_end;
10376 	vm_map_size_t           src_size,
10377 	    dst_size,
10378 	    copy_size,
10379 	    amount_left;
10380 	kern_return_t           kr = KERN_SUCCESS;
10381 
10382 
10383 	copy_entry = vm_map_copy_first_entry(copy);
10384 
10385 	vm_map_lock_write_to_read(dst_map);
10386 
10387 	src_offset = copy->offset - trunc_page_mask_64(copy->offset, VM_MAP_COPY_PAGE_MASK(copy));
10388 	amount_left = copy->size;
10389 /*
10390  *	unaligned so we never clipped this entry, we need the offset into
10391  *	the vm_object not just the data.
10392  */
10393 	while (amount_left > 0) {
10394 		if (entry == vm_map_to_entry(dst_map)) {
10395 			vm_map_unlock_read(dst_map);
10396 			return KERN_INVALID_ADDRESS;
10397 		}
10398 
10399 		/* "start" must be within the current map entry */
10400 		assert((start >= entry->vme_start) && (start < entry->vme_end));
10401 
10402 		/*
10403 		 *	Check protection again
10404 		 */
10405 		if (!(entry->protection & VM_PROT_WRITE)) {
10406 			vm_map_unlock_read(dst_map);
10407 			return KERN_PROTECTION_FAILURE;
10408 		}
10409 		if (!vm_map_entry_is_overwritable(dst_map, entry)) {
10410 			vm_map_unlock_read(dst_map);
10411 			return KERN_PROTECTION_FAILURE;
10412 		}
10413 
10414 		/*
10415 		 *	If the entry is in transition, we must wait
10416 		 *	for it to exit that state.  Anything could happen
10417 		 *	when we unlock the map, so start over.
10418 		 */
10419 		if (entry->in_transition) {
10420 			/*
10421 			 * Say that we are waiting, and wait for entry.
10422 			 */
10423 			entry->needs_wakeup = TRUE;
10424 			vm_map_entry_wait(dst_map, THREAD_UNINT);
10425 
10426 			goto RetryLookup;
10427 		}
10428 
10429 		dst_offset = start - entry->vme_start;
10430 
10431 		dst_size = entry->vme_end - start;
10432 
10433 		src_size = copy_entry->vme_end -
10434 		    (copy_entry->vme_start + src_offset);
10435 
10436 		if (dst_size < src_size) {
10437 /*
10438  *			we can only copy dst_size bytes before
10439  *			we have to get the next destination entry
10440  */
10441 			copy_size = dst_size;
10442 		} else {
10443 /*
10444  *			we can only copy src_size bytes before
10445  *			we have to get the next source copy entry
10446  */
10447 			copy_size = src_size;
10448 		}
10449 
10450 		if (copy_size > amount_left) {
10451 			copy_size = amount_left;
10452 		}
10453 /*
10454  *		Entry needs copy, create a shadow shadow object for
10455  *		Copy on write region.
10456  */
10457 		if (entry->needs_copy) {
10458 			if (vm_map_lock_read_to_write(dst_map)) {
10459 				vm_map_lock_read(dst_map);
10460 				goto RetryLookup;
10461 			}
10462 			VME_OBJECT_SHADOW(entry,
10463 			    (vm_map_size_t)(entry->vme_end
10464 			    - entry->vme_start),
10465 			    vm_map_always_shadow(dst_map));
10466 			entry->needs_copy = FALSE;
10467 			vm_map_lock_write_to_read(dst_map);
10468 		}
10469 		dst_object = VME_OBJECT(entry);
10470 /*
10471  *		unlike with the virtual (aligned) copy we're going
10472  *		to fault on it therefore we need a target object.
10473  */
10474 		if (dst_object == VM_OBJECT_NULL) {
10475 			if (vm_map_lock_read_to_write(dst_map)) {
10476 				vm_map_lock_read(dst_map);
10477 				goto RetryLookup;
10478 			}
10479 			dst_object = vm_object_allocate((vm_map_size_t)
10480 			    entry->vme_end - entry->vme_start);
10481 			VME_OBJECT_SET(entry, dst_object, false, 0);
10482 			VME_OFFSET_SET(entry, 0);
10483 			assert(entry->use_pmap);
10484 			vm_map_lock_write_to_read(dst_map);
10485 		}
10486 /*
10487  *		Take an object reference and unlock map. The "entry" may
10488  *		disappear or change when the map is unlocked.
10489  */
10490 		vm_object_reference(dst_object);
10491 		version.main_timestamp = dst_map->timestamp;
10492 		entry_offset = VME_OFFSET(entry);
10493 		entry_end = entry->vme_end;
10494 		vm_map_unlock_read(dst_map);
10495 /*
10496  *		Copy as much as possible in one pass
10497  */
10498 		kr = vm_fault_copy(
10499 			VME_OBJECT(copy_entry),
10500 			VME_OFFSET(copy_entry) + src_offset,
10501 			&copy_size,
10502 			dst_object,
10503 			entry_offset + dst_offset,
10504 			dst_map,
10505 			&version,
10506 			THREAD_UNINT );
10507 
10508 		start += copy_size;
10509 		src_offset += copy_size;
10510 		amount_left -= copy_size;
10511 /*
10512  *		Release the object reference
10513  */
10514 		vm_object_deallocate(dst_object);
10515 /*
10516  *		If a hard error occurred, return it now
10517  */
10518 		if (kr != KERN_SUCCESS) {
10519 			return kr;
10520 		}
10521 
10522 		if ((copy_entry->vme_start + src_offset) == copy_entry->vme_end
10523 		    || amount_left == 0) {
10524 /*
10525  *			all done with this copy entry, dispose.
10526  */
10527 			copy_entry_next = copy_entry->vme_next;
10528 
10529 			if (discard_on_success) {
10530 				vm_map_copy_entry_unlink(copy, copy_entry);
10531 				assert(!copy_entry->is_sub_map);
10532 				vm_object_deallocate(VME_OBJECT(copy_entry));
10533 				vm_map_copy_entry_dispose(copy_entry);
10534 			}
10535 
10536 			if (copy_entry_next == vm_map_copy_to_entry(copy) &&
10537 			    amount_left) {
10538 /*
10539  *				not finished copying but run out of source
10540  */
10541 				return KERN_INVALID_ADDRESS;
10542 			}
10543 
10544 			copy_entry = copy_entry_next;
10545 
10546 			src_offset = 0;
10547 		}
10548 
10549 		if (amount_left == 0) {
10550 			return KERN_SUCCESS;
10551 		}
10552 
10553 		vm_map_lock_read(dst_map);
10554 		if (version.main_timestamp == dst_map->timestamp) {
10555 			if (start == entry_end) {
10556 /*
10557  *				destination region is split.  Use the version
10558  *				information to avoid a lookup in the normal
10559  *				case.
10560  */
10561 				entry = entry->vme_next;
10562 /*
10563  *				should be contiguous. Fail if we encounter
10564  *				a hole in the destination.
10565  */
10566 				if (start != entry->vme_start) {
10567 					vm_map_unlock_read(dst_map);
10568 					return KERN_INVALID_ADDRESS;
10569 				}
10570 			}
10571 		} else {
10572 /*
10573  *			Map version check failed.
10574  *			we must lookup the entry because somebody
10575  *			might have changed the map behind our backs.
10576  */
10577 RetryLookup:
10578 			if (!vm_map_lookup_entry(dst_map, start, &entry)) {
10579 				vm_map_unlock_read(dst_map);
10580 				return KERN_INVALID_ADDRESS;
10581 			}
10582 		}
10583 	}/* while */
10584 
10585 	return KERN_SUCCESS;
10586 }/* vm_map_copy_overwrite_unaligned */
10587 
10588 /*
10589  *	Routine: vm_map_copy_overwrite_aligned	[internal use only]
10590  *
10591  *	Description:
10592  *	Does all the vm_trickery possible for whole pages.
10593  *
10594  *	Implementation:
10595  *
10596  *	If there are no permanent objects in the destination,
10597  *	and the source and destination map entry zones match,
10598  *	and the destination map entry is not shared,
10599  *	then the map entries can be deleted and replaced
10600  *	with those from the copy.  The following code is the
10601  *	basic idea of what to do, but there are lots of annoying
10602  *	little details about getting protection and inheritance
10603  *	right.  Should add protection, inheritance, and sharing checks
10604  *	to the above pass and make sure that no wiring is involved.
10605  *
10606  *	Callers of this function must call vm_map_copy_require on
10607  *	previously created vm_map_copy_t or pass a newly created
10608  *	one to ensure that it hasn't been forged.
10609  */
10610 
10611 int vm_map_copy_overwrite_aligned_src_not_internal = 0;
10612 int vm_map_copy_overwrite_aligned_src_not_symmetric = 0;
10613 int vm_map_copy_overwrite_aligned_src_large = 0;
10614 
10615 static kern_return_t
vm_map_copy_overwrite_aligned(vm_map_t dst_map,vm_map_entry_t tmp_entry,vm_map_copy_t copy,vm_map_offset_t start,__unused pmap_t pmap)10616 vm_map_copy_overwrite_aligned(
10617 	vm_map_t        dst_map,
10618 	vm_map_entry_t  tmp_entry,
10619 	vm_map_copy_t   copy,
10620 	vm_map_offset_t start,
10621 	__unused pmap_t pmap)
10622 {
10623 	vm_object_t     object;
10624 	vm_map_entry_t  copy_entry;
10625 	vm_map_size_t   copy_size;
10626 	vm_map_size_t   size;
10627 	vm_map_entry_t  entry;
10628 
10629 	while ((copy_entry = vm_map_copy_first_entry(copy))
10630 	    != vm_map_copy_to_entry(copy)) {
10631 		copy_size = (copy_entry->vme_end - copy_entry->vme_start);
10632 
10633 		entry = tmp_entry;
10634 		if (entry->is_sub_map) {
10635 			/* unnested when clipped earlier */
10636 			assert(!entry->use_pmap);
10637 		}
10638 		if (entry == vm_map_to_entry(dst_map)) {
10639 			vm_map_unlock(dst_map);
10640 			return KERN_INVALID_ADDRESS;
10641 		}
10642 		size = (entry->vme_end - entry->vme_start);
10643 		/*
10644 		 *	Make sure that no holes popped up in the
10645 		 *	address map, and that the protection is
10646 		 *	still valid, in case the map was unlocked
10647 		 *	earlier.
10648 		 */
10649 
10650 		if ((entry->vme_start != start) || ((entry->is_sub_map)
10651 		    && !entry->needs_copy)) {
10652 			vm_map_unlock(dst_map);
10653 			return KERN_INVALID_ADDRESS;
10654 		}
10655 		assert(entry != vm_map_to_entry(dst_map));
10656 
10657 		/*
10658 		 *	Check protection again
10659 		 */
10660 
10661 		if (!(entry->protection & VM_PROT_WRITE)) {
10662 			vm_map_unlock(dst_map);
10663 			return KERN_PROTECTION_FAILURE;
10664 		}
10665 
10666 		if (!vm_map_entry_is_overwritable(dst_map, entry)) {
10667 			vm_map_unlock(dst_map);
10668 			return KERN_PROTECTION_FAILURE;
10669 		}
10670 
10671 		/*
10672 		 *	If the entry is in transition, we must wait
10673 		 *	for it to exit that state.  Anything could happen
10674 		 *	when we unlock the map, so start over.
10675 		 */
10676 		if (entry->in_transition) {
10677 			/*
10678 			 * Say that we are waiting, and wait for entry.
10679 			 */
10680 			entry->needs_wakeup = TRUE;
10681 			vm_map_entry_wait(dst_map, THREAD_UNINT);
10682 
10683 			goto RetryLookup;
10684 		}
10685 
10686 		/*
10687 		 *	Adjust to source size first
10688 		 */
10689 
10690 		if (copy_size < size) {
10691 			if (entry->map_aligned &&
10692 			    !VM_MAP_PAGE_ALIGNED(entry->vme_start + copy_size,
10693 			    VM_MAP_PAGE_MASK(dst_map))) {
10694 				/* no longer map-aligned */
10695 				entry->map_aligned = FALSE;
10696 			}
10697 			vm_map_clip_end(dst_map, entry, entry->vme_start + copy_size);
10698 			size = copy_size;
10699 		}
10700 
10701 		/*
10702 		 *	Adjust to destination size
10703 		 */
10704 
10705 		if (size < copy_size) {
10706 			vm_map_copy_clip_end(copy, copy_entry,
10707 			    copy_entry->vme_start + size);
10708 			copy_size = size;
10709 		}
10710 
10711 		assert((entry->vme_end - entry->vme_start) == size);
10712 		assert((tmp_entry->vme_end - tmp_entry->vme_start) == size);
10713 		assert((copy_entry->vme_end - copy_entry->vme_start) == size);
10714 
10715 		/*
10716 		 *	If the destination contains temporary unshared memory,
10717 		 *	we can perform the copy by throwing it away and
10718 		 *	installing the source data.
10719 		 *
10720 		 *	Exceptions for mappings with special semantics:
10721 		 *	+ "permanent" entries,
10722 		 *	+ JIT regions,
10723 		 *	+ TPRO regions,
10724 		 *      + pmap-specific protection policies,
10725 		 *	+ VM objects with COPY_NONE copy strategy.
10726 		 */
10727 
10728 		object = VME_OBJECT(entry);
10729 		if ((!entry->is_shared &&
10730 		    !entry->vme_permanent &&
10731 		    !entry->used_for_jit &&
10732 #if __arm64e__
10733 		    !entry->used_for_tpro &&
10734 #endif /* __arm64e__ */
10735 		    !(entry->protection & VM_PROT_EXECUTE) &&
10736 		    !pmap_has_prot_policy(dst_map->pmap, entry->translated_allow_execute, entry->protection) &&
10737 		    ((object == VM_OBJECT_NULL) ||
10738 		    (object->internal &&
10739 		    !object->true_share &&
10740 		    object->copy_strategy != MEMORY_OBJECT_COPY_NONE))) ||
10741 		    entry->needs_copy) {
10742 			vm_object_t     old_object = VME_OBJECT(entry);
10743 			vm_object_offset_t      old_offset = VME_OFFSET(entry);
10744 			vm_object_offset_t      offset;
10745 
10746 			/*
10747 			 * Ensure that the source and destination aren't
10748 			 * identical
10749 			 */
10750 			if (old_object == VME_OBJECT(copy_entry) &&
10751 			    old_offset == VME_OFFSET(copy_entry)) {
10752 				vm_map_copy_entry_unlink(copy, copy_entry);
10753 				vm_map_copy_entry_dispose(copy_entry);
10754 
10755 				if (old_object != VM_OBJECT_NULL) {
10756 					vm_object_deallocate(old_object);
10757 				}
10758 
10759 				start = tmp_entry->vme_end;
10760 				tmp_entry = tmp_entry->vme_next;
10761 				continue;
10762 			}
10763 
10764 #if XNU_TARGET_OS_OSX
10765 #define __TRADEOFF1_OBJ_SIZE (64 * 1024 * 1024) /* 64 MB */
10766 #define __TRADEOFF1_COPY_SIZE (128 * 1024)      /* 128 KB */
10767 			if (VME_OBJECT(copy_entry) != VM_OBJECT_NULL &&
10768 			    VME_OBJECT(copy_entry)->vo_size >= __TRADEOFF1_OBJ_SIZE &&
10769 			    copy_size <= __TRADEOFF1_COPY_SIZE) {
10770 				/*
10771 				 * Virtual vs. Physical copy tradeoff #1.
10772 				 *
10773 				 * Copying only a few pages out of a large
10774 				 * object:  do a physical copy instead of
10775 				 * a virtual copy, to avoid possibly keeping
10776 				 * the entire large object alive because of
10777 				 * those few copy-on-write pages.
10778 				 */
10779 				vm_map_copy_overwrite_aligned_src_large++;
10780 				goto slow_copy;
10781 			}
10782 #endif /* XNU_TARGET_OS_OSX */
10783 
10784 			if ((dst_map->pmap != kernel_pmap) &&
10785 			    (VME_ALIAS(entry) >= VM_MEMORY_MALLOC) &&
10786 			    (VME_ALIAS(entry) <= VM_MEMORY_MALLOC_MEDIUM)) {
10787 				vm_object_t new_object, new_shadow;
10788 
10789 				/*
10790 				 * We're about to map something over a mapping
10791 				 * established by malloc()...
10792 				 */
10793 				new_object = VME_OBJECT(copy_entry);
10794 				if (new_object != VM_OBJECT_NULL) {
10795 					vm_object_lock_shared(new_object);
10796 				}
10797 				while (new_object != VM_OBJECT_NULL &&
10798 #if XNU_TARGET_OS_OSX
10799 				    !new_object->true_share &&
10800 				    new_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC &&
10801 #endif /* XNU_TARGET_OS_OSX */
10802 				    new_object->internal) {
10803 					new_shadow = new_object->shadow;
10804 					if (new_shadow == VM_OBJECT_NULL) {
10805 						break;
10806 					}
10807 					vm_object_lock_shared(new_shadow);
10808 					vm_object_unlock(new_object);
10809 					new_object = new_shadow;
10810 				}
10811 				if (new_object != VM_OBJECT_NULL) {
10812 					if (!new_object->internal) {
10813 						/*
10814 						 * The new mapping is backed
10815 						 * by an external object.  We
10816 						 * don't want malloc'ed memory
10817 						 * to be replaced with such a
10818 						 * non-anonymous mapping, so
10819 						 * let's go off the optimized
10820 						 * path...
10821 						 */
10822 						vm_map_copy_overwrite_aligned_src_not_internal++;
10823 						vm_object_unlock(new_object);
10824 						goto slow_copy;
10825 					}
10826 #if XNU_TARGET_OS_OSX
10827 					if (new_object->true_share ||
10828 					    new_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
10829 						/*
10830 						 * Same if there's a "true_share"
10831 						 * object in the shadow chain, or
10832 						 * an object with a non-default
10833 						 * (SYMMETRIC) copy strategy.
10834 						 */
10835 						vm_map_copy_overwrite_aligned_src_not_symmetric++;
10836 						vm_object_unlock(new_object);
10837 						goto slow_copy;
10838 					}
10839 #endif /* XNU_TARGET_OS_OSX */
10840 					vm_object_unlock(new_object);
10841 				}
10842 				/*
10843 				 * The new mapping is still backed by
10844 				 * anonymous (internal) memory, so it's
10845 				 * OK to substitute it for the original
10846 				 * malloc() mapping.
10847 				 */
10848 			}
10849 
10850 			if (old_object != VM_OBJECT_NULL) {
10851 				assert(!entry->vme_permanent);
10852 				if (entry->is_sub_map) {
10853 					if (entry->use_pmap) {
10854 #ifndef NO_NESTED_PMAP
10855 						pmap_unnest(dst_map->pmap,
10856 						    (addr64_t)entry->vme_start,
10857 						    entry->vme_end - entry->vme_start);
10858 #endif  /* NO_NESTED_PMAP */
10859 						if (dst_map->mapped_in_other_pmaps) {
10860 							/* clean up parent */
10861 							/* map/maps */
10862 							vm_map_submap_pmap_clean(
10863 								dst_map, entry->vme_start,
10864 								entry->vme_end,
10865 								VME_SUBMAP(entry),
10866 								VME_OFFSET(entry));
10867 						}
10868 					} else {
10869 						vm_map_submap_pmap_clean(
10870 							dst_map, entry->vme_start,
10871 							entry->vme_end,
10872 							VME_SUBMAP(entry),
10873 							VME_OFFSET(entry));
10874 					}
10875 					vm_map_deallocate(VME_SUBMAP(entry));
10876 				} else {
10877 					if (dst_map->mapped_in_other_pmaps) {
10878 						vm_object_pmap_protect_options(
10879 							VME_OBJECT(entry),
10880 							VME_OFFSET(entry),
10881 							entry->vme_end
10882 							- entry->vme_start,
10883 							PMAP_NULL,
10884 							PAGE_SIZE,
10885 							entry->vme_start,
10886 							VM_PROT_NONE,
10887 							PMAP_OPTIONS_REMOVE);
10888 					} else {
10889 						pmap_remove_options(
10890 							dst_map->pmap,
10891 							(addr64_t)(entry->vme_start),
10892 							(addr64_t)(entry->vme_end),
10893 							PMAP_OPTIONS_REMOVE);
10894 					}
10895 					vm_object_deallocate(old_object);
10896 				}
10897 			}
10898 
10899 			if (entry->iokit_acct) {
10900 				/* keep using iokit accounting */
10901 				entry->use_pmap = FALSE;
10902 			} else {
10903 				/* use pmap accounting */
10904 				entry->use_pmap = TRUE;
10905 			}
10906 			assert(!entry->vme_permanent);
10907 			VME_OBJECT_SET(entry, VME_OBJECT(copy_entry), false, 0);
10908 			object = VME_OBJECT(entry);
10909 			entry->needs_copy = copy_entry->needs_copy;
10910 			entry->wired_count = 0;
10911 			entry->user_wired_count = 0;
10912 			offset = VME_OFFSET(copy_entry);
10913 			VME_OFFSET_SET(entry, offset);
10914 
10915 			vm_map_copy_entry_unlink(copy, copy_entry);
10916 			vm_map_copy_entry_dispose(copy_entry);
10917 
10918 			/*
10919 			 * we could try to push pages into the pmap at this point, BUT
10920 			 * this optimization only saved on average 2 us per page if ALL
10921 			 * the pages in the source were currently mapped
10922 			 * and ALL the pages in the dest were touched, if there were fewer
10923 			 * than 2/3 of the pages touched, this optimization actually cost more cycles
10924 			 * it also puts a lot of pressure on the pmap layer w/r to mapping structures
10925 			 */
10926 
10927 			/*
10928 			 *	Set up for the next iteration.  The map
10929 			 *	has not been unlocked, so the next
10930 			 *	address should be at the end of this
10931 			 *	entry, and the next map entry should be
10932 			 *	the one following it.
10933 			 */
10934 
10935 			start = tmp_entry->vme_end;
10936 			tmp_entry = tmp_entry->vme_next;
10937 		} else {
10938 			vm_map_version_t        version;
10939 			vm_object_t             dst_object;
10940 			vm_object_offset_t      dst_offset;
10941 			kern_return_t           r;
10942 
10943 slow_copy:
10944 			if (entry->needs_copy) {
10945 				VME_OBJECT_SHADOW(entry,
10946 				    (entry->vme_end -
10947 				    entry->vme_start),
10948 				    vm_map_always_shadow(dst_map));
10949 				entry->needs_copy = FALSE;
10950 			}
10951 
10952 			dst_object = VME_OBJECT(entry);
10953 			dst_offset = VME_OFFSET(entry);
10954 
10955 			/*
10956 			 *	Take an object reference, and record
10957 			 *	the map version information so that the
10958 			 *	map can be safely unlocked.
10959 			 */
10960 
10961 			if (dst_object == VM_OBJECT_NULL) {
10962 				/*
10963 				 * We would usually have just taken the
10964 				 * optimized path above if the destination
10965 				 * object has not been allocated yet.  But we
10966 				 * now disable that optimization if the copy
10967 				 * entry's object is not backed by anonymous
10968 				 * memory to avoid replacing malloc'ed
10969 				 * (i.e. re-usable) anonymous memory with a
10970 				 * not-so-anonymous mapping.
10971 				 * So we have to handle this case here and
10972 				 * allocate a new VM object for this map entry.
10973 				 */
10974 				dst_object = vm_object_allocate(
10975 					entry->vme_end - entry->vme_start);
10976 				dst_offset = 0;
10977 				VME_OBJECT_SET(entry, dst_object, false, 0);
10978 				VME_OFFSET_SET(entry, dst_offset);
10979 				assert(entry->use_pmap);
10980 			}
10981 
10982 			vm_object_reference(dst_object);
10983 
10984 			/* account for unlock bumping up timestamp */
10985 			version.main_timestamp = dst_map->timestamp + 1;
10986 
10987 			vm_map_unlock(dst_map);
10988 
10989 			/*
10990 			 *	Copy as much as possible in one pass
10991 			 */
10992 
10993 			copy_size = size;
10994 			r = vm_fault_copy(
10995 				VME_OBJECT(copy_entry),
10996 				VME_OFFSET(copy_entry),
10997 				&copy_size,
10998 				dst_object,
10999 				dst_offset,
11000 				dst_map,
11001 				&version,
11002 				THREAD_UNINT );
11003 
11004 			/*
11005 			 *	Release the object reference
11006 			 */
11007 
11008 			vm_object_deallocate(dst_object);
11009 
11010 			/*
11011 			 *	If a hard error occurred, return it now
11012 			 */
11013 
11014 			if (r != KERN_SUCCESS) {
11015 				return r;
11016 			}
11017 
11018 			if (copy_size != 0) {
11019 				/*
11020 				 *	Dispose of the copied region
11021 				 */
11022 
11023 				vm_map_copy_clip_end(copy, copy_entry,
11024 				    copy_entry->vme_start + copy_size);
11025 				vm_map_copy_entry_unlink(copy, copy_entry);
11026 				vm_object_deallocate(VME_OBJECT(copy_entry));
11027 				vm_map_copy_entry_dispose(copy_entry);
11028 			}
11029 
11030 			/*
11031 			 *	Pick up in the destination map where we left off.
11032 			 *
11033 			 *	Use the version information to avoid a lookup
11034 			 *	in the normal case.
11035 			 */
11036 
11037 			start += copy_size;
11038 			vm_map_lock(dst_map);
11039 			if (version.main_timestamp == dst_map->timestamp &&
11040 			    copy_size != 0) {
11041 				/* We can safely use saved tmp_entry value */
11042 
11043 				if (tmp_entry->map_aligned &&
11044 				    !VM_MAP_PAGE_ALIGNED(
11045 					    start,
11046 					    VM_MAP_PAGE_MASK(dst_map))) {
11047 					/* no longer map-aligned */
11048 					tmp_entry->map_aligned = FALSE;
11049 				}
11050 				vm_map_clip_end(dst_map, tmp_entry, start);
11051 				tmp_entry = tmp_entry->vme_next;
11052 			} else {
11053 				/* Must do lookup of tmp_entry */
11054 
11055 RetryLookup:
11056 				if (!vm_map_lookup_entry(dst_map, start, &tmp_entry)) {
11057 					vm_map_unlock(dst_map);
11058 					return KERN_INVALID_ADDRESS;
11059 				}
11060 				if (tmp_entry->map_aligned &&
11061 				    !VM_MAP_PAGE_ALIGNED(
11062 					    start,
11063 					    VM_MAP_PAGE_MASK(dst_map))) {
11064 					/* no longer map-aligned */
11065 					tmp_entry->map_aligned = FALSE;
11066 				}
11067 				vm_map_clip_start(dst_map, tmp_entry, start);
11068 			}
11069 		}
11070 	}/* while */
11071 
11072 	return KERN_SUCCESS;
11073 }/* vm_map_copy_overwrite_aligned */
11074 
11075 /*
11076  *	Routine: vm_map_copyin_kernel_buffer [internal use only]
11077  *
11078  *	Description:
11079  *		Copy in data to a kernel buffer from space in the
11080  *		source map. The original space may be optionally
11081  *		deallocated.
11082  *
11083  *		If successful, returns a new copy object.
11084  */
11085 static kern_return_t
vm_map_copyin_kernel_buffer(vm_map_t src_map,vm_map_offset_t src_addr,vm_map_size_t len,boolean_t src_destroy,vm_map_copy_t * copy_result)11086 vm_map_copyin_kernel_buffer(
11087 	vm_map_t        src_map,
11088 	vm_map_offset_t src_addr,
11089 	vm_map_size_t   len,
11090 	boolean_t       src_destroy,
11091 	vm_map_copy_t   *copy_result)
11092 {
11093 	kern_return_t kr;
11094 	vm_map_copy_t copy;
11095 	void *kdata;
11096 
11097 	if (len > msg_ool_size_small) {
11098 		return KERN_INVALID_ARGUMENT;
11099 	}
11100 
11101 	kdata = kalloc_data(len, Z_WAITOK);
11102 	if (kdata == NULL) {
11103 		return KERN_RESOURCE_SHORTAGE;
11104 	}
11105 	kr = copyinmap(src_map, src_addr, kdata, (vm_size_t)len);
11106 	if (kr != KERN_SUCCESS) {
11107 		kfree_data(kdata, len);
11108 		return kr;
11109 	}
11110 
11111 	copy = vm_map_copy_allocate(VM_MAP_COPY_KERNEL_BUFFER);
11112 	copy->cpy_kdata = kdata;
11113 	copy->size = len;
11114 	copy->offset = 0;
11115 
11116 	if (src_destroy) {
11117 		vmr_flags_t flags = VM_MAP_REMOVE_INTERRUPTIBLE;
11118 
11119 		if (src_map == kernel_map) {
11120 			flags |= VM_MAP_REMOVE_KUNWIRE;
11121 		}
11122 
11123 		(void)vm_map_remove_guard(src_map,
11124 		    vm_map_trunc_page(src_addr, VM_MAP_PAGE_MASK(src_map)),
11125 		    vm_map_round_page(src_addr + len, VM_MAP_PAGE_MASK(src_map)),
11126 		    flags, KMEM_GUARD_NONE);
11127 	}
11128 
11129 	*copy_result = copy;
11130 	return KERN_SUCCESS;
11131 }
11132 
11133 /*
11134  *	Routine: vm_map_copyout_kernel_buffer	[internal use only]
11135  *
11136  *	Description:
11137  *		Copy out data from a kernel buffer into space in the
11138  *		destination map. The space may be otpionally dynamically
11139  *		allocated.
11140  *
11141  *		If successful, consumes the copy object.
11142  *		Otherwise, the caller is responsible for it.
11143  *
11144  *		Callers of this function must call vm_map_copy_require on
11145  *		previously created vm_map_copy_t or pass a newly created
11146  *		one to ensure that it hasn't been forged.
11147  */
11148 static int vm_map_copyout_kernel_buffer_failures = 0;
11149 static kern_return_t
vm_map_copyout_kernel_buffer(vm_map_t map,vm_map_address_t * addr,vm_map_copy_t copy,vm_map_size_t copy_size,boolean_t overwrite,boolean_t consume_on_success)11150 vm_map_copyout_kernel_buffer(
11151 	vm_map_t                map,
11152 	vm_map_address_t        *addr,  /* IN/OUT */
11153 	vm_map_copy_t           copy,
11154 	vm_map_size_t           copy_size,
11155 	boolean_t               overwrite,
11156 	boolean_t               consume_on_success)
11157 {
11158 	kern_return_t kr = KERN_SUCCESS;
11159 	thread_t thread = current_thread();
11160 
11161 	assert(copy->size == copy_size);
11162 
11163 	/*
11164 	 * check for corrupted vm_map_copy structure
11165 	 */
11166 	if (copy_size > msg_ool_size_small || copy->offset) {
11167 		panic("Invalid vm_map_copy_t sz:%lld, ofst:%lld",
11168 		    (long long)copy->size, (long long)copy->offset);
11169 	}
11170 
11171 	if (!overwrite) {
11172 		/*
11173 		 * Allocate space in the target map for the data
11174 		 */
11175 		vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_ANYWHERE();
11176 
11177 		if (map == kernel_map) {
11178 			vmk_flags.vmkf_range_id = KMEM_RANGE_ID_DATA;
11179 		}
11180 
11181 		*addr = 0;
11182 		kr = vm_map_enter(map,
11183 		    addr,
11184 		    vm_map_round_page(copy_size,
11185 		    VM_MAP_PAGE_MASK(map)),
11186 		    (vm_map_offset_t) 0,
11187 		    vmk_flags,
11188 		    VM_OBJECT_NULL,
11189 		    (vm_object_offset_t) 0,
11190 		    FALSE,
11191 		    VM_PROT_DEFAULT,
11192 		    VM_PROT_ALL,
11193 		    VM_INHERIT_DEFAULT);
11194 		if (kr != KERN_SUCCESS) {
11195 			return kr;
11196 		}
11197 #if KASAN
11198 		if (map->pmap == kernel_pmap) {
11199 			kasan_notify_address(*addr, copy->size);
11200 		}
11201 #endif
11202 	}
11203 
11204 	/*
11205 	 * Copyout the data from the kernel buffer to the target map.
11206 	 */
11207 	if (thread->map == map) {
11208 		/*
11209 		 * If the target map is the current map, just do
11210 		 * the copy.
11211 		 */
11212 		assert((vm_size_t)copy_size == copy_size);
11213 		if (copyout(copy->cpy_kdata, *addr, (vm_size_t)copy_size)) {
11214 			kr = KERN_INVALID_ADDRESS;
11215 		}
11216 	} else {
11217 		vm_map_t oldmap;
11218 
11219 		/*
11220 		 * If the target map is another map, assume the
11221 		 * target's address space identity for the duration
11222 		 * of the copy.
11223 		 */
11224 		vm_map_reference(map);
11225 		oldmap = vm_map_switch(map);
11226 
11227 		assert((vm_size_t)copy_size == copy_size);
11228 		if (copyout(copy->cpy_kdata, *addr, (vm_size_t)copy_size)) {
11229 			vm_map_copyout_kernel_buffer_failures++;
11230 			kr = KERN_INVALID_ADDRESS;
11231 		}
11232 
11233 		(void) vm_map_switch(oldmap);
11234 		vm_map_deallocate(map);
11235 	}
11236 
11237 	if (kr != KERN_SUCCESS) {
11238 		/* the copy failed, clean up */
11239 		if (!overwrite) {
11240 			/*
11241 			 * Deallocate the space we allocated in the target map.
11242 			 */
11243 			(void) vm_map_remove(map,
11244 			    vm_map_trunc_page(*addr,
11245 			    VM_MAP_PAGE_MASK(map)),
11246 			    vm_map_round_page((*addr +
11247 			    vm_map_round_page(copy_size,
11248 			    VM_MAP_PAGE_MASK(map))),
11249 			    VM_MAP_PAGE_MASK(map)));
11250 			*addr = 0;
11251 		}
11252 	} else {
11253 		/* copy was successful, dicard the copy structure */
11254 		if (consume_on_success) {
11255 			kfree_data(copy->cpy_kdata, copy_size);
11256 			zfree_id(ZONE_ID_VM_MAP_COPY, copy);
11257 		}
11258 	}
11259 
11260 	return kr;
11261 }
11262 
11263 /*
11264  *	Routine:	vm_map_copy_insert      [internal use only]
11265  *
11266  *	Description:
11267  *		Link a copy chain ("copy") into a map at the
11268  *		specified location (after "where").
11269  *
11270  *		Callers of this function must call vm_map_copy_require on
11271  *		previously created vm_map_copy_t or pass a newly created
11272  *		one to ensure that it hasn't been forged.
11273  *	Side effects:
11274  *		The copy chain is destroyed.
11275  */
11276 static void
vm_map_copy_insert(vm_map_t map,vm_map_entry_t after_where,vm_map_copy_t copy)11277 vm_map_copy_insert(
11278 	vm_map_t        map,
11279 	vm_map_entry_t  after_where,
11280 	vm_map_copy_t   copy)
11281 {
11282 	vm_map_entry_t  entry;
11283 
11284 	while (vm_map_copy_first_entry(copy) != vm_map_copy_to_entry(copy)) {
11285 		entry = vm_map_copy_first_entry(copy);
11286 		vm_map_copy_entry_unlink(copy, entry);
11287 		vm_map_store_entry_link(map, after_where, entry,
11288 		    VM_MAP_KERNEL_FLAGS_NONE);
11289 		after_where = entry;
11290 	}
11291 	zfree_id(ZONE_ID_VM_MAP_COPY, copy);
11292 }
11293 
11294 /*
11295  * Callers of this function must call vm_map_copy_require on
11296  * previously created vm_map_copy_t or pass a newly created
11297  * one to ensure that it hasn't been forged.
11298  */
11299 void
vm_map_copy_remap(vm_map_t map,vm_map_entry_t where,vm_map_copy_t copy,vm_map_offset_t adjustment,vm_prot_t cur_prot,vm_prot_t max_prot,vm_inherit_t inheritance)11300 vm_map_copy_remap(
11301 	vm_map_t        map,
11302 	vm_map_entry_t  where,
11303 	vm_map_copy_t   copy,
11304 	vm_map_offset_t adjustment,
11305 	vm_prot_t       cur_prot,
11306 	vm_prot_t       max_prot,
11307 	vm_inherit_t    inheritance)
11308 {
11309 	vm_map_entry_t  copy_entry, new_entry;
11310 
11311 	for (copy_entry = vm_map_copy_first_entry(copy);
11312 	    copy_entry != vm_map_copy_to_entry(copy);
11313 	    copy_entry = copy_entry->vme_next) {
11314 		/* get a new VM map entry for the map */
11315 		new_entry = vm_map_entry_create(map);
11316 		/* copy the "copy entry" to the new entry */
11317 		vm_map_entry_copy(map, new_entry, copy_entry);
11318 		/* adjust "start" and "end" */
11319 		new_entry->vme_start += adjustment;
11320 		new_entry->vme_end += adjustment;
11321 		/* clear some attributes */
11322 		new_entry->inheritance = inheritance;
11323 		new_entry->protection = cur_prot;
11324 		new_entry->max_protection = max_prot;
11325 		new_entry->behavior = VM_BEHAVIOR_DEFAULT;
11326 		/* take an extra reference on the entry's "object" */
11327 		if (new_entry->is_sub_map) {
11328 			assert(!new_entry->use_pmap); /* not nested */
11329 			vm_map_reference(VME_SUBMAP(new_entry));
11330 		} else {
11331 			vm_object_reference(VME_OBJECT(new_entry));
11332 		}
11333 		/* insert the new entry in the map */
11334 		vm_map_store_entry_link(map, where, new_entry,
11335 		    VM_MAP_KERNEL_FLAGS_NONE);
11336 		/* continue inserting the "copy entries" after the new entry */
11337 		where = new_entry;
11338 	}
11339 }
11340 
11341 
11342 /*
11343  * Returns true if *size matches (or is in the range of) copy->size.
11344  * Upon returning true, the *size field is updated with the actual size of the
11345  * copy object (may be different for VM_MAP_COPY_ENTRY_LIST types)
11346  */
11347 boolean_t
vm_map_copy_validate_size(vm_map_t dst_map,vm_map_copy_t copy,vm_map_size_t * size)11348 vm_map_copy_validate_size(
11349 	vm_map_t                dst_map,
11350 	vm_map_copy_t           copy,
11351 	vm_map_size_t           *size)
11352 {
11353 	if (copy == VM_MAP_COPY_NULL) {
11354 		return FALSE;
11355 	}
11356 
11357 	/*
11358 	 * Assert that the vm_map_copy is coming from the right
11359 	 * zone and hasn't been forged
11360 	 */
11361 	vm_map_copy_require(copy);
11362 
11363 	vm_map_size_t copy_sz = copy->size;
11364 	vm_map_size_t sz = *size;
11365 	switch (copy->type) {
11366 	case VM_MAP_COPY_KERNEL_BUFFER:
11367 		if (sz == copy_sz) {
11368 			return TRUE;
11369 		}
11370 		break;
11371 	case VM_MAP_COPY_ENTRY_LIST:
11372 		/*
11373 		 * potential page-size rounding prevents us from exactly
11374 		 * validating this flavor of vm_map_copy, but we can at least
11375 		 * assert that it's within a range.
11376 		 */
11377 		if (copy_sz >= sz &&
11378 		    copy_sz <= vm_map_round_page(sz, VM_MAP_PAGE_MASK(dst_map))) {
11379 			*size = copy_sz;
11380 			return TRUE;
11381 		}
11382 		break;
11383 	default:
11384 		break;
11385 	}
11386 	return FALSE;
11387 }
11388 
11389 /*
11390  *	Routine:	vm_map_copyout_size
11391  *
11392  *	Description:
11393  *		Copy out a copy chain ("copy") into newly-allocated
11394  *		space in the destination map. Uses a prevalidated
11395  *		size for the copy object (vm_map_copy_validate_size).
11396  *
11397  *		If successful, consumes the copy object.
11398  *		Otherwise, the caller is responsible for it.
11399  */
11400 kern_return_t
vm_map_copyout_size(vm_map_t dst_map,vm_map_address_t * dst_addr,vm_map_copy_t copy,vm_map_size_t copy_size)11401 vm_map_copyout_size(
11402 	vm_map_t                dst_map,
11403 	vm_map_address_t        *dst_addr,      /* OUT */
11404 	vm_map_copy_t           copy,
11405 	vm_map_size_t           copy_size)
11406 {
11407 	return vm_map_copyout_internal(dst_map, dst_addr, copy, copy_size,
11408 	           TRUE,                     /* consume_on_success */
11409 	           VM_PROT_DEFAULT,
11410 	           VM_PROT_ALL,
11411 	           VM_INHERIT_DEFAULT);
11412 }
11413 
11414 /*
11415  *	Routine:	vm_map_copyout
11416  *
11417  *	Description:
11418  *		Copy out a copy chain ("copy") into newly-allocated
11419  *		space in the destination map.
11420  *
11421  *		If successful, consumes the copy object.
11422  *		Otherwise, the caller is responsible for it.
11423  */
11424 kern_return_t
vm_map_copyout(vm_map_t dst_map,vm_map_address_t * dst_addr,vm_map_copy_t copy)11425 vm_map_copyout(
11426 	vm_map_t                dst_map,
11427 	vm_map_address_t        *dst_addr,      /* OUT */
11428 	vm_map_copy_t           copy)
11429 {
11430 	return vm_map_copyout_internal(dst_map, dst_addr, copy, copy ? copy->size : 0,
11431 	           TRUE,                     /* consume_on_success */
11432 	           VM_PROT_DEFAULT,
11433 	           VM_PROT_ALL,
11434 	           VM_INHERIT_DEFAULT);
11435 }
11436 
11437 kern_return_t
vm_map_copyout_internal(vm_map_t dst_map,vm_map_address_t * dst_addr,vm_map_copy_t copy,vm_map_size_t copy_size,boolean_t consume_on_success,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)11438 vm_map_copyout_internal(
11439 	vm_map_t                dst_map,
11440 	vm_map_address_t        *dst_addr,      /* OUT */
11441 	vm_map_copy_t           copy,
11442 	vm_map_size_t           copy_size,
11443 	boolean_t               consume_on_success,
11444 	vm_prot_t               cur_protection,
11445 	vm_prot_t               max_protection,
11446 	vm_inherit_t            inheritance)
11447 {
11448 	vm_map_size_t           size;
11449 	vm_map_size_t           adjustment;
11450 	vm_map_offset_t         start;
11451 	vm_object_offset_t      vm_copy_start;
11452 	vm_map_entry_t          last;
11453 	vm_map_entry_t          entry;
11454 	vm_map_copy_t           original_copy;
11455 	kern_return_t           kr;
11456 	vm_map_kernel_flags_t   vmk_flags = VM_MAP_KERNEL_FLAGS_ANYWHERE();
11457 
11458 	/*
11459 	 *	Check for null copy object.
11460 	 */
11461 
11462 	if (copy == VM_MAP_COPY_NULL) {
11463 		*dst_addr = 0;
11464 		return KERN_SUCCESS;
11465 	}
11466 
11467 	/*
11468 	 * Assert that the vm_map_copy is coming from the right
11469 	 * zone and hasn't been forged
11470 	 */
11471 	vm_map_copy_require(copy);
11472 
11473 	if (copy->size != copy_size) {
11474 		*dst_addr = 0;
11475 		ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOUT_INTERNAL_SIZE_ERROR), KERN_FAILURE /* arg */);
11476 		return KERN_FAILURE;
11477 	}
11478 
11479 	/*
11480 	 *	Check for special kernel buffer allocated
11481 	 *	by new_ipc_kmsg_copyin.
11482 	 */
11483 
11484 	if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) {
11485 		kr = vm_map_copyout_kernel_buffer(dst_map, dst_addr,
11486 		    copy, copy_size, FALSE,
11487 		    consume_on_success);
11488 		if (kr) {
11489 			ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOUT_KERNEL_BUFFER_ERROR), kr /* arg */);
11490 		}
11491 		return kr;
11492 	}
11493 
11494 	original_copy = copy;
11495 	if (copy->cpy_hdr.page_shift != VM_MAP_PAGE_SHIFT(dst_map)) {
11496 		vm_map_copy_t target_copy;
11497 		vm_map_offset_t overmap_start, overmap_end, trimmed_start;
11498 
11499 		target_copy = VM_MAP_COPY_NULL;
11500 		DEBUG4K_ADJUST("adjusting...\n");
11501 		kr = vm_map_copy_adjust_to_target(
11502 			copy,
11503 			0, /* offset */
11504 			copy->size, /* size */
11505 			dst_map,
11506 			TRUE, /* copy */
11507 			&target_copy,
11508 			&overmap_start,
11509 			&overmap_end,
11510 			&trimmed_start);
11511 		if (kr != KERN_SUCCESS) {
11512 			DEBUG4K_COPY("adjust failed 0x%x\n", kr);
11513 			ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOUT_INTERNAL_ADJUSTING_ERROR), kr /* arg */);
11514 			return kr;
11515 		}
11516 		DEBUG4K_COPY("copy %p (%d 0x%llx 0x%llx) dst_map %p (%d) target_copy %p (%d 0x%llx 0x%llx) overmap_start 0x%llx overmap_end 0x%llx trimmed_start 0x%llx\n", copy, copy->cpy_hdr.page_shift, copy->offset, (uint64_t)copy->size, dst_map, VM_MAP_PAGE_SHIFT(dst_map), target_copy, target_copy->cpy_hdr.page_shift, target_copy->offset, (uint64_t)target_copy->size, (uint64_t)overmap_start, (uint64_t)overmap_end, (uint64_t)trimmed_start);
11517 		if (target_copy != copy) {
11518 			copy = target_copy;
11519 		}
11520 		copy_size = copy->size;
11521 	}
11522 
11523 	/*
11524 	 *	Find space for the data
11525 	 */
11526 
11527 	vm_copy_start = vm_map_trunc_page((vm_map_size_t)copy->offset,
11528 	    VM_MAP_COPY_PAGE_MASK(copy));
11529 	size = vm_map_round_page((vm_map_size_t)copy->offset + copy_size,
11530 	    VM_MAP_COPY_PAGE_MASK(copy))
11531 	    - vm_copy_start;
11532 
11533 	vm_map_kernel_flags_update_range_id(&vmk_flags, dst_map);
11534 
11535 	vm_map_lock(dst_map);
11536 	kr = vm_map_locate_space(dst_map, size, 0, vmk_flags,
11537 	    &start, &last);
11538 	if (kr != KERN_SUCCESS) {
11539 		vm_map_unlock(dst_map);
11540 		ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOUT_INTERNAL_SPACE_ERROR), kr /* arg */);
11541 		return kr;
11542 	}
11543 
11544 	adjustment = start - vm_copy_start;
11545 	if (!consume_on_success) {
11546 		/*
11547 		 * We're not allowed to consume "copy", so we'll have to
11548 		 * copy its map entries into the destination map below.
11549 		 * No need to re-allocate map entries from the correct
11550 		 * (pageable or not) zone, since we'll get new map entries
11551 		 * during the transfer.
11552 		 * We'll also adjust the map entries's "start" and "end"
11553 		 * during the transfer, to keep "copy"'s entries consistent
11554 		 * with its "offset".
11555 		 */
11556 		goto after_adjustments;
11557 	}
11558 
11559 	/*
11560 	 *	Since we're going to just drop the map
11561 	 *	entries from the copy into the destination
11562 	 *	map, they must come from the same pool.
11563 	 */
11564 
11565 	if (copy->cpy_hdr.entries_pageable != dst_map->hdr.entries_pageable) {
11566 		/*
11567 		 * Mismatches occur when dealing with the default
11568 		 * pager.
11569 		 */
11570 		vm_map_entry_t  next, new;
11571 
11572 		/*
11573 		 * Find the zone that the copies were allocated from
11574 		 */
11575 
11576 		entry = vm_map_copy_first_entry(copy);
11577 
11578 		/*
11579 		 * Reinitialize the copy so that vm_map_copy_entry_link
11580 		 * will work.
11581 		 */
11582 		vm_map_store_copy_reset(copy, entry);
11583 		copy->cpy_hdr.entries_pageable = dst_map->hdr.entries_pageable;
11584 
11585 		/*
11586 		 * Copy each entry.
11587 		 */
11588 		while (entry != vm_map_copy_to_entry(copy)) {
11589 			new = vm_map_copy_entry_create(copy);
11590 			vm_map_entry_copy_full(new, entry);
11591 			new->vme_no_copy_on_read = FALSE;
11592 			assert(!new->iokit_acct);
11593 			if (new->is_sub_map) {
11594 				/* clr address space specifics */
11595 				new->use_pmap = FALSE;
11596 			}
11597 			vm_map_copy_entry_link(copy,
11598 			    vm_map_copy_last_entry(copy),
11599 			    new);
11600 			next = entry->vme_next;
11601 			vm_map_entry_dispose(entry);
11602 			entry = next;
11603 		}
11604 	}
11605 
11606 	/*
11607 	 *	Adjust the addresses in the copy chain, and
11608 	 *	reset the region attributes.
11609 	 */
11610 
11611 	for (entry = vm_map_copy_first_entry(copy);
11612 	    entry != vm_map_copy_to_entry(copy);
11613 	    entry = entry->vme_next) {
11614 		if (VM_MAP_PAGE_SHIFT(dst_map) == PAGE_SHIFT) {
11615 			/*
11616 			 * We're injecting this copy entry into a map that
11617 			 * has the standard page alignment, so clear
11618 			 * "map_aligned" (which might have been inherited
11619 			 * from the original map entry).
11620 			 */
11621 			entry->map_aligned = FALSE;
11622 		}
11623 
11624 		entry->vme_start += adjustment;
11625 		entry->vme_end += adjustment;
11626 
11627 		if (entry->map_aligned) {
11628 			assert(VM_MAP_PAGE_ALIGNED(entry->vme_start,
11629 			    VM_MAP_PAGE_MASK(dst_map)));
11630 			assert(VM_MAP_PAGE_ALIGNED(entry->vme_end,
11631 			    VM_MAP_PAGE_MASK(dst_map)));
11632 		}
11633 
11634 		entry->inheritance = VM_INHERIT_DEFAULT;
11635 		entry->protection = VM_PROT_DEFAULT;
11636 		entry->max_protection = VM_PROT_ALL;
11637 		entry->behavior = VM_BEHAVIOR_DEFAULT;
11638 
11639 		/*
11640 		 * If the entry is now wired,
11641 		 * map the pages into the destination map.
11642 		 */
11643 		if (entry->wired_count != 0) {
11644 			vm_map_offset_t va;
11645 			vm_object_offset_t       offset;
11646 			vm_object_t object;
11647 			vm_prot_t prot;
11648 			int     type_of_fault;
11649 			uint8_t object_lock_type = OBJECT_LOCK_EXCLUSIVE;
11650 
11651 			/* TODO4K would need to use actual page size */
11652 			assert(VM_MAP_PAGE_SHIFT(dst_map) == PAGE_SHIFT);
11653 
11654 			object = VME_OBJECT(entry);
11655 			offset = VME_OFFSET(entry);
11656 			va = entry->vme_start;
11657 
11658 			pmap_pageable(dst_map->pmap,
11659 			    entry->vme_start,
11660 			    entry->vme_end,
11661 			    TRUE);
11662 
11663 			while (va < entry->vme_end) {
11664 				vm_page_t       m;
11665 				struct vm_object_fault_info fault_info = {};
11666 
11667 				/*
11668 				 * Look up the page in the object.
11669 				 * Assert that the page will be found in the
11670 				 * top object:
11671 				 * either
11672 				 *	the object was newly created by
11673 				 *	vm_object_copy_slowly, and has
11674 				 *	copies of all of the pages from
11675 				 *	the source object
11676 				 * or
11677 				 *	the object was moved from the old
11678 				 *	map entry; because the old map
11679 				 *	entry was wired, all of the pages
11680 				 *	were in the top-level object.
11681 				 *	(XXX not true if we wire pages for
11682 				 *	 reading)
11683 				 */
11684 				vm_object_lock(object);
11685 
11686 				m = vm_page_lookup(object, offset);
11687 				if (m == VM_PAGE_NULL || !VM_PAGE_WIRED(m) ||
11688 				    m->vmp_absent) {
11689 					panic("vm_map_copyout: wiring %p", m);
11690 				}
11691 
11692 				prot = entry->protection;
11693 
11694 				if (override_nx(dst_map, VME_ALIAS(entry)) &&
11695 				    prot) {
11696 					prot |= VM_PROT_EXECUTE;
11697 				}
11698 
11699 				type_of_fault = DBG_CACHE_HIT_FAULT;
11700 
11701 				fault_info.user_tag = VME_ALIAS(entry);
11702 				fault_info.pmap_options = 0;
11703 				if (entry->iokit_acct ||
11704 				    (!entry->is_sub_map && !entry->use_pmap)) {
11705 					fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
11706 				}
11707 				if (entry->vme_xnu_user_debug &&
11708 				    !VM_PAGE_OBJECT(m)->code_signed) {
11709 					/*
11710 					 * Modified code-signed executable
11711 					 * region: this page does not belong
11712 					 * to a code-signed VM object, so it
11713 					 * must have been copied and should
11714 					 * therefore be typed XNU_USER_DEBUG
11715 					 * rather than XNU_USER_EXEC.
11716 					 */
11717 					fault_info.pmap_options |= PMAP_OPTIONS_XNU_USER_DEBUG;
11718 				}
11719 
11720 				vm_fault_enter(m,
11721 				    dst_map->pmap,
11722 				    va,
11723 				    PAGE_SIZE, 0,
11724 				    prot,
11725 				    prot,
11726 				    VM_PAGE_WIRED(m),
11727 				    FALSE,            /* change_wiring */
11728 				    VM_KERN_MEMORY_NONE,            /* tag - not wiring */
11729 				    &fault_info,
11730 				    NULL,             /* need_retry */
11731 				    &type_of_fault,
11732 				    &object_lock_type); /*Exclusive mode lock. Will remain unchanged.*/
11733 
11734 				vm_object_unlock(object);
11735 
11736 				offset += PAGE_SIZE_64;
11737 				va += PAGE_SIZE;
11738 			}
11739 		}
11740 	}
11741 
11742 after_adjustments:
11743 
11744 	/*
11745 	 *	Correct the page alignment for the result
11746 	 */
11747 
11748 	*dst_addr = start + (copy->offset - vm_copy_start);
11749 
11750 #if KASAN
11751 	kasan_notify_address(*dst_addr, size);
11752 #endif
11753 
11754 	/*
11755 	 *	Update the hints and the map size
11756 	 */
11757 
11758 	if (consume_on_success) {
11759 		SAVE_HINT_MAP_WRITE(dst_map, vm_map_copy_last_entry(copy));
11760 	} else {
11761 		SAVE_HINT_MAP_WRITE(dst_map, last);
11762 	}
11763 
11764 	dst_map->size += size;
11765 
11766 	/*
11767 	 *	Link in the copy
11768 	 */
11769 
11770 	if (consume_on_success) {
11771 		vm_map_copy_insert(dst_map, last, copy);
11772 		if (copy != original_copy) {
11773 			vm_map_copy_discard(original_copy);
11774 			original_copy = VM_MAP_COPY_NULL;
11775 		}
11776 	} else {
11777 		vm_map_copy_remap(dst_map, last, copy, adjustment,
11778 		    cur_protection, max_protection,
11779 		    inheritance);
11780 		if (copy != original_copy && original_copy != VM_MAP_COPY_NULL) {
11781 			vm_map_copy_discard(copy);
11782 			copy = original_copy;
11783 		}
11784 	}
11785 
11786 
11787 	vm_map_unlock(dst_map);
11788 
11789 	/*
11790 	 * XXX	If wiring_required, call vm_map_pageable
11791 	 */
11792 
11793 	return KERN_SUCCESS;
11794 }
11795 
11796 /*
11797  *	Routine:	vm_map_copyin
11798  *
11799  *	Description:
11800  *		see vm_map_copyin_common.  Exported via Unsupported.exports.
11801  *
11802  */
11803 
11804 #undef vm_map_copyin
11805 
11806 kern_return_t
vm_map_copyin(vm_map_t src_map,vm_map_address_t src_addr,vm_map_size_t len,boolean_t src_destroy,vm_map_copy_t * copy_result)11807 vm_map_copyin(
11808 	vm_map_t                        src_map,
11809 	vm_map_address_t        src_addr,
11810 	vm_map_size_t           len,
11811 	boolean_t                       src_destroy,
11812 	vm_map_copy_t           *copy_result)   /* OUT */
11813 {
11814 	return vm_map_copyin_common(src_map, src_addr, len, src_destroy,
11815 	           FALSE, copy_result, FALSE);
11816 }
11817 
11818 /*
11819  *	Routine:	vm_map_copyin_common
11820  *
11821  *	Description:
11822  *		Copy the specified region (src_addr, len) from the
11823  *		source address space (src_map), possibly removing
11824  *		the region from the source address space (src_destroy).
11825  *
11826  *	Returns:
11827  *		A vm_map_copy_t object (copy_result), suitable for
11828  *		insertion into another address space (using vm_map_copyout),
11829  *		copying over another address space region (using
11830  *		vm_map_copy_overwrite).  If the copy is unused, it
11831  *		should be destroyed (using vm_map_copy_discard).
11832  *
11833  *	In/out conditions:
11834  *		The source map should not be locked on entry.
11835  */
11836 
11837 typedef struct submap_map {
11838 	vm_map_t        parent_map;
11839 	vm_map_offset_t base_start;
11840 	vm_map_offset_t base_end;
11841 	vm_map_size_t   base_len;
11842 	struct submap_map *next;
11843 } submap_map_t;
11844 
11845 kern_return_t
vm_map_copyin_common(vm_map_t src_map,vm_map_address_t src_addr,vm_map_size_t len,boolean_t src_destroy,__unused boolean_t src_volatile,vm_map_copy_t * copy_result,boolean_t use_maxprot)11846 vm_map_copyin_common(
11847 	vm_map_t        src_map,
11848 	vm_map_address_t src_addr,
11849 	vm_map_size_t   len,
11850 	boolean_t       src_destroy,
11851 	__unused boolean_t      src_volatile,
11852 	vm_map_copy_t   *copy_result,   /* OUT */
11853 	boolean_t       use_maxprot)
11854 {
11855 	int flags;
11856 
11857 	flags = 0;
11858 	if (src_destroy) {
11859 		flags |= VM_MAP_COPYIN_SRC_DESTROY;
11860 	}
11861 	if (use_maxprot) {
11862 		flags |= VM_MAP_COPYIN_USE_MAXPROT;
11863 	}
11864 	return vm_map_copyin_internal(src_map,
11865 	           src_addr,
11866 	           len,
11867 	           flags,
11868 	           copy_result);
11869 }
11870 kern_return_t
vm_map_copyin_internal(vm_map_t src_map,vm_map_address_t src_addr,vm_map_size_t len,int flags,vm_map_copy_t * copy_result)11871 vm_map_copyin_internal(
11872 	vm_map_t        src_map,
11873 	vm_map_address_t src_addr,
11874 	vm_map_size_t   len,
11875 	int             flags,
11876 	vm_map_copy_t   *copy_result)   /* OUT */
11877 {
11878 	vm_map_entry_t  tmp_entry;      /* Result of last map lookup --
11879 	                                 * in multi-level lookup, this
11880 	                                 * entry contains the actual
11881 	                                 * vm_object/offset.
11882 	                                 */
11883 	vm_map_entry_t  new_entry = VM_MAP_ENTRY_NULL;  /* Map entry for copy */
11884 
11885 	vm_map_offset_t src_start;      /* Start of current entry --
11886 	                                 * where copy is taking place now
11887 	                                 */
11888 	vm_map_offset_t src_end;        /* End of entire region to be
11889 	                                 * copied */
11890 	vm_map_offset_t src_base;
11891 	vm_map_t        base_map = src_map;
11892 	boolean_t       map_share = FALSE;
11893 	submap_map_t    *parent_maps = NULL;
11894 
11895 	vm_map_copy_t   copy;           /* Resulting copy */
11896 	vm_map_address_t copy_addr;
11897 	vm_map_size_t   copy_size;
11898 	boolean_t       src_destroy;
11899 	boolean_t       use_maxprot;
11900 	boolean_t       preserve_purgeable;
11901 	boolean_t       entry_was_shared;
11902 	vm_map_entry_t  saved_src_entry;
11903 
11904 	if (flags & ~VM_MAP_COPYIN_ALL_FLAGS) {
11905 		return KERN_INVALID_ARGUMENT;
11906 	}
11907 
11908 #if CONFIG_KERNEL_TAGGING
11909 	if (src_map->pmap == kernel_pmap) {
11910 		src_addr = vm_memtag_canonicalize_address(src_addr);
11911 	}
11912 #endif /* CONFIG_KERNEL_TAGGING */
11913 
11914 	src_destroy = (flags & VM_MAP_COPYIN_SRC_DESTROY) ? TRUE : FALSE;
11915 	use_maxprot = (flags & VM_MAP_COPYIN_USE_MAXPROT) ? TRUE : FALSE;
11916 	preserve_purgeable =
11917 	    (flags & VM_MAP_COPYIN_PRESERVE_PURGEABLE) ? TRUE : FALSE;
11918 
11919 	/*
11920 	 *	Check for copies of zero bytes.
11921 	 */
11922 
11923 	if (len == 0) {
11924 		*copy_result = VM_MAP_COPY_NULL;
11925 		return KERN_SUCCESS;
11926 	}
11927 
11928 	/*
11929 	 *	Check that the end address doesn't overflow
11930 	 */
11931 	if (__improbable(vm_map_range_overflows(src_map, src_addr, len))) {
11932 		return KERN_INVALID_ADDRESS;
11933 	}
11934 	src_end = src_addr + len;
11935 	if (src_end < src_addr) {
11936 		return KERN_INVALID_ADDRESS;
11937 	}
11938 
11939 	/*
11940 	 *	Compute (page aligned) start and end of region
11941 	 */
11942 	src_start = vm_map_trunc_page(src_addr,
11943 	    VM_MAP_PAGE_MASK(src_map));
11944 	src_end = vm_map_round_page(src_end,
11945 	    VM_MAP_PAGE_MASK(src_map));
11946 	if (src_end < src_addr) {
11947 		return KERN_INVALID_ADDRESS;
11948 	}
11949 
11950 	/*
11951 	 * If the copy is sufficiently small, use a kernel buffer instead
11952 	 * of making a virtual copy.  The theory being that the cost of
11953 	 * setting up VM (and taking C-O-W faults) dominates the copy costs
11954 	 * for small regions.
11955 	 */
11956 	if ((len <= msg_ool_size_small) &&
11957 	    !use_maxprot &&
11958 	    !preserve_purgeable &&
11959 	    !(flags & VM_MAP_COPYIN_ENTRY_LIST) &&
11960 	    /*
11961 	     * Since the "msg_ool_size_small" threshold was increased and
11962 	     * vm_map_copyin_kernel_buffer() doesn't handle accesses beyond the
11963 	     * address space limits, we revert to doing a virtual copy if the
11964 	     * copied range goes beyond those limits.  Otherwise, mach_vm_read()
11965 	     * of the commpage would now fail when it used to work.
11966 	     */
11967 	    (src_start >= vm_map_min(src_map) &&
11968 	    src_start < vm_map_max(src_map) &&
11969 	    src_end >= vm_map_min(src_map) &&
11970 	    src_end < vm_map_max(src_map))) {
11971 		return vm_map_copyin_kernel_buffer(src_map, src_addr, len,
11972 		           src_destroy, copy_result);
11973 	}
11974 
11975 	/*
11976 	 *	Allocate a header element for the list.
11977 	 *
11978 	 *	Use the start and end in the header to
11979 	 *	remember the endpoints prior to rounding.
11980 	 */
11981 
11982 	copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
11983 	copy->cpy_hdr.entries_pageable = TRUE;
11984 	copy->cpy_hdr.page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(src_map);
11985 	copy->offset = src_addr;
11986 	copy->size = len;
11987 
11988 	new_entry = vm_map_copy_entry_create(copy);
11989 
11990 #define RETURN(x)                                               \
11991 	MACRO_BEGIN                                             \
11992 	vm_map_unlock(src_map);                                 \
11993 	if(src_map != base_map)                                 \
11994 	        vm_map_deallocate(src_map);                     \
11995 	if (new_entry != VM_MAP_ENTRY_NULL)                     \
11996 	        vm_map_copy_entry_dispose(new_entry);           \
11997 	vm_map_copy_discard(copy);                              \
11998 	{                                                       \
11999 	        submap_map_t	*_ptr;                          \
12000                                                                 \
12001 	        for(_ptr = parent_maps; _ptr != NULL; _ptr = parent_maps) { \
12002 	                parent_maps=parent_maps->next;          \
12003 	                if (_ptr->parent_map != base_map)       \
12004 	                        vm_map_deallocate(_ptr->parent_map);    \
12005 	                kfree_type(submap_map_t, _ptr);         \
12006 	        }                                               \
12007 	}                                                       \
12008 	MACRO_RETURN(x);                                        \
12009 	MACRO_END
12010 
12011 	/*
12012 	 *	Find the beginning of the region.
12013 	 */
12014 
12015 	vm_map_lock(src_map);
12016 
12017 	/*
12018 	 * Lookup the original "src_addr" rather than the truncated
12019 	 * "src_start", in case "src_start" falls in a non-map-aligned
12020 	 * map entry *before* the map entry that contains "src_addr"...
12021 	 */
12022 	if (!vm_map_lookup_entry(src_map, src_addr, &tmp_entry)) {
12023 		RETURN(KERN_INVALID_ADDRESS);
12024 	}
12025 	if (!tmp_entry->is_sub_map) {
12026 		/*
12027 		 * ... but clip to the map-rounded "src_start" rather than
12028 		 * "src_addr" to preserve map-alignment.  We'll adjust the
12029 		 * first copy entry at the end, if needed.
12030 		 */
12031 		vm_map_clip_start(src_map, tmp_entry, src_start);
12032 	}
12033 	if (src_start < tmp_entry->vme_start) {
12034 		/*
12035 		 * Move "src_start" up to the start of the
12036 		 * first map entry to copy.
12037 		 */
12038 		src_start = tmp_entry->vme_start;
12039 	}
12040 	/* set for later submap fix-up */
12041 	copy_addr = src_start;
12042 
12043 	/*
12044 	 *	Go through entries until we get to the end.
12045 	 */
12046 
12047 	while (TRUE) {
12048 		vm_map_entry_t  src_entry = tmp_entry;  /* Top-level entry */
12049 		vm_map_size_t   src_size;               /* Size of source
12050 		                                         * map entry (in both
12051 		                                         * maps)
12052 		                                         */
12053 
12054 		vm_object_t             src_object;     /* Object to copy */
12055 		vm_object_offset_t      src_offset;
12056 
12057 		vm_object_t             new_copy_object;/* vm_object_copy_* result */
12058 
12059 		boolean_t       src_needs_copy;         /* Should source map
12060 		                                         * be made read-only
12061 		                                         * for copy-on-write?
12062 		                                         */
12063 
12064 		boolean_t       new_entry_needs_copy;   /* Will new entry be COW? */
12065 
12066 		boolean_t       was_wired;              /* Was source wired? */
12067 		boolean_t       saved_used_for_jit;     /* Saved used_for_jit. */
12068 		vm_map_version_t version;               /* Version before locks
12069 		                                         * dropped to make copy
12070 		                                         */
12071 		kern_return_t   result;                 /* Return value from
12072 		                                         * copy_strategically.
12073 		                                         */
12074 		while (tmp_entry->is_sub_map) {
12075 			vm_map_size_t submap_len;
12076 			submap_map_t *ptr;
12077 
12078 			ptr = kalloc_type(submap_map_t, Z_WAITOK);
12079 			ptr->next = parent_maps;
12080 			parent_maps = ptr;
12081 			ptr->parent_map = src_map;
12082 			ptr->base_start = src_start;
12083 			ptr->base_end = src_end;
12084 			submap_len = tmp_entry->vme_end - src_start;
12085 			if (submap_len > (src_end - src_start)) {
12086 				submap_len = src_end - src_start;
12087 			}
12088 			ptr->base_len = submap_len;
12089 
12090 			src_start -= tmp_entry->vme_start;
12091 			src_start += VME_OFFSET(tmp_entry);
12092 			src_end = src_start + submap_len;
12093 			src_map = VME_SUBMAP(tmp_entry);
12094 			vm_map_lock(src_map);
12095 			/* keep an outstanding reference for all maps in */
12096 			/* the parents tree except the base map */
12097 			vm_map_reference(src_map);
12098 			vm_map_unlock(ptr->parent_map);
12099 			if (!vm_map_lookup_entry(
12100 				    src_map, src_start, &tmp_entry)) {
12101 				RETURN(KERN_INVALID_ADDRESS);
12102 			}
12103 			map_share = TRUE;
12104 			if (!tmp_entry->is_sub_map) {
12105 				vm_map_clip_start(src_map, tmp_entry, src_start);
12106 			}
12107 			src_entry = tmp_entry;
12108 		}
12109 		/* we are now in the lowest level submap... */
12110 
12111 		if ((VME_OBJECT(tmp_entry) != VM_OBJECT_NULL) &&
12112 		    (VME_OBJECT(tmp_entry)->phys_contiguous)) {
12113 			/* This is not, supported for now.In future */
12114 			/* we will need to detect the phys_contig   */
12115 			/* condition and then upgrade copy_slowly   */
12116 			/* to do physical copy from the device mem  */
12117 			/* based object. We can piggy-back off of   */
12118 			/* the was wired boolean to set-up the      */
12119 			/* proper handling */
12120 			RETURN(KERN_PROTECTION_FAILURE);
12121 		}
12122 		/*
12123 		 *	Create a new address map entry to hold the result.
12124 		 *	Fill in the fields from the appropriate source entries.
12125 		 *	We must unlock the source map to do this if we need
12126 		 *	to allocate a map entry.
12127 		 */
12128 		if (new_entry == VM_MAP_ENTRY_NULL) {
12129 			version.main_timestamp = src_map->timestamp;
12130 			vm_map_unlock(src_map);
12131 
12132 			new_entry = vm_map_copy_entry_create(copy);
12133 
12134 			vm_map_lock(src_map);
12135 			if ((version.main_timestamp + 1) != src_map->timestamp) {
12136 				if (!vm_map_lookup_entry(src_map, src_start,
12137 				    &tmp_entry)) {
12138 					RETURN(KERN_INVALID_ADDRESS);
12139 				}
12140 				if (!tmp_entry->is_sub_map) {
12141 					vm_map_clip_start(src_map, tmp_entry, src_start);
12142 				}
12143 				continue; /* restart w/ new tmp_entry */
12144 			}
12145 		}
12146 
12147 		/*
12148 		 *	Verify that the region can be read.
12149 		 */
12150 		if (((src_entry->protection & VM_PROT_READ) == VM_PROT_NONE &&
12151 		    !use_maxprot) ||
12152 		    (src_entry->max_protection & VM_PROT_READ) == 0) {
12153 			RETURN(KERN_PROTECTION_FAILURE);
12154 		}
12155 
12156 		/*
12157 		 *	Clip against the endpoints of the entire region.
12158 		 */
12159 
12160 		vm_map_clip_end(src_map, src_entry, src_end);
12161 
12162 		src_size = src_entry->vme_end - src_start;
12163 		src_object = VME_OBJECT(src_entry);
12164 		src_offset = VME_OFFSET(src_entry);
12165 		was_wired = (src_entry->wired_count != 0);
12166 
12167 		vm_map_entry_copy(src_map, new_entry, src_entry);
12168 		if (new_entry->is_sub_map) {
12169 			/* clr address space specifics */
12170 			new_entry->use_pmap = FALSE;
12171 		} else {
12172 			/*
12173 			 * We're dealing with a copy-on-write operation,
12174 			 * so the resulting mapping should not inherit the
12175 			 * original mapping's accounting settings.
12176 			 * "iokit_acct" should have been cleared in
12177 			 * vm_map_entry_copy().
12178 			 * "use_pmap" should be reset to its default (TRUE)
12179 			 * so that the new mapping gets accounted for in
12180 			 * the task's memory footprint.
12181 			 */
12182 			assert(!new_entry->iokit_acct);
12183 			new_entry->use_pmap = TRUE;
12184 		}
12185 
12186 		/*
12187 		 *	Attempt non-blocking copy-on-write optimizations.
12188 		 */
12189 
12190 		/*
12191 		 * If we are destroying the source, and the object
12192 		 * is internal, we could move the object reference
12193 		 * from the source to the copy.  The copy is
12194 		 * copy-on-write only if the source is.
12195 		 * We make another reference to the object, because
12196 		 * destroying the source entry will deallocate it.
12197 		 *
12198 		 * This memory transfer has to be atomic, (to prevent
12199 		 * the VM object from being shared or copied while
12200 		 * it's being moved here), so we could only do this
12201 		 * if we won't have to unlock the VM map until the
12202 		 * original mapping has been fully removed.
12203 		 */
12204 
12205 RestartCopy:
12206 		if ((src_object == VM_OBJECT_NULL ||
12207 		    (!was_wired && !map_share && !tmp_entry->is_shared
12208 		    && !(debug4k_no_cow_copyin && VM_MAP_PAGE_SHIFT(src_map) < PAGE_SHIFT))) &&
12209 		    vm_object_copy_quickly(
12210 			    VME_OBJECT(new_entry),
12211 			    src_offset,
12212 			    src_size,
12213 			    &src_needs_copy,
12214 			    &new_entry_needs_copy)) {
12215 			new_entry->needs_copy = new_entry_needs_copy;
12216 
12217 			/*
12218 			 *	Handle copy-on-write obligations
12219 			 */
12220 
12221 			if (src_needs_copy && !tmp_entry->needs_copy) {
12222 				vm_prot_t prot;
12223 
12224 				prot = src_entry->protection & ~VM_PROT_WRITE;
12225 
12226 				if (override_nx(src_map, VME_ALIAS(src_entry))
12227 				    && prot) {
12228 					prot |= VM_PROT_EXECUTE;
12229 				}
12230 
12231 				vm_object_pmap_protect(
12232 					src_object,
12233 					src_offset,
12234 					src_size,
12235 					(src_entry->is_shared ?
12236 					PMAP_NULL
12237 					: src_map->pmap),
12238 					VM_MAP_PAGE_SIZE(src_map),
12239 					src_entry->vme_start,
12240 					prot);
12241 
12242 				assert(tmp_entry->wired_count == 0);
12243 				tmp_entry->needs_copy = TRUE;
12244 			}
12245 
12246 			/*
12247 			 *	The map has never been unlocked, so it's safe
12248 			 *	to move to the next entry rather than doing
12249 			 *	another lookup.
12250 			 */
12251 
12252 			goto CopySuccessful;
12253 		}
12254 
12255 		entry_was_shared = tmp_entry->is_shared;
12256 
12257 		/*
12258 		 *	Take an object reference, so that we may
12259 		 *	release the map lock(s).
12260 		 */
12261 
12262 		assert(src_object != VM_OBJECT_NULL);
12263 		vm_object_reference(src_object);
12264 
12265 		/*
12266 		 *	Record the timestamp for later verification.
12267 		 *	Unlock the map.
12268 		 */
12269 
12270 		version.main_timestamp = src_map->timestamp;
12271 		vm_map_unlock(src_map); /* Increments timestamp once! */
12272 		saved_src_entry = src_entry;
12273 		tmp_entry = VM_MAP_ENTRY_NULL;
12274 		src_entry = VM_MAP_ENTRY_NULL;
12275 
12276 		/*
12277 		 *	Perform the copy
12278 		 */
12279 
12280 		if (was_wired ||
12281 		    (src_object->copy_strategy == MEMORY_OBJECT_COPY_DELAY_FORK &&
12282 		    !(flags & VM_MAP_COPYIN_FORK)) ||
12283 		    (debug4k_no_cow_copyin &&
12284 		    VM_MAP_PAGE_SHIFT(src_map) < PAGE_SHIFT)) {
12285 CopySlowly:
12286 			vm_object_lock(src_object);
12287 			result = vm_object_copy_slowly(
12288 				src_object,
12289 				src_offset,
12290 				src_size,
12291 				THREAD_UNINT,
12292 				&new_copy_object);
12293 			/* VME_OBJECT_SET will reset used_for_jit|tpro, so preserve it. */
12294 			saved_used_for_jit = new_entry->used_for_jit;
12295 			VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
12296 			new_entry->used_for_jit = saved_used_for_jit;
12297 			VME_OFFSET_SET(new_entry,
12298 			    src_offset - vm_object_trunc_page(src_offset));
12299 			new_entry->needs_copy = FALSE;
12300 		} else if (src_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC &&
12301 		    (entry_was_shared || map_share)) {
12302 			vm_object_t new_object;
12303 
12304 			vm_object_lock_shared(src_object);
12305 			new_object = vm_object_copy_delayed(
12306 				src_object,
12307 				src_offset,
12308 				src_size,
12309 				TRUE);
12310 			if (new_object == VM_OBJECT_NULL) {
12311 				goto CopySlowly;
12312 			}
12313 
12314 			VME_OBJECT_SET(new_entry, new_object, false, 0);
12315 			assert(new_entry->wired_count == 0);
12316 			new_entry->needs_copy = TRUE;
12317 			assert(!new_entry->iokit_acct);
12318 			assert(new_object->purgable == VM_PURGABLE_DENY);
12319 			assertf(new_entry->use_pmap, "src_map %p new_entry %p\n", src_map, new_entry);
12320 			result = KERN_SUCCESS;
12321 		} else {
12322 			vm_object_offset_t new_offset;
12323 			new_offset = VME_OFFSET(new_entry);
12324 			result = vm_object_copy_strategically(src_object,
12325 			    src_offset,
12326 			    src_size,
12327 			    (flags & VM_MAP_COPYIN_FORK),
12328 			    &new_copy_object,
12329 			    &new_offset,
12330 			    &new_entry_needs_copy);
12331 			/* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
12332 			saved_used_for_jit = new_entry->used_for_jit;
12333 			VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
12334 			new_entry->used_for_jit = saved_used_for_jit;
12335 			if (new_offset != VME_OFFSET(new_entry)) {
12336 				VME_OFFSET_SET(new_entry, new_offset);
12337 			}
12338 
12339 			new_entry->needs_copy = new_entry_needs_copy;
12340 		}
12341 
12342 		if (result == KERN_SUCCESS &&
12343 		    ((preserve_purgeable &&
12344 		    src_object->purgable != VM_PURGABLE_DENY) ||
12345 		    new_entry->used_for_jit)) {
12346 			/*
12347 			 * Purgeable objects should be COPY_NONE, true share;
12348 			 * this should be propogated to the copy.
12349 			 *
12350 			 * Also force mappings the pmap specially protects to
12351 			 * be COPY_NONE; trying to COW these mappings would
12352 			 * change the effective protections, which could have
12353 			 * side effects if the pmap layer relies on the
12354 			 * specified protections.
12355 			 */
12356 
12357 			vm_object_t     new_object;
12358 
12359 			new_object = VME_OBJECT(new_entry);
12360 			assert(new_object != src_object);
12361 			vm_object_lock(new_object);
12362 			assert(new_object->ref_count == 1);
12363 			assert(new_object->shadow == VM_OBJECT_NULL);
12364 			assert(new_object->vo_copy == VM_OBJECT_NULL);
12365 			assert(new_object->vo_owner == NULL);
12366 
12367 			new_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
12368 
12369 			if (preserve_purgeable &&
12370 			    src_object->purgable != VM_PURGABLE_DENY) {
12371 				new_object->true_share = TRUE;
12372 
12373 				/* start as non-volatile with no owner... */
12374 				new_object->purgable = VM_PURGABLE_NONVOLATILE;
12375 				vm_purgeable_nonvolatile_enqueue(new_object, NULL);
12376 				/* ... and move to src_object's purgeable state */
12377 				if (src_object->purgable != VM_PURGABLE_NONVOLATILE) {
12378 					int state;
12379 					state = src_object->purgable;
12380 					vm_object_purgable_control(
12381 						new_object,
12382 						VM_PURGABLE_SET_STATE_FROM_KERNEL,
12383 						&state);
12384 				}
12385 				/* no pmap accounting for purgeable objects */
12386 				new_entry->use_pmap = FALSE;
12387 			}
12388 
12389 			vm_object_unlock(new_object);
12390 			new_object = VM_OBJECT_NULL;
12391 		}
12392 
12393 		if (result != KERN_SUCCESS &&
12394 		    result != KERN_MEMORY_RESTART_COPY) {
12395 			vm_map_lock(src_map);
12396 			RETURN(result);
12397 		}
12398 
12399 		/*
12400 		 *	Throw away the extra reference
12401 		 */
12402 
12403 		vm_object_deallocate(src_object);
12404 
12405 		/*
12406 		 *	Verify that the map has not substantially
12407 		 *	changed while the copy was being made.
12408 		 */
12409 
12410 		vm_map_lock(src_map);
12411 
12412 		if ((version.main_timestamp + 1) == src_map->timestamp) {
12413 			/* src_map hasn't changed: src_entry is still valid */
12414 			src_entry = saved_src_entry;
12415 			goto VerificationSuccessful;
12416 		}
12417 
12418 		/*
12419 		 *	Simple version comparison failed.
12420 		 *
12421 		 *	Retry the lookup and verify that the
12422 		 *	same object/offset are still present.
12423 		 *
12424 		 *	[Note: a memory manager that colludes with
12425 		 *	the calling task can detect that we have
12426 		 *	cheated.  While the map was unlocked, the
12427 		 *	mapping could have been changed and restored.]
12428 		 */
12429 
12430 		if (!vm_map_lookup_entry(src_map, src_start, &tmp_entry)) {
12431 			if (result != KERN_MEMORY_RESTART_COPY) {
12432 				vm_object_deallocate(VME_OBJECT(new_entry));
12433 				VME_OBJECT_SET(new_entry, VM_OBJECT_NULL, false, 0);
12434 				/* reset accounting state */
12435 				new_entry->iokit_acct = FALSE;
12436 				new_entry->use_pmap = TRUE;
12437 			}
12438 			RETURN(KERN_INVALID_ADDRESS);
12439 		}
12440 
12441 		src_entry = tmp_entry;
12442 		vm_map_clip_start(src_map, src_entry, src_start);
12443 
12444 		if ((((src_entry->protection & VM_PROT_READ) == VM_PROT_NONE) &&
12445 		    !use_maxprot) ||
12446 		    ((src_entry->max_protection & VM_PROT_READ) == 0)) {
12447 			goto VerificationFailed;
12448 		}
12449 
12450 		if (src_entry->vme_end < new_entry->vme_end) {
12451 			/*
12452 			 * This entry might have been shortened
12453 			 * (vm_map_clip_end) or been replaced with
12454 			 * an entry that ends closer to "src_start"
12455 			 * than before.
12456 			 * Adjust "new_entry" accordingly; copying
12457 			 * less memory would be correct but we also
12458 			 * redo the copy (see below) if the new entry
12459 			 * no longer points at the same object/offset.
12460 			 */
12461 			assert(VM_MAP_PAGE_ALIGNED(src_entry->vme_end,
12462 			    VM_MAP_COPY_PAGE_MASK(copy)));
12463 			new_entry->vme_end = src_entry->vme_end;
12464 			src_size = new_entry->vme_end - src_start;
12465 		} else if (src_entry->vme_end > new_entry->vme_end) {
12466 			/*
12467 			 * This entry might have been extended
12468 			 * (vm_map_entry_simplify() or coalesce)
12469 			 * or been replaced with an entry that ends farther
12470 			 * from "src_start" than before.
12471 			 *
12472 			 * We've called vm_object_copy_*() only on
12473 			 * the previous <start:end> range, so we can't
12474 			 * just extend new_entry.  We have to re-do
12475 			 * the copy based on the new entry as if it was
12476 			 * pointing at a different object/offset (see
12477 			 * "Verification failed" below).
12478 			 */
12479 		}
12480 
12481 		if ((VME_OBJECT(src_entry) != src_object) ||
12482 		    (VME_OFFSET(src_entry) != src_offset) ||
12483 		    (src_entry->vme_end > new_entry->vme_end)) {
12484 			/*
12485 			 *	Verification failed.
12486 			 *
12487 			 *	Start over with this top-level entry.
12488 			 */
12489 
12490 VerificationFailed:     ;
12491 
12492 			vm_object_deallocate(VME_OBJECT(new_entry));
12493 			tmp_entry = src_entry;
12494 			continue;
12495 		}
12496 
12497 		/*
12498 		 *	Verification succeeded.
12499 		 */
12500 
12501 VerificationSuccessful:;
12502 
12503 		if (result == KERN_MEMORY_RESTART_COPY) {
12504 			goto RestartCopy;
12505 		}
12506 
12507 		/*
12508 		 *	Copy succeeded.
12509 		 */
12510 
12511 CopySuccessful: ;
12512 
12513 		/*
12514 		 *	Link in the new copy entry.
12515 		 */
12516 
12517 		vm_map_copy_entry_link(copy, vm_map_copy_last_entry(copy),
12518 		    new_entry);
12519 
12520 		/*
12521 		 *	Determine whether the entire region
12522 		 *	has been copied.
12523 		 */
12524 		src_base = src_start;
12525 		src_start = new_entry->vme_end;
12526 		new_entry = VM_MAP_ENTRY_NULL;
12527 		while ((src_start >= src_end) && (src_end != 0)) {
12528 			submap_map_t    *ptr;
12529 
12530 			if (src_map == base_map) {
12531 				/* back to the top */
12532 				break;
12533 			}
12534 
12535 			ptr = parent_maps;
12536 			assert(ptr != NULL);
12537 			parent_maps = parent_maps->next;
12538 
12539 			/* fix up the damage we did in that submap */
12540 			vm_map_simplify_range(src_map,
12541 			    src_base,
12542 			    src_end);
12543 
12544 			vm_map_unlock(src_map);
12545 			vm_map_deallocate(src_map);
12546 			vm_map_lock(ptr->parent_map);
12547 			src_map = ptr->parent_map;
12548 			src_base = ptr->base_start;
12549 			src_start = ptr->base_start + ptr->base_len;
12550 			src_end = ptr->base_end;
12551 			if (!vm_map_lookup_entry(src_map,
12552 			    src_start,
12553 			    &tmp_entry) &&
12554 			    (src_end > src_start)) {
12555 				RETURN(KERN_INVALID_ADDRESS);
12556 			}
12557 			kfree_type(submap_map_t, ptr);
12558 			if (parent_maps == NULL) {
12559 				map_share = FALSE;
12560 			}
12561 			src_entry = tmp_entry->vme_prev;
12562 		}
12563 
12564 		if ((VM_MAP_PAGE_SHIFT(src_map) != PAGE_SHIFT) &&
12565 		    (src_start >= src_addr + len) &&
12566 		    (src_addr + len != 0)) {
12567 			/*
12568 			 * Stop copying now, even though we haven't reached
12569 			 * "src_end".  We'll adjust the end of the last copy
12570 			 * entry at the end, if needed.
12571 			 *
12572 			 * If src_map's aligment is different from the
12573 			 * system's page-alignment, there could be
12574 			 * extra non-map-aligned map entries between
12575 			 * the original (non-rounded) "src_addr + len"
12576 			 * and the rounded "src_end".
12577 			 * We do not want to copy those map entries since
12578 			 * they're not part of the copied range.
12579 			 */
12580 			break;
12581 		}
12582 
12583 		if ((src_start >= src_end) && (src_end != 0)) {
12584 			break;
12585 		}
12586 
12587 		/*
12588 		 *	Verify that there are no gaps in the region
12589 		 */
12590 
12591 		tmp_entry = src_entry->vme_next;
12592 		if ((tmp_entry->vme_start != src_start) ||
12593 		    (tmp_entry == vm_map_to_entry(src_map))) {
12594 			RETURN(KERN_INVALID_ADDRESS);
12595 		}
12596 	}
12597 
12598 	/*
12599 	 * If the source should be destroyed, do it now, since the
12600 	 * copy was successful.
12601 	 */
12602 	if (src_destroy) {
12603 		vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_FLAGS;
12604 
12605 		if (src_map == kernel_map) {
12606 			remove_flags |= VM_MAP_REMOVE_KUNWIRE;
12607 		}
12608 		(void)vm_map_remove_and_unlock(src_map,
12609 		    vm_map_trunc_page(src_addr, VM_MAP_PAGE_MASK(src_map)),
12610 		    src_end,
12611 		    remove_flags,
12612 		    KMEM_GUARD_NONE);
12613 	} else {
12614 		/* fix up the damage we did in the base map */
12615 		vm_map_simplify_range(
12616 			src_map,
12617 			vm_map_trunc_page(src_addr,
12618 			VM_MAP_PAGE_MASK(src_map)),
12619 			vm_map_round_page(src_end,
12620 			VM_MAP_PAGE_MASK(src_map)));
12621 		vm_map_unlock(src_map);
12622 	}
12623 
12624 	tmp_entry = VM_MAP_ENTRY_NULL;
12625 
12626 	if (VM_MAP_PAGE_SHIFT(src_map) > PAGE_SHIFT &&
12627 	    VM_MAP_PAGE_SHIFT(src_map) != VM_MAP_COPY_PAGE_SHIFT(copy)) {
12628 		vm_map_offset_t original_start, original_offset, original_end;
12629 
12630 		assert(VM_MAP_COPY_PAGE_MASK(copy) == PAGE_MASK);
12631 
12632 		/* adjust alignment of first copy_entry's "vme_start" */
12633 		tmp_entry = vm_map_copy_first_entry(copy);
12634 		if (tmp_entry != vm_map_copy_to_entry(copy)) {
12635 			vm_map_offset_t adjustment;
12636 
12637 			original_start = tmp_entry->vme_start;
12638 			original_offset = VME_OFFSET(tmp_entry);
12639 
12640 			/* map-align the start of the first copy entry... */
12641 			adjustment = (tmp_entry->vme_start -
12642 			    vm_map_trunc_page(
12643 				    tmp_entry->vme_start,
12644 				    VM_MAP_PAGE_MASK(src_map)));
12645 			tmp_entry->vme_start -= adjustment;
12646 			VME_OFFSET_SET(tmp_entry,
12647 			    VME_OFFSET(tmp_entry) - adjustment);
12648 			copy_addr -= adjustment;
12649 			assert(tmp_entry->vme_start < tmp_entry->vme_end);
12650 			/* ... adjust for mis-aligned start of copy range */
12651 			adjustment =
12652 			    (vm_map_trunc_page(copy->offset,
12653 			    PAGE_MASK) -
12654 			    vm_map_trunc_page(copy->offset,
12655 			    VM_MAP_PAGE_MASK(src_map)));
12656 			if (adjustment) {
12657 				assert(page_aligned(adjustment));
12658 				assert(adjustment < VM_MAP_PAGE_SIZE(src_map));
12659 				tmp_entry->vme_start += adjustment;
12660 				VME_OFFSET_SET(tmp_entry,
12661 				    (VME_OFFSET(tmp_entry) +
12662 				    adjustment));
12663 				copy_addr += adjustment;
12664 				assert(tmp_entry->vme_start < tmp_entry->vme_end);
12665 			}
12666 
12667 			/*
12668 			 * Assert that the adjustments haven't exposed
12669 			 * more than was originally copied...
12670 			 */
12671 			assert(tmp_entry->vme_start >= original_start);
12672 			assert(VME_OFFSET(tmp_entry) >= original_offset);
12673 			/*
12674 			 * ... and that it did not adjust outside of a
12675 			 * a single 16K page.
12676 			 */
12677 			assert(vm_map_trunc_page(tmp_entry->vme_start,
12678 			    VM_MAP_PAGE_MASK(src_map)) ==
12679 			    vm_map_trunc_page(original_start,
12680 			    VM_MAP_PAGE_MASK(src_map)));
12681 		}
12682 
12683 		/* adjust alignment of last copy_entry's "vme_end" */
12684 		tmp_entry = vm_map_copy_last_entry(copy);
12685 		if (tmp_entry != vm_map_copy_to_entry(copy)) {
12686 			vm_map_offset_t adjustment;
12687 
12688 			original_end = tmp_entry->vme_end;
12689 
12690 			/* map-align the end of the last copy entry... */
12691 			tmp_entry->vme_end =
12692 			    vm_map_round_page(tmp_entry->vme_end,
12693 			    VM_MAP_PAGE_MASK(src_map));
12694 			/* ... adjust for mis-aligned end of copy range */
12695 			adjustment =
12696 			    (vm_map_round_page((copy->offset +
12697 			    copy->size),
12698 			    VM_MAP_PAGE_MASK(src_map)) -
12699 			    vm_map_round_page((copy->offset +
12700 			    copy->size),
12701 			    PAGE_MASK));
12702 			if (adjustment) {
12703 				assert(page_aligned(adjustment));
12704 				assert(adjustment < VM_MAP_PAGE_SIZE(src_map));
12705 				tmp_entry->vme_end -= adjustment;
12706 				assert(tmp_entry->vme_start < tmp_entry->vme_end);
12707 			}
12708 
12709 			/*
12710 			 * Assert that the adjustments haven't exposed
12711 			 * more than was originally copied...
12712 			 */
12713 			assert(tmp_entry->vme_end <= original_end);
12714 			/*
12715 			 * ... and that it did not adjust outside of a
12716 			 * a single 16K page.
12717 			 */
12718 			assert(vm_map_round_page(tmp_entry->vme_end,
12719 			    VM_MAP_PAGE_MASK(src_map)) ==
12720 			    vm_map_round_page(original_end,
12721 			    VM_MAP_PAGE_MASK(src_map)));
12722 		}
12723 	}
12724 
12725 	/* Fix-up start and end points in copy.  This is necessary */
12726 	/* when the various entries in the copy object were picked */
12727 	/* up from different sub-maps */
12728 
12729 	tmp_entry = vm_map_copy_first_entry(copy);
12730 	copy_size = 0; /* compute actual size */
12731 	while (tmp_entry != vm_map_copy_to_entry(copy)) {
12732 		assert(VM_MAP_PAGE_ALIGNED(
12733 			    copy_addr + (tmp_entry->vme_end -
12734 			    tmp_entry->vme_start),
12735 			    MIN(VM_MAP_COPY_PAGE_MASK(copy), PAGE_MASK)));
12736 		assert(VM_MAP_PAGE_ALIGNED(
12737 			    copy_addr,
12738 			    MIN(VM_MAP_COPY_PAGE_MASK(copy), PAGE_MASK)));
12739 
12740 		/*
12741 		 * The copy_entries will be injected directly into the
12742 		 * destination map and might not be "map aligned" there...
12743 		 */
12744 		tmp_entry->map_aligned = FALSE;
12745 
12746 		tmp_entry->vme_end = copy_addr +
12747 		    (tmp_entry->vme_end - tmp_entry->vme_start);
12748 		tmp_entry->vme_start = copy_addr;
12749 		assert(tmp_entry->vme_start < tmp_entry->vme_end);
12750 		copy_addr += tmp_entry->vme_end - tmp_entry->vme_start;
12751 		copy_size += tmp_entry->vme_end - tmp_entry->vme_start;
12752 		tmp_entry = (struct vm_map_entry *)tmp_entry->vme_next;
12753 	}
12754 
12755 	if (VM_MAP_PAGE_SHIFT(src_map) != PAGE_SHIFT &&
12756 	    copy_size < copy->size) {
12757 		/*
12758 		 * The actual size of the VM map copy is smaller than what
12759 		 * was requested by the caller.  This must be because some
12760 		 * PAGE_SIZE-sized pages are missing at the end of the last
12761 		 * VM_MAP_PAGE_SIZE(src_map)-sized chunk of the range.
12762 		 * The caller might not have been aware of those missing
12763 		 * pages and might not want to be aware of it, which is
12764 		 * fine as long as they don't try to access (and crash on)
12765 		 * those missing pages.
12766 		 * Let's adjust the size of the "copy", to avoid failing
12767 		 * in vm_map_copyout() or vm_map_copy_overwrite().
12768 		 */
12769 		assert(vm_map_round_page(copy_size,
12770 		    VM_MAP_PAGE_MASK(src_map)) ==
12771 		    vm_map_round_page(copy->size,
12772 		    VM_MAP_PAGE_MASK(src_map)));
12773 		copy->size = copy_size;
12774 	}
12775 
12776 	*copy_result = copy;
12777 	return KERN_SUCCESS;
12778 
12779 #undef  RETURN
12780 }
12781 
12782 kern_return_t
vm_map_copy_extract(vm_map_t src_map,vm_map_address_t src_addr,vm_map_size_t len,boolean_t do_copy,vm_map_copy_t * copy_result,vm_prot_t * cur_prot,vm_prot_t * max_prot,vm_inherit_t inheritance,vm_map_kernel_flags_t vmk_flags)12783 vm_map_copy_extract(
12784 	vm_map_t                src_map,
12785 	vm_map_address_t        src_addr,
12786 	vm_map_size_t           len,
12787 	boolean_t               do_copy,
12788 	vm_map_copy_t           *copy_result,   /* OUT */
12789 	vm_prot_t               *cur_prot,      /* IN/OUT */
12790 	vm_prot_t               *max_prot,      /* IN/OUT */
12791 	vm_inherit_t            inheritance,
12792 	vm_map_kernel_flags_t   vmk_flags)
12793 {
12794 	vm_map_copy_t   copy;
12795 	kern_return_t   kr;
12796 	vm_prot_t required_cur_prot, required_max_prot;
12797 
12798 	/*
12799 	 *	Check for copies of zero bytes.
12800 	 */
12801 
12802 	if (len == 0) {
12803 		*copy_result = VM_MAP_COPY_NULL;
12804 		return KERN_SUCCESS;
12805 	}
12806 
12807 	/*
12808 	 *	Check that the end address doesn't overflow
12809 	 */
12810 	if (src_addr + len < src_addr) {
12811 		return KERN_INVALID_ADDRESS;
12812 	}
12813 	if (__improbable(vm_map_range_overflows(src_map, src_addr, len))) {
12814 		return KERN_INVALID_ADDRESS;
12815 	}
12816 
12817 	if (VM_MAP_PAGE_SIZE(src_map) < PAGE_SIZE) {
12818 		DEBUG4K_SHARE("src_map %p src_addr 0x%llx src_end 0x%llx\n", src_map, (uint64_t)src_addr, (uint64_t)(src_addr + len));
12819 	}
12820 
12821 	required_cur_prot = *cur_prot;
12822 	required_max_prot = *max_prot;
12823 
12824 	/*
12825 	 *	Allocate a header element for the list.
12826 	 *
12827 	 *	Use the start and end in the header to
12828 	 *	remember the endpoints prior to rounding.
12829 	 */
12830 
12831 	copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
12832 	copy->cpy_hdr.entries_pageable = vmk_flags.vmkf_copy_pageable;
12833 	copy->offset = 0;
12834 	copy->size = len;
12835 
12836 	kr = vm_map_remap_extract(src_map,
12837 	    src_addr,
12838 	    len,
12839 	    do_copy,             /* copy */
12840 	    copy,
12841 	    cur_prot,            /* IN/OUT */
12842 	    max_prot,            /* IN/OUT */
12843 	    inheritance,
12844 	    vmk_flags);
12845 	if (kr != KERN_SUCCESS) {
12846 		vm_map_copy_discard(copy);
12847 		return kr;
12848 	}
12849 	if (required_cur_prot != VM_PROT_NONE) {
12850 		assert((*cur_prot & required_cur_prot) == required_cur_prot);
12851 		assert((*max_prot & required_max_prot) == required_max_prot);
12852 	}
12853 
12854 	*copy_result = copy;
12855 	return KERN_SUCCESS;
12856 }
12857 
12858 static void
vm_map_fork_share(vm_map_t old_map,vm_map_entry_t old_entry,vm_map_t new_map)12859 vm_map_fork_share(
12860 	vm_map_t        old_map,
12861 	vm_map_entry_t  old_entry,
12862 	vm_map_t        new_map)
12863 {
12864 	vm_object_t     object;
12865 	vm_map_entry_t  new_entry;
12866 
12867 	/*
12868 	 *	New sharing code.  New map entry
12869 	 *	references original object.  Internal
12870 	 *	objects use asynchronous copy algorithm for
12871 	 *	future copies.  First make sure we have
12872 	 *	the right object.  If we need a shadow,
12873 	 *	or someone else already has one, then
12874 	 *	make a new shadow and share it.
12875 	 */
12876 
12877 	if (!old_entry->is_sub_map) {
12878 		object = VME_OBJECT(old_entry);
12879 	}
12880 
12881 	if (old_entry->is_sub_map) {
12882 		assert(old_entry->wired_count == 0);
12883 #ifndef NO_NESTED_PMAP
12884 #if !PMAP_FORK_NEST
12885 		if (old_entry->use_pmap) {
12886 			kern_return_t   result;
12887 
12888 			result = pmap_nest(new_map->pmap,
12889 			    (VME_SUBMAP(old_entry))->pmap,
12890 			    (addr64_t)old_entry->vme_start,
12891 			    (uint64_t)(old_entry->vme_end - old_entry->vme_start));
12892 			if (result) {
12893 				panic("vm_map_fork_share: pmap_nest failed!");
12894 			}
12895 		}
12896 #endif /* !PMAP_FORK_NEST */
12897 #endif  /* NO_NESTED_PMAP */
12898 	} else if (object == VM_OBJECT_NULL) {
12899 		object = vm_object_allocate((vm_map_size_t)(old_entry->vme_end -
12900 		    old_entry->vme_start));
12901 		VME_OFFSET_SET(old_entry, 0);
12902 		VME_OBJECT_SET(old_entry, object, false, 0);
12903 		old_entry->use_pmap = TRUE;
12904 //		assert(!old_entry->needs_copy);
12905 	} else if (object->copy_strategy !=
12906 	    MEMORY_OBJECT_COPY_SYMMETRIC) {
12907 		/*
12908 		 *	We are already using an asymmetric
12909 		 *	copy, and therefore we already have
12910 		 *	the right object.
12911 		 */
12912 
12913 		assert(!old_entry->needs_copy);
12914 	} else if (old_entry->needs_copy ||       /* case 1 */
12915 	    object->shadowed ||                 /* case 2 */
12916 	    (!object->true_share &&             /* case 3 */
12917 	    !old_entry->is_shared &&
12918 	    (object->vo_size >
12919 	    (vm_map_size_t)(old_entry->vme_end -
12920 	    old_entry->vme_start)))) {
12921 		bool is_writable;
12922 
12923 		/*
12924 		 *	We need to create a shadow.
12925 		 *	There are three cases here.
12926 		 *	In the first case, we need to
12927 		 *	complete a deferred symmetrical
12928 		 *	copy that we participated in.
12929 		 *	In the second and third cases,
12930 		 *	we need to create the shadow so
12931 		 *	that changes that we make to the
12932 		 *	object do not interfere with
12933 		 *	any symmetrical copies which
12934 		 *	have occured (case 2) or which
12935 		 *	might occur (case 3).
12936 		 *
12937 		 *	The first case is when we had
12938 		 *	deferred shadow object creation
12939 		 *	via the entry->needs_copy mechanism.
12940 		 *	This mechanism only works when
12941 		 *	only one entry points to the source
12942 		 *	object, and we are about to create
12943 		 *	a second entry pointing to the
12944 		 *	same object. The problem is that
12945 		 *	there is no way of mapping from
12946 		 *	an object to the entries pointing
12947 		 *	to it. (Deferred shadow creation
12948 		 *	works with one entry because occurs
12949 		 *	at fault time, and we walk from the
12950 		 *	entry to the object when handling
12951 		 *	the fault.)
12952 		 *
12953 		 *	The second case is when the object
12954 		 *	to be shared has already been copied
12955 		 *	with a symmetric copy, but we point
12956 		 *	directly to the object without
12957 		 *	needs_copy set in our entry. (This
12958 		 *	can happen because different ranges
12959 		 *	of an object can be pointed to by
12960 		 *	different entries. In particular,
12961 		 *	a single entry pointing to an object
12962 		 *	can be split by a call to vm_inherit,
12963 		 *	which, combined with task_create, can
12964 		 *	result in the different entries
12965 		 *	having different needs_copy values.)
12966 		 *	The shadowed flag in the object allows
12967 		 *	us to detect this case. The problem
12968 		 *	with this case is that if this object
12969 		 *	has or will have shadows, then we
12970 		 *	must not perform an asymmetric copy
12971 		 *	of this object, since such a copy
12972 		 *	allows the object to be changed, which
12973 		 *	will break the previous symmetrical
12974 		 *	copies (which rely upon the object
12975 		 *	not changing). In a sense, the shadowed
12976 		 *	flag says "don't change this object".
12977 		 *	We fix this by creating a shadow
12978 		 *	object for this object, and sharing
12979 		 *	that. This works because we are free
12980 		 *	to change the shadow object (and thus
12981 		 *	to use an asymmetric copy strategy);
12982 		 *	this is also semantically correct,
12983 		 *	since this object is temporary, and
12984 		 *	therefore a copy of the object is
12985 		 *	as good as the object itself. (This
12986 		 *	is not true for permanent objects,
12987 		 *	since the pager needs to see changes,
12988 		 *	which won't happen if the changes
12989 		 *	are made to a copy.)
12990 		 *
12991 		 *	The third case is when the object
12992 		 *	to be shared has parts sticking
12993 		 *	outside of the entry we're working
12994 		 *	with, and thus may in the future
12995 		 *	be subject to a symmetrical copy.
12996 		 *	(This is a preemptive version of
12997 		 *	case 2.)
12998 		 */
12999 		VME_OBJECT_SHADOW(old_entry,
13000 		    (vm_map_size_t) (old_entry->vme_end -
13001 		    old_entry->vme_start),
13002 		    vm_map_always_shadow(old_map));
13003 
13004 		/*
13005 		 *	If we're making a shadow for other than
13006 		 *	copy on write reasons, then we have
13007 		 *	to remove write permission.
13008 		 */
13009 
13010 		is_writable = false;
13011 		if (old_entry->protection & VM_PROT_WRITE) {
13012 			is_writable = true;
13013 #if __arm64e__
13014 		} else if (old_entry->used_for_tpro) {
13015 			is_writable = true;
13016 #endif /* __arm64e__ */
13017 		}
13018 		if (!old_entry->needs_copy && is_writable) {
13019 			vm_prot_t prot;
13020 
13021 			assert(!pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, old_entry->protection));
13022 
13023 			prot = old_entry->protection & ~VM_PROT_WRITE;
13024 
13025 			assert(!pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, prot));
13026 
13027 			if (override_nx(old_map, VME_ALIAS(old_entry)) && prot) {
13028 				prot |= VM_PROT_EXECUTE;
13029 			}
13030 
13031 
13032 			if (old_map->mapped_in_other_pmaps) {
13033 				vm_object_pmap_protect(
13034 					VME_OBJECT(old_entry),
13035 					VME_OFFSET(old_entry),
13036 					(old_entry->vme_end -
13037 					old_entry->vme_start),
13038 					PMAP_NULL,
13039 					PAGE_SIZE,
13040 					old_entry->vme_start,
13041 					prot);
13042 			} else {
13043 				pmap_protect(old_map->pmap,
13044 				    old_entry->vme_start,
13045 				    old_entry->vme_end,
13046 				    prot);
13047 			}
13048 		}
13049 
13050 		old_entry->needs_copy = FALSE;
13051 		object = VME_OBJECT(old_entry);
13052 	}
13053 
13054 
13055 	/*
13056 	 *	If object was using a symmetric copy strategy,
13057 	 *	change its copy strategy to the default
13058 	 *	asymmetric copy strategy, which is copy_delay
13059 	 *	in the non-norma case and copy_call in the
13060 	 *	norma case. Bump the reference count for the
13061 	 *	new entry.
13062 	 */
13063 
13064 	if (old_entry->is_sub_map) {
13065 		vm_map_reference(VME_SUBMAP(old_entry));
13066 	} else {
13067 		vm_object_lock(object);
13068 		vm_object_reference_locked(object);
13069 		if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
13070 			object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
13071 		}
13072 		vm_object_unlock(object);
13073 	}
13074 
13075 	/*
13076 	 *	Clone the entry, using object ref from above.
13077 	 *	Mark both entries as shared.
13078 	 */
13079 
13080 	new_entry = vm_map_entry_create(new_map); /* Never the kernel map or descendants */
13081 	vm_map_entry_copy(old_map, new_entry, old_entry);
13082 	old_entry->is_shared = TRUE;
13083 	new_entry->is_shared = TRUE;
13084 
13085 	/*
13086 	 * We're dealing with a shared mapping, so the resulting mapping
13087 	 * should inherit some of the original mapping's accounting settings.
13088 	 * "iokit_acct" should have been cleared in vm_map_entry_copy().
13089 	 * "use_pmap" should stay the same as before (if it hasn't been reset
13090 	 * to TRUE when we cleared "iokit_acct").
13091 	 */
13092 	assert(!new_entry->iokit_acct);
13093 
13094 	/*
13095 	 *	If old entry's inheritence is VM_INHERIT_NONE,
13096 	 *	the new entry is for corpse fork, remove the
13097 	 *	write permission from the new entry.
13098 	 */
13099 	if (old_entry->inheritance == VM_INHERIT_NONE) {
13100 		new_entry->protection &= ~VM_PROT_WRITE;
13101 		new_entry->max_protection &= ~VM_PROT_WRITE;
13102 	}
13103 
13104 	/*
13105 	 *	Insert the entry into the new map -- we
13106 	 *	know we're inserting at the end of the new
13107 	 *	map.
13108 	 */
13109 
13110 	vm_map_store_entry_link(new_map, vm_map_last_entry(new_map), new_entry,
13111 	    VM_MAP_KERNEL_FLAGS_NONE);
13112 
13113 	/*
13114 	 *	Update the physical map
13115 	 */
13116 
13117 	if (old_entry->is_sub_map) {
13118 		/* Bill Angell pmap support goes here */
13119 	} else {
13120 		pmap_copy(new_map->pmap, old_map->pmap, new_entry->vme_start,
13121 		    old_entry->vme_end - old_entry->vme_start,
13122 		    old_entry->vme_start);
13123 	}
13124 }
13125 
13126 static boolean_t
vm_map_fork_copy(vm_map_t old_map,vm_map_entry_t * old_entry_p,vm_map_t new_map,int vm_map_copyin_flags)13127 vm_map_fork_copy(
13128 	vm_map_t        old_map,
13129 	vm_map_entry_t  *old_entry_p,
13130 	vm_map_t        new_map,
13131 	int             vm_map_copyin_flags)
13132 {
13133 	vm_map_entry_t old_entry = *old_entry_p;
13134 	vm_map_size_t entry_size = old_entry->vme_end - old_entry->vme_start;
13135 	vm_map_offset_t start = old_entry->vme_start;
13136 	vm_map_copy_t copy;
13137 	vm_map_entry_t last = vm_map_last_entry(new_map);
13138 
13139 	vm_map_unlock(old_map);
13140 	/*
13141 	 *	Use maxprot version of copyin because we
13142 	 *	care about whether this memory can ever
13143 	 *	be accessed, not just whether it's accessible
13144 	 *	right now.
13145 	 */
13146 	vm_map_copyin_flags |= VM_MAP_COPYIN_USE_MAXPROT;
13147 	if (vm_map_copyin_internal(old_map, start, entry_size,
13148 	    vm_map_copyin_flags, &copy)
13149 	    != KERN_SUCCESS) {
13150 		/*
13151 		 *	The map might have changed while it
13152 		 *	was unlocked, check it again.  Skip
13153 		 *	any blank space or permanently
13154 		 *	unreadable region.
13155 		 */
13156 		vm_map_lock(old_map);
13157 		if (!vm_map_lookup_entry(old_map, start, &last) ||
13158 		    (last->max_protection & VM_PROT_READ) == VM_PROT_NONE) {
13159 			last = last->vme_next;
13160 		}
13161 		*old_entry_p = last;
13162 
13163 		/*
13164 		 * XXX	For some error returns, want to
13165 		 * XXX	skip to the next element.  Note
13166 		 *	that INVALID_ADDRESS and
13167 		 *	PROTECTION_FAILURE are handled above.
13168 		 */
13169 
13170 		return FALSE;
13171 	}
13172 
13173 	/*
13174 	 * Assert that the vm_map_copy is coming from the right
13175 	 * zone and hasn't been forged
13176 	 */
13177 	vm_map_copy_require(copy);
13178 
13179 	/*
13180 	 *	Insert the copy into the new map
13181 	 */
13182 	vm_map_copy_insert(new_map, last, copy);
13183 
13184 	/*
13185 	 *	Pick up the traversal at the end of
13186 	 *	the copied region.
13187 	 */
13188 
13189 	vm_map_lock(old_map);
13190 	start += entry_size;
13191 	if (!vm_map_lookup_entry(old_map, start, &last)) {
13192 		last = last->vme_next;
13193 	} else {
13194 		if (last->vme_start == start) {
13195 			/*
13196 			 * No need to clip here and we don't
13197 			 * want to cause any unnecessary
13198 			 * unnesting...
13199 			 */
13200 		} else {
13201 			vm_map_clip_start(old_map, last, start);
13202 		}
13203 	}
13204 	*old_entry_p = last;
13205 
13206 	return TRUE;
13207 }
13208 
13209 #if PMAP_FORK_NEST
13210 #define PMAP_FORK_NEST_DEBUG 0
13211 static inline void
vm_map_fork_unnest(pmap_t new_pmap,vm_map_offset_t pre_nested_start,vm_map_offset_t pre_nested_end,vm_map_offset_t start,vm_map_offset_t end)13212 vm_map_fork_unnest(
13213 	pmap_t new_pmap,
13214 	vm_map_offset_t pre_nested_start,
13215 	vm_map_offset_t pre_nested_end,
13216 	vm_map_offset_t start,
13217 	vm_map_offset_t end)
13218 {
13219 	kern_return_t kr;
13220 	vm_map_offset_t nesting_mask, start_unnest, end_unnest;
13221 
13222 	assertf(pre_nested_start <= pre_nested_end,
13223 	    "pre_nested start 0x%llx end 0x%llx",
13224 	    (uint64_t)pre_nested_start, (uint64_t)pre_nested_end);
13225 	assertf(start <= end,
13226 	    "start 0x%llx end 0x%llx",
13227 	    (uint64_t) start, (uint64_t)end);
13228 
13229 	if (pre_nested_start == pre_nested_end) {
13230 		/* nothing was pre-nested: done */
13231 		return;
13232 	}
13233 	if (end <= pre_nested_start) {
13234 		/* fully before pre-nested range: done */
13235 		return;
13236 	}
13237 	if (start >= pre_nested_end) {
13238 		/* fully after pre-nested range: done */
13239 		return;
13240 	}
13241 	/* ignore parts of range outside of pre_nested range */
13242 	if (start < pre_nested_start) {
13243 		start = pre_nested_start;
13244 	}
13245 	if (end > pre_nested_end) {
13246 		end = pre_nested_end;
13247 	}
13248 	nesting_mask = pmap_shared_region_size_min(new_pmap) - 1;
13249 	start_unnest = start & ~nesting_mask;
13250 	end_unnest = (end + nesting_mask) & ~nesting_mask;
13251 	kr = pmap_unnest(new_pmap,
13252 	    (addr64_t)start_unnest,
13253 	    (uint64_t)(end_unnest - start_unnest));
13254 #if PMAP_FORK_NEST_DEBUG
13255 	printf("PMAP_FORK_NEST %s:%d new_pmap %p 0x%llx:0x%llx -> pmap_unnest 0x%llx:0x%llx kr 0x%x\n", __FUNCTION__, __LINE__, new_pmap, (uint64_t)start, (uint64_t)end, (uint64_t)start_unnest, (uint64_t)end_unnest, kr);
13256 #endif /* PMAP_FORK_NEST_DEBUG */
13257 	assertf(kr == KERN_SUCCESS,
13258 	    "0x%llx 0x%llx pmap_unnest(%p, 0x%llx, 0x%llx) -> 0x%x",
13259 	    (uint64_t)start, (uint64_t)end, new_pmap,
13260 	    (uint64_t)start_unnest, (uint64_t)(end_unnest - start_unnest),
13261 	    kr);
13262 }
13263 #endif /* PMAP_FORK_NEST */
13264 
13265 void
vm_map_inherit_limits(vm_map_t new_map,const struct _vm_map * old_map)13266 vm_map_inherit_limits(vm_map_t new_map, const struct _vm_map *old_map)
13267 {
13268 	new_map->size_limit = old_map->size_limit;
13269 	new_map->data_limit = old_map->data_limit;
13270 	new_map->user_wire_limit = old_map->user_wire_limit;
13271 	new_map->reserved_regions = old_map->reserved_regions;
13272 }
13273 
13274 /*
13275  *	vm_map_fork:
13276  *
13277  *	Create and return a new map based on the old
13278  *	map, according to the inheritance values on the
13279  *	regions in that map and the options.
13280  *
13281  *	The source map must not be locked.
13282  */
13283 vm_map_t
vm_map_fork(ledger_t ledger,vm_map_t old_map,int options)13284 vm_map_fork(
13285 	ledger_t        ledger,
13286 	vm_map_t        old_map,
13287 	int             options)
13288 {
13289 	pmap_t          new_pmap;
13290 	vm_map_t        new_map;
13291 	vm_map_entry_t  old_entry;
13292 	vm_map_size_t   new_size = 0, entry_size;
13293 	vm_map_entry_t  new_entry;
13294 	boolean_t       src_needs_copy;
13295 	boolean_t       new_entry_needs_copy;
13296 	boolean_t       pmap_is64bit;
13297 	int             vm_map_copyin_flags;
13298 	vm_inherit_t    old_entry_inheritance;
13299 	int             map_create_options;
13300 	kern_return_t   footprint_collect_kr;
13301 
13302 	if (options & ~(VM_MAP_FORK_SHARE_IF_INHERIT_NONE |
13303 	    VM_MAP_FORK_PRESERVE_PURGEABLE |
13304 	    VM_MAP_FORK_CORPSE_FOOTPRINT)) {
13305 		/* unsupported option */
13306 		return VM_MAP_NULL;
13307 	}
13308 
13309 	pmap_is64bit =
13310 #if defined(__i386__) || defined(__x86_64__)
13311 	    old_map->pmap->pm_task_map != TASK_MAP_32BIT;
13312 #elif defined(__arm64__)
13313 	    old_map->pmap->is_64bit;
13314 #else
13315 #error Unknown architecture.
13316 #endif
13317 
13318 	unsigned int pmap_flags = 0;
13319 	pmap_flags |= pmap_is64bit ? PMAP_CREATE_64BIT : 0;
13320 #if defined(HAS_APPLE_PAC)
13321 	pmap_flags |= old_map->pmap->disable_jop ? PMAP_CREATE_DISABLE_JOP : 0;
13322 #endif
13323 #if CONFIG_ROSETTA
13324 	pmap_flags |= old_map->pmap->is_rosetta ? PMAP_CREATE_ROSETTA : 0;
13325 #endif
13326 #if PMAP_CREATE_FORCE_4K_PAGES
13327 	if (VM_MAP_PAGE_SIZE(old_map) == FOURK_PAGE_SIZE &&
13328 	    PAGE_SIZE != FOURK_PAGE_SIZE) {
13329 		pmap_flags |= PMAP_CREATE_FORCE_4K_PAGES;
13330 	}
13331 #endif /* PMAP_CREATE_FORCE_4K_PAGES */
13332 	new_pmap = pmap_create_options(ledger, (vm_map_size_t) 0, pmap_flags);
13333 	if (new_pmap == NULL) {
13334 		return VM_MAP_NULL;
13335 	}
13336 
13337 	vm_map_reference(old_map);
13338 	vm_map_lock(old_map);
13339 
13340 	map_create_options = 0;
13341 	if (old_map->hdr.entries_pageable) {
13342 		map_create_options |= VM_MAP_CREATE_PAGEABLE;
13343 	}
13344 	if (options & VM_MAP_FORK_CORPSE_FOOTPRINT) {
13345 		map_create_options |= VM_MAP_CREATE_CORPSE_FOOTPRINT;
13346 		footprint_collect_kr = KERN_SUCCESS;
13347 	}
13348 	new_map = vm_map_create_options(new_pmap,
13349 	    old_map->min_offset,
13350 	    old_map->max_offset,
13351 	    map_create_options);
13352 
13353 	/* inherit cs_enforcement */
13354 	vm_map_cs_enforcement_set(new_map, old_map->cs_enforcement);
13355 
13356 	vm_map_lock(new_map);
13357 	vm_commit_pagezero_status(new_map);
13358 	/* inherit the parent map's page size */
13359 	vm_map_set_page_shift(new_map, VM_MAP_PAGE_SHIFT(old_map));
13360 
13361 	/* inherit the parent rlimits */
13362 	vm_map_inherit_limits(new_map, old_map);
13363 
13364 #if CONFIG_MAP_RANGES
13365 	/* inherit the parent map's VM ranges */
13366 	vm_map_range_fork(new_map, old_map);
13367 #endif
13368 
13369 #if CODE_SIGNING_MONITOR
13370 	/* Prepare the monitor for the fork */
13371 	csm_fork_prepare(old_map->pmap, new_pmap);
13372 #endif
13373 
13374 #if PMAP_FORK_NEST
13375 	/*
13376 	 * Pre-nest the shared region's pmap.
13377 	 */
13378 	vm_map_offset_t pre_nested_start = 0, pre_nested_end = 0;
13379 	pmap_fork_nest(old_map->pmap, new_pmap,
13380 	    &pre_nested_start, &pre_nested_end);
13381 #if PMAP_FORK_NEST_DEBUG
13382 	printf("PMAP_FORK_NEST %s:%d old %p new %p pre_nested start 0x%llx end 0x%llx\n", __FUNCTION__, __LINE__, old_map->pmap, new_pmap, (uint64_t)pre_nested_start, (uint64_t)pre_nested_end);
13383 #endif /* PMAP_FORK_NEST_DEBUG */
13384 #endif /* PMAP_FORK_NEST */
13385 
13386 	for (old_entry = vm_map_first_entry(old_map); old_entry != vm_map_to_entry(old_map);) {
13387 		/*
13388 		 * Abort any corpse collection if the system is shutting down.
13389 		 */
13390 		if ((options & VM_MAP_FORK_CORPSE_FOOTPRINT) &&
13391 		    get_system_inshutdown()) {
13392 #if PMAP_FORK_NEST
13393 			new_entry = vm_map_last_entry(new_map);
13394 			if (new_entry == vm_map_to_entry(new_map)) {
13395 				/* unnest all that was pre-nested */
13396 				vm_map_fork_unnest(new_pmap,
13397 				    pre_nested_start, pre_nested_end,
13398 				    vm_map_min(new_map), vm_map_max(new_map));
13399 			} else if (new_entry->vme_end < vm_map_max(new_map)) {
13400 				/* unnest hole at the end, if pre-nested */
13401 				vm_map_fork_unnest(new_pmap,
13402 				    pre_nested_start, pre_nested_end,
13403 				    new_entry->vme_end, vm_map_max(new_map));
13404 			}
13405 #endif /* PMAP_FORK_NEST */
13406 			vm_map_corpse_footprint_collect_done(new_map);
13407 			vm_map_unlock(new_map);
13408 			vm_map_unlock(old_map);
13409 			vm_map_deallocate(new_map);
13410 			vm_map_deallocate(old_map);
13411 			printf("Aborting corpse map due to system shutdown\n");
13412 			return VM_MAP_NULL;
13413 		}
13414 
13415 		entry_size = old_entry->vme_end - old_entry->vme_start;
13416 
13417 #if PMAP_FORK_NEST
13418 		/*
13419 		 * Undo any unnecessary pre-nesting.
13420 		 */
13421 		vm_map_offset_t prev_end;
13422 		if (old_entry == vm_map_first_entry(old_map)) {
13423 			prev_end = vm_map_min(old_map);
13424 		} else {
13425 			prev_end = old_entry->vme_prev->vme_end;
13426 		}
13427 		if (prev_end < old_entry->vme_start) {
13428 			/* unnest hole before this entry, if pre-nested */
13429 			vm_map_fork_unnest(new_pmap,
13430 			    pre_nested_start, pre_nested_end,
13431 			    prev_end, old_entry->vme_start);
13432 		}
13433 		if (old_entry->is_sub_map && old_entry->use_pmap) {
13434 			/* keep this entry nested in the child */
13435 #if PMAP_FORK_NEST_DEBUG
13436 			printf("PMAP_FORK_NEST %s:%d new_pmap %p keeping 0x%llx:0x%llx nested\n", __FUNCTION__, __LINE__, new_pmap, (uint64_t)old_entry->vme_start, (uint64_t)old_entry->vme_end);
13437 #endif /* PMAP_FORK_NEST_DEBUG */
13438 		} else {
13439 			/* undo nesting for this entry, if pre-nested */
13440 			vm_map_fork_unnest(new_pmap,
13441 			    pre_nested_start, pre_nested_end,
13442 			    old_entry->vme_start, old_entry->vme_end);
13443 		}
13444 #endif /* PMAP_FORK_NEST */
13445 
13446 		old_entry_inheritance = old_entry->inheritance;
13447 		/*
13448 		 * If caller used the VM_MAP_FORK_SHARE_IF_INHERIT_NONE option
13449 		 * share VM_INHERIT_NONE entries that are not backed by a
13450 		 * device pager.
13451 		 */
13452 		if (old_entry_inheritance == VM_INHERIT_NONE &&
13453 		    (options & VM_MAP_FORK_SHARE_IF_INHERIT_NONE) &&
13454 		    (old_entry->protection & VM_PROT_READ) &&
13455 		    !(!old_entry->is_sub_map &&
13456 		    VME_OBJECT(old_entry) != NULL &&
13457 		    VME_OBJECT(old_entry)->pager != NULL &&
13458 		    is_device_pager_ops(
13459 			    VME_OBJECT(old_entry)->pager->mo_pager_ops))) {
13460 			old_entry_inheritance = VM_INHERIT_SHARE;
13461 		}
13462 
13463 		if (old_entry_inheritance != VM_INHERIT_NONE &&
13464 		    (options & VM_MAP_FORK_CORPSE_FOOTPRINT) &&
13465 		    footprint_collect_kr == KERN_SUCCESS) {
13466 			/*
13467 			 * The corpse won't have old_map->pmap to query
13468 			 * footprint information, so collect that data now
13469 			 * and store it in new_map->vmmap_corpse_footprint
13470 			 * for later autopsy.
13471 			 */
13472 			footprint_collect_kr =
13473 			    vm_map_corpse_footprint_collect(old_map,
13474 			    old_entry,
13475 			    new_map);
13476 		}
13477 
13478 		switch (old_entry_inheritance) {
13479 		case VM_INHERIT_NONE:
13480 			break;
13481 
13482 		case VM_INHERIT_SHARE:
13483 			vm_map_fork_share(old_map, old_entry, new_map);
13484 			new_size += entry_size;
13485 			break;
13486 
13487 		case VM_INHERIT_COPY:
13488 
13489 			/*
13490 			 *	Inline the copy_quickly case;
13491 			 *	upon failure, fall back on call
13492 			 *	to vm_map_fork_copy.
13493 			 */
13494 
13495 			if (old_entry->is_sub_map) {
13496 				break;
13497 			}
13498 			if ((old_entry->wired_count != 0) ||
13499 			    ((VME_OBJECT(old_entry) != NULL) &&
13500 			    (VME_OBJECT(old_entry)->true_share))) {
13501 				goto slow_vm_map_fork_copy;
13502 			}
13503 
13504 			new_entry = vm_map_entry_create(new_map); /* never the kernel map or descendants */
13505 			vm_map_entry_copy(old_map, new_entry, old_entry);
13506 			if (old_entry->vme_permanent) {
13507 				/* inherit "permanent" on fork() */
13508 				new_entry->vme_permanent = TRUE;
13509 			}
13510 
13511 			if (new_entry->used_for_jit == TRUE && new_map->jit_entry_exists == FALSE) {
13512 				new_map->jit_entry_exists = TRUE;
13513 			}
13514 
13515 			if (new_entry->is_sub_map) {
13516 				/* clear address space specifics */
13517 				new_entry->use_pmap = FALSE;
13518 			} else {
13519 				/*
13520 				 * We're dealing with a copy-on-write operation,
13521 				 * so the resulting mapping should not inherit
13522 				 * the original mapping's accounting settings.
13523 				 * "iokit_acct" should have been cleared in
13524 				 * vm_map_entry_copy().
13525 				 * "use_pmap" should be reset to its default
13526 				 * (TRUE) so that the new mapping gets
13527 				 * accounted for in the task's memory footprint.
13528 				 */
13529 				assert(!new_entry->iokit_acct);
13530 				new_entry->use_pmap = TRUE;
13531 			}
13532 
13533 			if (!vm_object_copy_quickly(
13534 				    VME_OBJECT(new_entry),
13535 				    VME_OFFSET(old_entry),
13536 				    (old_entry->vme_end -
13537 				    old_entry->vme_start),
13538 				    &src_needs_copy,
13539 				    &new_entry_needs_copy)) {
13540 				vm_map_entry_dispose(new_entry);
13541 				goto slow_vm_map_fork_copy;
13542 			}
13543 
13544 			/*
13545 			 *	Handle copy-on-write obligations
13546 			 */
13547 
13548 			if (src_needs_copy && !old_entry->needs_copy) {
13549 				vm_prot_t prot;
13550 
13551 				assert(!pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, old_entry->protection));
13552 
13553 				prot = old_entry->protection & ~VM_PROT_WRITE;
13554 
13555 				if (override_nx(old_map, VME_ALIAS(old_entry))
13556 				    && prot) {
13557 					prot |= VM_PROT_EXECUTE;
13558 				}
13559 
13560 				assert(!pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, prot));
13561 
13562 				vm_object_pmap_protect(
13563 					VME_OBJECT(old_entry),
13564 					VME_OFFSET(old_entry),
13565 					(old_entry->vme_end -
13566 					old_entry->vme_start),
13567 					((old_entry->is_shared
13568 					|| old_map->mapped_in_other_pmaps)
13569 					? PMAP_NULL :
13570 					old_map->pmap),
13571 					VM_MAP_PAGE_SIZE(old_map),
13572 					old_entry->vme_start,
13573 					prot);
13574 
13575 				assert(old_entry->wired_count == 0);
13576 				old_entry->needs_copy = TRUE;
13577 			}
13578 			new_entry->needs_copy = new_entry_needs_copy;
13579 
13580 			/*
13581 			 *	Insert the entry at the end
13582 			 *	of the map.
13583 			 */
13584 
13585 			vm_map_store_entry_link(new_map,
13586 			    vm_map_last_entry(new_map),
13587 			    new_entry,
13588 			    VM_MAP_KERNEL_FLAGS_NONE);
13589 			new_size += entry_size;
13590 			break;
13591 
13592 slow_vm_map_fork_copy:
13593 			vm_map_copyin_flags = VM_MAP_COPYIN_FORK;
13594 			if (options & VM_MAP_FORK_PRESERVE_PURGEABLE) {
13595 				vm_map_copyin_flags |=
13596 				    VM_MAP_COPYIN_PRESERVE_PURGEABLE;
13597 			}
13598 			if (vm_map_fork_copy(old_map,
13599 			    &old_entry,
13600 			    new_map,
13601 			    vm_map_copyin_flags)) {
13602 				new_size += entry_size;
13603 			}
13604 			continue;
13605 		}
13606 		old_entry = old_entry->vme_next;
13607 	}
13608 
13609 #if PMAP_FORK_NEST
13610 	new_entry = vm_map_last_entry(new_map);
13611 	if (new_entry == vm_map_to_entry(new_map)) {
13612 		/* unnest all that was pre-nested */
13613 		vm_map_fork_unnest(new_pmap,
13614 		    pre_nested_start, pre_nested_end,
13615 		    vm_map_min(new_map), vm_map_max(new_map));
13616 	} else if (new_entry->vme_end < vm_map_max(new_map)) {
13617 		/* unnest hole at the end, if pre-nested */
13618 		vm_map_fork_unnest(new_pmap,
13619 		    pre_nested_start, pre_nested_end,
13620 		    new_entry->vme_end, vm_map_max(new_map));
13621 	}
13622 #endif /* PMAP_FORK_NEST */
13623 
13624 #if defined(__arm64__)
13625 	pmap_insert_commpage(new_map->pmap);
13626 #endif /* __arm64__ */
13627 
13628 	new_map->size = new_size;
13629 
13630 	if (options & VM_MAP_FORK_CORPSE_FOOTPRINT) {
13631 		vm_map_corpse_footprint_collect_done(new_map);
13632 	}
13633 
13634 	/* Propagate JIT entitlement for the pmap layer. */
13635 	if (pmap_get_jit_entitled(old_map->pmap)) {
13636 		/* Tell the pmap that it supports JIT. */
13637 		pmap_set_jit_entitled(new_map->pmap);
13638 	}
13639 
13640 	/* Propagate TPRO settings for the pmap layer */
13641 	if (pmap_get_tpro(old_map->pmap)) {
13642 		/* Tell the pmap that it supports TPRO */
13643 		pmap_set_tpro(new_map->pmap);
13644 	}
13645 
13646 	vm_map_unlock(new_map);
13647 	vm_map_unlock(old_map);
13648 	vm_map_deallocate(old_map);
13649 
13650 	return new_map;
13651 }
13652 
13653 /*
13654  * vm_map_exec:
13655  *
13656  *      Setup the "new_map" with the proper execution environment according
13657  *	to the type of executable (platform, 64bit, chroot environment).
13658  *	Map the comm page and shared region, etc...
13659  */
13660 kern_return_t
vm_map_exec(vm_map_t new_map,task_t task,boolean_t is64bit,void * fsroot,cpu_type_t cpu,cpu_subtype_t cpu_subtype,boolean_t reslide,boolean_t is_driverkit,uint32_t rsr_version)13661 vm_map_exec(
13662 	vm_map_t        new_map,
13663 	task_t          task,
13664 	boolean_t       is64bit,
13665 	void            *fsroot,
13666 	cpu_type_t      cpu,
13667 	cpu_subtype_t   cpu_subtype,
13668 	boolean_t       reslide,
13669 	boolean_t       is_driverkit,
13670 	uint32_t        rsr_version)
13671 {
13672 	SHARED_REGION_TRACE_DEBUG(
13673 		("shared_region: task %p: vm_map_exec(%p,%p,%p,0x%x,0x%x): ->\n",
13674 		(void *)VM_KERNEL_ADDRPERM(current_task()),
13675 		(void *)VM_KERNEL_ADDRPERM(new_map),
13676 		(void *)VM_KERNEL_ADDRPERM(task),
13677 		(void *)VM_KERNEL_ADDRPERM(fsroot),
13678 		cpu,
13679 		cpu_subtype));
13680 	(void) vm_commpage_enter(new_map, task, is64bit);
13681 
13682 	(void) vm_shared_region_enter(new_map, task, is64bit, fsroot, cpu, cpu_subtype, reslide, is_driverkit, rsr_version);
13683 
13684 	SHARED_REGION_TRACE_DEBUG(
13685 		("shared_region: task %p: vm_map_exec(%p,%p,%p,0x%x,0x%x): <-\n",
13686 		(void *)VM_KERNEL_ADDRPERM(current_task()),
13687 		(void *)VM_KERNEL_ADDRPERM(new_map),
13688 		(void *)VM_KERNEL_ADDRPERM(task),
13689 		(void *)VM_KERNEL_ADDRPERM(fsroot),
13690 		cpu,
13691 		cpu_subtype));
13692 
13693 	/*
13694 	 * Some devices have region(s) of memory that shouldn't get allocated by
13695 	 * user processes. The following code creates dummy vm_map_entry_t's for each
13696 	 * of the regions that needs to be reserved to prevent any allocations in
13697 	 * those regions.
13698 	 */
13699 	kern_return_t kr = KERN_FAILURE;
13700 	vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_FIXED_PERMANENT();
13701 	vmk_flags.vmkf_beyond_max = true;
13702 
13703 	const struct vm_reserved_region *regions = NULL;
13704 	size_t num_regions = ml_get_vm_reserved_regions(is64bit, &regions);
13705 	assert((num_regions == 0) || (num_regions > 0 && regions != NULL));
13706 
13707 	for (size_t i = 0; i < num_regions; ++i) {
13708 		vm_map_offset_t address = regions[i].vmrr_addr;
13709 
13710 		kr = vm_map_enter(
13711 			new_map,
13712 			&address,
13713 			regions[i].vmrr_size,
13714 			(vm_map_offset_t)0,
13715 			vmk_flags,
13716 			VM_OBJECT_NULL,
13717 			(vm_object_offset_t)0,
13718 			FALSE,
13719 			VM_PROT_NONE,
13720 			VM_PROT_NONE,
13721 			VM_INHERIT_COPY);
13722 
13723 		if (kr != KERN_SUCCESS) {
13724 			panic("Failed to reserve %s region in user map %p %d", regions[i].vmrr_name, new_map, kr);
13725 		}
13726 	}
13727 
13728 	new_map->reserved_regions = (num_regions ? TRUE : FALSE);
13729 
13730 	return KERN_SUCCESS;
13731 }
13732 
13733 uint64_t vm_map_lookup_and_lock_object_copy_slowly_count = 0;
13734 uint64_t vm_map_lookup_and_lock_object_copy_slowly_size = 0;
13735 uint64_t vm_map_lookup_and_lock_object_copy_slowly_max = 0;
13736 uint64_t vm_map_lookup_and_lock_object_copy_slowly_restart = 0;
13737 uint64_t vm_map_lookup_and_lock_object_copy_slowly_error = 0;
13738 uint64_t vm_map_lookup_and_lock_object_copy_strategically_count = 0;
13739 uint64_t vm_map_lookup_and_lock_object_copy_strategically_size = 0;
13740 uint64_t vm_map_lookup_and_lock_object_copy_strategically_max = 0;
13741 uint64_t vm_map_lookup_and_lock_object_copy_strategically_restart = 0;
13742 uint64_t vm_map_lookup_and_lock_object_copy_strategically_error = 0;
13743 uint64_t vm_map_lookup_and_lock_object_copy_shadow_count = 0;
13744 uint64_t vm_map_lookup_and_lock_object_copy_shadow_size = 0;
13745 uint64_t vm_map_lookup_and_lock_object_copy_shadow_max = 0;
13746 /*
13747  *	vm_map_lookup_and_lock_object:
13748  *
13749  *	Finds the VM object, offset, and
13750  *	protection for a given virtual address in the
13751  *	specified map, assuming a page fault of the
13752  *	type specified.
13753  *
13754  *	Returns the (object, offset, protection) for
13755  *	this address, whether it is wired down, and whether
13756  *	this map has the only reference to the data in question.
13757  *	In order to later verify this lookup, a "version"
13758  *	is returned.
13759  *	If contended != NULL, *contended will be set to
13760  *	true iff the thread had to spin or block to acquire
13761  *	an exclusive lock.
13762  *
13763  *	The map MUST be locked by the caller and WILL be
13764  *	locked on exit.  In order to guarantee the
13765  *	existence of the returned object, it is returned
13766  *	locked.
13767  *
13768  *	If a lookup is requested with "write protection"
13769  *	specified, the map may be changed to perform virtual
13770  *	copying operations, although the data referenced will
13771  *	remain the same.
13772  */
13773 kern_return_t
vm_map_lookup_and_lock_object(vm_map_t * var_map,vm_map_offset_t vaddr,vm_prot_t fault_type,int object_lock_type,vm_map_version_t * out_version,vm_object_t * object,vm_object_offset_t * offset,vm_prot_t * out_prot,boolean_t * wired,vm_object_fault_info_t fault_info,vm_map_t * real_map,bool * contended)13774 vm_map_lookup_and_lock_object(
13775 	vm_map_t                *var_map,       /* IN/OUT */
13776 	vm_map_offset_t         vaddr,
13777 	vm_prot_t               fault_type,
13778 	int                     object_lock_type,
13779 	vm_map_version_t        *out_version,   /* OUT */
13780 	vm_object_t             *object,        /* OUT */
13781 	vm_object_offset_t      *offset,        /* OUT */
13782 	vm_prot_t               *out_prot,      /* OUT */
13783 	boolean_t               *wired,         /* OUT */
13784 	vm_object_fault_info_t  fault_info,     /* OUT */
13785 	vm_map_t                *real_map,      /* OUT */
13786 	bool                    *contended)     /* OUT */
13787 {
13788 	vm_map_entry_t                  entry;
13789 	vm_map_t                        map = *var_map;
13790 	vm_map_t                        old_map = *var_map;
13791 	vm_map_t                        cow_sub_map_parent = VM_MAP_NULL;
13792 	vm_map_offset_t                 cow_parent_vaddr = 0;
13793 	vm_map_offset_t                 old_start = 0;
13794 	vm_map_offset_t                 old_end = 0;
13795 	vm_prot_t                       prot;
13796 	boolean_t                       mask_protections;
13797 	boolean_t                       force_copy;
13798 	boolean_t                       no_force_copy_if_executable;
13799 	boolean_t                       submap_needed_copy;
13800 	vm_prot_t                       original_fault_type;
13801 	vm_map_size_t                   fault_page_mask;
13802 
13803 	/*
13804 	 * VM_PROT_MASK means that the caller wants us to use "fault_type"
13805 	 * as a mask against the mapping's actual protections, not as an
13806 	 * absolute value.
13807 	 */
13808 	mask_protections = (fault_type & VM_PROT_IS_MASK) ? TRUE : FALSE;
13809 	force_copy = (fault_type & VM_PROT_COPY) ? TRUE : FALSE;
13810 	no_force_copy_if_executable = (fault_type & VM_PROT_COPY_FAIL_IF_EXECUTABLE) ? TRUE : FALSE;
13811 	fault_type &= VM_PROT_ALL;
13812 	original_fault_type = fault_type;
13813 	if (contended) {
13814 		*contended = false;
13815 	}
13816 
13817 	*real_map = map;
13818 
13819 	fault_page_mask = MIN(VM_MAP_PAGE_MASK(map), PAGE_MASK);
13820 	vaddr = VM_MAP_TRUNC_PAGE(vaddr, fault_page_mask);
13821 
13822 RetryLookup:
13823 	fault_type = original_fault_type;
13824 
13825 	/*
13826 	 *	If the map has an interesting hint, try it before calling
13827 	 *	full blown lookup routine.
13828 	 */
13829 	entry = map->hint;
13830 
13831 	if ((entry == vm_map_to_entry(map)) ||
13832 	    (vaddr < entry->vme_start) || (vaddr >= entry->vme_end)) {
13833 		vm_map_entry_t  tmp_entry;
13834 
13835 		/*
13836 		 *	Entry was either not a valid hint, or the vaddr
13837 		 *	was not contained in the entry, so do a full lookup.
13838 		 */
13839 		if (!vm_map_lookup_entry(map, vaddr, &tmp_entry)) {
13840 			if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
13841 				vm_map_unlock(cow_sub_map_parent);
13842 			}
13843 			if ((*real_map != map)
13844 			    && (*real_map != cow_sub_map_parent)) {
13845 				vm_map_unlock(*real_map);
13846 			}
13847 			return KERN_INVALID_ADDRESS;
13848 		}
13849 
13850 		entry = tmp_entry;
13851 	}
13852 	if (map == old_map) {
13853 		old_start = entry->vme_start;
13854 		old_end = entry->vme_end;
13855 	}
13856 
13857 	/*
13858 	 *	Handle submaps.  Drop lock on upper map, submap is
13859 	 *	returned locked.
13860 	 */
13861 
13862 	submap_needed_copy = FALSE;
13863 submap_recurse:
13864 	if (entry->is_sub_map) {
13865 		vm_map_offset_t         local_vaddr;
13866 		vm_map_offset_t         end_delta;
13867 		vm_map_offset_t         start_delta;
13868 		vm_map_offset_t         top_entry_saved_start;
13869 		vm_object_offset_t      top_entry_saved_offset;
13870 		vm_map_entry_t          submap_entry, saved_submap_entry;
13871 		vm_object_offset_t      submap_entry_offset;
13872 		vm_object_size_t        submap_entry_size;
13873 		vm_prot_t               subentry_protection;
13874 		vm_prot_t               subentry_max_protection;
13875 		boolean_t               subentry_no_copy_on_read;
13876 		boolean_t               subentry_permanent;
13877 		boolean_t               subentry_csm_associated;
13878 #if __arm64e__
13879 		boolean_t               subentry_used_for_tpro;
13880 #endif /* __arm64e__ */
13881 		boolean_t               mapped_needs_copy = FALSE;
13882 		vm_map_version_t        version;
13883 
13884 		assertf(VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)) >= VM_MAP_PAGE_SHIFT(map),
13885 		    "map %p (%d) entry %p submap %p (%d)\n",
13886 		    map, VM_MAP_PAGE_SHIFT(map), entry,
13887 		    VME_SUBMAP(entry), VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)));
13888 
13889 		local_vaddr = vaddr;
13890 		top_entry_saved_start = entry->vme_start;
13891 		top_entry_saved_offset = VME_OFFSET(entry);
13892 
13893 		if ((entry->use_pmap &&
13894 		    !((fault_type & VM_PROT_WRITE) ||
13895 		    force_copy))) {
13896 			/* if real_map equals map we unlock below */
13897 			if ((*real_map != map) &&
13898 			    (*real_map != cow_sub_map_parent)) {
13899 				vm_map_unlock(*real_map);
13900 			}
13901 			*real_map = VME_SUBMAP(entry);
13902 		}
13903 
13904 		if (entry->needs_copy &&
13905 		    ((fault_type & VM_PROT_WRITE) ||
13906 		    force_copy)) {
13907 			if (!mapped_needs_copy) {
13908 				if (vm_map_lock_read_to_write(map)) {
13909 					vm_map_lock_read(map);
13910 					*real_map = map;
13911 					goto RetryLookup;
13912 				}
13913 				vm_map_lock_read(VME_SUBMAP(entry));
13914 				*var_map = VME_SUBMAP(entry);
13915 				cow_sub_map_parent = map;
13916 				/* reset base to map before cow object */
13917 				/* this is the map which will accept   */
13918 				/* the new cow object */
13919 				old_start = entry->vme_start;
13920 				old_end = entry->vme_end;
13921 				cow_parent_vaddr = vaddr;
13922 				mapped_needs_copy = TRUE;
13923 			} else {
13924 				vm_map_lock_read(VME_SUBMAP(entry));
13925 				*var_map = VME_SUBMAP(entry);
13926 				if ((cow_sub_map_parent != map) &&
13927 				    (*real_map != map)) {
13928 					vm_map_unlock(map);
13929 				}
13930 			}
13931 		} else {
13932 			if (entry->needs_copy) {
13933 				submap_needed_copy = TRUE;
13934 			}
13935 			vm_map_lock_read(VME_SUBMAP(entry));
13936 			*var_map = VME_SUBMAP(entry);
13937 			/* leave map locked if it is a target */
13938 			/* cow sub_map above otherwise, just  */
13939 			/* follow the maps down to the object */
13940 			/* here we unlock knowing we are not  */
13941 			/* revisiting the map.  */
13942 			if ((*real_map != map) && (map != cow_sub_map_parent)) {
13943 				vm_map_unlock_read(map);
13944 			}
13945 		}
13946 
13947 		entry = NULL;
13948 		map = *var_map;
13949 
13950 		/* calculate the offset in the submap for vaddr */
13951 		local_vaddr = (local_vaddr - top_entry_saved_start) + top_entry_saved_offset;
13952 		assertf(VM_MAP_PAGE_ALIGNED(local_vaddr, fault_page_mask),
13953 		    "local_vaddr 0x%llx entry->vme_start 0x%llx fault_page_mask 0x%llx\n",
13954 		    (uint64_t)local_vaddr, (uint64_t)top_entry_saved_start, (uint64_t)fault_page_mask);
13955 
13956 RetrySubMap:
13957 		if (!vm_map_lookup_entry(map, local_vaddr, &submap_entry)) {
13958 			if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
13959 				vm_map_unlock(cow_sub_map_parent);
13960 			}
13961 			if ((*real_map != map)
13962 			    && (*real_map != cow_sub_map_parent)) {
13963 				vm_map_unlock(*real_map);
13964 			}
13965 			*real_map = map;
13966 			return KERN_INVALID_ADDRESS;
13967 		}
13968 
13969 		/* find the attenuated shadow of the underlying object */
13970 		/* on our target map */
13971 
13972 		/* in english the submap object may extend beyond the     */
13973 		/* region mapped by the entry or, may only fill a portion */
13974 		/* of it.  For our purposes, we only care if the object   */
13975 		/* doesn't fill.  In this case the area which will        */
13976 		/* ultimately be clipped in the top map will only need    */
13977 		/* to be as big as the portion of the underlying entry    */
13978 		/* which is mapped */
13979 		start_delta = submap_entry->vme_start > top_entry_saved_offset ?
13980 		    submap_entry->vme_start - top_entry_saved_offset : 0;
13981 
13982 		end_delta =
13983 		    (top_entry_saved_offset + start_delta + (old_end - old_start)) <=
13984 		    submap_entry->vme_end ?
13985 		    0 : (top_entry_saved_offset +
13986 		    (old_end - old_start))
13987 		    - submap_entry->vme_end;
13988 
13989 		old_start += start_delta;
13990 		old_end -= end_delta;
13991 
13992 		if (submap_entry->is_sub_map) {
13993 			entry = submap_entry;
13994 			vaddr = local_vaddr;
13995 			goto submap_recurse;
13996 		}
13997 
13998 		if (((fault_type & VM_PROT_WRITE) ||
13999 		    force_copy)
14000 		    && cow_sub_map_parent) {
14001 			vm_object_t     sub_object, copy_object;
14002 			vm_object_offset_t copy_offset;
14003 			vm_map_offset_t local_start;
14004 			vm_map_offset_t local_end;
14005 			boolean_t       object_copied = FALSE;
14006 			vm_object_offset_t object_copied_offset = 0;
14007 			boolean_t       object_copied_needs_copy = FALSE;
14008 			kern_return_t   kr = KERN_SUCCESS;
14009 
14010 			if (vm_map_lock_read_to_write(map)) {
14011 				vm_map_lock_read(map);
14012 				old_start -= start_delta;
14013 				old_end += end_delta;
14014 				goto RetrySubMap;
14015 			}
14016 
14017 
14018 			sub_object = VME_OBJECT(submap_entry);
14019 			if (sub_object == VM_OBJECT_NULL) {
14020 				sub_object =
14021 				    vm_object_allocate(
14022 					(vm_map_size_t)
14023 					(submap_entry->vme_end -
14024 					submap_entry->vme_start));
14025 				VME_OBJECT_SET(submap_entry, sub_object, false, 0);
14026 				VME_OFFSET_SET(submap_entry, 0);
14027 				assert(!submap_entry->is_sub_map);
14028 				assert(submap_entry->use_pmap);
14029 			}
14030 			local_start =  local_vaddr -
14031 			    (cow_parent_vaddr - old_start);
14032 			local_end = local_vaddr +
14033 			    (old_end - cow_parent_vaddr);
14034 			vm_map_clip_start(map, submap_entry, local_start);
14035 			vm_map_clip_end(map, submap_entry, local_end);
14036 			if (submap_entry->is_sub_map) {
14037 				/* unnesting was done when clipping */
14038 				assert(!submap_entry->use_pmap);
14039 			}
14040 
14041 			/* This is the COW case, lets connect */
14042 			/* an entry in our space to the underlying */
14043 			/* object in the submap, bypassing the  */
14044 			/* submap. */
14045 			submap_entry_offset = VME_OFFSET(submap_entry);
14046 			submap_entry_size = submap_entry->vme_end - submap_entry->vme_start;
14047 
14048 			if ((submap_entry->wired_count != 0 ||
14049 			    sub_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) &&
14050 			    (submap_entry->protection & VM_PROT_EXECUTE) &&
14051 			    no_force_copy_if_executable) {
14052 //				printf("FBDP map %p entry %p start 0x%llx end 0x%llx wired %d strat %d\n", map, submap_entry, (uint64_t)local_start, (uint64_t)local_end, submap_entry->wired_count, sub_object->copy_strategy);
14053 				if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14054 					vm_map_unlock(cow_sub_map_parent);
14055 				}
14056 				if ((*real_map != map)
14057 				    && (*real_map != cow_sub_map_parent)) {
14058 					vm_map_unlock(*real_map);
14059 				}
14060 				*real_map = map;
14061 				ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_NO_COW_ON_EXECUTABLE), 0 /* arg */);
14062 				vm_map_lock_write_to_read(map);
14063 				kr = KERN_PROTECTION_FAILURE;
14064 				DTRACE_VM4(submap_no_copy_executable,
14065 				    vm_map_t, map,
14066 				    vm_object_offset_t, submap_entry_offset,
14067 				    vm_object_size_t, submap_entry_size,
14068 				    int, kr);
14069 				return kr;
14070 			}
14071 
14072 			if (submap_entry->wired_count != 0) {
14073 				vm_object_reference(sub_object);
14074 
14075 				assertf(VM_MAP_PAGE_ALIGNED(VME_OFFSET(submap_entry), VM_MAP_PAGE_MASK(map)),
14076 				    "submap_entry %p offset 0x%llx\n",
14077 				    submap_entry, VME_OFFSET(submap_entry));
14078 
14079 				DTRACE_VM6(submap_copy_slowly,
14080 				    vm_map_t, cow_sub_map_parent,
14081 				    vm_map_offset_t, vaddr,
14082 				    vm_map_t, map,
14083 				    vm_object_size_t, submap_entry_size,
14084 				    int, submap_entry->wired_count,
14085 				    int, sub_object->copy_strategy);
14086 
14087 				saved_submap_entry = submap_entry;
14088 				version.main_timestamp = map->timestamp;
14089 				vm_map_unlock(map); /* Increments timestamp by 1 */
14090 				submap_entry = VM_MAP_ENTRY_NULL;
14091 
14092 				vm_object_lock(sub_object);
14093 				kr = vm_object_copy_slowly(sub_object,
14094 				    submap_entry_offset,
14095 				    submap_entry_size,
14096 				    FALSE,
14097 				    &copy_object);
14098 				object_copied = TRUE;
14099 				object_copied_offset = 0;
14100 				/* 4k: account for extra offset in physical page */
14101 				object_copied_offset += submap_entry_offset - vm_object_trunc_page(submap_entry_offset);
14102 				object_copied_needs_copy = FALSE;
14103 				vm_object_deallocate(sub_object);
14104 
14105 				vm_map_lock(map);
14106 
14107 				if (kr != KERN_SUCCESS &&
14108 				    kr != KERN_MEMORY_RESTART_COPY) {
14109 					if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14110 						vm_map_unlock(cow_sub_map_parent);
14111 					}
14112 					if ((*real_map != map)
14113 					    && (*real_map != cow_sub_map_parent)) {
14114 						vm_map_unlock(*real_map);
14115 					}
14116 					*real_map = map;
14117 					vm_object_deallocate(copy_object);
14118 					copy_object = VM_OBJECT_NULL;
14119 					ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_COPY_SLOWLY_FAILED), 0 /* arg */);
14120 					vm_map_lock_write_to_read(map);
14121 					DTRACE_VM4(submap_copy_error_slowly,
14122 					    vm_object_t, sub_object,
14123 					    vm_object_offset_t, submap_entry_offset,
14124 					    vm_object_size_t, submap_entry_size,
14125 					    int, kr);
14126 					vm_map_lookup_and_lock_object_copy_slowly_error++;
14127 					return kr;
14128 				}
14129 
14130 				if ((kr == KERN_SUCCESS) &&
14131 				    (version.main_timestamp + 1) == map->timestamp) {
14132 					submap_entry = saved_submap_entry;
14133 				} else {
14134 					saved_submap_entry = NULL;
14135 					old_start -= start_delta;
14136 					old_end += end_delta;
14137 					vm_object_deallocate(copy_object);
14138 					copy_object = VM_OBJECT_NULL;
14139 					vm_map_lock_write_to_read(map);
14140 					vm_map_lookup_and_lock_object_copy_slowly_restart++;
14141 					goto RetrySubMap;
14142 				}
14143 				vm_map_lookup_and_lock_object_copy_slowly_count++;
14144 				vm_map_lookup_and_lock_object_copy_slowly_size += submap_entry_size;
14145 				if (submap_entry_size > vm_map_lookup_and_lock_object_copy_slowly_max) {
14146 					vm_map_lookup_and_lock_object_copy_slowly_max = submap_entry_size;
14147 				}
14148 			} else if (sub_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
14149 				submap_entry_offset = VME_OFFSET(submap_entry);
14150 				copy_object = VM_OBJECT_NULL;
14151 				object_copied_offset = submap_entry_offset;
14152 				object_copied_needs_copy = FALSE;
14153 				DTRACE_VM6(submap_copy_strategically,
14154 				    vm_map_t, cow_sub_map_parent,
14155 				    vm_map_offset_t, vaddr,
14156 				    vm_map_t, map,
14157 				    vm_object_size_t, submap_entry_size,
14158 				    int, submap_entry->wired_count,
14159 				    int, sub_object->copy_strategy);
14160 				kr = vm_object_copy_strategically(
14161 					sub_object,
14162 					submap_entry_offset,
14163 					submap_entry->vme_end - submap_entry->vme_start,
14164 					false, /* forking */
14165 					&copy_object,
14166 					&object_copied_offset,
14167 					&object_copied_needs_copy);
14168 				if (kr == KERN_MEMORY_RESTART_COPY) {
14169 					old_start -= start_delta;
14170 					old_end += end_delta;
14171 					vm_object_deallocate(copy_object);
14172 					copy_object = VM_OBJECT_NULL;
14173 					vm_map_lock_write_to_read(map);
14174 					vm_map_lookup_and_lock_object_copy_strategically_restart++;
14175 					goto RetrySubMap;
14176 				}
14177 				if (kr != KERN_SUCCESS) {
14178 					if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14179 						vm_map_unlock(cow_sub_map_parent);
14180 					}
14181 					if ((*real_map != map)
14182 					    && (*real_map != cow_sub_map_parent)) {
14183 						vm_map_unlock(*real_map);
14184 					}
14185 					*real_map = map;
14186 					vm_object_deallocate(copy_object);
14187 					copy_object = VM_OBJECT_NULL;
14188 					ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_COPY_STRAT_FAILED), 0 /* arg */);
14189 					vm_map_lock_write_to_read(map);
14190 					DTRACE_VM4(submap_copy_error_strategically,
14191 					    vm_object_t, sub_object,
14192 					    vm_object_offset_t, submap_entry_offset,
14193 					    vm_object_size_t, submap_entry_size,
14194 					    int, kr);
14195 					vm_map_lookup_and_lock_object_copy_strategically_error++;
14196 					return kr;
14197 				}
14198 				assert(copy_object != VM_OBJECT_NULL);
14199 				assert(copy_object != sub_object);
14200 				object_copied = TRUE;
14201 				vm_map_lookup_and_lock_object_copy_strategically_count++;
14202 				vm_map_lookup_and_lock_object_copy_strategically_size += submap_entry_size;
14203 				if (submap_entry_size > vm_map_lookup_and_lock_object_copy_strategically_max) {
14204 					vm_map_lookup_and_lock_object_copy_strategically_max = submap_entry_size;
14205 				}
14206 			} else {
14207 				/* set up shadow object */
14208 				object_copied = FALSE;
14209 				copy_object = sub_object;
14210 				vm_object_lock(sub_object);
14211 				vm_object_reference_locked(sub_object);
14212 				sub_object->shadowed = TRUE;
14213 				vm_object_unlock(sub_object);
14214 
14215 				assert(submap_entry->wired_count == 0);
14216 				submap_entry->needs_copy = TRUE;
14217 
14218 				prot = submap_entry->protection;
14219 				assert(!pmap_has_prot_policy(map->pmap, submap_entry->translated_allow_execute, prot));
14220 				prot = prot & ~VM_PROT_WRITE;
14221 				assert(!pmap_has_prot_policy(map->pmap, submap_entry->translated_allow_execute, prot));
14222 
14223 				if (override_nx(old_map,
14224 				    VME_ALIAS(submap_entry))
14225 				    && prot) {
14226 					prot |= VM_PROT_EXECUTE;
14227 				}
14228 
14229 				vm_object_pmap_protect(
14230 					sub_object,
14231 					VME_OFFSET(submap_entry),
14232 					submap_entry->vme_end -
14233 					submap_entry->vme_start,
14234 					(submap_entry->is_shared
14235 					|| map->mapped_in_other_pmaps) ?
14236 					PMAP_NULL : map->pmap,
14237 					VM_MAP_PAGE_SIZE(map),
14238 					submap_entry->vme_start,
14239 					prot);
14240 				vm_map_lookup_and_lock_object_copy_shadow_count++;
14241 				vm_map_lookup_and_lock_object_copy_shadow_size += submap_entry_size;
14242 				if (submap_entry_size > vm_map_lookup_and_lock_object_copy_shadow_max) {
14243 					vm_map_lookup_and_lock_object_copy_shadow_max = submap_entry_size;
14244 				}
14245 			}
14246 
14247 			/*
14248 			 * Adjust the fault offset to the submap entry.
14249 			 */
14250 			copy_offset = (local_vaddr -
14251 			    submap_entry->vme_start +
14252 			    VME_OFFSET(submap_entry));
14253 
14254 			/* This works diffently than the   */
14255 			/* normal submap case. We go back  */
14256 			/* to the parent of the cow map and*/
14257 			/* clip out the target portion of  */
14258 			/* the sub_map, substituting the   */
14259 			/* new copy object,                */
14260 
14261 			subentry_protection = submap_entry->protection;
14262 			subentry_max_protection = submap_entry->max_protection;
14263 			subentry_no_copy_on_read = submap_entry->vme_no_copy_on_read;
14264 			subentry_permanent = submap_entry->vme_permanent;
14265 			subentry_csm_associated = submap_entry->csm_associated;
14266 #if __arm64e__
14267 			subentry_used_for_tpro = submap_entry->used_for_tpro;
14268 #endif // __arm64e__
14269 			vm_map_unlock(map);
14270 			submap_entry = NULL; /* not valid after map unlock */
14271 
14272 			local_start = old_start;
14273 			local_end = old_end;
14274 			map = cow_sub_map_parent;
14275 			*var_map = cow_sub_map_parent;
14276 			vaddr = cow_parent_vaddr;
14277 			cow_sub_map_parent = NULL;
14278 
14279 			if (!vm_map_lookup_entry(map,
14280 			    vaddr, &entry)) {
14281 				if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14282 					vm_map_unlock(cow_sub_map_parent);
14283 				}
14284 				if ((*real_map != map)
14285 				    && (*real_map != cow_sub_map_parent)) {
14286 					vm_map_unlock(*real_map);
14287 				}
14288 				*real_map = map;
14289 				vm_object_deallocate(
14290 					copy_object);
14291 				copy_object = VM_OBJECT_NULL;
14292 				vm_map_lock_write_to_read(map);
14293 				DTRACE_VM4(submap_lookup_post_unlock,
14294 				    uint64_t, (uint64_t)entry->vme_start,
14295 				    uint64_t, (uint64_t)entry->vme_end,
14296 				    vm_map_offset_t, vaddr,
14297 				    int, object_copied);
14298 				return KERN_INVALID_ADDRESS;
14299 			}
14300 
14301 			/* clip out the portion of space */
14302 			/* mapped by the sub map which   */
14303 			/* corresponds to the underlying */
14304 			/* object */
14305 
14306 			/*
14307 			 * Clip (and unnest) the smallest nested chunk
14308 			 * possible around the faulting address...
14309 			 */
14310 			local_start = vaddr & ~(pmap_shared_region_size_min(map->pmap) - 1);
14311 			local_end = local_start + pmap_shared_region_size_min(map->pmap);
14312 			/*
14313 			 * ... but don't go beyond the "old_start" to "old_end"
14314 			 * range, to avoid spanning over another VM region
14315 			 * with a possibly different VM object and/or offset.
14316 			 */
14317 			if (local_start < old_start) {
14318 				local_start = old_start;
14319 			}
14320 			if (local_end > old_end) {
14321 				local_end = old_end;
14322 			}
14323 			/*
14324 			 * Adjust copy_offset to the start of the range.
14325 			 */
14326 			copy_offset -= (vaddr - local_start);
14327 
14328 			vm_map_clip_start(map, entry, local_start);
14329 			vm_map_clip_end(map, entry, local_end);
14330 			if (entry->is_sub_map) {
14331 				/* unnesting was done when clipping */
14332 				assert(!entry->use_pmap);
14333 			}
14334 
14335 			/* substitute copy object for */
14336 			/* shared map entry           */
14337 			vm_map_deallocate(VME_SUBMAP(entry));
14338 			assert(!entry->iokit_acct);
14339 			entry->use_pmap = TRUE;
14340 			VME_OBJECT_SET(entry, copy_object, false, 0);
14341 
14342 			/* propagate the submap entry's protections */
14343 			if (entry->protection != VM_PROT_READ) {
14344 				/*
14345 				 * Someone has already altered the top entry's
14346 				 * protections via vm_protect(VM_PROT_COPY).
14347 				 * Respect these new values and ignore the
14348 				 * submap entry's protections.
14349 				 */
14350 			} else {
14351 				/*
14352 				 * Regular copy-on-write: propagate the submap
14353 				 * entry's protections to the top map entry.
14354 				 */
14355 				entry->protection |= subentry_protection;
14356 			}
14357 			entry->max_protection |= subentry_max_protection;
14358 			/* propagate some attributes from subentry */
14359 			entry->vme_no_copy_on_read = subentry_no_copy_on_read;
14360 			entry->vme_permanent = subentry_permanent;
14361 			entry->csm_associated = subentry_csm_associated;
14362 #if __arm64e__
14363 			/* propagate TPRO iff the destination map has TPRO enabled */
14364 			if (subentry_used_for_tpro && vm_map_tpro(map)) {
14365 				entry->used_for_tpro = subentry_used_for_tpro;
14366 			}
14367 #endif /* __arm64e */
14368 			if ((entry->protection & VM_PROT_WRITE) &&
14369 			    (entry->protection & VM_PROT_EXECUTE) &&
14370 #if XNU_TARGET_OS_OSX
14371 			    map->pmap != kernel_pmap &&
14372 			    (vm_map_cs_enforcement(map)
14373 #if __arm64__
14374 			    || !VM_MAP_IS_EXOTIC(map)
14375 #endif /* __arm64__ */
14376 			    ) &&
14377 #endif /* XNU_TARGET_OS_OSX */
14378 #if CODE_SIGNING_MONITOR
14379 			    (csm_address_space_exempt(map->pmap) != KERN_SUCCESS) &&
14380 #endif
14381 			    !(entry->used_for_jit) &&
14382 			    VM_MAP_POLICY_WX_STRIP_X(map)) {
14383 				DTRACE_VM3(cs_wx,
14384 				    uint64_t, (uint64_t)entry->vme_start,
14385 				    uint64_t, (uint64_t)entry->vme_end,
14386 				    vm_prot_t, entry->protection);
14387 				printf("CODE SIGNING: %d[%s] %s:%d(0x%llx,0x%llx,0x%x) can't have both write and exec at the same time\n",
14388 				    proc_selfpid(),
14389 				    (get_bsdtask_info(current_task())
14390 				    ? proc_name_address(get_bsdtask_info(current_task()))
14391 				    : "?"),
14392 				    __FUNCTION__, __LINE__,
14393 #if DEVELOPMENT || DEBUG
14394 				    (uint64_t)entry->vme_start,
14395 				    (uint64_t)entry->vme_end,
14396 #else /* DEVELOPMENT || DEBUG */
14397 				    (uint64_t)0,
14398 				    (uint64_t)0,
14399 #endif /* DEVELOPMENT || DEBUG */
14400 				    entry->protection);
14401 				entry->protection &= ~VM_PROT_EXECUTE;
14402 			}
14403 
14404 			if (object_copied) {
14405 				VME_OFFSET_SET(entry, local_start - old_start + object_copied_offset);
14406 				entry->needs_copy = object_copied_needs_copy;
14407 				entry->is_shared = FALSE;
14408 			} else {
14409 				assert(VME_OBJECT(entry) != VM_OBJECT_NULL);
14410 				assert(VME_OBJECT(entry)->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC);
14411 				assert(entry->wired_count == 0);
14412 				VME_OFFSET_SET(entry, copy_offset);
14413 				entry->needs_copy = TRUE;
14414 				if (map != old_map) {
14415 					entry->is_shared = TRUE;
14416 				}
14417 			}
14418 			if (entry->inheritance == VM_INHERIT_SHARE) {
14419 				entry->inheritance = VM_INHERIT_COPY;
14420 			}
14421 
14422 			vm_map_lock_write_to_read(map);
14423 		} else {
14424 			if ((cow_sub_map_parent)
14425 			    && (cow_sub_map_parent != *real_map)
14426 			    && (cow_sub_map_parent != map)) {
14427 				vm_map_unlock(cow_sub_map_parent);
14428 			}
14429 			entry = submap_entry;
14430 			vaddr = local_vaddr;
14431 		}
14432 	}
14433 
14434 	/*
14435 	 *	Check whether this task is allowed to have
14436 	 *	this page.
14437 	 */
14438 
14439 	prot = entry->protection;
14440 
14441 	if (override_nx(old_map, VME_ALIAS(entry)) && prot) {
14442 		/*
14443 		 * HACK -- if not a stack, then allow execution
14444 		 */
14445 		prot |= VM_PROT_EXECUTE;
14446 	}
14447 
14448 #if __arm64e__
14449 	/*
14450 	 * If the entry we're dealing with is TPRO and we have a write
14451 	 * fault, inject VM_PROT_WRITE into protections. This allows us
14452 	 * to maintain RO permissions when not marked as TPRO.
14453 	 */
14454 	if (entry->used_for_tpro && (fault_type & VM_PROT_WRITE)) {
14455 		prot |= VM_PROT_WRITE;
14456 	}
14457 #endif /* __arm64e__ */
14458 	if (mask_protections) {
14459 		fault_type &= prot;
14460 		if (fault_type == VM_PROT_NONE) {
14461 			goto protection_failure;
14462 		}
14463 	}
14464 	if (((fault_type & prot) != fault_type)
14465 #if __arm64__
14466 	    /* prefetch abort in execute-only page */
14467 	    && !(prot == VM_PROT_EXECUTE && fault_type == (VM_PROT_READ | VM_PROT_EXECUTE))
14468 #elif defined(__x86_64__)
14469 	    /* Consider the UEXEC bit when handling an EXECUTE fault */
14470 	    && !((fault_type & VM_PROT_EXECUTE) && !(prot & VM_PROT_EXECUTE) && (prot & VM_PROT_UEXEC))
14471 #endif
14472 	    ) {
14473 protection_failure:
14474 		if (*real_map != map) {
14475 			vm_map_unlock(*real_map);
14476 		}
14477 		*real_map = map;
14478 
14479 		if ((fault_type & VM_PROT_EXECUTE) && prot) {
14480 			log_stack_execution_failure((addr64_t)vaddr, prot);
14481 		}
14482 
14483 		DTRACE_VM2(prot_fault, int, 1, (uint64_t *), NULL);
14484 		DTRACE_VM3(prot_fault_detailed, vm_prot_t, fault_type, vm_prot_t, prot, void *, vaddr);
14485 		/*
14486 		 * Noisy (esp. internally) and can be inferred from CrashReports. So OFF for now.
14487 		 *
14488 		 * ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_PROTECTION_FAILURE), 0);
14489 		 */
14490 		return KERN_PROTECTION_FAILURE;
14491 	}
14492 
14493 	/*
14494 	 *	If this page is not pageable, we have to get
14495 	 *	it for all possible accesses.
14496 	 */
14497 
14498 	*wired = (entry->wired_count != 0);
14499 	if (*wired) {
14500 		fault_type = prot;
14501 	}
14502 
14503 	/*
14504 	 *	If the entry was copy-on-write, we either ...
14505 	 */
14506 
14507 	if (entry->needs_copy) {
14508 		/*
14509 		 *	If we want to write the page, we may as well
14510 		 *	handle that now since we've got the map locked.
14511 		 *
14512 		 *	If we don't need to write the page, we just
14513 		 *	demote the permissions allowed.
14514 		 */
14515 
14516 		if ((fault_type & VM_PROT_WRITE) || *wired || force_copy) {
14517 			/*
14518 			 *	Make a new object, and place it in the
14519 			 *	object chain.  Note that no new references
14520 			 *	have appeared -- one just moved from the
14521 			 *	map to the new object.
14522 			 */
14523 
14524 			if (vm_map_lock_read_to_write(map)) {
14525 				vm_map_lock_read(map);
14526 				goto RetryLookup;
14527 			}
14528 
14529 			if (VME_OBJECT(entry)->shadowed == FALSE) {
14530 				vm_object_lock(VME_OBJECT(entry));
14531 				VME_OBJECT(entry)->shadowed = TRUE;
14532 				vm_object_unlock(VME_OBJECT(entry));
14533 			}
14534 			VME_OBJECT_SHADOW(entry,
14535 			    (vm_map_size_t) (entry->vme_end -
14536 			    entry->vme_start),
14537 			    vm_map_always_shadow(map));
14538 			entry->needs_copy = FALSE;
14539 
14540 			vm_map_lock_write_to_read(map);
14541 		}
14542 		if ((fault_type & VM_PROT_WRITE) == 0 && *wired == 0) {
14543 			/*
14544 			 *	We're attempting to read a copy-on-write
14545 			 *	page -- don't allow writes.
14546 			 */
14547 
14548 			prot &= (~VM_PROT_WRITE);
14549 		}
14550 	}
14551 
14552 	if (submap_needed_copy && (prot & VM_PROT_WRITE)) {
14553 		/*
14554 		 * We went through a "needs_copy" submap without triggering
14555 		 * a copy, so granting write access to the page would bypass
14556 		 * that submap's "needs_copy".
14557 		 */
14558 		assert(!(fault_type & VM_PROT_WRITE));
14559 		assert(!*wired);
14560 		assert(!force_copy);
14561 		// printf("FBDP %d[%s] submap_needed_copy for %p 0x%llx\n", proc_selfpid(), proc_name_address(current_task()->bsd_info), map, vaddr);
14562 		prot &= ~VM_PROT_WRITE;
14563 	}
14564 
14565 	/*
14566 	 *	Create an object if necessary.
14567 	 */
14568 	if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
14569 		if (vm_map_lock_read_to_write(map)) {
14570 			vm_map_lock_read(map);
14571 			goto RetryLookup;
14572 		}
14573 
14574 		VME_OBJECT_SET(entry,
14575 		    vm_object_allocate(
14576 			    (vm_map_size_t)(entry->vme_end -
14577 			    entry->vme_start)), false, 0);
14578 		VME_OFFSET_SET(entry, 0);
14579 		assert(entry->use_pmap);
14580 		vm_map_lock_write_to_read(map);
14581 	}
14582 
14583 	/*
14584 	 *	Return the object/offset from this entry.  If the entry
14585 	 *	was copy-on-write or empty, it has been fixed up.  Also
14586 	 *	return the protection.
14587 	 */
14588 
14589 	*offset = (vaddr - entry->vme_start) + VME_OFFSET(entry);
14590 	*object = VME_OBJECT(entry);
14591 	*out_prot = prot;
14592 	KDBG_FILTERED(MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_MAP_LOOKUP_OBJECT), VM_KERNEL_UNSLIDE_OR_PERM(*object), (unsigned long) VME_ALIAS(entry), 0, 0);
14593 
14594 	if (fault_info) {
14595 		fault_info->interruptible = THREAD_UNINT; /* for now... */
14596 		/* ... the caller will change "interruptible" if needed */
14597 		fault_info->cluster_size = 0;
14598 		fault_info->user_tag = VME_ALIAS(entry);
14599 		fault_info->pmap_options = 0;
14600 		if (entry->iokit_acct ||
14601 		    (!entry->is_sub_map && !entry->use_pmap)) {
14602 			fault_info->pmap_options |= PMAP_OPTIONS_ALT_ACCT;
14603 		}
14604 		fault_info->behavior = entry->behavior;
14605 		fault_info->lo_offset = VME_OFFSET(entry);
14606 		fault_info->hi_offset =
14607 		    (entry->vme_end - entry->vme_start) + VME_OFFSET(entry);
14608 		fault_info->no_cache  = entry->no_cache;
14609 		fault_info->stealth = FALSE;
14610 		fault_info->io_sync = FALSE;
14611 		if (entry->used_for_jit ||
14612 #if CODE_SIGNING_MONITOR
14613 		    (csm_address_space_exempt(map->pmap) == KERN_SUCCESS) ||
14614 #endif
14615 		    entry->vme_resilient_codesign) {
14616 			fault_info->cs_bypass = TRUE;
14617 		} else {
14618 			fault_info->cs_bypass = FALSE;
14619 		}
14620 		fault_info->csm_associated = FALSE;
14621 #if CODE_SIGNING_MONITOR
14622 		if (entry->csm_associated) {
14623 			/*
14624 			 * The pmap layer will validate this page
14625 			 * before allowing it to be executed from.
14626 			 */
14627 			fault_info->csm_associated = TRUE;
14628 		}
14629 #endif
14630 		fault_info->mark_zf_absent = FALSE;
14631 		fault_info->batch_pmap_op = FALSE;
14632 		fault_info->resilient_media = entry->vme_resilient_media;
14633 		fault_info->fi_xnu_user_debug = entry->vme_xnu_user_debug;
14634 		fault_info->no_copy_on_read = entry->vme_no_copy_on_read;
14635 #if __arm64e__
14636 		fault_info->fi_used_for_tpro = entry->used_for_tpro;
14637 #else /* __arm64e__ */
14638 		fault_info->fi_used_for_tpro = FALSE;
14639 #endif
14640 		if (entry->translated_allow_execute) {
14641 			fault_info->pmap_options |= PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE;
14642 		}
14643 	}
14644 
14645 	/*
14646 	 *	Lock the object to prevent it from disappearing
14647 	 */
14648 	if (object_lock_type == OBJECT_LOCK_EXCLUSIVE) {
14649 		if (contended == NULL) {
14650 			vm_object_lock(*object);
14651 		} else {
14652 			*contended = vm_object_lock_check_contended(*object);
14653 		}
14654 	} else {
14655 		vm_object_lock_shared(*object);
14656 	}
14657 
14658 	/*
14659 	 *	Save the version number
14660 	 */
14661 
14662 	out_version->main_timestamp = map->timestamp;
14663 
14664 	return KERN_SUCCESS;
14665 }
14666 
14667 
14668 /*
14669  *	vm_map_verify:
14670  *
14671  *	Verifies that the map in question has not changed
14672  *	since the given version. The map has to be locked
14673  *	("shared" mode is fine) before calling this function
14674  *	and it will be returned locked too.
14675  */
14676 boolean_t
vm_map_verify(vm_map_t map,vm_map_version_t * version)14677 vm_map_verify(
14678 	vm_map_t                map,
14679 	vm_map_version_t        *version)       /* REF */
14680 {
14681 	boolean_t       result;
14682 
14683 	vm_map_lock_assert_held(map);
14684 	result = (map->timestamp == version->main_timestamp);
14685 
14686 	return result;
14687 }
14688 
14689 /*
14690  *	TEMPORARYTEMPORARYTEMPORARYTEMPORARYTEMPORARYTEMPORARY
14691  *	Goes away after regular vm_region_recurse function migrates to
14692  *	64 bits
14693  *	vm_region_recurse: A form of vm_region which follows the
14694  *	submaps in a target map
14695  *
14696  */
14697 
14698 kern_return_t
vm_map_region_recurse_64(vm_map_t map,vm_map_offset_t * address,vm_map_size_t * size,natural_t * nesting_depth,vm_region_submap_info_64_t submap_info,mach_msg_type_number_t * count)14699 vm_map_region_recurse_64(
14700 	vm_map_t                 map,
14701 	vm_map_offset_t *address,               /* IN/OUT */
14702 	vm_map_size_t           *size,                  /* OUT */
14703 	natural_t               *nesting_depth, /* IN/OUT */
14704 	vm_region_submap_info_64_t      submap_info,    /* IN/OUT */
14705 	mach_msg_type_number_t  *count) /* IN/OUT */
14706 {
14707 	mach_msg_type_number_t  original_count;
14708 	vm_region_extended_info_data_t  extended;
14709 	vm_map_entry_t                  tmp_entry;
14710 	vm_map_offset_t                 user_address;
14711 	unsigned int                    user_max_depth;
14712 
14713 	/*
14714 	 * "curr_entry" is the VM map entry preceding or including the
14715 	 * address we're looking for.
14716 	 * "curr_map" is the map or sub-map containing "curr_entry".
14717 	 * "curr_address" is the equivalent of the top map's "user_address"
14718 	 * in the current map.
14719 	 * "curr_offset" is the cumulated offset of "curr_map" in the
14720 	 * target task's address space.
14721 	 * "curr_depth" is the depth of "curr_map" in the chain of
14722 	 * sub-maps.
14723 	 *
14724 	 * "curr_max_below" and "curr_max_above" limit the range (around
14725 	 * "curr_address") we should take into account in the current (sub)map.
14726 	 * They limit the range to what's visible through the map entries
14727 	 * we've traversed from the top map to the current map.
14728 	 *
14729 	 */
14730 	vm_map_entry_t                  curr_entry;
14731 	vm_map_address_t                curr_address;
14732 	vm_map_offset_t                 curr_offset;
14733 	vm_map_t                        curr_map;
14734 	unsigned int                    curr_depth;
14735 	vm_map_offset_t                 curr_max_below, curr_max_above;
14736 	vm_map_offset_t                 curr_skip;
14737 
14738 	/*
14739 	 * "next_" is the same as "curr_" but for the VM region immediately
14740 	 * after the address we're looking for.  We need to keep track of this
14741 	 * too because we want to return info about that region if the
14742 	 * address we're looking for is not mapped.
14743 	 */
14744 	vm_map_entry_t                  next_entry;
14745 	vm_map_offset_t                 next_offset;
14746 	vm_map_offset_t                 next_address;
14747 	vm_map_t                        next_map;
14748 	unsigned int                    next_depth;
14749 	vm_map_offset_t                 next_max_below, next_max_above;
14750 	vm_map_offset_t                 next_skip;
14751 
14752 	boolean_t                       look_for_pages;
14753 	vm_region_submap_short_info_64_t short_info;
14754 	boolean_t                       do_region_footprint;
14755 	int                             effective_page_size, effective_page_shift;
14756 	boolean_t                       submap_needed_copy;
14757 
14758 	if (map == VM_MAP_NULL) {
14759 		/* no address space to work on */
14760 		return KERN_INVALID_ARGUMENT;
14761 	}
14762 
14763 	effective_page_shift = vm_self_region_page_shift(map);
14764 	effective_page_size = (1 << effective_page_shift);
14765 
14766 	if (*count < VM_REGION_SUBMAP_SHORT_INFO_COUNT_64) {
14767 		/*
14768 		 * "info" structure is not big enough and
14769 		 * would overflow
14770 		 */
14771 		return KERN_INVALID_ARGUMENT;
14772 	}
14773 
14774 	do_region_footprint = task_self_region_footprint();
14775 	original_count = *count;
14776 
14777 	if (original_count < VM_REGION_SUBMAP_INFO_V0_COUNT_64) {
14778 		*count = VM_REGION_SUBMAP_SHORT_INFO_COUNT_64;
14779 		look_for_pages = FALSE;
14780 		short_info = (vm_region_submap_short_info_64_t) submap_info;
14781 		submap_info = NULL;
14782 	} else {
14783 		look_for_pages = TRUE;
14784 		*count = VM_REGION_SUBMAP_INFO_V0_COUNT_64;
14785 		short_info = NULL;
14786 
14787 		if (original_count >= VM_REGION_SUBMAP_INFO_V1_COUNT_64) {
14788 			*count = VM_REGION_SUBMAP_INFO_V1_COUNT_64;
14789 		}
14790 		if (original_count >= VM_REGION_SUBMAP_INFO_V2_COUNT_64) {
14791 			*count = VM_REGION_SUBMAP_INFO_V2_COUNT_64;
14792 		}
14793 	}
14794 
14795 	user_address = *address;
14796 	user_max_depth = *nesting_depth;
14797 	submap_needed_copy = FALSE;
14798 
14799 	if (not_in_kdp) {
14800 		vm_map_lock_read(map);
14801 	}
14802 
14803 recurse_again:
14804 	curr_entry = NULL;
14805 	curr_map = map;
14806 	curr_address = user_address;
14807 	curr_offset = 0;
14808 	curr_skip = 0;
14809 	curr_depth = 0;
14810 	curr_max_above = ((vm_map_offset_t) -1) - curr_address;
14811 	curr_max_below = curr_address;
14812 
14813 	next_entry = NULL;
14814 	next_map = NULL;
14815 	next_address = 0;
14816 	next_offset = 0;
14817 	next_skip = 0;
14818 	next_depth = 0;
14819 	next_max_above = (vm_map_offset_t) -1;
14820 	next_max_below = (vm_map_offset_t) -1;
14821 
14822 	for (;;) {
14823 		if (vm_map_lookup_entry(curr_map,
14824 		    curr_address,
14825 		    &tmp_entry)) {
14826 			/* tmp_entry contains the address we're looking for */
14827 			curr_entry = tmp_entry;
14828 		} else {
14829 			vm_map_offset_t skip;
14830 			/*
14831 			 * The address is not mapped.  "tmp_entry" is the
14832 			 * map entry preceding the address.  We want the next
14833 			 * one, if it exists.
14834 			 */
14835 			curr_entry = tmp_entry->vme_next;
14836 
14837 			if (curr_entry == vm_map_to_entry(curr_map) ||
14838 			    (curr_entry->vme_start >=
14839 			    curr_address + curr_max_above)) {
14840 				/* no next entry at this level: stop looking */
14841 				if (not_in_kdp) {
14842 					vm_map_unlock_read(curr_map);
14843 				}
14844 				curr_entry = NULL;
14845 				curr_map = NULL;
14846 				curr_skip = 0;
14847 				curr_offset = 0;
14848 				curr_depth = 0;
14849 				curr_max_above = 0;
14850 				curr_max_below = 0;
14851 				break;
14852 			}
14853 
14854 			/* adjust current address and offset */
14855 			skip = curr_entry->vme_start - curr_address;
14856 			curr_address = curr_entry->vme_start;
14857 			curr_skip += skip;
14858 			curr_offset += skip;
14859 			curr_max_above -= skip;
14860 			curr_max_below = 0;
14861 		}
14862 
14863 		/*
14864 		 * Is the next entry at this level closer to the address (or
14865 		 * deeper in the submap chain) than the one we had
14866 		 * so far ?
14867 		 */
14868 		tmp_entry = curr_entry->vme_next;
14869 		if (tmp_entry == vm_map_to_entry(curr_map)) {
14870 			/* no next entry at this level */
14871 		} else if (tmp_entry->vme_start >=
14872 		    curr_address + curr_max_above) {
14873 			/*
14874 			 * tmp_entry is beyond the scope of what we mapped of
14875 			 * this submap in the upper level: ignore it.
14876 			 */
14877 		} else if ((next_entry == NULL) ||
14878 		    (tmp_entry->vme_start + curr_offset <=
14879 		    next_entry->vme_start + next_offset)) {
14880 			/*
14881 			 * We didn't have a "next_entry" or this one is
14882 			 * closer to the address we're looking for:
14883 			 * use this "tmp_entry" as the new "next_entry".
14884 			 */
14885 			if (next_entry != NULL) {
14886 				/* unlock the last "next_map" */
14887 				if (next_map != curr_map && not_in_kdp) {
14888 					vm_map_unlock_read(next_map);
14889 				}
14890 			}
14891 			next_entry = tmp_entry;
14892 			next_map = curr_map;
14893 			next_depth = curr_depth;
14894 			next_address = next_entry->vme_start;
14895 			next_skip = curr_skip;
14896 			next_skip += (next_address - curr_address);
14897 			next_offset = curr_offset;
14898 			next_offset += (next_address - curr_address);
14899 			next_max_above = MIN(next_max_above, curr_max_above);
14900 			next_max_above = MIN(next_max_above,
14901 			    next_entry->vme_end - next_address);
14902 			next_max_below = MIN(next_max_below, curr_max_below);
14903 			next_max_below = MIN(next_max_below,
14904 			    next_address - next_entry->vme_start);
14905 		}
14906 
14907 		/*
14908 		 * "curr_max_{above,below}" allow us to keep track of the
14909 		 * portion of the submap that is actually mapped at this level:
14910 		 * the rest of that submap is irrelevant to us, since it's not
14911 		 * mapped here.
14912 		 * The relevant portion of the map starts at
14913 		 * "VME_OFFSET(curr_entry)" up to the size of "curr_entry".
14914 		 */
14915 		curr_max_above = MIN(curr_max_above,
14916 		    curr_entry->vme_end - curr_address);
14917 		curr_max_below = MIN(curr_max_below,
14918 		    curr_address - curr_entry->vme_start);
14919 
14920 		if (!curr_entry->is_sub_map ||
14921 		    curr_depth >= user_max_depth) {
14922 			/*
14923 			 * We hit a leaf map or we reached the maximum depth
14924 			 * we could, so stop looking.  Keep the current map
14925 			 * locked.
14926 			 */
14927 			break;
14928 		}
14929 
14930 		/*
14931 		 * Get down to the next submap level.
14932 		 */
14933 
14934 		if (curr_entry->needs_copy) {
14935 			/* everything below this is effectively copy-on-write */
14936 			submap_needed_copy = TRUE;
14937 		}
14938 
14939 		/*
14940 		 * Lock the next level and unlock the current level,
14941 		 * unless we need to keep it locked to access the "next_entry"
14942 		 * later.
14943 		 */
14944 		if (not_in_kdp) {
14945 			vm_map_lock_read(VME_SUBMAP(curr_entry));
14946 		}
14947 		if (curr_map == next_map) {
14948 			/* keep "next_map" locked in case we need it */
14949 		} else {
14950 			/* release this map */
14951 			if (not_in_kdp) {
14952 				vm_map_unlock_read(curr_map);
14953 			}
14954 		}
14955 
14956 		/*
14957 		 * Adjust the offset.  "curr_entry" maps the submap
14958 		 * at relative address "curr_entry->vme_start" in the
14959 		 * curr_map but skips the first "VME_OFFSET(curr_entry)"
14960 		 * bytes of the submap.
14961 		 * "curr_offset" always represents the offset of a virtual
14962 		 * address in the curr_map relative to the absolute address
14963 		 * space (i.e. the top-level VM map).
14964 		 */
14965 		curr_offset +=
14966 		    (VME_OFFSET(curr_entry) - curr_entry->vme_start);
14967 		curr_address = user_address + curr_offset;
14968 		/* switch to the submap */
14969 		curr_map = VME_SUBMAP(curr_entry);
14970 		curr_depth++;
14971 		curr_entry = NULL;
14972 	}
14973 
14974 // LP64todo: all the current tools are 32bit, obviously never worked for 64b
14975 // so probably should be a real 32b ID vs. ptr.
14976 // Current users just check for equality
14977 
14978 	if (curr_entry == NULL) {
14979 		/* no VM region contains the address... */
14980 
14981 		if (do_region_footprint && /* we want footprint numbers */
14982 		    next_entry == NULL && /* & there are no more regions */
14983 		    /* & we haven't already provided our fake region: */
14984 		    user_address <= vm_map_last_entry(map)->vme_end) {
14985 			ledger_amount_t ledger_resident, ledger_compressed;
14986 
14987 			/*
14988 			 * Add a fake memory region to account for
14989 			 * purgeable and/or ledger-tagged memory that
14990 			 * counts towards this task's memory footprint,
14991 			 * i.e. the resident/compressed pages of non-volatile
14992 			 * objects owned by that task.
14993 			 */
14994 			task_ledgers_footprint(map->pmap->ledger,
14995 			    &ledger_resident,
14996 			    &ledger_compressed);
14997 			if (ledger_resident + ledger_compressed == 0) {
14998 				/* no purgeable memory usage to report */
14999 				return KERN_INVALID_ADDRESS;
15000 			}
15001 			/* fake region to show nonvolatile footprint */
15002 			if (look_for_pages) {
15003 				submap_info->protection = VM_PROT_DEFAULT;
15004 				submap_info->max_protection = VM_PROT_DEFAULT;
15005 				submap_info->inheritance = VM_INHERIT_DEFAULT;
15006 				submap_info->offset = 0;
15007 				submap_info->user_tag = -1;
15008 				submap_info->pages_resident = (unsigned int) (ledger_resident / effective_page_size);
15009 				submap_info->pages_shared_now_private = 0;
15010 				submap_info->pages_swapped_out = (unsigned int) (ledger_compressed / effective_page_size);
15011 				submap_info->pages_dirtied = submap_info->pages_resident;
15012 				submap_info->ref_count = 1;
15013 				submap_info->shadow_depth = 0;
15014 				submap_info->external_pager = 0;
15015 				submap_info->share_mode = SM_PRIVATE;
15016 				if (submap_needed_copy) {
15017 					submap_info->share_mode = SM_COW;
15018 				}
15019 				submap_info->is_submap = 0;
15020 				submap_info->behavior = VM_BEHAVIOR_DEFAULT;
15021 				submap_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
15022 				submap_info->user_wired_count = 0;
15023 				submap_info->pages_reusable = 0;
15024 			} else {
15025 				short_info->user_tag = -1;
15026 				short_info->offset = 0;
15027 				short_info->protection = VM_PROT_DEFAULT;
15028 				short_info->inheritance = VM_INHERIT_DEFAULT;
15029 				short_info->max_protection = VM_PROT_DEFAULT;
15030 				short_info->behavior = VM_BEHAVIOR_DEFAULT;
15031 				short_info->user_wired_count = 0;
15032 				short_info->is_submap = 0;
15033 				short_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
15034 				short_info->external_pager = 0;
15035 				short_info->shadow_depth = 0;
15036 				short_info->share_mode = SM_PRIVATE;
15037 				if (submap_needed_copy) {
15038 					short_info->share_mode = SM_COW;
15039 				}
15040 				short_info->ref_count = 1;
15041 			}
15042 			*nesting_depth = 0;
15043 			*size = (vm_map_size_t) (ledger_resident + ledger_compressed);
15044 //			*address = user_address;
15045 			*address = vm_map_last_entry(map)->vme_end;
15046 			return KERN_SUCCESS;
15047 		}
15048 
15049 		if (next_entry == NULL) {
15050 			/* ... and no VM region follows it either */
15051 			return KERN_INVALID_ADDRESS;
15052 		}
15053 		/* ... gather info about the next VM region */
15054 		curr_entry = next_entry;
15055 		curr_map = next_map;    /* still locked ... */
15056 		curr_address = next_address;
15057 		curr_skip = next_skip;
15058 		curr_offset = next_offset;
15059 		curr_depth = next_depth;
15060 		curr_max_above = next_max_above;
15061 		curr_max_below = next_max_below;
15062 	} else {
15063 		/* we won't need "next_entry" after all */
15064 		if (next_entry != NULL) {
15065 			/* release "next_map" */
15066 			if (next_map != curr_map && not_in_kdp) {
15067 				vm_map_unlock_read(next_map);
15068 			}
15069 		}
15070 	}
15071 	next_entry = NULL;
15072 	next_map = NULL;
15073 	next_offset = 0;
15074 	next_skip = 0;
15075 	next_depth = 0;
15076 	next_max_below = -1;
15077 	next_max_above = -1;
15078 
15079 	if (curr_entry->is_sub_map &&
15080 	    curr_depth < user_max_depth) {
15081 		/*
15082 		 * We're not as deep as we could be:  we must have
15083 		 * gone back up after not finding anything mapped
15084 		 * below the original top-level map entry's.
15085 		 * Let's move "curr_address" forward and recurse again.
15086 		 */
15087 		user_address = curr_address;
15088 		goto recurse_again;
15089 	}
15090 
15091 	*nesting_depth = curr_depth;
15092 	*size = curr_max_above + curr_max_below;
15093 	*address = user_address + curr_skip - curr_max_below;
15094 
15095 	if (look_for_pages) {
15096 		submap_info->user_tag = VME_ALIAS(curr_entry);
15097 		submap_info->offset = VME_OFFSET(curr_entry);
15098 		submap_info->protection = curr_entry->protection;
15099 		submap_info->inheritance = curr_entry->inheritance;
15100 		submap_info->max_protection = curr_entry->max_protection;
15101 		submap_info->behavior = curr_entry->behavior;
15102 		submap_info->user_wired_count = curr_entry->user_wired_count;
15103 		submap_info->is_submap = curr_entry->is_sub_map;
15104 		if (curr_entry->is_sub_map) {
15105 			submap_info->object_id = VM_OBJECT_ID(VME_SUBMAP(curr_entry));
15106 		} else {
15107 			submap_info->object_id = VM_OBJECT_ID(VME_OBJECT(curr_entry));
15108 		}
15109 	} else {
15110 		short_info->user_tag = VME_ALIAS(curr_entry);
15111 		short_info->offset = VME_OFFSET(curr_entry);
15112 		short_info->protection = curr_entry->protection;
15113 		short_info->inheritance = curr_entry->inheritance;
15114 		short_info->max_protection = curr_entry->max_protection;
15115 		short_info->behavior = curr_entry->behavior;
15116 		short_info->user_wired_count = curr_entry->user_wired_count;
15117 		short_info->is_submap = curr_entry->is_sub_map;
15118 		if (curr_entry->is_sub_map) {
15119 			short_info->object_id = VM_OBJECT_ID(VME_SUBMAP(curr_entry));
15120 		} else {
15121 			short_info->object_id = VM_OBJECT_ID(VME_OBJECT(curr_entry));
15122 		}
15123 	}
15124 
15125 	extended.pages_resident = 0;
15126 	extended.pages_swapped_out = 0;
15127 	extended.pages_shared_now_private = 0;
15128 	extended.pages_dirtied = 0;
15129 	extended.pages_reusable = 0;
15130 	extended.external_pager = 0;
15131 	extended.shadow_depth = 0;
15132 	extended.share_mode = SM_EMPTY;
15133 	extended.ref_count = 0;
15134 
15135 	if (not_in_kdp) {
15136 		if (!curr_entry->is_sub_map) {
15137 			vm_map_offset_t range_start, range_end;
15138 			range_start = MAX((curr_address - curr_max_below),
15139 			    curr_entry->vme_start);
15140 			range_end = MIN((curr_address + curr_max_above),
15141 			    curr_entry->vme_end);
15142 			vm_map_region_walk(curr_map,
15143 			    range_start,
15144 			    curr_entry,
15145 			    (VME_OFFSET(curr_entry) +
15146 			    (range_start -
15147 			    curr_entry->vme_start)),
15148 			    range_end - range_start,
15149 			    &extended,
15150 			    look_for_pages, VM_REGION_EXTENDED_INFO_COUNT);
15151 			if (extended.external_pager &&
15152 			    extended.ref_count == 2 &&
15153 			    extended.share_mode == SM_SHARED) {
15154 				extended.share_mode = SM_PRIVATE;
15155 			}
15156 			if (submap_needed_copy) {
15157 				extended.share_mode = SM_COW;
15158 			}
15159 		} else {
15160 			if (curr_entry->use_pmap) {
15161 				extended.share_mode = SM_TRUESHARED;
15162 			} else {
15163 				extended.share_mode = SM_PRIVATE;
15164 			}
15165 			extended.ref_count = os_ref_get_count_raw(&VME_SUBMAP(curr_entry)->map_refcnt);
15166 		}
15167 	}
15168 
15169 	if (look_for_pages) {
15170 		submap_info->pages_resident = extended.pages_resident;
15171 		submap_info->pages_swapped_out = extended.pages_swapped_out;
15172 		submap_info->pages_shared_now_private =
15173 		    extended.pages_shared_now_private;
15174 		submap_info->pages_dirtied = extended.pages_dirtied;
15175 		submap_info->external_pager = extended.external_pager;
15176 		submap_info->shadow_depth = extended.shadow_depth;
15177 		submap_info->share_mode = extended.share_mode;
15178 		submap_info->ref_count = extended.ref_count;
15179 
15180 		if (original_count >= VM_REGION_SUBMAP_INFO_V1_COUNT_64) {
15181 			submap_info->pages_reusable = extended.pages_reusable;
15182 		}
15183 		if (original_count >= VM_REGION_SUBMAP_INFO_V2_COUNT_64) {
15184 			if (curr_entry->is_sub_map) {
15185 				submap_info->object_id_full = (vm_object_id_t)VM_KERNEL_ADDRPERM(VME_SUBMAP(curr_entry));
15186 			} else if (VME_OBJECT(curr_entry)) {
15187 				submap_info->object_id_full = (vm_object_id_t)VM_KERNEL_ADDRPERM(VME_OBJECT(curr_entry));
15188 			} else {
15189 				submap_info->object_id_full = 0ull;
15190 			}
15191 		}
15192 	} else {
15193 		short_info->external_pager = extended.external_pager;
15194 		short_info->shadow_depth = extended.shadow_depth;
15195 		short_info->share_mode = extended.share_mode;
15196 		short_info->ref_count = extended.ref_count;
15197 	}
15198 
15199 	if (not_in_kdp) {
15200 		vm_map_unlock_read(curr_map);
15201 	}
15202 
15203 	return KERN_SUCCESS;
15204 }
15205 
15206 /*
15207  *	vm_region:
15208  *
15209  *	User call to obtain information about a region in
15210  *	a task's address map. Currently, only one flavor is
15211  *	supported.
15212  *
15213  *	XXX The reserved and behavior fields cannot be filled
15214  *	    in until the vm merge from the IK is completed, and
15215  *	    vm_reserve is implemented.
15216  */
15217 
15218 kern_return_t
vm_map_region(vm_map_t map,vm_map_offset_t * address,vm_map_size_t * size,vm_region_flavor_t flavor,vm_region_info_t info,mach_msg_type_number_t * count,mach_port_t * object_name)15219 vm_map_region(
15220 	vm_map_t                 map,
15221 	vm_map_offset_t *address,               /* IN/OUT */
15222 	vm_map_size_t           *size,                  /* OUT */
15223 	vm_region_flavor_t       flavor,                /* IN */
15224 	vm_region_info_t         info,                  /* OUT */
15225 	mach_msg_type_number_t  *count, /* IN/OUT */
15226 	mach_port_t             *object_name)           /* OUT */
15227 {
15228 	vm_map_entry_t          tmp_entry;
15229 	vm_map_entry_t          entry;
15230 	vm_map_offset_t         start;
15231 
15232 	if (map == VM_MAP_NULL) {
15233 		return KERN_INVALID_ARGUMENT;
15234 	}
15235 
15236 	switch (flavor) {
15237 	case VM_REGION_BASIC_INFO:
15238 		/* legacy for old 32-bit objects info */
15239 	{
15240 		vm_region_basic_info_t  basic;
15241 
15242 		if (*count < VM_REGION_BASIC_INFO_COUNT) {
15243 			return KERN_INVALID_ARGUMENT;
15244 		}
15245 
15246 		basic = (vm_region_basic_info_t) info;
15247 		*count = VM_REGION_BASIC_INFO_COUNT;
15248 
15249 		vm_map_lock_read(map);
15250 
15251 		start = *address;
15252 		if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15253 			if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15254 				vm_map_unlock_read(map);
15255 				return KERN_INVALID_ADDRESS;
15256 			}
15257 		} else {
15258 			entry = tmp_entry;
15259 		}
15260 
15261 		start = entry->vme_start;
15262 
15263 		basic->offset = (uint32_t)VME_OFFSET(entry);
15264 		basic->protection = entry->protection;
15265 		basic->inheritance = entry->inheritance;
15266 		basic->max_protection = entry->max_protection;
15267 		basic->behavior = entry->behavior;
15268 		basic->user_wired_count = entry->user_wired_count;
15269 		basic->reserved = entry->is_sub_map;
15270 		*address = start;
15271 		*size = (entry->vme_end - start);
15272 
15273 		if (object_name) {
15274 			*object_name = IP_NULL;
15275 		}
15276 		if (entry->is_sub_map) {
15277 			basic->shared = FALSE;
15278 		} else {
15279 			basic->shared = entry->is_shared;
15280 		}
15281 
15282 		vm_map_unlock_read(map);
15283 		return KERN_SUCCESS;
15284 	}
15285 
15286 	case VM_REGION_BASIC_INFO_64:
15287 	{
15288 		vm_region_basic_info_64_t       basic;
15289 
15290 		if (*count < VM_REGION_BASIC_INFO_COUNT_64) {
15291 			return KERN_INVALID_ARGUMENT;
15292 		}
15293 
15294 		basic = (vm_region_basic_info_64_t) info;
15295 		*count = VM_REGION_BASIC_INFO_COUNT_64;
15296 
15297 		vm_map_lock_read(map);
15298 
15299 		start = *address;
15300 		if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15301 			if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15302 				vm_map_unlock_read(map);
15303 				return KERN_INVALID_ADDRESS;
15304 			}
15305 		} else {
15306 			entry = tmp_entry;
15307 		}
15308 
15309 		start = entry->vme_start;
15310 
15311 		basic->offset = VME_OFFSET(entry);
15312 		basic->protection = entry->protection;
15313 		basic->inheritance = entry->inheritance;
15314 		basic->max_protection = entry->max_protection;
15315 		basic->behavior = entry->behavior;
15316 		basic->user_wired_count = entry->user_wired_count;
15317 		basic->reserved = entry->is_sub_map;
15318 		*address = start;
15319 		*size = (entry->vme_end - start);
15320 
15321 		if (object_name) {
15322 			*object_name = IP_NULL;
15323 		}
15324 		if (entry->is_sub_map) {
15325 			basic->shared = FALSE;
15326 		} else {
15327 			basic->shared = entry->is_shared;
15328 		}
15329 
15330 		vm_map_unlock_read(map);
15331 		return KERN_SUCCESS;
15332 	}
15333 	case VM_REGION_EXTENDED_INFO:
15334 		if (*count < VM_REGION_EXTENDED_INFO_COUNT) {
15335 			return KERN_INVALID_ARGUMENT;
15336 		}
15337 		OS_FALLTHROUGH;
15338 	case VM_REGION_EXTENDED_INFO__legacy:
15339 		if (*count < VM_REGION_EXTENDED_INFO_COUNT__legacy) {
15340 			return KERN_INVALID_ARGUMENT;
15341 		}
15342 
15343 		{
15344 			vm_region_extended_info_t       extended;
15345 			mach_msg_type_number_t original_count;
15346 			int effective_page_size, effective_page_shift;
15347 
15348 			extended = (vm_region_extended_info_t) info;
15349 
15350 			effective_page_shift = vm_self_region_page_shift(map);
15351 			effective_page_size = (1 << effective_page_shift);
15352 
15353 			vm_map_lock_read(map);
15354 
15355 			start = *address;
15356 			if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15357 				if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15358 					vm_map_unlock_read(map);
15359 					return KERN_INVALID_ADDRESS;
15360 				}
15361 			} else {
15362 				entry = tmp_entry;
15363 			}
15364 			start = entry->vme_start;
15365 
15366 			extended->protection = entry->protection;
15367 			extended->user_tag = VME_ALIAS(entry);
15368 			extended->pages_resident = 0;
15369 			extended->pages_swapped_out = 0;
15370 			extended->pages_shared_now_private = 0;
15371 			extended->pages_dirtied = 0;
15372 			extended->external_pager = 0;
15373 			extended->shadow_depth = 0;
15374 
15375 			original_count = *count;
15376 			if (flavor == VM_REGION_EXTENDED_INFO__legacy) {
15377 				*count = VM_REGION_EXTENDED_INFO_COUNT__legacy;
15378 			} else {
15379 				extended->pages_reusable = 0;
15380 				*count = VM_REGION_EXTENDED_INFO_COUNT;
15381 			}
15382 
15383 			vm_map_region_walk(map, start, entry, VME_OFFSET(entry), entry->vme_end - start, extended, TRUE, *count);
15384 
15385 			if (extended->external_pager && extended->ref_count == 2 && extended->share_mode == SM_SHARED) {
15386 				extended->share_mode = SM_PRIVATE;
15387 			}
15388 
15389 			if (object_name) {
15390 				*object_name = IP_NULL;
15391 			}
15392 			*address = start;
15393 			*size = (entry->vme_end - start);
15394 
15395 			vm_map_unlock_read(map);
15396 			return KERN_SUCCESS;
15397 		}
15398 	case VM_REGION_TOP_INFO:
15399 	{
15400 		vm_region_top_info_t    top;
15401 
15402 		if (*count < VM_REGION_TOP_INFO_COUNT) {
15403 			return KERN_INVALID_ARGUMENT;
15404 		}
15405 
15406 		top = (vm_region_top_info_t) info;
15407 		*count = VM_REGION_TOP_INFO_COUNT;
15408 
15409 		vm_map_lock_read(map);
15410 
15411 		start = *address;
15412 		if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15413 			if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15414 				vm_map_unlock_read(map);
15415 				return KERN_INVALID_ADDRESS;
15416 			}
15417 		} else {
15418 			entry = tmp_entry;
15419 		}
15420 		start = entry->vme_start;
15421 
15422 		top->private_pages_resident = 0;
15423 		top->shared_pages_resident = 0;
15424 
15425 		vm_map_region_top_walk(entry, top);
15426 
15427 		if (object_name) {
15428 			*object_name = IP_NULL;
15429 		}
15430 		*address = start;
15431 		*size = (entry->vme_end - start);
15432 
15433 		vm_map_unlock_read(map);
15434 		return KERN_SUCCESS;
15435 	}
15436 	default:
15437 		return KERN_INVALID_ARGUMENT;
15438 	}
15439 }
15440 
15441 #define OBJ_RESIDENT_COUNT(obj, entry_size)                             \
15442 	MIN((entry_size),                                               \
15443 	    ((obj)->all_reusable ?                                      \
15444 	     (obj)->wired_page_count :                                  \
15445 	     (obj)->resident_page_count - (obj)->reusable_page_count))
15446 
15447 void
vm_map_region_top_walk(vm_map_entry_t entry,vm_region_top_info_t top)15448 vm_map_region_top_walk(
15449 	vm_map_entry_t             entry,
15450 	vm_region_top_info_t       top)
15451 {
15452 	if (entry->is_sub_map || VME_OBJECT(entry) == 0) {
15453 		top->share_mode = SM_EMPTY;
15454 		top->ref_count = 0;
15455 		top->obj_id = 0;
15456 		return;
15457 	}
15458 
15459 	{
15460 		struct  vm_object *obj, *tmp_obj;
15461 		int             ref_count;
15462 		uint32_t        entry_size;
15463 
15464 		entry_size = (uint32_t) ((entry->vme_end - entry->vme_start) / PAGE_SIZE_64);
15465 
15466 		obj = VME_OBJECT(entry);
15467 
15468 		vm_object_lock(obj);
15469 
15470 		if ((ref_count = obj->ref_count) > 1 && obj->paging_in_progress) {
15471 			ref_count--;
15472 		}
15473 
15474 		assert(obj->reusable_page_count <= obj->resident_page_count);
15475 		if (obj->shadow) {
15476 			if (ref_count == 1) {
15477 				top->private_pages_resident =
15478 				    OBJ_RESIDENT_COUNT(obj, entry_size);
15479 			} else {
15480 				top->shared_pages_resident =
15481 				    OBJ_RESIDENT_COUNT(obj, entry_size);
15482 			}
15483 			top->ref_count  = ref_count;
15484 			top->share_mode = SM_COW;
15485 
15486 			while ((tmp_obj = obj->shadow)) {
15487 				vm_object_lock(tmp_obj);
15488 				vm_object_unlock(obj);
15489 				obj = tmp_obj;
15490 
15491 				if ((ref_count = obj->ref_count) > 1 && obj->paging_in_progress) {
15492 					ref_count--;
15493 				}
15494 
15495 				assert(obj->reusable_page_count <= obj->resident_page_count);
15496 				top->shared_pages_resident +=
15497 				    OBJ_RESIDENT_COUNT(obj, entry_size);
15498 				top->ref_count += ref_count - 1;
15499 			}
15500 		} else {
15501 			if (entry->superpage_size) {
15502 				top->share_mode = SM_LARGE_PAGE;
15503 				top->shared_pages_resident = 0;
15504 				top->private_pages_resident = entry_size;
15505 			} else if (entry->needs_copy) {
15506 				top->share_mode = SM_COW;
15507 				top->shared_pages_resident =
15508 				    OBJ_RESIDENT_COUNT(obj, entry_size);
15509 			} else {
15510 				if (ref_count == 1 ||
15511 				    (ref_count == 2 && obj->named)) {
15512 					top->share_mode = SM_PRIVATE;
15513 					top->private_pages_resident =
15514 					    OBJ_RESIDENT_COUNT(obj,
15515 					    entry_size);
15516 				} else {
15517 					top->share_mode = SM_SHARED;
15518 					top->shared_pages_resident =
15519 					    OBJ_RESIDENT_COUNT(obj,
15520 					    entry_size);
15521 				}
15522 			}
15523 			top->ref_count = ref_count;
15524 		}
15525 		/* XXX K64: obj_id will be truncated */
15526 		top->obj_id = (unsigned int) (uintptr_t)VM_KERNEL_ADDRPERM(obj);
15527 
15528 		vm_object_unlock(obj);
15529 	}
15530 }
15531 
15532 void
vm_map_region_walk(vm_map_t map,vm_map_offset_t va,vm_map_entry_t entry,vm_object_offset_t offset,vm_object_size_t range,vm_region_extended_info_t extended,boolean_t look_for_pages,mach_msg_type_number_t count)15533 vm_map_region_walk(
15534 	vm_map_t                        map,
15535 	vm_map_offset_t                 va,
15536 	vm_map_entry_t                  entry,
15537 	vm_object_offset_t              offset,
15538 	vm_object_size_t                range,
15539 	vm_region_extended_info_t       extended,
15540 	boolean_t                       look_for_pages,
15541 	mach_msg_type_number_t count)
15542 {
15543 	struct vm_object *obj, *tmp_obj;
15544 	vm_map_offset_t       last_offset;
15545 	int               i;
15546 	int               ref_count;
15547 	struct vm_object        *shadow_object;
15548 	unsigned short          shadow_depth;
15549 	boolean_t         do_region_footprint;
15550 	int                     effective_page_size, effective_page_shift;
15551 	vm_map_offset_t         effective_page_mask;
15552 
15553 	do_region_footprint = task_self_region_footprint();
15554 
15555 	if ((entry->is_sub_map) ||
15556 	    (VME_OBJECT(entry) == 0) ||
15557 	    (VME_OBJECT(entry)->phys_contiguous &&
15558 	    !entry->superpage_size)) {
15559 		extended->share_mode = SM_EMPTY;
15560 		extended->ref_count = 0;
15561 		return;
15562 	}
15563 
15564 	if (entry->superpage_size) {
15565 		extended->shadow_depth = 0;
15566 		extended->share_mode = SM_LARGE_PAGE;
15567 		extended->ref_count = 1;
15568 		extended->external_pager = 0;
15569 
15570 		/* TODO4K: Superpage in 4k mode? */
15571 		extended->pages_resident = (unsigned int)(range >> PAGE_SHIFT);
15572 		extended->shadow_depth = 0;
15573 		return;
15574 	}
15575 
15576 	effective_page_shift = vm_self_region_page_shift(map);
15577 	effective_page_size = (1 << effective_page_shift);
15578 	effective_page_mask = effective_page_size - 1;
15579 
15580 	offset = vm_map_trunc_page(offset, effective_page_mask);
15581 
15582 	obj = VME_OBJECT(entry);
15583 
15584 	vm_object_lock(obj);
15585 
15586 	if ((ref_count = obj->ref_count) > 1 && obj->paging_in_progress) {
15587 		ref_count--;
15588 	}
15589 
15590 	if (look_for_pages) {
15591 		for (last_offset = offset + range;
15592 		    offset < last_offset;
15593 		    offset += effective_page_size, va += effective_page_size) {
15594 			if (do_region_footprint) {
15595 				int disp;
15596 
15597 				disp = 0;
15598 				if (map->has_corpse_footprint) {
15599 					/*
15600 					 * Query the page info data we saved
15601 					 * while forking the corpse.
15602 					 */
15603 					vm_map_corpse_footprint_query_page_info(
15604 						map,
15605 						va,
15606 						&disp);
15607 				} else {
15608 					/*
15609 					 * Query the pmap.
15610 					 */
15611 					vm_map_footprint_query_page_info(
15612 						map,
15613 						entry,
15614 						va,
15615 						&disp);
15616 				}
15617 				if (disp & VM_PAGE_QUERY_PAGE_PRESENT) {
15618 					extended->pages_resident++;
15619 				}
15620 				if (disp & VM_PAGE_QUERY_PAGE_REUSABLE) {
15621 					extended->pages_reusable++;
15622 				}
15623 				if (disp & VM_PAGE_QUERY_PAGE_DIRTY) {
15624 					extended->pages_dirtied++;
15625 				}
15626 				if (disp & PMAP_QUERY_PAGE_COMPRESSED) {
15627 					extended->pages_swapped_out++;
15628 				}
15629 				continue;
15630 			}
15631 
15632 			vm_map_region_look_for_page(map, va, obj,
15633 			    vm_object_trunc_page(offset), ref_count,
15634 			    0, extended, count);
15635 		}
15636 
15637 		if (do_region_footprint) {
15638 			goto collect_object_info;
15639 		}
15640 	} else {
15641 collect_object_info:
15642 		shadow_object = obj->shadow;
15643 		shadow_depth = 0;
15644 
15645 		if (!(obj->internal)) {
15646 			extended->external_pager = 1;
15647 		}
15648 
15649 		if (shadow_object != VM_OBJECT_NULL) {
15650 			vm_object_lock(shadow_object);
15651 			for (;
15652 			    shadow_object != VM_OBJECT_NULL;
15653 			    shadow_depth++) {
15654 				vm_object_t     next_shadow;
15655 
15656 				if (!(shadow_object->internal)) {
15657 					extended->external_pager = 1;
15658 				}
15659 
15660 				next_shadow = shadow_object->shadow;
15661 				if (next_shadow) {
15662 					vm_object_lock(next_shadow);
15663 				}
15664 				vm_object_unlock(shadow_object);
15665 				shadow_object = next_shadow;
15666 			}
15667 		}
15668 		extended->shadow_depth = shadow_depth;
15669 	}
15670 
15671 	if (extended->shadow_depth || entry->needs_copy) {
15672 		extended->share_mode = SM_COW;
15673 	} else {
15674 		if (ref_count == 1) {
15675 			extended->share_mode = SM_PRIVATE;
15676 		} else {
15677 			if (obj->true_share) {
15678 				extended->share_mode = SM_TRUESHARED;
15679 			} else {
15680 				extended->share_mode = SM_SHARED;
15681 			}
15682 		}
15683 	}
15684 	extended->ref_count = ref_count - extended->shadow_depth;
15685 
15686 	for (i = 0; i < extended->shadow_depth; i++) {
15687 		if ((tmp_obj = obj->shadow) == 0) {
15688 			break;
15689 		}
15690 		vm_object_lock(tmp_obj);
15691 		vm_object_unlock(obj);
15692 
15693 		if ((ref_count = tmp_obj->ref_count) > 1 && tmp_obj->paging_in_progress) {
15694 			ref_count--;
15695 		}
15696 
15697 		extended->ref_count += ref_count;
15698 		obj = tmp_obj;
15699 	}
15700 	vm_object_unlock(obj);
15701 
15702 	if (extended->share_mode == SM_SHARED) {
15703 		vm_map_entry_t       cur;
15704 		vm_map_entry_t       last;
15705 		int      my_refs;
15706 
15707 		obj = VME_OBJECT(entry);
15708 		last = vm_map_to_entry(map);
15709 		my_refs = 0;
15710 
15711 		if ((ref_count = obj->ref_count) > 1 && obj->paging_in_progress) {
15712 			ref_count--;
15713 		}
15714 		for (cur = vm_map_first_entry(map); cur != last; cur = cur->vme_next) {
15715 			my_refs += vm_map_region_count_obj_refs(cur, obj);
15716 		}
15717 
15718 		if (my_refs == ref_count) {
15719 			extended->share_mode = SM_PRIVATE_ALIASED;
15720 		} else if (my_refs > 1) {
15721 			extended->share_mode = SM_SHARED_ALIASED;
15722 		}
15723 	}
15724 }
15725 
15726 
15727 /* object is locked on entry and locked on return */
15728 
15729 
15730 static void
vm_map_region_look_for_page(__unused vm_map_t map,__unused vm_map_offset_t va,vm_object_t object,vm_object_offset_t offset,int max_refcnt,unsigned short depth,vm_region_extended_info_t extended,mach_msg_type_number_t count)15731 vm_map_region_look_for_page(
15732 	__unused vm_map_t               map,
15733 	__unused vm_map_offset_t        va,
15734 	vm_object_t                     object,
15735 	vm_object_offset_t              offset,
15736 	int                             max_refcnt,
15737 	unsigned short                  depth,
15738 	vm_region_extended_info_t       extended,
15739 	mach_msg_type_number_t count)
15740 {
15741 	vm_page_t       p;
15742 	vm_object_t     shadow;
15743 	int             ref_count;
15744 	vm_object_t     caller_object;
15745 
15746 	shadow = object->shadow;
15747 	caller_object = object;
15748 
15749 
15750 	while (TRUE) {
15751 		if (!(object->internal)) {
15752 			extended->external_pager = 1;
15753 		}
15754 
15755 		if ((p = vm_page_lookup(object, offset)) != VM_PAGE_NULL) {
15756 			if (shadow && (max_refcnt == 1)) {
15757 				extended->pages_shared_now_private++;
15758 			}
15759 
15760 			if (!p->vmp_fictitious &&
15761 			    (p->vmp_dirty || pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(p)))) {
15762 				extended->pages_dirtied++;
15763 			} else if (count >= VM_REGION_EXTENDED_INFO_COUNT) {
15764 				if (p->vmp_reusable || object->all_reusable) {
15765 					extended->pages_reusable++;
15766 				}
15767 			}
15768 
15769 			extended->pages_resident++;
15770 
15771 			if (object != caller_object) {
15772 				vm_object_unlock(object);
15773 			}
15774 
15775 			return;
15776 		}
15777 		if (object->internal &&
15778 		    object->alive &&
15779 		    !object->terminating &&
15780 		    object->pager_ready) {
15781 			if (VM_COMPRESSOR_PAGER_STATE_GET(object, offset)
15782 			    == VM_EXTERNAL_STATE_EXISTS) {
15783 				/* the pager has that page */
15784 				extended->pages_swapped_out++;
15785 				if (object != caller_object) {
15786 					vm_object_unlock(object);
15787 				}
15788 				return;
15789 			}
15790 		}
15791 
15792 		if (shadow) {
15793 			vm_object_lock(shadow);
15794 
15795 			if ((ref_count = shadow->ref_count) > 1 && shadow->paging_in_progress) {
15796 				ref_count--;
15797 			}
15798 
15799 			if (++depth > extended->shadow_depth) {
15800 				extended->shadow_depth = depth;
15801 			}
15802 
15803 			if (ref_count > max_refcnt) {
15804 				max_refcnt = ref_count;
15805 			}
15806 
15807 			if (object != caller_object) {
15808 				vm_object_unlock(object);
15809 			}
15810 
15811 			offset = offset + object->vo_shadow_offset;
15812 			object = shadow;
15813 			shadow = object->shadow;
15814 			continue;
15815 		}
15816 		if (object != caller_object) {
15817 			vm_object_unlock(object);
15818 		}
15819 		break;
15820 	}
15821 }
15822 
15823 static int
vm_map_region_count_obj_refs(vm_map_entry_t entry,vm_object_t object)15824 vm_map_region_count_obj_refs(
15825 	vm_map_entry_t    entry,
15826 	vm_object_t       object)
15827 {
15828 	int ref_count;
15829 	vm_object_t chk_obj;
15830 	vm_object_t tmp_obj;
15831 
15832 	if (entry->is_sub_map || VME_OBJECT(entry) == VM_OBJECT_NULL) {
15833 		return 0;
15834 	}
15835 
15836 	ref_count = 0;
15837 	chk_obj = VME_OBJECT(entry);
15838 	vm_object_lock(chk_obj);
15839 
15840 	while (chk_obj) {
15841 		if (chk_obj == object) {
15842 			ref_count++;
15843 		}
15844 		tmp_obj = chk_obj->shadow;
15845 		if (tmp_obj) {
15846 			vm_object_lock(tmp_obj);
15847 		}
15848 		vm_object_unlock(chk_obj);
15849 
15850 		chk_obj = tmp_obj;
15851 	}
15852 
15853 	return ref_count;
15854 }
15855 
15856 
15857 /*
15858  *	Routine:	vm_map_simplify
15859  *
15860  *	Description:
15861  *		Attempt to simplify the map representation in
15862  *		the vicinity of the given starting address.
15863  *	Note:
15864  *		This routine is intended primarily to keep the
15865  *		kernel maps more compact -- they generally don't
15866  *		benefit from the "expand a map entry" technology
15867  *		at allocation time because the adjacent entry
15868  *		is often wired down.
15869  */
15870 void
vm_map_simplify_entry(vm_map_t map,vm_map_entry_t this_entry)15871 vm_map_simplify_entry(
15872 	vm_map_t        map,
15873 	vm_map_entry_t  this_entry)
15874 {
15875 	vm_map_entry_t  prev_entry;
15876 
15877 	prev_entry = this_entry->vme_prev;
15878 
15879 	if ((this_entry != vm_map_to_entry(map)) &&
15880 	    (prev_entry != vm_map_to_entry(map)) &&
15881 
15882 	    (prev_entry->vme_end == this_entry->vme_start) &&
15883 
15884 	    (prev_entry->is_sub_map == this_entry->is_sub_map) &&
15885 	    (prev_entry->vme_object_value == this_entry->vme_object_value) &&
15886 	    (prev_entry->vme_kernel_object == this_entry->vme_kernel_object) &&
15887 	    ((VME_OFFSET(prev_entry) + (prev_entry->vme_end -
15888 	    prev_entry->vme_start))
15889 	    == VME_OFFSET(this_entry)) &&
15890 
15891 	    (prev_entry->behavior == this_entry->behavior) &&
15892 	    (prev_entry->needs_copy == this_entry->needs_copy) &&
15893 	    (prev_entry->protection == this_entry->protection) &&
15894 	    (prev_entry->max_protection == this_entry->max_protection) &&
15895 	    (prev_entry->inheritance == this_entry->inheritance) &&
15896 	    (prev_entry->use_pmap == this_entry->use_pmap) &&
15897 	    (VME_ALIAS(prev_entry) == VME_ALIAS(this_entry)) &&
15898 	    (prev_entry->no_cache == this_entry->no_cache) &&
15899 	    (prev_entry->vme_permanent == this_entry->vme_permanent) &&
15900 	    (prev_entry->map_aligned == this_entry->map_aligned) &&
15901 	    (prev_entry->zero_wired_pages == this_entry->zero_wired_pages) &&
15902 	    (prev_entry->used_for_jit == this_entry->used_for_jit) &&
15903 #if __arm64e__
15904 	    (prev_entry->used_for_tpro == this_entry->used_for_tpro) &&
15905 #endif
15906 	    (prev_entry->csm_associated == this_entry->csm_associated) &&
15907 	    (prev_entry->vme_xnu_user_debug == this_entry->vme_xnu_user_debug) &&
15908 	    (prev_entry->iokit_acct == this_entry->iokit_acct) &&
15909 	    (prev_entry->vme_resilient_codesign ==
15910 	    this_entry->vme_resilient_codesign) &&
15911 	    (prev_entry->vme_resilient_media ==
15912 	    this_entry->vme_resilient_media) &&
15913 	    (prev_entry->vme_no_copy_on_read == this_entry->vme_no_copy_on_read) &&
15914 	    (prev_entry->translated_allow_execute == this_entry->translated_allow_execute) &&
15915 
15916 	    (prev_entry->wired_count == this_entry->wired_count) &&
15917 	    (prev_entry->user_wired_count == this_entry->user_wired_count) &&
15918 
15919 	    ((prev_entry->vme_atomic == FALSE) && (this_entry->vme_atomic == FALSE)) &&
15920 	    (prev_entry->in_transition == FALSE) &&
15921 	    (this_entry->in_transition == FALSE) &&
15922 	    (prev_entry->needs_wakeup == FALSE) &&
15923 	    (this_entry->needs_wakeup == FALSE) &&
15924 	    (prev_entry->is_shared == this_entry->is_shared) &&
15925 	    (prev_entry->superpage_size == FALSE) &&
15926 	    (this_entry->superpage_size == FALSE)
15927 	    ) {
15928 		if (prev_entry->vme_permanent) {
15929 			assert(this_entry->vme_permanent);
15930 			prev_entry->vme_permanent = false;
15931 		}
15932 		vm_map_store_entry_unlink(map, prev_entry, true);
15933 		assert(prev_entry->vme_start < this_entry->vme_end);
15934 		if (prev_entry->map_aligned) {
15935 			assert(VM_MAP_PAGE_ALIGNED(prev_entry->vme_start,
15936 			    VM_MAP_PAGE_MASK(map)));
15937 		}
15938 		this_entry->vme_start = prev_entry->vme_start;
15939 		VME_OFFSET_SET(this_entry, VME_OFFSET(prev_entry));
15940 
15941 		if (map->holelistenabled) {
15942 			vm_map_store_update_first_free(map, this_entry, TRUE);
15943 		}
15944 
15945 		if (prev_entry->is_sub_map) {
15946 			vm_map_deallocate(VME_SUBMAP(prev_entry));
15947 		} else {
15948 			vm_object_deallocate(VME_OBJECT(prev_entry));
15949 		}
15950 		vm_map_entry_dispose(prev_entry);
15951 		SAVE_HINT_MAP_WRITE(map, this_entry);
15952 	}
15953 }
15954 
15955 void
vm_map_simplify(vm_map_t map,vm_map_offset_t start)15956 vm_map_simplify(
15957 	vm_map_t        map,
15958 	vm_map_offset_t start)
15959 {
15960 	vm_map_entry_t  this_entry;
15961 
15962 	vm_map_lock(map);
15963 	if (vm_map_lookup_entry(map, start, &this_entry)) {
15964 		vm_map_simplify_entry(map, this_entry);
15965 		vm_map_simplify_entry(map, this_entry->vme_next);
15966 	}
15967 	vm_map_unlock(map);
15968 }
15969 
15970 static void
vm_map_simplify_range(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)15971 vm_map_simplify_range(
15972 	vm_map_t        map,
15973 	vm_map_offset_t start,
15974 	vm_map_offset_t end)
15975 {
15976 	vm_map_entry_t  entry;
15977 
15978 	/*
15979 	 * The map should be locked (for "write") by the caller.
15980 	 */
15981 
15982 	if (start >= end) {
15983 		/* invalid address range */
15984 		return;
15985 	}
15986 
15987 	start = vm_map_trunc_page(start,
15988 	    VM_MAP_PAGE_MASK(map));
15989 	end = vm_map_round_page(end,
15990 	    VM_MAP_PAGE_MASK(map));
15991 
15992 	if (!vm_map_lookup_entry(map, start, &entry)) {
15993 		/* "start" is not mapped and "entry" ends before "start" */
15994 		if (entry == vm_map_to_entry(map)) {
15995 			/* start with first entry in the map */
15996 			entry = vm_map_first_entry(map);
15997 		} else {
15998 			/* start with next entry */
15999 			entry = entry->vme_next;
16000 		}
16001 	}
16002 
16003 	while (entry != vm_map_to_entry(map) &&
16004 	    entry->vme_start <= end) {
16005 		/* try and coalesce "entry" with its previous entry */
16006 		vm_map_simplify_entry(map, entry);
16007 		entry = entry->vme_next;
16008 	}
16009 }
16010 
16011 
16012 /*
16013  *	Routine:	vm_map_machine_attribute
16014  *	Purpose:
16015  *		Provide machine-specific attributes to mappings,
16016  *		such as cachability etc. for machines that provide
16017  *		them.  NUMA architectures and machines with big/strange
16018  *		caches will use this.
16019  *	Note:
16020  *		Responsibilities for locking and checking are handled here,
16021  *		everything else in the pmap module. If any non-volatile
16022  *		information must be kept, the pmap module should handle
16023  *		it itself. [This assumes that attributes do not
16024  *		need to be inherited, which seems ok to me]
16025  */
16026 kern_return_t
vm_map_machine_attribute(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_machine_attribute_t attribute,vm_machine_attribute_val_t * value)16027 vm_map_machine_attribute(
16028 	vm_map_t                        map,
16029 	vm_map_offset_t         start,
16030 	vm_map_offset_t         end,
16031 	vm_machine_attribute_t  attribute,
16032 	vm_machine_attribute_val_t* value)              /* IN/OUT */
16033 {
16034 	kern_return_t   ret;
16035 	vm_map_size_t sync_size;
16036 	vm_map_entry_t entry;
16037 
16038 	if (start < vm_map_min(map) || end > vm_map_max(map)) {
16039 		return KERN_INVALID_ADDRESS;
16040 	}
16041 	if (__improbable(vm_map_range_overflows(map, start, end - start))) {
16042 		return KERN_INVALID_ADDRESS;
16043 	}
16044 
16045 	/* Figure how much memory we need to flush (in page increments) */
16046 	sync_size = end - start;
16047 
16048 	vm_map_lock(map);
16049 
16050 	if (attribute != MATTR_CACHE) {
16051 		/* If we don't have to find physical addresses, we */
16052 		/* don't have to do an explicit traversal here.    */
16053 		ret = pmap_attribute(map->pmap, start, end - start,
16054 		    attribute, value);
16055 		vm_map_unlock(map);
16056 		return ret;
16057 	}
16058 
16059 	ret = KERN_SUCCESS;                                                                             /* Assume it all worked */
16060 
16061 	while (sync_size) {
16062 		if (vm_map_lookup_entry(map, start, &entry)) {
16063 			vm_map_size_t   sub_size;
16064 			if ((entry->vme_end - start) > sync_size) {
16065 				sub_size = sync_size;
16066 				sync_size = 0;
16067 			} else {
16068 				sub_size = entry->vme_end - start;
16069 				sync_size -= sub_size;
16070 			}
16071 			if (entry->is_sub_map) {
16072 				vm_map_offset_t sub_start;
16073 				vm_map_offset_t sub_end;
16074 
16075 				sub_start = (start - entry->vme_start)
16076 				    + VME_OFFSET(entry);
16077 				sub_end = sub_start + sub_size;
16078 				vm_map_machine_attribute(
16079 					VME_SUBMAP(entry),
16080 					sub_start,
16081 					sub_end,
16082 					attribute, value);
16083 			} else if (VME_OBJECT(entry)) {
16084 				vm_page_t               m;
16085 				vm_object_t             object;
16086 				vm_object_t             base_object;
16087 				vm_object_t             last_object;
16088 				vm_object_offset_t      offset;
16089 				vm_object_offset_t      base_offset;
16090 				vm_map_size_t           range;
16091 				range = sub_size;
16092 				offset = (start - entry->vme_start)
16093 				    + VME_OFFSET(entry);
16094 				offset = vm_object_trunc_page(offset);
16095 				base_offset = offset;
16096 				object = VME_OBJECT(entry);
16097 				base_object = object;
16098 				last_object = NULL;
16099 
16100 				vm_object_lock(object);
16101 
16102 				while (range) {
16103 					m = vm_page_lookup(
16104 						object, offset);
16105 
16106 					if (m && !m->vmp_fictitious) {
16107 						ret =
16108 						    pmap_attribute_cache_sync(
16109 							VM_PAGE_GET_PHYS_PAGE(m),
16110 							PAGE_SIZE,
16111 							attribute, value);
16112 					} else if (object->shadow) {
16113 						offset = offset + object->vo_shadow_offset;
16114 						last_object = object;
16115 						object = object->shadow;
16116 						vm_object_lock(last_object->shadow);
16117 						vm_object_unlock(last_object);
16118 						continue;
16119 					}
16120 					if (range < PAGE_SIZE) {
16121 						range = 0;
16122 					} else {
16123 						range -= PAGE_SIZE;
16124 					}
16125 
16126 					if (base_object != object) {
16127 						vm_object_unlock(object);
16128 						vm_object_lock(base_object);
16129 						object = base_object;
16130 					}
16131 					/* Bump to the next page */
16132 					base_offset += PAGE_SIZE;
16133 					offset = base_offset;
16134 				}
16135 				vm_object_unlock(object);
16136 			}
16137 			start += sub_size;
16138 		} else {
16139 			vm_map_unlock(map);
16140 			return KERN_FAILURE;
16141 		}
16142 	}
16143 
16144 	vm_map_unlock(map);
16145 
16146 	return ret;
16147 }
16148 
16149 /*
16150  *	vm_map_behavior_set:
16151  *
16152  *	Sets the paging reference behavior of the specified address
16153  *	range in the target map.  Paging reference behavior affects
16154  *	how pagein operations resulting from faults on the map will be
16155  *	clustered.
16156  */
16157 kern_return_t
vm_map_behavior_set(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_behavior_t new_behavior)16158 vm_map_behavior_set(
16159 	vm_map_t        map,
16160 	vm_map_offset_t start,
16161 	vm_map_offset_t end,
16162 	vm_behavior_t   new_behavior)
16163 {
16164 	vm_map_entry_t  entry;
16165 	vm_map_entry_t  temp_entry;
16166 
16167 	if (start > end ||
16168 	    start < vm_map_min(map) ||
16169 	    end > vm_map_max(map)) {
16170 		return KERN_NO_SPACE;
16171 	}
16172 	if (__improbable(vm_map_range_overflows(map, start, end - start))) {
16173 		return KERN_INVALID_ADDRESS;
16174 	}
16175 
16176 	switch (new_behavior) {
16177 	/*
16178 	 * This first block of behaviors all set a persistent state on the specified
16179 	 * memory range.  All we have to do here is to record the desired behavior
16180 	 * in the vm_map_entry_t's.
16181 	 */
16182 
16183 	case VM_BEHAVIOR_DEFAULT:
16184 	case VM_BEHAVIOR_RANDOM:
16185 	case VM_BEHAVIOR_SEQUENTIAL:
16186 	case VM_BEHAVIOR_RSEQNTL:
16187 	case VM_BEHAVIOR_ZERO_WIRED_PAGES:
16188 		vm_map_lock(map);
16189 
16190 		/*
16191 		 *	The entire address range must be valid for the map.
16192 		 *      Note that vm_map_range_check() does a
16193 		 *	vm_map_lookup_entry() internally and returns the
16194 		 *	entry containing the start of the address range if
16195 		 *	the entire range is valid.
16196 		 */
16197 		if (vm_map_range_check(map, start, end, &temp_entry)) {
16198 			entry = temp_entry;
16199 			vm_map_clip_start(map, entry, start);
16200 		} else {
16201 			vm_map_unlock(map);
16202 			return KERN_INVALID_ADDRESS;
16203 		}
16204 
16205 		while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
16206 			vm_map_clip_end(map, entry, end);
16207 			if (entry->is_sub_map) {
16208 				assert(!entry->use_pmap);
16209 			}
16210 
16211 			if (new_behavior == VM_BEHAVIOR_ZERO_WIRED_PAGES) {
16212 				entry->zero_wired_pages = TRUE;
16213 			} else {
16214 				entry->behavior = new_behavior;
16215 			}
16216 			entry = entry->vme_next;
16217 		}
16218 
16219 		vm_map_unlock(map);
16220 		break;
16221 
16222 	/*
16223 	 * The rest of these are different from the above in that they cause
16224 	 * an immediate action to take place as opposed to setting a behavior that
16225 	 * affects future actions.
16226 	 */
16227 
16228 	case VM_BEHAVIOR_WILLNEED:
16229 		return vm_map_willneed(map, start, end);
16230 
16231 	case VM_BEHAVIOR_DONTNEED:
16232 		return vm_map_msync(map, start, end - start, VM_SYNC_DEACTIVATE | VM_SYNC_CONTIGUOUS);
16233 
16234 	case VM_BEHAVIOR_FREE:
16235 		return vm_map_msync(map, start, end - start, VM_SYNC_KILLPAGES | VM_SYNC_CONTIGUOUS);
16236 
16237 	case VM_BEHAVIOR_REUSABLE:
16238 		return vm_map_reusable_pages(map, start, end);
16239 
16240 	case VM_BEHAVIOR_REUSE:
16241 		return vm_map_reuse_pages(map, start, end);
16242 
16243 	case VM_BEHAVIOR_CAN_REUSE:
16244 		return vm_map_can_reuse(map, start, end);
16245 
16246 #if MACH_ASSERT
16247 	case VM_BEHAVIOR_PAGEOUT:
16248 		return vm_map_pageout(map, start, end);
16249 #endif /* MACH_ASSERT */
16250 
16251 	default:
16252 		return KERN_INVALID_ARGUMENT;
16253 	}
16254 
16255 	return KERN_SUCCESS;
16256 }
16257 
16258 
16259 /*
16260  * Internals for madvise(MADV_WILLNEED) system call.
16261  *
16262  * The implementation is to do:-
16263  * a) read-ahead if the mapping corresponds to a mapped regular file
16264  * b) or, fault in the pages (zero-fill, decompress etc) if it's an anonymous mapping
16265  */
16266 
16267 
16268 static kern_return_t
vm_map_willneed(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16269 vm_map_willneed(
16270 	vm_map_t        map,
16271 	vm_map_offset_t start,
16272 	vm_map_offset_t end
16273 	)
16274 {
16275 	vm_map_entry_t                  entry;
16276 	vm_object_t                     object;
16277 	memory_object_t                 pager;
16278 	struct vm_object_fault_info     fault_info = {};
16279 	kern_return_t                   kr;
16280 	vm_object_size_t                len;
16281 	vm_object_offset_t              offset;
16282 
16283 	fault_info.interruptible = THREAD_UNINT;        /* ignored value */
16284 	fault_info.behavior      = VM_BEHAVIOR_SEQUENTIAL;
16285 	fault_info.stealth       = TRUE;
16286 
16287 	/*
16288 	 * The MADV_WILLNEED operation doesn't require any changes to the
16289 	 * vm_map_entry_t's, so the read lock is sufficient.
16290 	 */
16291 
16292 	vm_map_lock_read(map);
16293 
16294 	/*
16295 	 * The madvise semantics require that the address range be fully
16296 	 * allocated with no holes.  Otherwise, we're required to return
16297 	 * an error.
16298 	 */
16299 
16300 	if (!vm_map_range_check(map, start, end, &entry)) {
16301 		vm_map_unlock_read(map);
16302 		return KERN_INVALID_ADDRESS;
16303 	}
16304 
16305 	/*
16306 	 * Examine each vm_map_entry_t in the range.
16307 	 */
16308 	for (; entry != vm_map_to_entry(map) && start < end;) {
16309 		/*
16310 		 * The first time through, the start address could be anywhere
16311 		 * within the vm_map_entry we found.  So adjust the offset to
16312 		 * correspond.  After that, the offset will always be zero to
16313 		 * correspond to the beginning of the current vm_map_entry.
16314 		 */
16315 		offset = (start - entry->vme_start) + VME_OFFSET(entry);
16316 
16317 		/*
16318 		 * Set the length so we don't go beyond the end of the
16319 		 * map_entry or beyond the end of the range we were given.
16320 		 * This range could span also multiple map entries all of which
16321 		 * map different files, so make sure we only do the right amount
16322 		 * of I/O for each object.  Note that it's possible for there
16323 		 * to be multiple map entries all referring to the same object
16324 		 * but with different page permissions, but it's not worth
16325 		 * trying to optimize that case.
16326 		 */
16327 		len = MIN(entry->vme_end - start, end - start);
16328 
16329 		if ((vm_size_t) len != len) {
16330 			/* 32-bit overflow */
16331 			len = (vm_size_t) (0 - PAGE_SIZE);
16332 		}
16333 		fault_info.cluster_size = (vm_size_t) len;
16334 		fault_info.lo_offset    = offset;
16335 		fault_info.hi_offset    = offset + len;
16336 		fault_info.user_tag     = VME_ALIAS(entry);
16337 		fault_info.pmap_options = 0;
16338 		if (entry->iokit_acct ||
16339 		    (!entry->is_sub_map && !entry->use_pmap)) {
16340 			fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
16341 		}
16342 		fault_info.fi_xnu_user_debug = entry->vme_xnu_user_debug;
16343 
16344 		/*
16345 		 * If the entry is a submap OR there's no read permission
16346 		 * to this mapping, then just skip it.
16347 		 */
16348 		if ((entry->is_sub_map) || (entry->protection & VM_PROT_READ) == 0) {
16349 			entry = entry->vme_next;
16350 			start = entry->vme_start;
16351 			continue;
16352 		}
16353 
16354 		object = VME_OBJECT(entry);
16355 
16356 		if (object == NULL ||
16357 		    (object && object->internal)) {
16358 			/*
16359 			 * Memory range backed by anonymous memory.
16360 			 */
16361 			vm_size_t region_size = 0, effective_page_size = 0;
16362 			vm_map_offset_t addr = 0, effective_page_mask = 0;
16363 
16364 			region_size = len;
16365 			addr = start;
16366 
16367 			effective_page_mask = MIN(vm_map_page_mask(current_map()), PAGE_MASK);
16368 			effective_page_size = effective_page_mask + 1;
16369 
16370 			vm_map_unlock_read(map);
16371 
16372 			while (region_size) {
16373 				vm_pre_fault(
16374 					vm_map_trunc_page(addr, effective_page_mask),
16375 					VM_PROT_READ | VM_PROT_WRITE);
16376 
16377 				region_size -= effective_page_size;
16378 				addr += effective_page_size;
16379 			}
16380 		} else {
16381 			/*
16382 			 * Find the file object backing this map entry.  If there is
16383 			 * none, then we simply ignore the "will need" advice for this
16384 			 * entry and go on to the next one.
16385 			 */
16386 			if ((object = find_vnode_object(entry)) == VM_OBJECT_NULL) {
16387 				entry = entry->vme_next;
16388 				start = entry->vme_start;
16389 				continue;
16390 			}
16391 
16392 			vm_object_paging_begin(object);
16393 			pager = object->pager;
16394 			vm_object_unlock(object);
16395 
16396 			/*
16397 			 * The data_request() could take a long time, so let's
16398 			 * release the map lock to avoid blocking other threads.
16399 			 */
16400 			vm_map_unlock_read(map);
16401 
16402 			/*
16403 			 * Get the data from the object asynchronously.
16404 			 *
16405 			 * Note that memory_object_data_request() places limits on the
16406 			 * amount of I/O it will do.  Regardless of the len we
16407 			 * specified, it won't do more than MAX_UPL_TRANSFER_BYTES and it
16408 			 * silently truncates the len to that size.  This isn't
16409 			 * necessarily bad since madvise shouldn't really be used to
16410 			 * page in unlimited amounts of data.  Other Unix variants
16411 			 * limit the willneed case as well.  If this turns out to be an
16412 			 * issue for developers, then we can always adjust the policy
16413 			 * here and still be backwards compatible since this is all
16414 			 * just "advice".
16415 			 */
16416 			kr = memory_object_data_request(
16417 				pager,
16418 				vm_object_trunc_page(offset) + object->paging_offset,
16419 				0,      /* ignored */
16420 				VM_PROT_READ,
16421 				(memory_object_fault_info_t)&fault_info);
16422 
16423 			vm_object_lock(object);
16424 			vm_object_paging_end(object);
16425 			vm_object_unlock(object);
16426 
16427 			/*
16428 			 * If we couldn't do the I/O for some reason, just give up on
16429 			 * the madvise.  We still return success to the user since
16430 			 * madvise isn't supposed to fail when the advice can't be
16431 			 * taken.
16432 			 */
16433 
16434 			if (kr != KERN_SUCCESS) {
16435 				return KERN_SUCCESS;
16436 			}
16437 		}
16438 
16439 		start += len;
16440 		if (start >= end) {
16441 			/* done */
16442 			return KERN_SUCCESS;
16443 		}
16444 
16445 		/* look up next entry */
16446 		vm_map_lock_read(map);
16447 		if (!vm_map_lookup_entry(map, start, &entry)) {
16448 			/*
16449 			 * There's a new hole in the address range.
16450 			 */
16451 			vm_map_unlock_read(map);
16452 			return KERN_INVALID_ADDRESS;
16453 		}
16454 	}
16455 
16456 	vm_map_unlock_read(map);
16457 	return KERN_SUCCESS;
16458 }
16459 
16460 static boolean_t
vm_map_entry_is_reusable(vm_map_entry_t entry)16461 vm_map_entry_is_reusable(
16462 	vm_map_entry_t entry)
16463 {
16464 	/* Only user map entries */
16465 
16466 	vm_object_t object;
16467 
16468 	if (entry->is_sub_map) {
16469 		return FALSE;
16470 	}
16471 
16472 	switch (VME_ALIAS(entry)) {
16473 	case VM_MEMORY_MALLOC:
16474 	case VM_MEMORY_MALLOC_SMALL:
16475 	case VM_MEMORY_MALLOC_LARGE:
16476 	case VM_MEMORY_REALLOC:
16477 	case VM_MEMORY_MALLOC_TINY:
16478 	case VM_MEMORY_MALLOC_LARGE_REUSABLE:
16479 	case VM_MEMORY_MALLOC_LARGE_REUSED:
16480 		/*
16481 		 * This is a malloc() memory region: check if it's still
16482 		 * in its original state and can be re-used for more
16483 		 * malloc() allocations.
16484 		 */
16485 		break;
16486 	default:
16487 		/*
16488 		 * Not a malloc() memory region: let the caller decide if
16489 		 * it's re-usable.
16490 		 */
16491 		return TRUE;
16492 	}
16493 
16494 	if (/*entry->is_shared ||*/
16495 		entry->is_sub_map ||
16496 		entry->in_transition ||
16497 		entry->protection != VM_PROT_DEFAULT ||
16498 		entry->max_protection != VM_PROT_ALL ||
16499 		entry->inheritance != VM_INHERIT_DEFAULT ||
16500 		entry->no_cache ||
16501 		entry->vme_permanent ||
16502 		entry->superpage_size != FALSE ||
16503 		entry->zero_wired_pages ||
16504 		entry->wired_count != 0 ||
16505 		entry->user_wired_count != 0) {
16506 		return FALSE;
16507 	}
16508 
16509 	object = VME_OBJECT(entry);
16510 	if (object == VM_OBJECT_NULL) {
16511 		return TRUE;
16512 	}
16513 	if (
16514 #if 0
16515 		/*
16516 		 * Let's proceed even if the VM object is potentially
16517 		 * shared.
16518 		 * We check for this later when processing the actual
16519 		 * VM pages, so the contents will be safe if shared.
16520 		 *
16521 		 * But we can still mark this memory region as "reusable" to
16522 		 * acknowledge that the caller did let us know that the memory
16523 		 * could be re-used and should not be penalized for holding
16524 		 * on to it.  This allows its "resident size" to not include
16525 		 * the reusable range.
16526 		 */
16527 		object->ref_count == 1 &&
16528 #endif
16529 		object->vo_copy == VM_OBJECT_NULL &&
16530 		object->shadow == VM_OBJECT_NULL &&
16531 		object->internal &&
16532 		object->purgable == VM_PURGABLE_DENY &&
16533 		object->wimg_bits == VM_WIMG_USE_DEFAULT &&
16534 		!object->code_signed) {
16535 		return TRUE;
16536 	}
16537 	return FALSE;
16538 }
16539 
16540 static kern_return_t
vm_map_reuse_pages(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16541 vm_map_reuse_pages(
16542 	vm_map_t        map,
16543 	vm_map_offset_t start,
16544 	vm_map_offset_t end)
16545 {
16546 	vm_map_entry_t                  entry;
16547 	vm_object_t                     object;
16548 	vm_object_offset_t              start_offset, end_offset;
16549 
16550 	/*
16551 	 * The MADV_REUSE operation doesn't require any changes to the
16552 	 * vm_map_entry_t's, so the read lock is sufficient.
16553 	 */
16554 
16555 	if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
16556 		/*
16557 		 * XXX TODO4K
16558 		 * need to figure out what reusable means for a
16559 		 * portion of a native page.
16560 		 */
16561 		return KERN_SUCCESS;
16562 	}
16563 
16564 	vm_map_lock_read(map);
16565 	assert(map->pmap != kernel_pmap);       /* protect alias access */
16566 
16567 	/*
16568 	 * The madvise semantics require that the address range be fully
16569 	 * allocated with no holes.  Otherwise, we're required to return
16570 	 * an error.
16571 	 */
16572 
16573 	if (!vm_map_range_check(map, start, end, &entry)) {
16574 		vm_map_unlock_read(map);
16575 		vm_page_stats_reusable.reuse_pages_failure++;
16576 		return KERN_INVALID_ADDRESS;
16577 	}
16578 
16579 	/*
16580 	 * Examine each vm_map_entry_t in the range.
16581 	 */
16582 	for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
16583 	    entry = entry->vme_next) {
16584 		/*
16585 		 * Sanity check on the VM map entry.
16586 		 */
16587 		if (!vm_map_entry_is_reusable(entry)) {
16588 			vm_map_unlock_read(map);
16589 			vm_page_stats_reusable.reuse_pages_failure++;
16590 			return KERN_INVALID_ADDRESS;
16591 		}
16592 
16593 		/*
16594 		 * The first time through, the start address could be anywhere
16595 		 * within the vm_map_entry we found.  So adjust the offset to
16596 		 * correspond.
16597 		 */
16598 		if (entry->vme_start < start) {
16599 			start_offset = start - entry->vme_start;
16600 		} else {
16601 			start_offset = 0;
16602 		}
16603 		end_offset = MIN(end, entry->vme_end) - entry->vme_start;
16604 		start_offset += VME_OFFSET(entry);
16605 		end_offset += VME_OFFSET(entry);
16606 
16607 		object = VME_OBJECT(entry);
16608 		if (object != VM_OBJECT_NULL) {
16609 			vm_object_lock(object);
16610 			vm_object_reuse_pages(object, start_offset, end_offset,
16611 			    TRUE);
16612 			vm_object_unlock(object);
16613 		}
16614 
16615 		if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE_REUSABLE) {
16616 			/*
16617 			 * XXX
16618 			 * We do not hold the VM map exclusively here.
16619 			 * The "alias" field is not that critical, so it's
16620 			 * safe to update it here, as long as it is the only
16621 			 * one that can be modified while holding the VM map
16622 			 * "shared".
16623 			 */
16624 			VME_ALIAS_SET(entry, VM_MEMORY_MALLOC_LARGE_REUSED);
16625 		}
16626 	}
16627 
16628 	vm_map_unlock_read(map);
16629 	vm_page_stats_reusable.reuse_pages_success++;
16630 	return KERN_SUCCESS;
16631 }
16632 
16633 
16634 static kern_return_t
vm_map_reusable_pages(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16635 vm_map_reusable_pages(
16636 	vm_map_t        map,
16637 	vm_map_offset_t start,
16638 	vm_map_offset_t end)
16639 {
16640 	vm_map_entry_t                  entry;
16641 	vm_object_t                     object;
16642 	vm_object_offset_t              start_offset, end_offset;
16643 	vm_map_offset_t                 pmap_offset;
16644 
16645 	if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
16646 		/*
16647 		 * XXX TODO4K
16648 		 * need to figure out what reusable means for a portion
16649 		 * of a native page.
16650 		 */
16651 		return KERN_SUCCESS;
16652 	}
16653 
16654 	/*
16655 	 * The MADV_REUSABLE operation doesn't require any changes to the
16656 	 * vm_map_entry_t's, so the read lock is sufficient.
16657 	 */
16658 
16659 	vm_map_lock_read(map);
16660 	assert(map->pmap != kernel_pmap);       /* protect alias access */
16661 
16662 	/*
16663 	 * The madvise semantics require that the address range be fully
16664 	 * allocated with no holes.  Otherwise, we're required to return
16665 	 * an error.
16666 	 */
16667 
16668 	if (!vm_map_range_check(map, start, end, &entry)) {
16669 		vm_map_unlock_read(map);
16670 		vm_page_stats_reusable.reusable_pages_failure++;
16671 		return KERN_INVALID_ADDRESS;
16672 	}
16673 
16674 	/*
16675 	 * Examine each vm_map_entry_t in the range.
16676 	 */
16677 	for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
16678 	    entry = entry->vme_next) {
16679 		int kill_pages = 0;
16680 		boolean_t reusable_no_write = FALSE;
16681 
16682 		/*
16683 		 * Sanity check on the VM map entry.
16684 		 */
16685 		if (!vm_map_entry_is_reusable(entry)) {
16686 			vm_map_unlock_read(map);
16687 			vm_page_stats_reusable.reusable_pages_failure++;
16688 			return KERN_INVALID_ADDRESS;
16689 		}
16690 
16691 		if (!(entry->protection & VM_PROT_WRITE) && !entry->used_for_jit
16692 #if __arm64e__
16693 		    && !entry->used_for_tpro
16694 #endif
16695 		    ) {
16696 			/* not writable: can't discard contents */
16697 			vm_map_unlock_read(map);
16698 			vm_page_stats_reusable.reusable_nonwritable++;
16699 			vm_page_stats_reusable.reusable_pages_failure++;
16700 			return KERN_PROTECTION_FAILURE;
16701 		}
16702 
16703 		/*
16704 		 * The first time through, the start address could be anywhere
16705 		 * within the vm_map_entry we found.  So adjust the offset to
16706 		 * correspond.
16707 		 */
16708 		if (entry->vme_start < start) {
16709 			start_offset = start - entry->vme_start;
16710 			pmap_offset = start;
16711 		} else {
16712 			start_offset = 0;
16713 			pmap_offset = entry->vme_start;
16714 		}
16715 		end_offset = MIN(end, entry->vme_end) - entry->vme_start;
16716 		start_offset += VME_OFFSET(entry);
16717 		end_offset += VME_OFFSET(entry);
16718 
16719 		object = VME_OBJECT(entry);
16720 		if (object == VM_OBJECT_NULL) {
16721 			continue;
16722 		}
16723 
16724 		if (entry->protection & VM_PROT_EXECUTE) {
16725 			/*
16726 			 * Executable mappings might be write-protected by
16727 			 * hardware, so do not attempt to write to these pages.
16728 			 */
16729 			reusable_no_write = TRUE;
16730 		}
16731 
16732 		vm_object_lock(object);
16733 		if (((object->ref_count == 1) ||
16734 		    (object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC &&
16735 		    object->vo_copy == VM_OBJECT_NULL)) &&
16736 		    object->shadow == VM_OBJECT_NULL &&
16737 		    /*
16738 		     * "iokit_acct" entries are billed for their virtual size
16739 		     * (rather than for their resident pages only), so they
16740 		     * wouldn't benefit from making pages reusable, and it
16741 		     * would be hard to keep track of pages that are both
16742 		     * "iokit_acct" and "reusable" in the pmap stats and
16743 		     * ledgers.
16744 		     */
16745 		    !(entry->iokit_acct ||
16746 		    (!entry->is_sub_map && !entry->use_pmap))) {
16747 			if (object->ref_count != 1) {
16748 				vm_page_stats_reusable.reusable_shared++;
16749 			}
16750 			kill_pages = 1;
16751 		} else {
16752 			kill_pages = -1;
16753 		}
16754 		if (kill_pages != -1) {
16755 			vm_object_deactivate_pages(object,
16756 			    start_offset,
16757 			    end_offset - start_offset,
16758 			    kill_pages,
16759 			    TRUE /*reusable_pages*/,
16760 			    reusable_no_write,
16761 			    map->pmap,
16762 			    pmap_offset);
16763 		} else {
16764 			vm_page_stats_reusable.reusable_pages_shared++;
16765 			DTRACE_VM4(vm_map_reusable_pages_shared,
16766 			    unsigned int, VME_ALIAS(entry),
16767 			    vm_map_t, map,
16768 			    vm_map_entry_t, entry,
16769 			    vm_object_t, object);
16770 		}
16771 		vm_object_unlock(object);
16772 
16773 		if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE ||
16774 		    VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE_REUSED) {
16775 			/*
16776 			 * XXX
16777 			 * We do not hold the VM map exclusively here.
16778 			 * The "alias" field is not that critical, so it's
16779 			 * safe to update it here, as long as it is the only
16780 			 * one that can be modified while holding the VM map
16781 			 * "shared".
16782 			 */
16783 			VME_ALIAS_SET(entry, VM_MEMORY_MALLOC_LARGE_REUSABLE);
16784 		}
16785 	}
16786 
16787 	vm_map_unlock_read(map);
16788 	vm_page_stats_reusable.reusable_pages_success++;
16789 	return KERN_SUCCESS;
16790 }
16791 
16792 
16793 static kern_return_t
vm_map_can_reuse(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16794 vm_map_can_reuse(
16795 	vm_map_t        map,
16796 	vm_map_offset_t start,
16797 	vm_map_offset_t end)
16798 {
16799 	vm_map_entry_t                  entry;
16800 
16801 	/*
16802 	 * The MADV_REUSABLE operation doesn't require any changes to the
16803 	 * vm_map_entry_t's, so the read lock is sufficient.
16804 	 */
16805 
16806 	vm_map_lock_read(map);
16807 	assert(map->pmap != kernel_pmap);       /* protect alias access */
16808 
16809 	/*
16810 	 * The madvise semantics require that the address range be fully
16811 	 * allocated with no holes.  Otherwise, we're required to return
16812 	 * an error.
16813 	 */
16814 
16815 	if (!vm_map_range_check(map, start, end, &entry)) {
16816 		vm_map_unlock_read(map);
16817 		vm_page_stats_reusable.can_reuse_failure++;
16818 		return KERN_INVALID_ADDRESS;
16819 	}
16820 
16821 	/*
16822 	 * Examine each vm_map_entry_t in the range.
16823 	 */
16824 	for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
16825 	    entry = entry->vme_next) {
16826 		/*
16827 		 * Sanity check on the VM map entry.
16828 		 */
16829 		if (!vm_map_entry_is_reusable(entry)) {
16830 			vm_map_unlock_read(map);
16831 			vm_page_stats_reusable.can_reuse_failure++;
16832 			return KERN_INVALID_ADDRESS;
16833 		}
16834 	}
16835 
16836 	vm_map_unlock_read(map);
16837 	vm_page_stats_reusable.can_reuse_success++;
16838 	return KERN_SUCCESS;
16839 }
16840 
16841 
16842 #if MACH_ASSERT
16843 static kern_return_t
vm_map_pageout(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16844 vm_map_pageout(
16845 	vm_map_t        map,
16846 	vm_map_offset_t start,
16847 	vm_map_offset_t end)
16848 {
16849 	vm_map_entry_t                  entry;
16850 
16851 	/*
16852 	 * The MADV_PAGEOUT operation doesn't require any changes to the
16853 	 * vm_map_entry_t's, so the read lock is sufficient.
16854 	 */
16855 
16856 	vm_map_lock_read(map);
16857 
16858 	/*
16859 	 * The madvise semantics require that the address range be fully
16860 	 * allocated with no holes.  Otherwise, we're required to return
16861 	 * an error.
16862 	 */
16863 
16864 	if (!vm_map_range_check(map, start, end, &entry)) {
16865 		vm_map_unlock_read(map);
16866 		return KERN_INVALID_ADDRESS;
16867 	}
16868 
16869 	/*
16870 	 * Examine each vm_map_entry_t in the range.
16871 	 */
16872 	for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
16873 	    entry = entry->vme_next) {
16874 		vm_object_t     object;
16875 
16876 		/*
16877 		 * Sanity check on the VM map entry.
16878 		 */
16879 		if (entry->is_sub_map) {
16880 			vm_map_t submap;
16881 			vm_map_offset_t submap_start;
16882 			vm_map_offset_t submap_end;
16883 			vm_map_entry_t submap_entry;
16884 
16885 			submap = VME_SUBMAP(entry);
16886 			submap_start = VME_OFFSET(entry);
16887 			submap_end = submap_start + (entry->vme_end -
16888 			    entry->vme_start);
16889 
16890 			vm_map_lock_read(submap);
16891 
16892 			if (!vm_map_range_check(submap,
16893 			    submap_start,
16894 			    submap_end,
16895 			    &submap_entry)) {
16896 				vm_map_unlock_read(submap);
16897 				vm_map_unlock_read(map);
16898 				return KERN_INVALID_ADDRESS;
16899 			}
16900 
16901 			if (submap_entry->is_sub_map) {
16902 				vm_map_unlock_read(submap);
16903 				continue;
16904 			}
16905 
16906 			object = VME_OBJECT(submap_entry);
16907 			if (object == VM_OBJECT_NULL || !object->internal) {
16908 				vm_map_unlock_read(submap);
16909 				continue;
16910 			}
16911 
16912 			vm_object_pageout(object);
16913 
16914 			vm_map_unlock_read(submap);
16915 			submap = VM_MAP_NULL;
16916 			submap_entry = VM_MAP_ENTRY_NULL;
16917 			continue;
16918 		}
16919 
16920 		object = VME_OBJECT(entry);
16921 		if (object == VM_OBJECT_NULL || !object->internal) {
16922 			continue;
16923 		}
16924 
16925 		vm_object_pageout(object);
16926 	}
16927 
16928 	vm_map_unlock_read(map);
16929 	return KERN_SUCCESS;
16930 }
16931 #endif /* MACH_ASSERT */
16932 
16933 
16934 /*
16935  *	Routine:	vm_map_entry_insert
16936  *
16937  *	Description:	This routine inserts a new vm_entry in a locked map.
16938  */
16939 static vm_map_entry_t
vm_map_entry_insert(vm_map_t map,vm_map_entry_t insp_entry,vm_map_offset_t start,vm_map_offset_t end,vm_object_t object,vm_object_offset_t offset,vm_map_kernel_flags_t vmk_flags,boolean_t needs_copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance,boolean_t clear_map_aligned)16940 vm_map_entry_insert(
16941 	vm_map_t                map,
16942 	vm_map_entry_t          insp_entry,
16943 	vm_map_offset_t         start,
16944 	vm_map_offset_t         end,
16945 	vm_object_t             object,
16946 	vm_object_offset_t      offset,
16947 	vm_map_kernel_flags_t   vmk_flags,
16948 	boolean_t               needs_copy,
16949 	vm_prot_t               cur_protection,
16950 	vm_prot_t               max_protection,
16951 	vm_inherit_t            inheritance,
16952 	boolean_t               clear_map_aligned)
16953 {
16954 	vm_map_entry_t  new_entry;
16955 	boolean_t map_aligned = FALSE;
16956 
16957 	assert(insp_entry != (vm_map_entry_t)0);
16958 	vm_map_lock_assert_exclusive(map);
16959 
16960 #if DEVELOPMENT || DEBUG
16961 	vm_object_offset_t      end_offset = 0;
16962 	assertf(!os_add_overflow(end - start, offset, &end_offset), "size 0x%llx, offset 0x%llx caused overflow", (uint64_t)(end - start), offset);
16963 #endif /* DEVELOPMENT || DEBUG */
16964 
16965 	if (VM_MAP_PAGE_SHIFT(map) != PAGE_SHIFT) {
16966 		map_aligned = TRUE;
16967 	}
16968 	if (clear_map_aligned &&
16969 	    (!VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)) ||
16970 	    !VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)))) {
16971 		map_aligned = FALSE;
16972 	}
16973 	if (map_aligned) {
16974 		assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
16975 		assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
16976 	} else {
16977 		assert(page_aligned(start));
16978 		assert(page_aligned(end));
16979 	}
16980 	assert(start < end);
16981 
16982 	new_entry = vm_map_entry_create(map);
16983 
16984 	new_entry->vme_start = start;
16985 	new_entry->vme_end = end;
16986 
16987 	if (vmk_flags.vmkf_submap) {
16988 		new_entry->vme_atomic = vmk_flags.vmkf_submap_atomic;
16989 		VME_SUBMAP_SET(new_entry, (vm_map_t)object);
16990 	} else {
16991 		VME_OBJECT_SET(new_entry, object, false, 0);
16992 	}
16993 	VME_OFFSET_SET(new_entry, offset);
16994 	VME_ALIAS_SET(new_entry, vmk_flags.vm_tag);
16995 
16996 	new_entry->map_aligned = map_aligned;
16997 	new_entry->needs_copy = needs_copy;
16998 	new_entry->inheritance = inheritance;
16999 	new_entry->protection = cur_protection;
17000 	new_entry->max_protection = max_protection;
17001 	/*
17002 	 * submap: "use_pmap" means "nested".
17003 	 * default: false.
17004 	 *
17005 	 * object: "use_pmap" means "use pmap accounting" for footprint.
17006 	 * default: true.
17007 	 */
17008 	new_entry->use_pmap = !vmk_flags.vmkf_submap;
17009 	new_entry->no_cache = vmk_flags.vmf_no_cache;
17010 	new_entry->vme_permanent = vmk_flags.vmf_permanent;
17011 	new_entry->translated_allow_execute = vmk_flags.vmkf_translated_allow_execute;
17012 	new_entry->vme_no_copy_on_read = vmk_flags.vmkf_no_copy_on_read;
17013 	new_entry->superpage_size = (vmk_flags.vmf_superpage_size != 0);
17014 
17015 	if (vmk_flags.vmkf_map_jit) {
17016 		if (!(map->jit_entry_exists) ||
17017 		    VM_MAP_POLICY_ALLOW_MULTIPLE_JIT(map)) {
17018 			new_entry->used_for_jit = TRUE;
17019 			map->jit_entry_exists = TRUE;
17020 		}
17021 	}
17022 
17023 	/*
17024 	 *	Insert the new entry into the list.
17025 	 */
17026 
17027 	vm_map_store_entry_link(map, insp_entry, new_entry, vmk_flags);
17028 	map->size += end - start;
17029 
17030 	/*
17031 	 *	Update the free space hint and the lookup hint.
17032 	 */
17033 
17034 	SAVE_HINT_MAP_WRITE(map, new_entry);
17035 	return new_entry;
17036 }
17037 
17038 /*
17039  *	Routine:	vm_map_remap_extract
17040  *
17041  *	Description:	This routine returns a vm_entry list from a map.
17042  */
17043 static kern_return_t
vm_map_remap_extract(vm_map_t map,vm_map_offset_t addr,vm_map_size_t size,boolean_t copy,vm_map_copy_t map_copy,vm_prot_t * cur_protection,vm_prot_t * max_protection,vm_inherit_t inheritance,vm_map_kernel_flags_t vmk_flags)17044 vm_map_remap_extract(
17045 	vm_map_t                map,
17046 	vm_map_offset_t         addr,
17047 	vm_map_size_t           size,
17048 	boolean_t               copy,
17049 	vm_map_copy_t           map_copy,
17050 	vm_prot_t               *cur_protection,   /* IN/OUT */
17051 	vm_prot_t               *max_protection,   /* IN/OUT */
17052 	/* What, no behavior? */
17053 	vm_inherit_t            inheritance,
17054 	vm_map_kernel_flags_t   vmk_flags)
17055 {
17056 	struct vm_map_header   *map_header = &map_copy->cpy_hdr;
17057 	kern_return_t           result;
17058 	vm_map_size_t           mapped_size;
17059 	vm_map_size_t           tmp_size;
17060 	vm_map_entry_t          src_entry;     /* result of last map lookup */
17061 	vm_map_entry_t          new_entry;
17062 	vm_object_offset_t      offset;
17063 	vm_map_offset_t         map_address;
17064 	vm_map_offset_t         src_start;     /* start of entry to map */
17065 	vm_map_offset_t         src_end;       /* end of region to be mapped */
17066 	vm_object_t             object;
17067 	vm_map_version_t        version;
17068 	boolean_t               src_needs_copy;
17069 	boolean_t               new_entry_needs_copy;
17070 	vm_map_entry_t          saved_src_entry;
17071 	boolean_t               src_entry_was_wired;
17072 	vm_prot_t               max_prot_for_prot_copy;
17073 	vm_map_offset_t         effective_page_mask;
17074 	bool                    pageable, same_map;
17075 	boolean_t               vm_remap_legacy;
17076 	vm_prot_t               required_cur_prot, required_max_prot;
17077 	vm_object_t             new_copy_object;     /* vm_object_copy_* result */
17078 	boolean_t               saved_used_for_jit;  /* Saved used_for_jit. */
17079 
17080 	pageable = vmk_flags.vmkf_copy_pageable;
17081 	same_map = vmk_flags.vmkf_copy_same_map;
17082 
17083 	effective_page_mask = MIN(PAGE_MASK, VM_MAP_PAGE_MASK(map));
17084 
17085 	assert(map != VM_MAP_NULL);
17086 	assert(size != 0);
17087 	assert(size == vm_map_round_page(size, effective_page_mask));
17088 	assert(inheritance == VM_INHERIT_NONE ||
17089 	    inheritance == VM_INHERIT_COPY ||
17090 	    inheritance == VM_INHERIT_SHARE);
17091 	assert(!(*cur_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)));
17092 	assert(!(*max_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)));
17093 	assert((*cur_protection & *max_protection) == *cur_protection);
17094 
17095 	/*
17096 	 *	Compute start and end of region.
17097 	 */
17098 	src_start = vm_map_trunc_page(addr, effective_page_mask);
17099 	src_end = vm_map_round_page(src_start + size, effective_page_mask);
17100 
17101 	/*
17102 	 *	Initialize map_header.
17103 	 */
17104 	map_header->nentries = 0;
17105 	map_header->entries_pageable = pageable;
17106 //	map_header->page_shift = MIN(VM_MAP_PAGE_SHIFT(map), PAGE_SHIFT);
17107 	map_header->page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(map);
17108 	map_header->rb_head_store.rbh_root = (void *)(int)SKIP_RB_TREE;
17109 	vm_map_store_init(map_header);
17110 
17111 	if (copy && vmk_flags.vmkf_remap_prot_copy) {
17112 		/*
17113 		 * Special case for vm_map_protect(VM_PROT_COPY):
17114 		 * we want to set the new mappings' max protection to the
17115 		 * specified *max_protection...
17116 		 */
17117 		max_prot_for_prot_copy = *max_protection & (VM_PROT_ALL | VM_PROT_ALLEXEC);
17118 		/* ... but we want to use the vm_remap() legacy mode */
17119 		*max_protection = VM_PROT_NONE;
17120 		*cur_protection = VM_PROT_NONE;
17121 	} else {
17122 		max_prot_for_prot_copy = VM_PROT_NONE;
17123 	}
17124 
17125 	if (*cur_protection == VM_PROT_NONE &&
17126 	    *max_protection == VM_PROT_NONE) {
17127 		/*
17128 		 * vm_remap() legacy mode:
17129 		 * Extract all memory regions in the specified range and
17130 		 * collect the strictest set of protections allowed on the
17131 		 * entire range, so the caller knows what they can do with
17132 		 * the remapped range.
17133 		 * We start with VM_PROT_ALL and we'll remove the protections
17134 		 * missing from each memory region.
17135 		 */
17136 		vm_remap_legacy = TRUE;
17137 		*cur_protection = VM_PROT_ALL;
17138 		*max_protection = VM_PROT_ALL;
17139 		required_cur_prot = VM_PROT_NONE;
17140 		required_max_prot = VM_PROT_NONE;
17141 	} else {
17142 		/*
17143 		 * vm_remap_new() mode:
17144 		 * Extract all memory regions in the specified range and
17145 		 * ensure that they have at least the protections specified
17146 		 * by the caller via *cur_protection and *max_protection.
17147 		 * The resulting mapping should have these protections.
17148 		 */
17149 		vm_remap_legacy = FALSE;
17150 		if (copy) {
17151 			required_cur_prot = VM_PROT_NONE;
17152 			required_max_prot = VM_PROT_READ;
17153 		} else {
17154 			required_cur_prot = *cur_protection;
17155 			required_max_prot = *max_protection;
17156 		}
17157 	}
17158 
17159 	map_address = 0;
17160 	mapped_size = 0;
17161 	result = KERN_SUCCESS;
17162 
17163 	/*
17164 	 *	The specified source virtual space might correspond to
17165 	 *	multiple map entries, need to loop on them.
17166 	 */
17167 	vm_map_lock(map);
17168 
17169 	if (map->pmap == kernel_pmap) {
17170 		map_copy->is_kernel_range = true;
17171 		map_copy->orig_range = kmem_addr_get_range(addr, size);
17172 #if CONFIG_MAP_RANGES
17173 	} else if (map->uses_user_ranges) {
17174 		map_copy->is_user_range = true;
17175 		map_copy->orig_range = vm_map_user_range_resolve(map, addr, size, NULL);
17176 #endif /* CONFIG_MAP_RANGES */
17177 	}
17178 
17179 	if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
17180 		/*
17181 		 * This address space uses sub-pages so the range might
17182 		 * not be re-mappable in an address space with larger
17183 		 * pages. Re-assemble any broken-up VM map entries to
17184 		 * improve our chances of making it work.
17185 		 */
17186 		vm_map_simplify_range(map, src_start, src_end);
17187 	}
17188 	while (mapped_size != size) {
17189 		vm_map_size_t   entry_size;
17190 
17191 		/*
17192 		 *	Find the beginning of the region.
17193 		 */
17194 		if (!vm_map_lookup_entry(map, src_start, &src_entry)) {
17195 			result = KERN_INVALID_ADDRESS;
17196 			break;
17197 		}
17198 
17199 		if (src_start < src_entry->vme_start ||
17200 		    (mapped_size && src_start != src_entry->vme_start)) {
17201 			result = KERN_INVALID_ADDRESS;
17202 			break;
17203 		}
17204 
17205 		tmp_size = size - mapped_size;
17206 		if (src_end > src_entry->vme_end) {
17207 			tmp_size -= (src_end - src_entry->vme_end);
17208 		}
17209 
17210 		entry_size = (vm_map_size_t)(src_entry->vme_end -
17211 		    src_entry->vme_start);
17212 
17213 		if (src_entry->is_sub_map &&
17214 		    vmk_flags.vmkf_copy_single_object) {
17215 			vm_map_t submap;
17216 			vm_map_offset_t submap_start;
17217 			vm_map_size_t submap_size;
17218 			boolean_t submap_needs_copy;
17219 
17220 			/*
17221 			 * No check for "required protection" on "src_entry"
17222 			 * because the protections that matter are the ones
17223 			 * on the submap's VM map entry, which will be checked
17224 			 * during the call to vm_map_remap_extract() below.
17225 			 */
17226 			submap_size = src_entry->vme_end - src_start;
17227 			if (submap_size > size) {
17228 				submap_size = size;
17229 			}
17230 			submap_start = VME_OFFSET(src_entry) + src_start - src_entry->vme_start;
17231 			submap = VME_SUBMAP(src_entry);
17232 			if (copy) {
17233 				/*
17234 				 * The caller wants a copy-on-write re-mapping,
17235 				 * so let's extract from the submap accordingly.
17236 				 */
17237 				submap_needs_copy = TRUE;
17238 			} else if (src_entry->needs_copy) {
17239 				/*
17240 				 * The caller wants a shared re-mapping but the
17241 				 * submap is mapped with "needs_copy", so its
17242 				 * contents can't be shared as is. Extract the
17243 				 * contents of the submap as "copy-on-write".
17244 				 * The re-mapping won't be shared with the
17245 				 * original mapping but this is equivalent to
17246 				 * what happened with the original "remap from
17247 				 * submap" code.
17248 				 * The shared region is mapped "needs_copy", for
17249 				 * example.
17250 				 */
17251 				submap_needs_copy = TRUE;
17252 			} else {
17253 				/*
17254 				 * The caller wants a shared re-mapping and
17255 				 * this mapping can be shared (no "needs_copy"),
17256 				 * so let's extract from the submap accordingly.
17257 				 * Kernel submaps are mapped without
17258 				 * "needs_copy", for example.
17259 				 */
17260 				submap_needs_copy = FALSE;
17261 			}
17262 			vm_map_reference(submap);
17263 			vm_map_unlock(map);
17264 			src_entry = NULL;
17265 			if (vm_remap_legacy) {
17266 				*cur_protection = VM_PROT_NONE;
17267 				*max_protection = VM_PROT_NONE;
17268 			}
17269 
17270 			DTRACE_VM7(remap_submap_recurse,
17271 			    vm_map_t, map,
17272 			    vm_map_offset_t, addr,
17273 			    vm_map_size_t, size,
17274 			    boolean_t, copy,
17275 			    vm_map_offset_t, submap_start,
17276 			    vm_map_size_t, submap_size,
17277 			    boolean_t, submap_needs_copy);
17278 
17279 			result = vm_map_remap_extract(submap,
17280 			    submap_start,
17281 			    submap_size,
17282 			    submap_needs_copy,
17283 			    map_copy,
17284 			    cur_protection,
17285 			    max_protection,
17286 			    inheritance,
17287 			    vmk_flags);
17288 			vm_map_deallocate(submap);
17289 
17290 			if (result == KERN_SUCCESS &&
17291 			    submap_needs_copy &&
17292 			    !copy) {
17293 				/*
17294 				 * We were asked for a "shared"
17295 				 * re-mapping but had to ask for a
17296 				 * "copy-on-write" remapping of the
17297 				 * submap's mapping to honor the
17298 				 * submap's "needs_copy".
17299 				 * We now need to resolve that
17300 				 * pending "copy-on-write" to
17301 				 * get something we can share.
17302 				 */
17303 				vm_map_entry_t copy_entry;
17304 				vm_object_offset_t copy_offset;
17305 				vm_map_size_t copy_size;
17306 				vm_object_t copy_object;
17307 				copy_entry = vm_map_copy_first_entry(map_copy);
17308 				copy_size = copy_entry->vme_end - copy_entry->vme_start;
17309 				copy_object = VME_OBJECT(copy_entry);
17310 				copy_offset = VME_OFFSET(copy_entry);
17311 				if (copy_object == VM_OBJECT_NULL) {
17312 					assert(copy_offset == 0);
17313 					assert(!copy_entry->needs_copy);
17314 					if (copy_entry->max_protection == VM_PROT_NONE) {
17315 						assert(copy_entry->protection == VM_PROT_NONE);
17316 						/* nothing to share */
17317 					} else {
17318 						assert(copy_offset == 0);
17319 						copy_object = vm_object_allocate(copy_size);
17320 						VME_OFFSET_SET(copy_entry, 0);
17321 						VME_OBJECT_SET(copy_entry, copy_object, false, 0);
17322 						assert(copy_entry->use_pmap);
17323 					}
17324 				} else if (copy_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
17325 					/* already shareable */
17326 					assert(!copy_entry->needs_copy);
17327 				} else if (copy_entry->needs_copy ||
17328 				    copy_object->shadowed ||
17329 				    (object->internal &&
17330 				    !object->true_share &&
17331 				    !copy_entry->is_shared &&
17332 				    copy_object->vo_size > copy_size)) {
17333 					VME_OBJECT_SHADOW(copy_entry, copy_size, TRUE);
17334 					assert(copy_entry->use_pmap);
17335 					if (copy_entry->needs_copy) {
17336 						/* already write-protected */
17337 					} else {
17338 						vm_prot_t prot;
17339 						prot = copy_entry->protection & ~VM_PROT_WRITE;
17340 						vm_object_pmap_protect(copy_object,
17341 						    copy_offset,
17342 						    copy_size,
17343 						    PMAP_NULL,
17344 						    PAGE_SIZE,
17345 						    0,
17346 						    prot);
17347 					}
17348 					copy_entry->needs_copy = FALSE;
17349 				}
17350 				copy_object = VME_OBJECT(copy_entry);
17351 				copy_offset = VME_OFFSET(copy_entry);
17352 				if (copy_object &&
17353 				    copy_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
17354 					copy_object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
17355 					copy_object->true_share = TRUE;
17356 				}
17357 			}
17358 
17359 			return result;
17360 		}
17361 
17362 		if (src_entry->is_sub_map) {
17363 			/* protections for submap mapping are irrelevant here */
17364 		} else if (((src_entry->protection & required_cur_prot) !=
17365 		    required_cur_prot) ||
17366 		    ((src_entry->max_protection & required_max_prot) !=
17367 		    required_max_prot)) {
17368 			if (vmk_flags.vmkf_copy_single_object &&
17369 			    mapped_size != 0) {
17370 				/*
17371 				 * Single object extraction.
17372 				 * We can't extract more with the required
17373 				 * protection but we've extracted some, so
17374 				 * stop there and declare success.
17375 				 * The caller should check the size of
17376 				 * the copy entry we've extracted.
17377 				 */
17378 				result = KERN_SUCCESS;
17379 			} else {
17380 				/*
17381 				 * VM range extraction.
17382 				 * Required proctection is not available
17383 				 * for this part of the range: fail.
17384 				 */
17385 				result = KERN_PROTECTION_FAILURE;
17386 			}
17387 			break;
17388 		}
17389 
17390 		if (src_entry->is_sub_map) {
17391 			vm_map_t submap;
17392 			vm_map_offset_t submap_start;
17393 			vm_map_size_t submap_size;
17394 			vm_map_copy_t submap_copy;
17395 			vm_prot_t submap_curprot, submap_maxprot;
17396 			boolean_t submap_needs_copy;
17397 
17398 			/*
17399 			 * No check for "required protection" on "src_entry"
17400 			 * because the protections that matter are the ones
17401 			 * on the submap's VM map entry, which will be checked
17402 			 * during the call to vm_map_copy_extract() below.
17403 			 */
17404 			object = VM_OBJECT_NULL;
17405 			submap_copy = VM_MAP_COPY_NULL;
17406 
17407 			/* find equivalent range in the submap */
17408 			submap = VME_SUBMAP(src_entry);
17409 			submap_start = VME_OFFSET(src_entry) + src_start - src_entry->vme_start;
17410 			submap_size = tmp_size;
17411 			if (copy) {
17412 				/*
17413 				 * The caller wants a copy-on-write re-mapping,
17414 				 * so let's extract from the submap accordingly.
17415 				 */
17416 				submap_needs_copy = TRUE;
17417 			} else if (src_entry->needs_copy) {
17418 				/*
17419 				 * The caller wants a shared re-mapping but the
17420 				 * submap is mapped with "needs_copy", so its
17421 				 * contents can't be shared as is. Extract the
17422 				 * contents of the submap as "copy-on-write".
17423 				 * The re-mapping won't be shared with the
17424 				 * original mapping but this is equivalent to
17425 				 * what happened with the original "remap from
17426 				 * submap" code.
17427 				 * The shared region is mapped "needs_copy", for
17428 				 * example.
17429 				 */
17430 				submap_needs_copy = TRUE;
17431 			} else {
17432 				/*
17433 				 * The caller wants a shared re-mapping and
17434 				 * this mapping can be shared (no "needs_copy"),
17435 				 * so let's extract from the submap accordingly.
17436 				 * Kernel submaps are mapped without
17437 				 * "needs_copy", for example.
17438 				 */
17439 				submap_needs_copy = FALSE;
17440 			}
17441 			/* extra ref to keep submap alive */
17442 			vm_map_reference(submap);
17443 
17444 			DTRACE_VM7(remap_submap_recurse,
17445 			    vm_map_t, map,
17446 			    vm_map_offset_t, addr,
17447 			    vm_map_size_t, size,
17448 			    boolean_t, copy,
17449 			    vm_map_offset_t, submap_start,
17450 			    vm_map_size_t, submap_size,
17451 			    boolean_t, submap_needs_copy);
17452 
17453 			/*
17454 			 * The map can be safely unlocked since we
17455 			 * already hold a reference on the submap.
17456 			 *
17457 			 * No timestamp since we don't care if the map
17458 			 * gets modified while we're down in the submap.
17459 			 * We'll resume the extraction at src_start + tmp_size
17460 			 * anyway.
17461 			 */
17462 			vm_map_unlock(map);
17463 			src_entry = NULL; /* not valid once map is unlocked */
17464 
17465 			if (vm_remap_legacy) {
17466 				submap_curprot = VM_PROT_NONE;
17467 				submap_maxprot = VM_PROT_NONE;
17468 				if (max_prot_for_prot_copy) {
17469 					submap_maxprot = max_prot_for_prot_copy;
17470 				}
17471 			} else {
17472 				assert(!max_prot_for_prot_copy);
17473 				submap_curprot = *cur_protection;
17474 				submap_maxprot = *max_protection;
17475 			}
17476 			result = vm_map_copy_extract(submap,
17477 			    submap_start,
17478 			    submap_size,
17479 			    submap_needs_copy,
17480 			    &submap_copy,
17481 			    &submap_curprot,
17482 			    &submap_maxprot,
17483 			    inheritance,
17484 			    vmk_flags);
17485 
17486 			/* release extra ref on submap */
17487 			vm_map_deallocate(submap);
17488 			submap = VM_MAP_NULL;
17489 
17490 			if (result != KERN_SUCCESS) {
17491 				vm_map_lock(map);
17492 				break;
17493 			}
17494 
17495 			/* transfer submap_copy entries to map_header */
17496 			while (vm_map_copy_first_entry(submap_copy) !=
17497 			    vm_map_copy_to_entry(submap_copy)) {
17498 				vm_map_entry_t copy_entry;
17499 				vm_map_size_t copy_entry_size;
17500 
17501 				copy_entry = vm_map_copy_first_entry(submap_copy);
17502 
17503 				/*
17504 				 * Prevent kernel_object from being exposed to
17505 				 * user space.
17506 				 */
17507 				if (__improbable(copy_entry->vme_kernel_object)) {
17508 					printf("%d[%s]: rejecting attempt to extract from kernel_object\n",
17509 					    proc_selfpid(),
17510 					    (get_bsdtask_info(current_task())
17511 					    ? proc_name_address(get_bsdtask_info(current_task()))
17512 					    : "?"));
17513 					DTRACE_VM(extract_kernel_only);
17514 					result = KERN_INVALID_RIGHT;
17515 					vm_map_copy_discard(submap_copy);
17516 					submap_copy = VM_MAP_COPY_NULL;
17517 					vm_map_lock(map);
17518 					break;
17519 				}
17520 
17521 #ifdef __arm64e__
17522 				if (vmk_flags.vmkf_tpro_enforcement_override) {
17523 					copy_entry->used_for_tpro = FALSE;
17524 				}
17525 #endif /* __arm64e__ */
17526 
17527 				vm_map_copy_entry_unlink(submap_copy, copy_entry);
17528 				copy_entry_size = copy_entry->vme_end - copy_entry->vme_start;
17529 				copy_entry->vme_start = map_address;
17530 				copy_entry->vme_end = map_address + copy_entry_size;
17531 				map_address += copy_entry_size;
17532 				mapped_size += copy_entry_size;
17533 				src_start += copy_entry_size;
17534 				assert(src_start <= src_end);
17535 				_vm_map_store_entry_link(map_header,
17536 				    map_header->links.prev,
17537 				    copy_entry);
17538 			}
17539 			/* done with submap_copy */
17540 			vm_map_copy_discard(submap_copy);
17541 
17542 			if (vm_remap_legacy) {
17543 				*cur_protection &= submap_curprot;
17544 				*max_protection &= submap_maxprot;
17545 			}
17546 
17547 			/* re-acquire the map lock and continue to next entry */
17548 			vm_map_lock(map);
17549 			continue;
17550 		} else {
17551 			object = VME_OBJECT(src_entry);
17552 
17553 			/*
17554 			 * Prevent kernel_object from being exposed to
17555 			 * user space.
17556 			 */
17557 			if (__improbable(is_kernel_object(object))) {
17558 				printf("%d[%s]: rejecting attempt to extract from kernel_object\n",
17559 				    proc_selfpid(),
17560 				    (get_bsdtask_info(current_task())
17561 				    ? proc_name_address(get_bsdtask_info(current_task()))
17562 				    : "?"));
17563 				DTRACE_VM(extract_kernel_only);
17564 				result = KERN_INVALID_RIGHT;
17565 				break;
17566 			}
17567 
17568 			if (src_entry->iokit_acct) {
17569 				/*
17570 				 * This entry uses "IOKit accounting".
17571 				 */
17572 			} else if (object != VM_OBJECT_NULL &&
17573 			    (object->purgable != VM_PURGABLE_DENY ||
17574 			    object->vo_ledger_tag != VM_LEDGER_TAG_NONE)) {
17575 				/*
17576 				 * Purgeable objects have their own accounting:
17577 				 * no pmap accounting for them.
17578 				 */
17579 				assertf(!src_entry->use_pmap,
17580 				    "map=%p src_entry=%p [0x%llx:0x%llx] 0x%x/0x%x %d",
17581 				    map,
17582 				    src_entry,
17583 				    (uint64_t)src_entry->vme_start,
17584 				    (uint64_t)src_entry->vme_end,
17585 				    src_entry->protection,
17586 				    src_entry->max_protection,
17587 				    VME_ALIAS(src_entry));
17588 			} else {
17589 				/*
17590 				 * Not IOKit or purgeable:
17591 				 * must be accounted by pmap stats.
17592 				 */
17593 				assertf(src_entry->use_pmap,
17594 				    "map=%p src_entry=%p [0x%llx:0x%llx] 0x%x/0x%x %d",
17595 				    map,
17596 				    src_entry,
17597 				    (uint64_t)src_entry->vme_start,
17598 				    (uint64_t)src_entry->vme_end,
17599 				    src_entry->protection,
17600 				    src_entry->max_protection,
17601 				    VME_ALIAS(src_entry));
17602 			}
17603 
17604 			if (object == VM_OBJECT_NULL) {
17605 				assert(!src_entry->needs_copy);
17606 				if (src_entry->max_protection == VM_PROT_NONE) {
17607 					assert(src_entry->protection == VM_PROT_NONE);
17608 					/*
17609 					 * No VM object and no permissions:
17610 					 * this must be a reserved range with
17611 					 * nothing to share or copy.
17612 					 * There could also be all sorts of
17613 					 * pmap shenanigans within that reserved
17614 					 * range, so let's just copy the map
17615 					 * entry as is to remap a similar
17616 					 * reserved range.
17617 					 */
17618 					offset = 0; /* no object => no offset */
17619 					goto copy_src_entry;
17620 				}
17621 				object = vm_object_allocate(entry_size);
17622 				VME_OFFSET_SET(src_entry, 0);
17623 				VME_OBJECT_SET(src_entry, object, false, 0);
17624 				assert(src_entry->use_pmap);
17625 				assert(!map->mapped_in_other_pmaps);
17626 			} else if (src_entry->wired_count ||
17627 			    object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
17628 				/*
17629 				 * A wired memory region should not have
17630 				 * any pending copy-on-write and needs to
17631 				 * keep pointing at the VM object that
17632 				 * contains the wired pages.
17633 				 * If we're sharing this memory (copy=false),
17634 				 * we'll share this VM object.
17635 				 * If we're copying this memory (copy=true),
17636 				 * we'll call vm_object_copy_slowly() below
17637 				 * and use the new VM object for the remapping.
17638 				 *
17639 				 * Or, we are already using an asymmetric
17640 				 * copy, and therefore we already have
17641 				 * the right object.
17642 				 */
17643 				assert(!src_entry->needs_copy);
17644 			} else if (src_entry->needs_copy || object->shadowed ||
17645 			    (object->internal && !object->true_share &&
17646 			    !src_entry->is_shared &&
17647 			    object->vo_size > entry_size)) {
17648 				bool is_writable;
17649 
17650 				VME_OBJECT_SHADOW(src_entry, entry_size,
17651 				    vm_map_always_shadow(map));
17652 				assert(src_entry->use_pmap);
17653 
17654 				is_writable = false;
17655 				if (src_entry->protection & VM_PROT_WRITE) {
17656 					is_writable = true;
17657 #if __arm64e__
17658 				} else if (src_entry->used_for_tpro) {
17659 					is_writable = true;
17660 #endif /* __arm64e__ */
17661 				}
17662 				if (!src_entry->needs_copy && is_writable) {
17663 					vm_prot_t prot;
17664 
17665 					assert(!pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, src_entry->protection));
17666 
17667 					prot = src_entry->protection & ~VM_PROT_WRITE;
17668 
17669 					if (override_nx(map,
17670 					    VME_ALIAS(src_entry))
17671 					    && prot) {
17672 						prot |= VM_PROT_EXECUTE;
17673 					}
17674 
17675 					assert(!pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, prot));
17676 
17677 					if (map->mapped_in_other_pmaps) {
17678 						vm_object_pmap_protect(
17679 							VME_OBJECT(src_entry),
17680 							VME_OFFSET(src_entry),
17681 							entry_size,
17682 							PMAP_NULL,
17683 							PAGE_SIZE,
17684 							src_entry->vme_start,
17685 							prot);
17686 #if MACH_ASSERT
17687 					} else if (__improbable(map->pmap == PMAP_NULL)) {
17688 						extern boolean_t vm_tests_in_progress;
17689 						assert(vm_tests_in_progress);
17690 						/*
17691 						 * Some VM tests (in vm_tests.c)
17692 						 * sometimes want to use a VM
17693 						 * map without a pmap.
17694 						 * Otherwise, this should never
17695 						 * happen.
17696 						 */
17697 #endif /* MACH_ASSERT */
17698 					} else {
17699 						pmap_protect(vm_map_pmap(map),
17700 						    src_entry->vme_start,
17701 						    src_entry->vme_end,
17702 						    prot);
17703 					}
17704 				}
17705 
17706 				object = VME_OBJECT(src_entry);
17707 				src_entry->needs_copy = FALSE;
17708 			}
17709 
17710 
17711 			vm_object_lock(object);
17712 			vm_object_reference_locked(object); /* object ref. for new entry */
17713 			assert(!src_entry->needs_copy);
17714 			if (object->copy_strategy ==
17715 			    MEMORY_OBJECT_COPY_SYMMETRIC) {
17716 				/*
17717 				 * If we want to share this object (copy==0),
17718 				 * it needs to be COPY_DELAY.
17719 				 * If we want to copy this object (copy==1),
17720 				 * we can't just set "needs_copy" on our side
17721 				 * and expect the other side to do the same
17722 				 * (symmetrically), so we can't let the object
17723 				 * stay COPY_SYMMETRIC.
17724 				 * So we always switch from COPY_SYMMETRIC to
17725 				 * COPY_DELAY.
17726 				 */
17727 				object->copy_strategy =
17728 				    MEMORY_OBJECT_COPY_DELAY;
17729 				object->true_share = TRUE;
17730 			}
17731 			vm_object_unlock(object);
17732 		}
17733 
17734 		offset = (VME_OFFSET(src_entry) +
17735 		    (src_start - src_entry->vme_start));
17736 
17737 copy_src_entry:
17738 		new_entry = _vm_map_entry_create(map_header);
17739 		vm_map_entry_copy(map, new_entry, src_entry);
17740 		if (new_entry->is_sub_map) {
17741 			/* clr address space specifics */
17742 			new_entry->use_pmap = FALSE;
17743 		} else if (copy) {
17744 			/*
17745 			 * We're dealing with a copy-on-write operation,
17746 			 * so the resulting mapping should not inherit the
17747 			 * original mapping's accounting settings.
17748 			 * "use_pmap" should be reset to its default (TRUE)
17749 			 * so that the new mapping gets accounted for in
17750 			 * the task's memory footprint.
17751 			 */
17752 			new_entry->use_pmap = TRUE;
17753 		}
17754 		/* "iokit_acct" was cleared in vm_map_entry_copy() */
17755 		assert(!new_entry->iokit_acct);
17756 
17757 		new_entry->map_aligned = FALSE;
17758 
17759 		new_entry->vme_start = map_address;
17760 		new_entry->vme_end = map_address + tmp_size;
17761 		assert(new_entry->vme_start < new_entry->vme_end);
17762 		if (copy && vmk_flags.vmkf_remap_prot_copy) {
17763 			/* security: keep "permanent" and "csm_associated" */
17764 			new_entry->vme_permanent = src_entry->vme_permanent;
17765 			new_entry->csm_associated = src_entry->csm_associated;
17766 			/*
17767 			 * Remapping for vm_map_protect(VM_PROT_COPY)
17768 			 * to convert a read-only mapping into a
17769 			 * copy-on-write version of itself but
17770 			 * with write access:
17771 			 * keep the original inheritance but let's not
17772 			 * add VM_PROT_WRITE to the max protection yet
17773 			 * since we want to do more security checks against
17774 			 * the target map.
17775 			 */
17776 			new_entry->inheritance = src_entry->inheritance;
17777 			new_entry->protection &= max_prot_for_prot_copy;
17778 		} else {
17779 			new_entry->inheritance = inheritance;
17780 			if (!vm_remap_legacy) {
17781 				new_entry->protection = *cur_protection;
17782 				new_entry->max_protection = *max_protection;
17783 			}
17784 		}
17785 #ifdef __arm64e__
17786 		if (copy && vmk_flags.vmkf_tpro_enforcement_override) {
17787 			new_entry->used_for_tpro = FALSE;
17788 		}
17789 #endif /* __arm64e__ */
17790 		VME_OFFSET_SET(new_entry, offset);
17791 
17792 		/*
17793 		 * The new region has to be copied now if required.
17794 		 */
17795 RestartCopy:
17796 		if (!copy) {
17797 			if (src_entry->used_for_jit == TRUE) {
17798 				if (same_map) {
17799 				} else if (!VM_MAP_POLICY_ALLOW_JIT_SHARING(map)) {
17800 					/*
17801 					 * Cannot allow an entry describing a JIT
17802 					 * region to be shared across address spaces.
17803 					 */
17804 					result = KERN_INVALID_ARGUMENT;
17805 					vm_object_deallocate(object);
17806 					vm_map_entry_dispose(new_entry);
17807 					new_entry = VM_MAP_ENTRY_NULL;
17808 					break;
17809 				}
17810 			}
17811 
17812 			src_entry->is_shared = TRUE;
17813 			new_entry->is_shared = TRUE;
17814 			if (!(new_entry->is_sub_map)) {
17815 				new_entry->needs_copy = FALSE;
17816 			}
17817 		} else if (src_entry->is_sub_map) {
17818 			/* make this a COW sub_map if not already */
17819 			assert(new_entry->wired_count == 0);
17820 			new_entry->needs_copy = TRUE;
17821 			object = VM_OBJECT_NULL;
17822 		} else if (src_entry->wired_count == 0 &&
17823 		    !(debug4k_no_cow_copyin && VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) &&
17824 		    vm_object_copy_quickly(VME_OBJECT(new_entry),
17825 		    VME_OFFSET(new_entry),
17826 		    (new_entry->vme_end -
17827 		    new_entry->vme_start),
17828 		    &src_needs_copy,
17829 		    &new_entry_needs_copy)) {
17830 			new_entry->needs_copy = new_entry_needs_copy;
17831 			new_entry->is_shared = FALSE;
17832 			assertf(new_entry->use_pmap, "map %p new_entry %p\n", map, new_entry);
17833 
17834 			/*
17835 			 * Handle copy_on_write semantics.
17836 			 */
17837 			if (src_needs_copy && !src_entry->needs_copy) {
17838 				vm_prot_t prot;
17839 
17840 				assert(!pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, src_entry->protection));
17841 
17842 				prot = src_entry->protection & ~VM_PROT_WRITE;
17843 
17844 				if (override_nx(map,
17845 				    VME_ALIAS(src_entry))
17846 				    && prot) {
17847 					prot |= VM_PROT_EXECUTE;
17848 				}
17849 
17850 				assert(!pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, prot));
17851 
17852 				vm_object_pmap_protect(object,
17853 				    offset,
17854 				    entry_size,
17855 				    ((src_entry->is_shared
17856 				    || map->mapped_in_other_pmaps) ?
17857 				    PMAP_NULL : map->pmap),
17858 				    VM_MAP_PAGE_SIZE(map),
17859 				    src_entry->vme_start,
17860 				    prot);
17861 
17862 				assert(src_entry->wired_count == 0);
17863 				src_entry->needs_copy = TRUE;
17864 			}
17865 			/*
17866 			 * Throw away the old object reference of the new entry.
17867 			 */
17868 			vm_object_deallocate(object);
17869 		} else {
17870 			new_entry->is_shared = FALSE;
17871 			assertf(new_entry->use_pmap, "map %p new_entry %p\n", map, new_entry);
17872 
17873 			src_entry_was_wired = (src_entry->wired_count > 0);
17874 			saved_src_entry = src_entry;
17875 			src_entry = VM_MAP_ENTRY_NULL;
17876 
17877 			/*
17878 			 * The map can be safely unlocked since we
17879 			 * already hold a reference on the object.
17880 			 *
17881 			 * Record the timestamp of the map for later
17882 			 * verification, and unlock the map.
17883 			 */
17884 			version.main_timestamp = map->timestamp;
17885 			vm_map_unlock(map);     /* Increments timestamp once! */
17886 
17887 			/*
17888 			 * Perform the copy.
17889 			 */
17890 			if (src_entry_was_wired > 0 ||
17891 			    (debug4k_no_cow_copyin &&
17892 			    VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT)) {
17893 				vm_object_lock(object);
17894 				result = vm_object_copy_slowly(
17895 					object,
17896 					offset,
17897 					(new_entry->vme_end -
17898 					new_entry->vme_start),
17899 					THREAD_UNINT,
17900 					&new_copy_object);
17901 				/* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
17902 				saved_used_for_jit = new_entry->used_for_jit;
17903 				VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
17904 				new_entry->used_for_jit = saved_used_for_jit;
17905 				VME_OFFSET_SET(new_entry, offset - vm_object_trunc_page(offset));
17906 				new_entry->needs_copy = FALSE;
17907 			} else {
17908 				vm_object_offset_t new_offset;
17909 
17910 				new_offset = VME_OFFSET(new_entry);
17911 				result = vm_object_copy_strategically(
17912 					object,
17913 					offset,
17914 					(new_entry->vme_end -
17915 					new_entry->vme_start),
17916 					false, /* forking */
17917 					&new_copy_object,
17918 					&new_offset,
17919 					&new_entry_needs_copy);
17920 				/* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
17921 				saved_used_for_jit = new_entry->used_for_jit;
17922 				VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
17923 				new_entry->used_for_jit = saved_used_for_jit;
17924 				if (new_offset != VME_OFFSET(new_entry)) {
17925 					VME_OFFSET_SET(new_entry, new_offset);
17926 				}
17927 
17928 				new_entry->needs_copy = new_entry_needs_copy;
17929 			}
17930 
17931 			/*
17932 			 * Throw away the old object reference of the new entry.
17933 			 */
17934 			vm_object_deallocate(object);
17935 
17936 			if (result != KERN_SUCCESS &&
17937 			    result != KERN_MEMORY_RESTART_COPY) {
17938 				vm_map_entry_dispose(new_entry);
17939 				vm_map_lock(map);
17940 				break;
17941 			}
17942 
17943 			/*
17944 			 * Verify that the map has not substantially
17945 			 * changed while the copy was being made.
17946 			 */
17947 
17948 			vm_map_lock(map);
17949 			if (version.main_timestamp + 1 != map->timestamp) {
17950 				/*
17951 				 * Simple version comparison failed.
17952 				 *
17953 				 * Retry the lookup and verify that the
17954 				 * same object/offset are still present.
17955 				 */
17956 				saved_src_entry = VM_MAP_ENTRY_NULL;
17957 				vm_object_deallocate(VME_OBJECT(new_entry));
17958 				vm_map_entry_dispose(new_entry);
17959 				if (result == KERN_MEMORY_RESTART_COPY) {
17960 					result = KERN_SUCCESS;
17961 				}
17962 				continue;
17963 			}
17964 			/* map hasn't changed: src_entry is still valid */
17965 			src_entry = saved_src_entry;
17966 			saved_src_entry = VM_MAP_ENTRY_NULL;
17967 
17968 			if (result == KERN_MEMORY_RESTART_COPY) {
17969 				vm_object_reference(object);
17970 				goto RestartCopy;
17971 			}
17972 		}
17973 
17974 		_vm_map_store_entry_link(map_header,
17975 		    map_header->links.prev, new_entry);
17976 
17977 		/* protections for submap mapping are irrelevant here */
17978 		if (vm_remap_legacy && !src_entry->is_sub_map) {
17979 			*cur_protection &= src_entry->protection;
17980 			*max_protection &= src_entry->max_protection;
17981 		}
17982 
17983 		map_address += tmp_size;
17984 		mapped_size += tmp_size;
17985 		src_start += tmp_size;
17986 
17987 		if (vmk_flags.vmkf_copy_single_object) {
17988 			if (mapped_size != size) {
17989 				DEBUG4K_SHARE("map %p addr 0x%llx size 0x%llx clipped copy at mapped_size 0x%llx\n",
17990 				    map, (uint64_t)addr, (uint64_t)size, (uint64_t)mapped_size);
17991 				if (src_entry->vme_next != vm_map_to_entry(map) &&
17992 				    src_entry->vme_next->vme_object_value ==
17993 				    src_entry->vme_object_value) {
17994 					/* XXX TODO4K */
17995 					DEBUG4K_ERROR("could have extended copy to next entry...\n");
17996 				}
17997 			}
17998 			break;
17999 		}
18000 	} /* end while */
18001 
18002 	vm_map_unlock(map);
18003 	if (result != KERN_SUCCESS) {
18004 		/*
18005 		 * Free all allocated elements.
18006 		 */
18007 		for (src_entry = map_header->links.next;
18008 		    src_entry != CAST_TO_VM_MAP_ENTRY(&map_header->links);
18009 		    src_entry = new_entry) {
18010 			new_entry = src_entry->vme_next;
18011 			_vm_map_store_entry_unlink(map_header, src_entry, false);
18012 			if (src_entry->is_sub_map) {
18013 				vm_map_deallocate(VME_SUBMAP(src_entry));
18014 			} else {
18015 				vm_object_deallocate(VME_OBJECT(src_entry));
18016 			}
18017 			vm_map_entry_dispose(src_entry);
18018 		}
18019 	}
18020 	return result;
18021 }
18022 
18023 bool
vm_map_is_exotic(vm_map_t map)18024 vm_map_is_exotic(
18025 	vm_map_t map)
18026 {
18027 	return VM_MAP_IS_EXOTIC(map);
18028 }
18029 
18030 bool
vm_map_is_alien(vm_map_t map)18031 vm_map_is_alien(
18032 	vm_map_t map)
18033 {
18034 	return VM_MAP_IS_ALIEN(map);
18035 }
18036 
18037 #if XNU_TARGET_OS_OSX
18038 void
vm_map_mark_alien(vm_map_t map)18039 vm_map_mark_alien(
18040 	vm_map_t map)
18041 {
18042 	vm_map_lock(map);
18043 	map->is_alien = true;
18044 	vm_map_unlock(map);
18045 }
18046 
18047 void
vm_map_single_jit(vm_map_t map)18048 vm_map_single_jit(
18049 	vm_map_t map)
18050 {
18051 	vm_map_lock(map);
18052 	map->single_jit = true;
18053 	vm_map_unlock(map);
18054 }
18055 #endif /* XNU_TARGET_OS_OSX */
18056 
18057 /*
18058  * Callers of this function must call vm_map_copy_require on
18059  * previously created vm_map_copy_t or pass a newly created
18060  * one to ensure that it hasn't been forged.
18061  */
18062 static kern_return_t
vm_map_copy_to_physcopy(vm_map_copy_t copy_map,vm_map_t target_map)18063 vm_map_copy_to_physcopy(
18064 	vm_map_copy_t   copy_map,
18065 	vm_map_t        target_map)
18066 {
18067 	vm_map_size_t           size;
18068 	vm_map_entry_t          entry;
18069 	vm_map_entry_t          new_entry;
18070 	vm_object_t             new_object;
18071 	unsigned int            pmap_flags;
18072 	pmap_t                  new_pmap;
18073 	vm_map_t                new_map;
18074 	vm_map_address_t        src_start, src_end, src_cur;
18075 	vm_map_address_t        dst_start, dst_end, dst_cur;
18076 	kern_return_t           kr;
18077 	void                    *kbuf;
18078 
18079 	/*
18080 	 * Perform the equivalent of vm_allocate() and memcpy().
18081 	 * Replace the mappings in "copy_map" with the newly allocated mapping.
18082 	 */
18083 	DEBUG4K_COPY("copy_map %p (%d %d 0x%llx 0x%llx) BEFORE\n", copy_map, copy_map->cpy_hdr.page_shift, copy_map->cpy_hdr.nentries, copy_map->offset, (uint64_t)copy_map->size);
18084 
18085 	assert(copy_map->cpy_hdr.page_shift != VM_MAP_PAGE_MASK(target_map));
18086 
18087 	/* create a new pmap to map "copy_map" */
18088 	pmap_flags = 0;
18089 	assert(copy_map->cpy_hdr.page_shift == FOURK_PAGE_SHIFT);
18090 #if PMAP_CREATE_FORCE_4K_PAGES
18091 	pmap_flags |= PMAP_CREATE_FORCE_4K_PAGES;
18092 #endif /* PMAP_CREATE_FORCE_4K_PAGES */
18093 	pmap_flags |= PMAP_CREATE_64BIT;
18094 	new_pmap = pmap_create_options(NULL, (vm_map_size_t)0, pmap_flags);
18095 	if (new_pmap == NULL) {
18096 		return KERN_RESOURCE_SHORTAGE;
18097 	}
18098 
18099 	/* allocate new VM object */
18100 	size = VM_MAP_ROUND_PAGE(copy_map->size, PAGE_MASK);
18101 	new_object = vm_object_allocate(size);
18102 	assert(new_object);
18103 
18104 	/* allocate new VM map entry */
18105 	new_entry = vm_map_copy_entry_create(copy_map);
18106 	assert(new_entry);
18107 
18108 	/* finish initializing new VM map entry */
18109 	new_entry->protection = VM_PROT_DEFAULT;
18110 	new_entry->max_protection = VM_PROT_DEFAULT;
18111 	new_entry->use_pmap = TRUE;
18112 
18113 	/* make new VM map entry point to new VM object */
18114 	new_entry->vme_start = 0;
18115 	new_entry->vme_end = size;
18116 	VME_OBJECT_SET(new_entry, new_object, false, 0);
18117 	VME_OFFSET_SET(new_entry, 0);
18118 
18119 	/* create a new pageable VM map to map "copy_map" */
18120 	new_map = vm_map_create_options(new_pmap, 0, MACH_VM_MAX_ADDRESS,
18121 	    VM_MAP_CREATE_PAGEABLE);
18122 	assert(new_map);
18123 	vm_map_set_page_shift(new_map, copy_map->cpy_hdr.page_shift);
18124 
18125 	/* map "copy_map" in the new VM map */
18126 	src_start = 0;
18127 	kr = vm_map_copyout_internal(
18128 		new_map,
18129 		&src_start,
18130 		copy_map,
18131 		copy_map->size,
18132 		FALSE, /* consume_on_success */
18133 		VM_PROT_DEFAULT,
18134 		VM_PROT_DEFAULT,
18135 		VM_INHERIT_DEFAULT);
18136 	assert(kr == KERN_SUCCESS);
18137 	src_end = src_start + copy_map->size;
18138 
18139 	/* map "new_object" in the new VM map */
18140 	vm_object_reference(new_object);
18141 	dst_start = 0;
18142 	kr = vm_map_enter(new_map,
18143 	    &dst_start,
18144 	    size,
18145 	    0,               /* mask */
18146 	    VM_MAP_KERNEL_FLAGS_ANYWHERE(.vm_tag = VM_KERN_MEMORY_OSFMK),
18147 	    new_object,
18148 	    0,               /* offset */
18149 	    FALSE,               /* needs copy */
18150 	    VM_PROT_DEFAULT,
18151 	    VM_PROT_DEFAULT,
18152 	    VM_INHERIT_DEFAULT);
18153 	assert(kr == KERN_SUCCESS);
18154 	dst_end = dst_start + size;
18155 
18156 	/* get a kernel buffer */
18157 	kbuf = kalloc_data(PAGE_SIZE, Z_WAITOK | Z_NOFAIL);
18158 
18159 	/* physically copy "copy_map" mappings to new VM object */
18160 	for (src_cur = src_start, dst_cur = dst_start;
18161 	    src_cur < src_end;
18162 	    src_cur += PAGE_SIZE, dst_cur += PAGE_SIZE) {
18163 		vm_size_t bytes;
18164 
18165 		bytes = PAGE_SIZE;
18166 		if (src_cur + PAGE_SIZE > src_end) {
18167 			/* partial copy for last page */
18168 			bytes = src_end - src_cur;
18169 			assert(bytes > 0 && bytes < PAGE_SIZE);
18170 			/* rest of dst page should be zero-filled */
18171 		}
18172 		/* get bytes from src mapping */
18173 		kr = copyinmap(new_map, src_cur, kbuf, bytes);
18174 		if (kr != KERN_SUCCESS) {
18175 			DEBUG4K_COPY("copyinmap(%p, 0x%llx, %p, 0x%llx) kr 0x%x\n", new_map, (uint64_t)src_cur, kbuf, (uint64_t)bytes, kr);
18176 		}
18177 		/* put bytes in dst mapping */
18178 		assert(dst_cur < dst_end);
18179 		assert(dst_cur + bytes <= dst_end);
18180 		kr = copyoutmap(new_map, kbuf, dst_cur, bytes);
18181 		if (kr != KERN_SUCCESS) {
18182 			DEBUG4K_COPY("copyoutmap(%p, %p, 0x%llx, 0x%llx) kr 0x%x\n", new_map, kbuf, (uint64_t)dst_cur, (uint64_t)bytes, kr);
18183 		}
18184 	}
18185 
18186 	/* free kernel buffer */
18187 	kfree_data(kbuf, PAGE_SIZE);
18188 
18189 	/* destroy new map */
18190 	vm_map_destroy(new_map);
18191 	new_map = VM_MAP_NULL;
18192 
18193 	/* dispose of the old map entries in "copy_map" */
18194 	while (vm_map_copy_first_entry(copy_map) !=
18195 	    vm_map_copy_to_entry(copy_map)) {
18196 		entry = vm_map_copy_first_entry(copy_map);
18197 		vm_map_copy_entry_unlink(copy_map, entry);
18198 		if (entry->is_sub_map) {
18199 			vm_map_deallocate(VME_SUBMAP(entry));
18200 		} else {
18201 			vm_object_deallocate(VME_OBJECT(entry));
18202 		}
18203 		vm_map_copy_entry_dispose(entry);
18204 	}
18205 
18206 	/* change "copy_map"'s page_size to match "target_map" */
18207 	copy_map->cpy_hdr.page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(target_map);
18208 	copy_map->offset = 0;
18209 	copy_map->size = size;
18210 
18211 	/* insert new map entry in "copy_map" */
18212 	assert(vm_map_copy_last_entry(copy_map) == vm_map_copy_to_entry(copy_map));
18213 	vm_map_copy_entry_link(copy_map, vm_map_copy_last_entry(copy_map), new_entry);
18214 
18215 	DEBUG4K_COPY("copy_map %p (%d %d 0x%llx 0x%llx) AFTER\n", copy_map, copy_map->cpy_hdr.page_shift, copy_map->cpy_hdr.nentries, copy_map->offset, (uint64_t)copy_map->size);
18216 	return KERN_SUCCESS;
18217 }
18218 
18219 void
18220 vm_map_copy_adjust_get_target_copy_map(
18221 	vm_map_copy_t   copy_map,
18222 	vm_map_copy_t   *target_copy_map_p);
18223 void
vm_map_copy_adjust_get_target_copy_map(vm_map_copy_t copy_map,vm_map_copy_t * target_copy_map_p)18224 vm_map_copy_adjust_get_target_copy_map(
18225 	vm_map_copy_t   copy_map,
18226 	vm_map_copy_t   *target_copy_map_p)
18227 {
18228 	vm_map_copy_t   target_copy_map;
18229 	vm_map_entry_t  entry, target_entry;
18230 
18231 	if (*target_copy_map_p != VM_MAP_COPY_NULL) {
18232 		/* the caller already has a "target_copy_map": use it */
18233 		return;
18234 	}
18235 
18236 	/* the caller wants us to create a new copy of "copy_map" */
18237 	assert(copy_map->type == VM_MAP_COPY_ENTRY_LIST);
18238 	target_copy_map = vm_map_copy_allocate(copy_map->type);
18239 	target_copy_map->offset = copy_map->offset;
18240 	target_copy_map->size = copy_map->size;
18241 	target_copy_map->cpy_hdr.page_shift = copy_map->cpy_hdr.page_shift;
18242 	for (entry = vm_map_copy_first_entry(copy_map);
18243 	    entry != vm_map_copy_to_entry(copy_map);
18244 	    entry = entry->vme_next) {
18245 		target_entry = vm_map_copy_entry_create(target_copy_map);
18246 		vm_map_entry_copy_full(target_entry, entry);
18247 		if (target_entry->is_sub_map) {
18248 			vm_map_reference(VME_SUBMAP(target_entry));
18249 		} else {
18250 			vm_object_reference(VME_OBJECT(target_entry));
18251 		}
18252 		vm_map_copy_entry_link(
18253 			target_copy_map,
18254 			vm_map_copy_last_entry(target_copy_map),
18255 			target_entry);
18256 	}
18257 	entry = VM_MAP_ENTRY_NULL;
18258 	*target_copy_map_p = target_copy_map;
18259 }
18260 
18261 /*
18262  * Callers of this function must call vm_map_copy_require on
18263  * previously created vm_map_copy_t or pass a newly created
18264  * one to ensure that it hasn't been forged.
18265  */
18266 static void
vm_map_copy_trim(vm_map_copy_t copy_map,uint16_t new_page_shift,vm_map_offset_t trim_start,vm_map_offset_t trim_end)18267 vm_map_copy_trim(
18268 	vm_map_copy_t   copy_map,
18269 	uint16_t        new_page_shift,
18270 	vm_map_offset_t trim_start,
18271 	vm_map_offset_t trim_end)
18272 {
18273 	uint16_t        copy_page_shift;
18274 	vm_map_entry_t  entry, next_entry;
18275 
18276 	assert(copy_map->type == VM_MAP_COPY_ENTRY_LIST);
18277 	assert(copy_map->cpy_hdr.nentries > 0);
18278 
18279 	trim_start += vm_map_copy_first_entry(copy_map)->vme_start;
18280 	trim_end += vm_map_copy_first_entry(copy_map)->vme_start;
18281 
18282 	/* use the new page_shift to do the clipping */
18283 	copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy_map);
18284 	copy_map->cpy_hdr.page_shift = new_page_shift;
18285 
18286 	for (entry = vm_map_copy_first_entry(copy_map);
18287 	    entry != vm_map_copy_to_entry(copy_map);
18288 	    entry = next_entry) {
18289 		next_entry = entry->vme_next;
18290 		if (entry->vme_end <= trim_start) {
18291 			/* entry fully before trim range: skip */
18292 			continue;
18293 		}
18294 		if (entry->vme_start >= trim_end) {
18295 			/* entry fully after trim range: done */
18296 			break;
18297 		}
18298 		/* clip entry if needed */
18299 		vm_map_copy_clip_start(copy_map, entry, trim_start);
18300 		vm_map_copy_clip_end(copy_map, entry, trim_end);
18301 		/* dispose of entry */
18302 		copy_map->size -= entry->vme_end - entry->vme_start;
18303 		vm_map_copy_entry_unlink(copy_map, entry);
18304 		if (entry->is_sub_map) {
18305 			vm_map_deallocate(VME_SUBMAP(entry));
18306 		} else {
18307 			vm_object_deallocate(VME_OBJECT(entry));
18308 		}
18309 		vm_map_copy_entry_dispose(entry);
18310 		entry = VM_MAP_ENTRY_NULL;
18311 	}
18312 
18313 	/* restore copy_map's original page_shift */
18314 	copy_map->cpy_hdr.page_shift = copy_page_shift;
18315 }
18316 
18317 /*
18318  * Make any necessary adjustments to "copy_map" to allow it to be
18319  * mapped into "target_map".
18320  * If no changes were necessary, "target_copy_map" points to the
18321  * untouched "copy_map".
18322  * If changes are necessary, changes will be made to "target_copy_map".
18323  * If "target_copy_map" was NULL, we create a new "vm_map_copy_t" and
18324  * copy the original "copy_map" to it before applying the changes.
18325  * The caller should discard "target_copy_map" if it's not the same as
18326  * the original "copy_map".
18327  */
18328 /* TODO4K: also adjust to sub-range in the copy_map -> add start&end? */
18329 kern_return_t
vm_map_copy_adjust_to_target(vm_map_copy_t src_copy_map,vm_map_offset_t offset,vm_map_size_t size,vm_map_t target_map,boolean_t copy,vm_map_copy_t * target_copy_map_p,vm_map_offset_t * overmap_start_p,vm_map_offset_t * overmap_end_p,vm_map_offset_t * trimmed_start_p)18330 vm_map_copy_adjust_to_target(
18331 	vm_map_copy_t           src_copy_map,
18332 	vm_map_offset_t         offset,
18333 	vm_map_size_t           size,
18334 	vm_map_t                target_map,
18335 	boolean_t               copy,
18336 	vm_map_copy_t           *target_copy_map_p,
18337 	vm_map_offset_t         *overmap_start_p,
18338 	vm_map_offset_t         *overmap_end_p,
18339 	vm_map_offset_t         *trimmed_start_p)
18340 {
18341 	vm_map_copy_t           copy_map, target_copy_map;
18342 	vm_map_size_t           target_size;
18343 	vm_map_size_t           src_copy_map_size;
18344 	vm_map_size_t           overmap_start, overmap_end;
18345 	int                     misalignments;
18346 	vm_map_entry_t          entry, target_entry;
18347 	vm_map_offset_t         addr_adjustment;
18348 	vm_map_offset_t         new_start, new_end;
18349 	int                     copy_page_mask, target_page_mask;
18350 	uint16_t                copy_page_shift, target_page_shift;
18351 	vm_map_offset_t         trimmed_end;
18352 
18353 	/*
18354 	 * Assert that the vm_map_copy is coming from the right
18355 	 * zone and hasn't been forged
18356 	 */
18357 	vm_map_copy_require(src_copy_map);
18358 	assert(src_copy_map->type == VM_MAP_COPY_ENTRY_LIST);
18359 
18360 	/*
18361 	 * Start working with "src_copy_map" but we'll switch
18362 	 * to "target_copy_map" as soon as we start making adjustments.
18363 	 */
18364 	copy_map = src_copy_map;
18365 	src_copy_map_size = src_copy_map->size;
18366 
18367 	copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy_map);
18368 	copy_page_mask = VM_MAP_COPY_PAGE_MASK(copy_map);
18369 	target_page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(target_map);
18370 	target_page_mask = VM_MAP_PAGE_MASK(target_map);
18371 
18372 	DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p...\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, (uint64_t)offset, (uint64_t)size, *target_copy_map_p);
18373 
18374 	target_copy_map = *target_copy_map_p;
18375 	if (target_copy_map != VM_MAP_COPY_NULL) {
18376 		vm_map_copy_require(target_copy_map);
18377 	}
18378 
18379 	if (offset + size > copy_map->size) {
18380 		DEBUG4K_ERROR("copy_map %p (%d->%d) copy_map->size 0x%llx offset 0x%llx size 0x%llx KERN_INVALID_ARGUMENT\n", copy_map, copy_page_shift, target_page_shift, (uint64_t)copy_map->size, (uint64_t)offset, (uint64_t)size);
18381 		return KERN_INVALID_ARGUMENT;
18382 	}
18383 
18384 	/* trim the end */
18385 	trimmed_end = 0;
18386 	new_end = VM_MAP_ROUND_PAGE(offset + size, target_page_mask);
18387 	if (new_end < copy_map->size) {
18388 		trimmed_end = src_copy_map_size - new_end;
18389 		DEBUG4K_ADJUST("copy_map %p (%d->%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p... trim end from 0x%llx to 0x%llx\n", copy_map, copy_page_shift, target_page_shift, copy, (uint64_t)offset, (uint64_t)size, target_copy_map, (uint64_t)new_end, (uint64_t)copy_map->size);
18390 		/* get "target_copy_map" if needed and adjust it */
18391 		vm_map_copy_adjust_get_target_copy_map(copy_map,
18392 		    &target_copy_map);
18393 		copy_map = target_copy_map;
18394 		vm_map_copy_trim(target_copy_map, target_page_shift,
18395 		    new_end, copy_map->size);
18396 	}
18397 
18398 	/* trim the start */
18399 	new_start = VM_MAP_TRUNC_PAGE(offset, target_page_mask);
18400 	if (new_start != 0) {
18401 		DEBUG4K_ADJUST("copy_map %p (%d->%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p... trim start from 0x%llx to 0x%llx\n", copy_map, copy_page_shift, target_page_shift, copy, (uint64_t)offset, (uint64_t)size, target_copy_map, (uint64_t)0, (uint64_t)new_start);
18402 		/* get "target_copy_map" if needed and adjust it */
18403 		vm_map_copy_adjust_get_target_copy_map(copy_map,
18404 		    &target_copy_map);
18405 		copy_map = target_copy_map;
18406 		vm_map_copy_trim(target_copy_map, target_page_shift,
18407 		    0, new_start);
18408 	}
18409 	*trimmed_start_p = new_start;
18410 
18411 	/* target_size starts with what's left after trimming */
18412 	target_size = copy_map->size;
18413 	assertf(target_size == src_copy_map_size - *trimmed_start_p - trimmed_end,
18414 	    "target_size 0x%llx src_copy_map_size 0x%llx trimmed_start 0x%llx trimmed_end 0x%llx\n",
18415 	    (uint64_t)target_size, (uint64_t)src_copy_map_size,
18416 	    (uint64_t)*trimmed_start_p, (uint64_t)trimmed_end);
18417 
18418 	/* check for misalignments but don't adjust yet */
18419 	misalignments = 0;
18420 	overmap_start = 0;
18421 	overmap_end = 0;
18422 	if (copy_page_shift < target_page_shift) {
18423 		/*
18424 		 * Remapping from 4K to 16K: check the VM object alignments
18425 		 * throughout the range.
18426 		 * If the start and end of the range are mis-aligned, we can
18427 		 * over-map to re-align, and adjust the "overmap" start/end
18428 		 * and "target_size" of the range accordingly.
18429 		 * If there is any mis-alignment within the range:
18430 		 *     if "copy":
18431 		 *         we can do immediate-copy instead of copy-on-write,
18432 		 *     else:
18433 		 *         no way to remap and share; fail.
18434 		 */
18435 		for (entry = vm_map_copy_first_entry(copy_map);
18436 		    entry != vm_map_copy_to_entry(copy_map);
18437 		    entry = entry->vme_next) {
18438 			vm_object_offset_t object_offset_start, object_offset_end;
18439 
18440 			object_offset_start = VME_OFFSET(entry);
18441 			object_offset_end = object_offset_start;
18442 			object_offset_end += entry->vme_end - entry->vme_start;
18443 			if (object_offset_start & target_page_mask) {
18444 				if (entry == vm_map_copy_first_entry(copy_map) && !copy) {
18445 					overmap_start++;
18446 				} else {
18447 					misalignments++;
18448 				}
18449 			}
18450 			if (object_offset_end & target_page_mask) {
18451 				if (entry->vme_next == vm_map_copy_to_entry(copy_map) && !copy) {
18452 					overmap_end++;
18453 				} else {
18454 					misalignments++;
18455 				}
18456 			}
18457 		}
18458 	}
18459 	entry = VM_MAP_ENTRY_NULL;
18460 
18461 	/* decide how to deal with misalignments */
18462 	assert(overmap_start <= 1);
18463 	assert(overmap_end <= 1);
18464 	if (!overmap_start && !overmap_end && !misalignments) {
18465 		/* copy_map is properly aligned for target_map ... */
18466 		if (*trimmed_start_p) {
18467 			/* ... but we trimmed it, so still need to adjust */
18468 		} else {
18469 			/* ... and we didn't trim anything: we're done */
18470 			if (target_copy_map == VM_MAP_COPY_NULL) {
18471 				target_copy_map = copy_map;
18472 			}
18473 			*target_copy_map_p = target_copy_map;
18474 			*overmap_start_p = 0;
18475 			*overmap_end_p = 0;
18476 			DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx) -> trimmed 0x%llx overmap start 0x%llx end 0x%llx KERN_SUCCESS\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p);
18477 			return KERN_SUCCESS;
18478 		}
18479 	} else if (misalignments && !copy) {
18480 		/* can't "share" if misaligned */
18481 		DEBUG4K_ADJUST("unsupported sharing\n");
18482 #if MACH_ASSERT
18483 		if (debug4k_panic_on_misaligned_sharing) {
18484 			panic("DEBUG4k %s:%d unsupported sharing", __FUNCTION__, __LINE__);
18485 		}
18486 #endif /* MACH_ASSERT */
18487 		DEBUG4K_ADJUST("copy_map %p (%d) target_map %p (%d) copy %d target_copy_map %p -> KERN_NOT_SUPPORTED\n", copy_map, copy_page_shift, target_map, target_page_shift, copy, *target_copy_map_p);
18488 		return KERN_NOT_SUPPORTED;
18489 	} else {
18490 		/* can't virtual-copy if misaligned (but can physical-copy) */
18491 		DEBUG4K_ADJUST("mis-aligned copying\n");
18492 	}
18493 
18494 	/* get a "target_copy_map" if needed and switch to it */
18495 	vm_map_copy_adjust_get_target_copy_map(copy_map, &target_copy_map);
18496 	copy_map = target_copy_map;
18497 
18498 	if (misalignments && copy) {
18499 		vm_map_size_t target_copy_map_size;
18500 
18501 		/*
18502 		 * Can't do copy-on-write with misaligned mappings.
18503 		 * Replace the mappings with a physical copy of the original
18504 		 * mappings' contents.
18505 		 */
18506 		target_copy_map_size = target_copy_map->size;
18507 		kern_return_t kr = vm_map_copy_to_physcopy(target_copy_map, target_map);
18508 		if (kr != KERN_SUCCESS) {
18509 			return kr;
18510 		}
18511 		*target_copy_map_p = target_copy_map;
18512 		*overmap_start_p = 0;
18513 		*overmap_end_p = target_copy_map->size - target_copy_map_size;
18514 		DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx)-> trimmed 0x%llx overmap start 0x%llx end 0x%llx PHYSCOPY\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p);
18515 		return KERN_SUCCESS;
18516 	}
18517 
18518 	/* apply the adjustments */
18519 	misalignments = 0;
18520 	overmap_start = 0;
18521 	overmap_end = 0;
18522 	/* remove copy_map->offset, so that everything starts at offset 0 */
18523 	addr_adjustment = copy_map->offset;
18524 	/* also remove whatever we trimmed from the start */
18525 	addr_adjustment += *trimmed_start_p;
18526 	for (target_entry = vm_map_copy_first_entry(target_copy_map);
18527 	    target_entry != vm_map_copy_to_entry(target_copy_map);
18528 	    target_entry = target_entry->vme_next) {
18529 		vm_object_offset_t object_offset_start, object_offset_end;
18530 
18531 		DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx BEFORE\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
18532 		object_offset_start = VME_OFFSET(target_entry);
18533 		if (object_offset_start & target_page_mask) {
18534 			DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx misaligned at start\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
18535 			if (target_entry == vm_map_copy_first_entry(target_copy_map)) {
18536 				/*
18537 				 * start of 1st entry is mis-aligned:
18538 				 * re-adjust by over-mapping.
18539 				 */
18540 				overmap_start = object_offset_start - trunc_page_mask_64(object_offset_start, target_page_mask);
18541 				DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> overmap_start 0x%llx\n", target_entry, VME_OFFSET(target_entry), copy, (uint64_t)overmap_start);
18542 				VME_OFFSET_SET(target_entry, VME_OFFSET(target_entry) - overmap_start);
18543 			} else {
18544 				misalignments++;
18545 				DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> misalignments %d\n", target_entry, VME_OFFSET(target_entry), copy, misalignments);
18546 				assert(copy);
18547 			}
18548 		}
18549 
18550 		if (target_entry == vm_map_copy_first_entry(target_copy_map)) {
18551 			target_size += overmap_start;
18552 		} else {
18553 			target_entry->vme_start += overmap_start;
18554 		}
18555 		target_entry->vme_end += overmap_start;
18556 
18557 		object_offset_end = VME_OFFSET(target_entry) + target_entry->vme_end - target_entry->vme_start;
18558 		if (object_offset_end & target_page_mask) {
18559 			DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx misaligned at end\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
18560 			if (target_entry->vme_next == vm_map_copy_to_entry(target_copy_map)) {
18561 				/*
18562 				 * end of last entry is mis-aligned: re-adjust by over-mapping.
18563 				 */
18564 				overmap_end = round_page_mask_64(object_offset_end, target_page_mask) - object_offset_end;
18565 				DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> overmap_end 0x%llx\n", target_entry, VME_OFFSET(target_entry), copy, (uint64_t)overmap_end);
18566 				target_entry->vme_end += overmap_end;
18567 				target_size += overmap_end;
18568 			} else {
18569 				misalignments++;
18570 				DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> misalignments %d\n", target_entry, VME_OFFSET(target_entry), copy, misalignments);
18571 				assert(copy);
18572 			}
18573 		}
18574 		target_entry->vme_start -= addr_adjustment;
18575 		target_entry->vme_end -= addr_adjustment;
18576 		DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx AFTER\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
18577 	}
18578 
18579 	target_copy_map->size = target_size;
18580 	target_copy_map->offset += overmap_start;
18581 	target_copy_map->offset -= addr_adjustment;
18582 	target_copy_map->cpy_hdr.page_shift = target_page_shift;
18583 
18584 //	assert(VM_MAP_PAGE_ALIGNED(target_copy_map->size, target_page_mask));
18585 //	assert(VM_MAP_PAGE_ALIGNED(target_copy_map->offset, FOURK_PAGE_MASK));
18586 	assert(overmap_start < VM_MAP_PAGE_SIZE(target_map));
18587 	assert(overmap_end < VM_MAP_PAGE_SIZE(target_map));
18588 
18589 	*target_copy_map_p = target_copy_map;
18590 	*overmap_start_p = overmap_start;
18591 	*overmap_end_p = overmap_end;
18592 
18593 	DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx) -> trimmed 0x%llx overmap start 0x%llx end 0x%llx KERN_SUCCESS\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p);
18594 	return KERN_SUCCESS;
18595 }
18596 
18597 kern_return_t
vm_map_range_physical_size(vm_map_t map,vm_map_address_t start,mach_vm_size_t size,mach_vm_size_t * phys_size)18598 vm_map_range_physical_size(
18599 	vm_map_t         map,
18600 	vm_map_address_t start,
18601 	mach_vm_size_t   size,
18602 	mach_vm_size_t * phys_size)
18603 {
18604 	kern_return_t   kr;
18605 	vm_map_copy_t   copy_map, target_copy_map;
18606 	vm_map_offset_t adjusted_start, adjusted_end;
18607 	vm_map_size_t   adjusted_size;
18608 	vm_prot_t       cur_prot, max_prot;
18609 	vm_map_offset_t overmap_start, overmap_end, trimmed_start, end;
18610 	vm_map_kernel_flags_t vmk_flags;
18611 
18612 	if (size == 0) {
18613 		DEBUG4K_SHARE("map %p start 0x%llx size 0x%llx -> phys_size 0!\n", map, (uint64_t)start, (uint64_t)size);
18614 		*phys_size = 0;
18615 		return KERN_SUCCESS;
18616 	}
18617 
18618 	adjusted_start = vm_map_trunc_page(start, VM_MAP_PAGE_MASK(map));
18619 	adjusted_end = vm_map_round_page(start + size, VM_MAP_PAGE_MASK(map));
18620 	if (__improbable(os_add_overflow(start, size, &end) ||
18621 	    adjusted_end <= adjusted_start)) {
18622 		/* wraparound */
18623 		printf("%s:%d(start=0x%llx, size=0x%llx) pgmask 0x%x: wraparound\n", __FUNCTION__, __LINE__, (uint64_t)start, (uint64_t)size, VM_MAP_PAGE_MASK(map));
18624 		*phys_size = 0;
18625 		return KERN_INVALID_ARGUMENT;
18626 	}
18627 	if (__improbable(vm_map_range_overflows(map, start, size))) {
18628 		*phys_size = 0;
18629 		return KERN_INVALID_ADDRESS;
18630 	}
18631 	assert(adjusted_end > adjusted_start);
18632 	adjusted_size = adjusted_end - adjusted_start;
18633 	*phys_size = adjusted_size;
18634 	if (VM_MAP_PAGE_SIZE(map) == PAGE_SIZE) {
18635 		return KERN_SUCCESS;
18636 	}
18637 	if (start == 0) {
18638 		adjusted_start = vm_map_trunc_page(start, PAGE_MASK);
18639 		adjusted_end = vm_map_round_page(start + size, PAGE_MASK);
18640 		if (__improbable(adjusted_end <= adjusted_start)) {
18641 			/* wraparound */
18642 			printf("%s:%d(start=0x%llx, size=0x%llx) pgmask 0x%x: wraparound\n", __FUNCTION__, __LINE__, (uint64_t)start, (uint64_t)size, PAGE_MASK);
18643 			*phys_size = 0;
18644 			return KERN_INVALID_ARGUMENT;
18645 		}
18646 		assert(adjusted_end > adjusted_start);
18647 		adjusted_size = adjusted_end - adjusted_start;
18648 		*phys_size = adjusted_size;
18649 		return KERN_SUCCESS;
18650 	}
18651 
18652 	vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
18653 	vmk_flags.vmkf_copy_pageable = TRUE;
18654 	vmk_flags.vmkf_copy_same_map = TRUE;
18655 	assert(adjusted_size != 0);
18656 	cur_prot = VM_PROT_NONE; /* legacy mode */
18657 	max_prot = VM_PROT_NONE; /* legacy mode */
18658 	kr = vm_map_copy_extract(map, adjusted_start, adjusted_size,
18659 	    FALSE /* copy */,
18660 	    &copy_map,
18661 	    &cur_prot, &max_prot, VM_INHERIT_DEFAULT,
18662 	    vmk_flags);
18663 	if (kr != KERN_SUCCESS) {
18664 		DEBUG4K_ERROR("map %p start 0x%llx 0x%llx size 0x%llx 0x%llx kr 0x%x\n", map, (uint64_t)start, (uint64_t)adjusted_start, size, (uint64_t)adjusted_size, kr);
18665 		//assert(0);
18666 		*phys_size = 0;
18667 		return kr;
18668 	}
18669 	assert(copy_map != VM_MAP_COPY_NULL);
18670 	target_copy_map = copy_map;
18671 	DEBUG4K_ADJUST("adjusting...\n");
18672 	kr = vm_map_copy_adjust_to_target(
18673 		copy_map,
18674 		start - adjusted_start, /* offset */
18675 		size, /* size */
18676 		kernel_map,
18677 		FALSE,                          /* copy */
18678 		&target_copy_map,
18679 		&overmap_start,
18680 		&overmap_end,
18681 		&trimmed_start);
18682 	if (kr == KERN_SUCCESS) {
18683 		if (target_copy_map->size != *phys_size) {
18684 			DEBUG4K_ADJUST("map %p (%d) start 0x%llx size 0x%llx adjusted_start 0x%llx adjusted_end 0x%llx overmap_start 0x%llx overmap_end 0x%llx trimmed_start 0x%llx phys_size 0x%llx -> 0x%llx\n", map, VM_MAP_PAGE_SHIFT(map), (uint64_t)start, (uint64_t)size, (uint64_t)adjusted_start, (uint64_t)adjusted_end, (uint64_t)overmap_start, (uint64_t)overmap_end, (uint64_t)trimmed_start, (uint64_t)*phys_size, (uint64_t)target_copy_map->size);
18685 		}
18686 		*phys_size = target_copy_map->size;
18687 	} else {
18688 		DEBUG4K_ERROR("map %p start 0x%llx 0x%llx size 0x%llx 0x%llx kr 0x%x\n", map, (uint64_t)start, (uint64_t)adjusted_start, size, (uint64_t)adjusted_size, kr);
18689 		//assert(0);
18690 		*phys_size = 0;
18691 	}
18692 	vm_map_copy_discard(copy_map);
18693 	copy_map = VM_MAP_COPY_NULL;
18694 
18695 	return kr;
18696 }
18697 
18698 
18699 kern_return_t
memory_entry_check_for_adjustment(vm_map_t src_map,ipc_port_t port,vm_map_offset_t * overmap_start,vm_map_offset_t * overmap_end)18700 memory_entry_check_for_adjustment(
18701 	vm_map_t                        src_map,
18702 	ipc_port_t                      port,
18703 	vm_map_offset_t         *overmap_start,
18704 	vm_map_offset_t         *overmap_end)
18705 {
18706 	kern_return_t kr = KERN_SUCCESS;
18707 	vm_map_copy_t copy_map = VM_MAP_COPY_NULL, target_copy_map = VM_MAP_COPY_NULL;
18708 
18709 	assert(port);
18710 	assertf(ip_kotype(port) == IKOT_NAMED_ENTRY, "Port Type expected: %d...received:%d\n", IKOT_NAMED_ENTRY, ip_kotype(port));
18711 
18712 	vm_named_entry_t        named_entry;
18713 
18714 	named_entry = mach_memory_entry_from_port(port);
18715 	named_entry_lock(named_entry);
18716 	copy_map = named_entry->backing.copy;
18717 	target_copy_map = copy_map;
18718 
18719 	if (src_map && VM_MAP_PAGE_SHIFT(src_map) < PAGE_SHIFT) {
18720 		vm_map_offset_t trimmed_start;
18721 
18722 		trimmed_start = 0;
18723 		DEBUG4K_ADJUST("adjusting...\n");
18724 		kr = vm_map_copy_adjust_to_target(
18725 			copy_map,
18726 			0, /* offset */
18727 			copy_map->size, /* size */
18728 			src_map,
18729 			FALSE, /* copy */
18730 			&target_copy_map,
18731 			overmap_start,
18732 			overmap_end,
18733 			&trimmed_start);
18734 		assert(trimmed_start == 0);
18735 	}
18736 	named_entry_unlock(named_entry);
18737 
18738 	return kr;
18739 }
18740 
18741 
18742 /*
18743  *	Routine:	vm_remap
18744  *
18745  *			Map portion of a task's address space.
18746  *			Mapped region must not overlap more than
18747  *			one vm memory object. Protections and
18748  *			inheritance attributes remain the same
18749  *			as in the original task and are	out parameters.
18750  *			Source and Target task can be identical
18751  *			Other attributes are identical as for vm_map()
18752  */
18753 kern_return_t
vm_map_remap(vm_map_t target_map,vm_map_address_t * address,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_map_t src_map,vm_map_offset_t memory_address,boolean_t copy,vm_prot_t * cur_protection,vm_prot_t * max_protection,vm_inherit_t inheritance)18754 vm_map_remap(
18755 	vm_map_t                target_map,
18756 	vm_map_address_t        *address,
18757 	vm_map_size_t           size,
18758 	vm_map_offset_t         mask,
18759 	vm_map_kernel_flags_t   vmk_flags,
18760 	vm_map_t                src_map,
18761 	vm_map_offset_t         memory_address,
18762 	boolean_t               copy,
18763 	vm_prot_t               *cur_protection, /* IN/OUT */
18764 	vm_prot_t               *max_protection, /* IN/OUT */
18765 	vm_inherit_t            inheritance)
18766 {
18767 	kern_return_t           result;
18768 	vm_map_entry_t          entry;
18769 	vm_map_entry_t          insp_entry = VM_MAP_ENTRY_NULL;
18770 	vm_map_entry_t          new_entry;
18771 	vm_map_copy_t           copy_map;
18772 	vm_map_offset_t         offset_in_mapping;
18773 	vm_map_size_t           target_size = 0;
18774 	vm_map_size_t           src_page_mask, target_page_mask;
18775 	vm_map_offset_t         overmap_start, overmap_end, trimmed_start;
18776 	vm_map_offset_t         initial_memory_address;
18777 	vm_map_size_t           initial_size;
18778 	VM_MAP_ZAP_DECLARE(zap_list);
18779 
18780 	if (target_map == VM_MAP_NULL) {
18781 		return KERN_INVALID_ARGUMENT;
18782 	}
18783 
18784 	if (__improbable(vm_map_range_overflows(src_map, memory_address, size))) {
18785 		return KERN_INVALID_ARGUMENT;
18786 	}
18787 
18788 	initial_memory_address = memory_address;
18789 	initial_size = size;
18790 	src_page_mask = VM_MAP_PAGE_MASK(src_map);
18791 	target_page_mask = VM_MAP_PAGE_MASK(target_map);
18792 
18793 	switch (inheritance) {
18794 	case VM_INHERIT_NONE:
18795 	case VM_INHERIT_COPY:
18796 	case VM_INHERIT_SHARE:
18797 		if (size != 0 && src_map != VM_MAP_NULL) {
18798 			break;
18799 		}
18800 		OS_FALLTHROUGH;
18801 	default:
18802 		return KERN_INVALID_ARGUMENT;
18803 	}
18804 
18805 	if (src_page_mask != target_page_mask) {
18806 		if (copy) {
18807 			DEBUG4K_COPY("src_map %p pgsz 0x%x addr 0x%llx size 0x%llx copy %d -> target_map %p pgsz 0x%x\n", src_map, VM_MAP_PAGE_SIZE(src_map), (uint64_t)memory_address, (uint64_t)size, copy, target_map, VM_MAP_PAGE_SIZE(target_map));
18808 		} else {
18809 			DEBUG4K_SHARE("src_map %p pgsz 0x%x addr 0x%llx size 0x%llx copy %d -> target_map %p pgsz 0x%x\n", src_map, VM_MAP_PAGE_SIZE(src_map), (uint64_t)memory_address, (uint64_t)size, copy, target_map, VM_MAP_PAGE_SIZE(target_map));
18810 		}
18811 	}
18812 
18813 	/*
18814 	 * If the user is requesting that we return the address of the
18815 	 * first byte of the data (rather than the base of the page),
18816 	 * then we use different rounding semantics: specifically,
18817 	 * we assume that (memory_address, size) describes a region
18818 	 * all of whose pages we must cover, rather than a base to be truncated
18819 	 * down and a size to be added to that base.  So we figure out
18820 	 * the highest page that the requested region includes and make
18821 	 * sure that the size will cover it.
18822 	 *
18823 	 * The key example we're worried about it is of the form:
18824 	 *
18825 	 *              memory_address = 0x1ff0, size = 0x20
18826 	 *
18827 	 * With the old semantics, we round down the memory_address to 0x1000
18828 	 * and round up the size to 0x1000, resulting in our covering *only*
18829 	 * page 0x1000.  With the new semantics, we'd realize that the region covers
18830 	 * 0x1ff0-0x2010, and compute a size of 0x2000.  Thus, we cover both page
18831 	 * 0x1000 and page 0x2000 in the region we remap.
18832 	 */
18833 	if (vmk_flags.vmf_return_data_addr) {
18834 		vm_map_offset_t range_start, range_end;
18835 
18836 		range_start = vm_map_trunc_page(memory_address, src_page_mask);
18837 		range_end = vm_map_round_page(memory_address + size, src_page_mask);
18838 		memory_address = range_start;
18839 		size = range_end - range_start;
18840 		offset_in_mapping = initial_memory_address - memory_address;
18841 	} else {
18842 		/*
18843 		 * IMPORTANT:
18844 		 * This legacy code path is broken: for the range mentioned
18845 		 * above [ memory_address = 0x1ff0,size = 0x20 ], which spans
18846 		 * two 4k pages, it yields [ memory_address = 0x1000,
18847 		 * size = 0x1000 ], which covers only the first 4k page.
18848 		 * BUT some code unfortunately depends on this bug, so we
18849 		 * can't fix it without breaking something.
18850 		 * New code should get automatically opted in the new
18851 		 * behavior with the new VM_FLAGS_RETURN_DATA_ADDR flags.
18852 		 */
18853 		offset_in_mapping = 0;
18854 		memory_address = vm_map_trunc_page(memory_address, src_page_mask);
18855 		size = vm_map_round_page(size, src_page_mask);
18856 		initial_memory_address = memory_address;
18857 		initial_size = size;
18858 	}
18859 
18860 
18861 	if (size == 0) {
18862 		return KERN_INVALID_ARGUMENT;
18863 	}
18864 
18865 	if (vmk_flags.vmf_resilient_media) {
18866 		/* must be copy-on-write to be "media resilient" */
18867 		if (!copy) {
18868 			return KERN_INVALID_ARGUMENT;
18869 		}
18870 	}
18871 
18872 	vmk_flags.vmkf_copy_pageable = target_map->hdr.entries_pageable;
18873 	vmk_flags.vmkf_copy_same_map = (src_map == target_map);
18874 
18875 	assert(size != 0);
18876 	result = vm_map_copy_extract(src_map,
18877 	    memory_address,
18878 	    size,
18879 	    copy, &copy_map,
18880 	    cur_protection, /* IN/OUT */
18881 	    max_protection, /* IN/OUT */
18882 	    inheritance,
18883 	    vmk_flags);
18884 	if (result != KERN_SUCCESS) {
18885 		return result;
18886 	}
18887 	assert(copy_map != VM_MAP_COPY_NULL);
18888 
18889 	/*
18890 	 * Handle the policy for vm map ranges
18891 	 *
18892 	 * If the maps differ, the target_map policy applies like for vm_map()
18893 	 * For same mapping remaps, we preserve the range.
18894 	 */
18895 	if (vmk_flags.vmkf_copy_same_map) {
18896 		vmk_flags.vmkf_range_id = copy_map->orig_range;
18897 	} else {
18898 		vm_map_kernel_flags_update_range_id(&vmk_flags, target_map);
18899 	}
18900 
18901 	overmap_start = 0;
18902 	overmap_end = 0;
18903 	trimmed_start = 0;
18904 	target_size = size;
18905 	if (src_page_mask != target_page_mask) {
18906 		vm_map_copy_t target_copy_map;
18907 
18908 		target_copy_map = copy_map; /* can modify "copy_map" itself */
18909 		DEBUG4K_ADJUST("adjusting...\n");
18910 		result = vm_map_copy_adjust_to_target(
18911 			copy_map,
18912 			offset_in_mapping, /* offset */
18913 			initial_size,
18914 			target_map,
18915 			copy,
18916 			&target_copy_map,
18917 			&overmap_start,
18918 			&overmap_end,
18919 			&trimmed_start);
18920 		if (result != KERN_SUCCESS) {
18921 			DEBUG4K_COPY("failed to adjust 0x%x\n", result);
18922 			vm_map_copy_discard(copy_map);
18923 			return result;
18924 		}
18925 		if (trimmed_start == 0) {
18926 			/* nothing trimmed: no adjustment needed */
18927 		} else if (trimmed_start >= offset_in_mapping) {
18928 			/* trimmed more than offset_in_mapping: nothing left */
18929 			assert(overmap_start == 0);
18930 			assert(overmap_end == 0);
18931 			offset_in_mapping = 0;
18932 		} else {
18933 			/* trimmed some of offset_in_mapping: adjust */
18934 			assert(overmap_start == 0);
18935 			assert(overmap_end == 0);
18936 			offset_in_mapping -= trimmed_start;
18937 		}
18938 		offset_in_mapping += overmap_start;
18939 		target_size = target_copy_map->size;
18940 	}
18941 
18942 	/*
18943 	 * Allocate/check a range of free virtual address
18944 	 * space for the target
18945 	 */
18946 	*address = vm_map_trunc_page(*address, target_page_mask);
18947 	vm_map_lock(target_map);
18948 	target_size = vm_map_round_page(target_size, target_page_mask);
18949 	result = vm_map_remap_range_allocate(target_map, address,
18950 	    target_size, mask, vmk_flags,
18951 	    &insp_entry, &zap_list);
18952 
18953 	for (entry = vm_map_copy_first_entry(copy_map);
18954 	    entry != vm_map_copy_to_entry(copy_map);
18955 	    entry = new_entry) {
18956 		new_entry = entry->vme_next;
18957 		vm_map_copy_entry_unlink(copy_map, entry);
18958 		if (result == KERN_SUCCESS) {
18959 			if (vmk_flags.vmkf_remap_prot_copy) {
18960 				/*
18961 				 * This vm_map_remap() is for a
18962 				 * vm_protect(VM_PROT_COPY), so the caller
18963 				 * expects to be allowed to add write access
18964 				 * to this new mapping.  This is done by
18965 				 * adding VM_PROT_WRITE to each entry's
18966 				 * max_protection... unless some security
18967 				 * settings disallow it.
18968 				 */
18969 				bool allow_write = false;
18970 				if (entry->vme_permanent) {
18971 					/* immutable mapping... */
18972 					if ((entry->max_protection & VM_PROT_EXECUTE) &&
18973 					    developer_mode_state()) {
18974 						/*
18975 						 * ... but executable and
18976 						 * possibly being debugged,
18977 						 * so let's allow it to become
18978 						 * writable, for breakpoints
18979 						 * and dtrace probes, for
18980 						 * example.
18981 						 */
18982 						allow_write = true;
18983 					} else {
18984 						printf("%d[%s] vm_remap(0x%llx,0x%llx) VM_PROT_COPY denied on permanent mapping prot 0x%x/0x%x developer %d\n",
18985 						    proc_selfpid(),
18986 						    (get_bsdtask_info(current_task())
18987 						    ? proc_name_address(get_bsdtask_info(current_task()))
18988 						    : "?"),
18989 						    (uint64_t)memory_address,
18990 						    (uint64_t)size,
18991 						    entry->protection,
18992 						    entry->max_protection,
18993 						    developer_mode_state());
18994 						DTRACE_VM6(vm_map_delete_permanent_deny_protcopy,
18995 						    vm_map_entry_t, entry,
18996 						    vm_map_offset_t, entry->vme_start,
18997 						    vm_map_offset_t, entry->vme_end,
18998 						    vm_prot_t, entry->protection,
18999 						    vm_prot_t, entry->max_protection,
19000 						    int, VME_ALIAS(entry));
19001 					}
19002 				} else {
19003 					allow_write = true;
19004 				}
19005 
19006 				/*
19007 				 * VM_PROT_COPY: allow this mapping to become
19008 				 * writable, unless it was "permanent".
19009 				 */
19010 				if (allow_write) {
19011 					entry->max_protection |= VM_PROT_WRITE;
19012 				}
19013 			}
19014 			if (vmk_flags.vmf_resilient_codesign) {
19015 				/* no codesigning -> read-only access */
19016 				entry->max_protection = VM_PROT_READ;
19017 				entry->protection = VM_PROT_READ;
19018 				entry->vme_resilient_codesign = TRUE;
19019 			}
19020 			entry->vme_start += *address;
19021 			entry->vme_end += *address;
19022 			assert(!entry->map_aligned);
19023 			if (vmk_flags.vmf_resilient_media &&
19024 			    !entry->is_sub_map &&
19025 			    (VME_OBJECT(entry) == VM_OBJECT_NULL ||
19026 			    VME_OBJECT(entry)->internal)) {
19027 				entry->vme_resilient_media = TRUE;
19028 			}
19029 			assert(VM_MAP_PAGE_ALIGNED(entry->vme_start, MIN(target_page_mask, PAGE_MASK)));
19030 			assert(VM_MAP_PAGE_ALIGNED(entry->vme_end, MIN(target_page_mask, PAGE_MASK)));
19031 			assert(VM_MAP_PAGE_ALIGNED(VME_OFFSET(entry), MIN(target_page_mask, PAGE_MASK)));
19032 			vm_map_store_entry_link(target_map, insp_entry, entry,
19033 			    vmk_flags);
19034 			insp_entry = entry;
19035 		} else {
19036 			if (!entry->is_sub_map) {
19037 				vm_object_deallocate(VME_OBJECT(entry));
19038 			} else {
19039 				vm_map_deallocate(VME_SUBMAP(entry));
19040 			}
19041 			vm_map_copy_entry_dispose(entry);
19042 		}
19043 	}
19044 
19045 	if (vmk_flags.vmf_resilient_codesign) {
19046 		*cur_protection = VM_PROT_READ;
19047 		*max_protection = VM_PROT_READ;
19048 	}
19049 
19050 	if (result == KERN_SUCCESS) {
19051 		target_map->size += target_size;
19052 		SAVE_HINT_MAP_WRITE(target_map, insp_entry);
19053 	}
19054 	vm_map_unlock(target_map);
19055 
19056 	vm_map_zap_dispose(&zap_list);
19057 
19058 	if (result == KERN_SUCCESS && target_map->wiring_required) {
19059 		result = vm_map_wire_kernel(target_map, *address,
19060 		    *address + size, *cur_protection, VM_KERN_MEMORY_MLOCK,
19061 		    TRUE);
19062 	}
19063 
19064 	/*
19065 	 * If requested, return the address of the data pointed to by the
19066 	 * request, rather than the base of the resulting page.
19067 	 */
19068 	if (vmk_flags.vmf_return_data_addr) {
19069 		*address += offset_in_mapping;
19070 	}
19071 
19072 	if (src_page_mask != target_page_mask) {
19073 		DEBUG4K_SHARE("vm_remap(%p 0x%llx 0x%llx copy=%d-> %p 0x%llx 0x%llx  result=0x%x\n", src_map, (uint64_t)memory_address, (uint64_t)size, copy, target_map, (uint64_t)*address, (uint64_t)offset_in_mapping, result);
19074 	}
19075 	vm_map_copy_discard(copy_map);
19076 	copy_map = VM_MAP_COPY_NULL;
19077 
19078 	return result;
19079 }
19080 
19081 /*
19082  *	Routine:	vm_map_remap_range_allocate
19083  *
19084  *	Description:
19085  *		Allocate a range in the specified virtual address map.
19086  *		returns the address and the map entry just before the allocated
19087  *		range
19088  *
19089  *	Map must be locked.
19090  */
19091 
19092 static kern_return_t
vm_map_remap_range_allocate(vm_map_t map,vm_map_address_t * address,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_map_entry_t * map_entry,vm_map_zap_t zap_list)19093 vm_map_remap_range_allocate(
19094 	vm_map_t                map,
19095 	vm_map_address_t        *address,       /* IN/OUT */
19096 	vm_map_size_t           size,
19097 	vm_map_offset_t         mask,
19098 	vm_map_kernel_flags_t   vmk_flags,
19099 	vm_map_entry_t          *map_entry,     /* OUT */
19100 	vm_map_zap_t            zap_list)
19101 {
19102 	vm_map_entry_t  entry;
19103 	vm_map_offset_t start;
19104 	kern_return_t   kr;
19105 
19106 	start = *address;
19107 
19108 	if (!vmk_flags.vmf_fixed) {
19109 		kr = vm_map_locate_space(map, size, mask, vmk_flags,
19110 		    &start, &entry);
19111 		if (kr != KERN_SUCCESS) {
19112 			return kr;
19113 		}
19114 		*address = start;
19115 	} else {
19116 		vm_map_offset_t effective_min_offset, effective_max_offset;
19117 		vm_map_entry_t  temp_entry;
19118 		vm_map_offset_t end;
19119 
19120 		effective_min_offset = map->min_offset;
19121 		effective_max_offset = map->max_offset;
19122 
19123 		/*
19124 		 *	Verify that:
19125 		 *		the address doesn't itself violate
19126 		 *		the mask requirement.
19127 		 */
19128 
19129 		if ((start & mask) != 0) {
19130 			return KERN_NO_SPACE;
19131 		}
19132 
19133 #if CONFIG_MAP_RANGES
19134 		if (map->uses_user_ranges) {
19135 			struct mach_vm_range r;
19136 
19137 			vm_map_user_range_resolve(map, start, 1, &r);
19138 			if (r.max_address == 0) {
19139 				return KERN_INVALID_ADDRESS;
19140 			}
19141 
19142 			effective_min_offset = r.min_address;
19143 			effective_max_offset = r.max_address;
19144 		}
19145 #endif /* CONFIG_MAP_RANGES */
19146 		if (map == kernel_map) {
19147 			mach_vm_range_t r = kmem_validate_range_for_overwrite(start, size);
19148 			effective_min_offset = r->min_address;
19149 			effective_min_offset = r->max_address;
19150 		}
19151 
19152 		/*
19153 		 *	...	the address is within bounds
19154 		 */
19155 
19156 		end = start + size;
19157 
19158 		if ((start < effective_min_offset) ||
19159 		    (end > effective_max_offset) ||
19160 		    (start >= end)) {
19161 			return KERN_INVALID_ADDRESS;
19162 		}
19163 
19164 		/*
19165 		 * If we're asked to overwrite whatever was mapped in that
19166 		 * range, first deallocate that range.
19167 		 */
19168 		if (vmk_flags.vmf_overwrite) {
19169 			vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_MAP_ALIGN;
19170 
19171 			/*
19172 			 * We use a "zap_list" to avoid having to unlock
19173 			 * the "map" in vm_map_delete(), which would compromise
19174 			 * the atomicity of the "deallocate" and then "remap"
19175 			 * combination.
19176 			 */
19177 			remove_flags |= VM_MAP_REMOVE_NO_YIELD;
19178 
19179 			if (vmk_flags.vmkf_overwrite_immutable) {
19180 				remove_flags |= VM_MAP_REMOVE_IMMUTABLE;
19181 			}
19182 			if (vmk_flags.vmkf_remap_prot_copy) {
19183 				remove_flags |= VM_MAP_REMOVE_IMMUTABLE_CODE;
19184 			}
19185 			kr = vm_map_delete(map, start, end, remove_flags,
19186 			    KMEM_GUARD_NONE, zap_list).kmr_return;
19187 			if (kr != KERN_SUCCESS) {
19188 				/* XXX FBDP restore zap_list? */
19189 				return kr;
19190 			}
19191 		}
19192 
19193 		/*
19194 		 *	...	the starting address isn't allocated
19195 		 */
19196 
19197 		if (vm_map_lookup_entry(map, start, &temp_entry)) {
19198 			return KERN_NO_SPACE;
19199 		}
19200 
19201 		entry = temp_entry;
19202 
19203 		/*
19204 		 *	...	the next region doesn't overlap the
19205 		 *		end point.
19206 		 */
19207 
19208 		if ((entry->vme_next != vm_map_to_entry(map)) &&
19209 		    (entry->vme_next->vme_start < end)) {
19210 			return KERN_NO_SPACE;
19211 		}
19212 	}
19213 	*map_entry = entry;
19214 	return KERN_SUCCESS;
19215 }
19216 
19217 /*
19218  *	vm_map_switch:
19219  *
19220  *	Set the address map for the current thread to the specified map
19221  */
19222 
19223 vm_map_t
vm_map_switch(vm_map_t map)19224 vm_map_switch(
19225 	vm_map_t        map)
19226 {
19227 	thread_t        thread = current_thread();
19228 	vm_map_t        oldmap = thread->map;
19229 
19230 
19231 	/*
19232 	 *	Deactivate the current map and activate the requested map
19233 	 */
19234 	mp_disable_preemption();
19235 	PMAP_SWITCH_USER(thread, map, cpu_number());
19236 	mp_enable_preemption();
19237 	return oldmap;
19238 }
19239 
19240 
19241 /*
19242  *	Routine:	vm_map_write_user
19243  *
19244  *	Description:
19245  *		Copy out data from a kernel space into space in the
19246  *		destination map. The space must already exist in the
19247  *		destination map.
19248  *		NOTE:  This routine should only be called by threads
19249  *		which can block on a page fault. i.e. kernel mode user
19250  *		threads.
19251  *
19252  */
19253 kern_return_t
vm_map_write_user(vm_map_t map,void * src_p,vm_map_address_t dst_addr,vm_size_t size)19254 vm_map_write_user(
19255 	vm_map_t                map,
19256 	void                    *src_p,
19257 	vm_map_address_t        dst_addr,
19258 	vm_size_t               size)
19259 {
19260 	kern_return_t   kr = KERN_SUCCESS;
19261 
19262 	if (__improbable(vm_map_range_overflows(map, dst_addr, size))) {
19263 		return KERN_INVALID_ADDRESS;
19264 	}
19265 
19266 	if (current_map() == map) {
19267 		if (copyout(src_p, dst_addr, size)) {
19268 			kr = KERN_INVALID_ADDRESS;
19269 		}
19270 	} else {
19271 		vm_map_t        oldmap;
19272 
19273 		/* take on the identity of the target map while doing */
19274 		/* the transfer */
19275 
19276 		vm_map_reference(map);
19277 		oldmap = vm_map_switch(map);
19278 		if (copyout(src_p, dst_addr, size)) {
19279 			kr = KERN_INVALID_ADDRESS;
19280 		}
19281 		vm_map_switch(oldmap);
19282 		vm_map_deallocate(map);
19283 	}
19284 	return kr;
19285 }
19286 
19287 /*
19288  *	Routine:	vm_map_read_user
19289  *
19290  *	Description:
19291  *		Copy in data from a user space source map into the
19292  *		kernel map. The space must already exist in the
19293  *		kernel map.
19294  *		NOTE:  This routine should only be called by threads
19295  *		which can block on a page fault. i.e. kernel mode user
19296  *		threads.
19297  *
19298  */
19299 kern_return_t
vm_map_read_user(vm_map_t map,vm_map_address_t src_addr,void * dst_p,vm_size_t size)19300 vm_map_read_user(
19301 	vm_map_t                map,
19302 	vm_map_address_t        src_addr,
19303 	void                    *dst_p,
19304 	vm_size_t               size)
19305 {
19306 	kern_return_t   kr = KERN_SUCCESS;
19307 
19308 	if (__improbable(vm_map_range_overflows(map, src_addr, size))) {
19309 		return KERN_INVALID_ADDRESS;
19310 	}
19311 
19312 	if (current_map() == map) {
19313 		if (copyin(src_addr, dst_p, size)) {
19314 			kr = KERN_INVALID_ADDRESS;
19315 		}
19316 	} else {
19317 		vm_map_t        oldmap;
19318 
19319 		/* take on the identity of the target map while doing */
19320 		/* the transfer */
19321 
19322 		vm_map_reference(map);
19323 		oldmap = vm_map_switch(map);
19324 		if (copyin(src_addr, dst_p, size)) {
19325 			kr = KERN_INVALID_ADDRESS;
19326 		}
19327 		vm_map_switch(oldmap);
19328 		vm_map_deallocate(map);
19329 	}
19330 	return kr;
19331 }
19332 
19333 
19334 /*
19335  *	vm_map_check_protection:
19336  *
19337  *	Assert that the target map allows the specified
19338  *	privilege on the entire address region given.
19339  *	The entire region must be allocated.
19340  */
19341 boolean_t
vm_map_check_protection(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t protection)19342 vm_map_check_protection(vm_map_t map, vm_map_offset_t start,
19343     vm_map_offset_t end, vm_prot_t protection)
19344 {
19345 	vm_map_entry_t entry;
19346 	vm_map_entry_t tmp_entry;
19347 
19348 	if (__improbable(vm_map_range_overflows(map, start, end - start))) {
19349 		return FALSE;
19350 	}
19351 
19352 	vm_map_lock(map);
19353 
19354 	if (start < vm_map_min(map) || end > vm_map_max(map) || start > end) {
19355 		vm_map_unlock(map);
19356 		return FALSE;
19357 	}
19358 
19359 	if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
19360 		vm_map_unlock(map);
19361 		return FALSE;
19362 	}
19363 
19364 	entry = tmp_entry;
19365 
19366 	while (start < end) {
19367 		if (entry == vm_map_to_entry(map)) {
19368 			vm_map_unlock(map);
19369 			return FALSE;
19370 		}
19371 
19372 		/*
19373 		 *	No holes allowed!
19374 		 */
19375 
19376 		if (start < entry->vme_start) {
19377 			vm_map_unlock(map);
19378 			return FALSE;
19379 		}
19380 
19381 		/*
19382 		 * Check protection associated with entry.
19383 		 */
19384 
19385 		if ((entry->protection & protection) != protection) {
19386 			vm_map_unlock(map);
19387 			return FALSE;
19388 		}
19389 
19390 		/* go to next entry */
19391 
19392 		start = entry->vme_end;
19393 		entry = entry->vme_next;
19394 	}
19395 	vm_map_unlock(map);
19396 	return TRUE;
19397 }
19398 
19399 kern_return_t
vm_map_purgable_control(vm_map_t map,vm_map_offset_t address,vm_purgable_t control,int * state)19400 vm_map_purgable_control(
19401 	vm_map_t                map,
19402 	vm_map_offset_t         address,
19403 	vm_purgable_t           control,
19404 	int                     *state)
19405 {
19406 	vm_map_entry_t          entry;
19407 	vm_object_t             object;
19408 	kern_return_t           kr;
19409 	boolean_t               was_nonvolatile;
19410 
19411 	/*
19412 	 * Vet all the input parameters and current type and state of the
19413 	 * underlaying object.  Return with an error if anything is amiss.
19414 	 */
19415 	if (map == VM_MAP_NULL) {
19416 		return KERN_INVALID_ARGUMENT;
19417 	}
19418 
19419 	if (control != VM_PURGABLE_SET_STATE &&
19420 	    control != VM_PURGABLE_GET_STATE &&
19421 	    control != VM_PURGABLE_PURGE_ALL &&
19422 	    control != VM_PURGABLE_SET_STATE_FROM_KERNEL) {
19423 		return KERN_INVALID_ARGUMENT;
19424 	}
19425 
19426 	if (control == VM_PURGABLE_PURGE_ALL) {
19427 		vm_purgeable_object_purge_all();
19428 		return KERN_SUCCESS;
19429 	}
19430 
19431 	if ((control == VM_PURGABLE_SET_STATE ||
19432 	    control == VM_PURGABLE_SET_STATE_FROM_KERNEL) &&
19433 	    (((*state & ~(VM_PURGABLE_ALL_MASKS)) != 0) ||
19434 	    ((*state & VM_PURGABLE_STATE_MASK) > VM_PURGABLE_STATE_MASK))) {
19435 		return KERN_INVALID_ARGUMENT;
19436 	}
19437 
19438 	vm_map_lock_read(map);
19439 
19440 	if (!vm_map_lookup_entry(map, address, &entry) || entry->is_sub_map) {
19441 		/*
19442 		 * Must pass a valid non-submap address.
19443 		 */
19444 		vm_map_unlock_read(map);
19445 		return KERN_INVALID_ADDRESS;
19446 	}
19447 
19448 	if ((entry->protection & VM_PROT_WRITE) == 0 &&
19449 	    control != VM_PURGABLE_GET_STATE) {
19450 		/*
19451 		 * Can't apply purgable controls to something you can't write.
19452 		 */
19453 		vm_map_unlock_read(map);
19454 		return KERN_PROTECTION_FAILURE;
19455 	}
19456 
19457 	object = VME_OBJECT(entry);
19458 	if (object == VM_OBJECT_NULL ||
19459 	    object->purgable == VM_PURGABLE_DENY) {
19460 		/*
19461 		 * Object must already be present and be purgeable.
19462 		 */
19463 		vm_map_unlock_read(map);
19464 		return KERN_INVALID_ARGUMENT;
19465 	}
19466 
19467 	vm_object_lock(object);
19468 
19469 #if 00
19470 	if (VME_OFFSET(entry) != 0 ||
19471 	    entry->vme_end - entry->vme_start != object->vo_size) {
19472 		/*
19473 		 * Can only apply purgable controls to the whole (existing)
19474 		 * object at once.
19475 		 */
19476 		vm_map_unlock_read(map);
19477 		vm_object_unlock(object);
19478 		return KERN_INVALID_ARGUMENT;
19479 	}
19480 #endif
19481 
19482 	assert(!entry->is_sub_map);
19483 	assert(!entry->use_pmap); /* purgeable has its own accounting */
19484 
19485 	vm_map_unlock_read(map);
19486 
19487 	was_nonvolatile = (object->purgable == VM_PURGABLE_NONVOLATILE);
19488 
19489 	kr = vm_object_purgable_control(object, control, state);
19490 
19491 	if (was_nonvolatile &&
19492 	    object->purgable != VM_PURGABLE_NONVOLATILE &&
19493 	    map->pmap == kernel_pmap) {
19494 #if DEBUG
19495 		object->vo_purgeable_volatilizer = kernel_task;
19496 #endif /* DEBUG */
19497 	}
19498 
19499 	vm_object_unlock(object);
19500 
19501 	return kr;
19502 }
19503 
19504 void
vm_map_footprint_query_page_info(vm_map_t map,vm_map_entry_t map_entry,vm_map_offset_t curr_s_offset,int * disposition_p)19505 vm_map_footprint_query_page_info(
19506 	vm_map_t        map,
19507 	vm_map_entry_t  map_entry,
19508 	vm_map_offset_t curr_s_offset,
19509 	int             *disposition_p)
19510 {
19511 	int             pmap_disp;
19512 	vm_object_t     object = VM_OBJECT_NULL;
19513 	int             disposition;
19514 	int             effective_page_size;
19515 
19516 	vm_map_lock_assert_held(map);
19517 	assert(!map->has_corpse_footprint);
19518 	assert(curr_s_offset >= map_entry->vme_start);
19519 	assert(curr_s_offset < map_entry->vme_end);
19520 
19521 	if (map_entry->is_sub_map) {
19522 		if (!map_entry->use_pmap) {
19523 			/* nested pmap: no footprint */
19524 			*disposition_p = 0;
19525 			return;
19526 		}
19527 	} else {
19528 		object = VME_OBJECT(map_entry);
19529 		if (object == VM_OBJECT_NULL) {
19530 			/* nothing mapped here: no need to ask */
19531 			*disposition_p = 0;
19532 			return;
19533 		}
19534 	}
19535 
19536 	effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(map));
19537 
19538 	pmap_disp = 0;
19539 
19540 	/*
19541 	 * Query the pmap.
19542 	 */
19543 	pmap_query_page_info(map->pmap, curr_s_offset, &pmap_disp);
19544 
19545 	/*
19546 	 * Compute this page's disposition.
19547 	 */
19548 	disposition = 0;
19549 
19550 	/* deal with "alternate accounting" first */
19551 	if (!map_entry->is_sub_map &&
19552 	    object->vo_no_footprint) {
19553 		/* does not count in footprint */
19554 		assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19555 	} else if (!map_entry->is_sub_map &&
19556 	    (object->purgable == VM_PURGABLE_NONVOLATILE ||
19557 	    (object->purgable == VM_PURGABLE_DENY &&
19558 	    object->vo_ledger_tag)) &&
19559 	    VM_OBJECT_OWNER(object) != NULL &&
19560 	    VM_OBJECT_OWNER(object)->map == map) {
19561 		assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19562 		if ((((curr_s_offset
19563 		    - map_entry->vme_start
19564 		    + VME_OFFSET(map_entry))
19565 		    / effective_page_size) <
19566 		    (object->resident_page_count +
19567 		    vm_compressor_pager_get_count(object->pager)))) {
19568 			/*
19569 			 * Non-volatile purgeable object owned
19570 			 * by this task: report the first
19571 			 * "#resident + #compressed" pages as
19572 			 * "resident" (to show that they
19573 			 * contribute to the footprint) but not
19574 			 * "dirty" (to avoid double-counting
19575 			 * with the fake "non-volatile" region
19576 			 * we'll report at the end of the
19577 			 * address space to account for all
19578 			 * (mapped or not) non-volatile memory
19579 			 * owned by this task.
19580 			 */
19581 			disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
19582 		}
19583 	} else if (!map_entry->is_sub_map &&
19584 	    (object->purgable == VM_PURGABLE_VOLATILE ||
19585 	    object->purgable == VM_PURGABLE_EMPTY) &&
19586 	    VM_OBJECT_OWNER(object) != NULL &&
19587 	    VM_OBJECT_OWNER(object)->map == map) {
19588 		assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19589 		if ((((curr_s_offset
19590 		    - map_entry->vme_start
19591 		    + VME_OFFSET(map_entry))
19592 		    / effective_page_size) <
19593 		    object->wired_page_count)) {
19594 			/*
19595 			 * Volatile|empty purgeable object owned
19596 			 * by this task: report the first
19597 			 * "#wired" pages as "resident" (to
19598 			 * show that they contribute to the
19599 			 * footprint) but not "dirty" (to avoid
19600 			 * double-counting with the fake
19601 			 * "non-volatile" region we'll report
19602 			 * at the end of the address space to
19603 			 * account for all (mapped or not)
19604 			 * non-volatile memory owned by this
19605 			 * task.
19606 			 */
19607 			disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
19608 		}
19609 	} else if (!map_entry->is_sub_map &&
19610 	    map_entry->iokit_acct &&
19611 	    object->internal &&
19612 	    object->purgable == VM_PURGABLE_DENY) {
19613 		/*
19614 		 * Non-purgeable IOKit memory: phys_footprint
19615 		 * includes the entire virtual mapping.
19616 		 */
19617 		assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19618 		disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
19619 		disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
19620 	} else if (pmap_disp & (PMAP_QUERY_PAGE_ALTACCT |
19621 	    PMAP_QUERY_PAGE_COMPRESSED_ALTACCT)) {
19622 		/* alternate accounting */
19623 #if __arm64__ && (DEVELOPMENT || DEBUG)
19624 		if (map->pmap->footprint_was_suspended) {
19625 			/*
19626 			 * The assertion below can fail if dyld
19627 			 * suspended footprint accounting
19628 			 * while doing some adjustments to
19629 			 * this page;  the mapping would say
19630 			 * "use pmap accounting" but the page
19631 			 * would be marked "alternate
19632 			 * accounting".
19633 			 */
19634 		} else
19635 #endif /* __arm64__ && (DEVELOPMENT || DEBUG) */
19636 		{
19637 			assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19638 		}
19639 		disposition = 0;
19640 	} else {
19641 		if (pmap_disp & PMAP_QUERY_PAGE_PRESENT) {
19642 			assertf(map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19643 			disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
19644 			disposition |= VM_PAGE_QUERY_PAGE_REF;
19645 			if (pmap_disp & PMAP_QUERY_PAGE_INTERNAL) {
19646 				disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
19647 			} else {
19648 				disposition |= VM_PAGE_QUERY_PAGE_EXTERNAL;
19649 			}
19650 			if (pmap_disp & PMAP_QUERY_PAGE_REUSABLE) {
19651 				disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
19652 			}
19653 		} else if (pmap_disp & PMAP_QUERY_PAGE_COMPRESSED) {
19654 			assertf(map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19655 			disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
19656 		}
19657 	}
19658 
19659 	*disposition_p = disposition;
19660 }
19661 
19662 kern_return_t
vm_map_page_query_internal(vm_map_t target_map,vm_map_offset_t offset,int * disposition,int * ref_count)19663 vm_map_page_query_internal(
19664 	vm_map_t        target_map,
19665 	vm_map_offset_t offset,
19666 	int             *disposition,
19667 	int             *ref_count)
19668 {
19669 	kern_return_t                   kr;
19670 	vm_page_info_basic_data_t       info;
19671 	mach_msg_type_number_t          count;
19672 
19673 	count = VM_PAGE_INFO_BASIC_COUNT;
19674 	kr = vm_map_page_info(target_map,
19675 	    offset,
19676 	    VM_PAGE_INFO_BASIC,
19677 	    (vm_page_info_t) &info,
19678 	    &count);
19679 	if (kr == KERN_SUCCESS) {
19680 		*disposition = info.disposition;
19681 		*ref_count = info.ref_count;
19682 	} else {
19683 		*disposition = 0;
19684 		*ref_count = 0;
19685 	}
19686 
19687 	return kr;
19688 }
19689 
19690 kern_return_t
vm_map_page_info(vm_map_t map,vm_map_offset_t offset,vm_page_info_flavor_t flavor,vm_page_info_t info,mach_msg_type_number_t * count)19691 vm_map_page_info(
19692 	vm_map_t                map,
19693 	vm_map_offset_t         offset,
19694 	vm_page_info_flavor_t   flavor,
19695 	vm_page_info_t          info,
19696 	mach_msg_type_number_t  *count)
19697 {
19698 	return vm_map_page_range_info_internal(map,
19699 	           offset, /* start of range */
19700 	           (offset + 1), /* this will get rounded in the call to the page boundary */
19701 	           (int)-1, /* effective_page_shift: unspecified */
19702 	           flavor,
19703 	           info,
19704 	           count);
19705 }
19706 
19707 kern_return_t
vm_map_page_range_info_internal(vm_map_t map,vm_map_offset_t start_offset,vm_map_offset_t end_offset,int effective_page_shift,vm_page_info_flavor_t flavor,vm_page_info_t info,mach_msg_type_number_t * count)19708 vm_map_page_range_info_internal(
19709 	vm_map_t                map,
19710 	vm_map_offset_t         start_offset,
19711 	vm_map_offset_t         end_offset,
19712 	int                     effective_page_shift,
19713 	vm_page_info_flavor_t   flavor,
19714 	vm_page_info_t          info,
19715 	mach_msg_type_number_t  *count)
19716 {
19717 	vm_map_entry_t          map_entry = VM_MAP_ENTRY_NULL;
19718 	vm_object_t             object = VM_OBJECT_NULL, curr_object = VM_OBJECT_NULL;
19719 	vm_page_t               m = VM_PAGE_NULL;
19720 	kern_return_t           retval = KERN_SUCCESS;
19721 	int                     disposition = 0;
19722 	int                     ref_count = 0;
19723 	int                     depth = 0, info_idx = 0;
19724 	vm_page_info_basic_t    basic_info = 0;
19725 	vm_map_offset_t         offset_in_page = 0, offset_in_object = 0, curr_offset_in_object = 0;
19726 	vm_map_offset_t         start = 0, end = 0, curr_s_offset = 0, curr_e_offset = 0;
19727 	boolean_t               do_region_footprint;
19728 	ledger_amount_t         ledger_resident, ledger_compressed;
19729 	int                     effective_page_size;
19730 	vm_map_offset_t         effective_page_mask;
19731 
19732 	switch (flavor) {
19733 	case VM_PAGE_INFO_BASIC:
19734 		if (*count != VM_PAGE_INFO_BASIC_COUNT) {
19735 			/*
19736 			 * The "vm_page_info_basic_data" structure was not
19737 			 * properly padded, so allow the size to be off by
19738 			 * one to maintain backwards binary compatibility...
19739 			 */
19740 			if (*count != VM_PAGE_INFO_BASIC_COUNT - 1) {
19741 				return KERN_INVALID_ARGUMENT;
19742 			}
19743 		}
19744 		break;
19745 	default:
19746 		return KERN_INVALID_ARGUMENT;
19747 	}
19748 
19749 	if (effective_page_shift == -1) {
19750 		effective_page_shift = vm_self_region_page_shift_safely(map);
19751 		if (effective_page_shift == -1) {
19752 			return KERN_INVALID_ARGUMENT;
19753 		}
19754 	}
19755 	effective_page_size = (1 << effective_page_shift);
19756 	effective_page_mask = effective_page_size - 1;
19757 
19758 	do_region_footprint = task_self_region_footprint();
19759 	disposition = 0;
19760 	ref_count = 0;
19761 	depth = 0;
19762 	info_idx = 0; /* Tracks the next index within the info structure to be filled.*/
19763 	retval = KERN_SUCCESS;
19764 
19765 	if (__improbable(vm_map_range_overflows(map, start_offset, end_offset - start_offset))) {
19766 		return KERN_INVALID_ADDRESS;
19767 	}
19768 
19769 	offset_in_page = start_offset & effective_page_mask;
19770 	start = vm_map_trunc_page(start_offset, effective_page_mask);
19771 	end = vm_map_round_page(end_offset, effective_page_mask);
19772 
19773 	if (end < start) {
19774 		return KERN_INVALID_ARGUMENT;
19775 	}
19776 
19777 	assert((end - start) <= MAX_PAGE_RANGE_QUERY);
19778 
19779 	vm_map_lock_read(map);
19780 
19781 	task_ledgers_footprint(map->pmap->ledger, &ledger_resident, &ledger_compressed);
19782 
19783 	for (curr_s_offset = start; curr_s_offset < end;) {
19784 		/*
19785 		 * New lookup needs reset of these variables.
19786 		 */
19787 		curr_object = object = VM_OBJECT_NULL;
19788 		offset_in_object = 0;
19789 		ref_count = 0;
19790 		depth = 0;
19791 
19792 		if (do_region_footprint &&
19793 		    curr_s_offset >= vm_map_last_entry(map)->vme_end) {
19794 			/*
19795 			 * Request for "footprint" info about a page beyond
19796 			 * the end of address space: this must be for
19797 			 * the fake region vm_map_region_recurse_64()
19798 			 * reported to account for non-volatile purgeable
19799 			 * memory owned by this task.
19800 			 */
19801 			disposition = 0;
19802 
19803 			if (curr_s_offset - vm_map_last_entry(map)->vme_end <=
19804 			    (unsigned) ledger_compressed) {
19805 				/*
19806 				 * We haven't reported all the "non-volatile
19807 				 * compressed" pages yet, so report this fake
19808 				 * page as "compressed".
19809 				 */
19810 				disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
19811 			} else {
19812 				/*
19813 				 * We've reported all the non-volatile
19814 				 * compressed page but not all the non-volatile
19815 				 * pages , so report this fake page as
19816 				 * "resident dirty".
19817 				 */
19818 				disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
19819 				disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
19820 				disposition |= VM_PAGE_QUERY_PAGE_REF;
19821 			}
19822 			switch (flavor) {
19823 			case VM_PAGE_INFO_BASIC:
19824 				basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
19825 				basic_info->disposition = disposition;
19826 				basic_info->ref_count = 1;
19827 				basic_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
19828 				basic_info->offset = 0;
19829 				basic_info->depth = 0;
19830 
19831 				info_idx++;
19832 				break;
19833 			}
19834 			curr_s_offset += effective_page_size;
19835 			continue;
19836 		}
19837 
19838 		/*
19839 		 * First, find the map entry covering "curr_s_offset", going down
19840 		 * submaps if necessary.
19841 		 */
19842 		if (!vm_map_lookup_entry(map, curr_s_offset, &map_entry)) {
19843 			/* no entry -> no object -> no page */
19844 
19845 			if (curr_s_offset < vm_map_min(map)) {
19846 				/*
19847 				 * Illegal address that falls below map min.
19848 				 */
19849 				curr_e_offset = MIN(end, vm_map_min(map));
19850 			} else if (curr_s_offset >= vm_map_max(map)) {
19851 				/*
19852 				 * Illegal address that falls on/after map max.
19853 				 */
19854 				curr_e_offset = end;
19855 			} else if (map_entry == vm_map_to_entry(map)) {
19856 				/*
19857 				 * Hit a hole.
19858 				 */
19859 				if (map_entry->vme_next == vm_map_to_entry(map)) {
19860 					/*
19861 					 * Empty map.
19862 					 */
19863 					curr_e_offset = MIN(map->max_offset, end);
19864 				} else {
19865 					/*
19866 					 * Hole at start of the map.
19867 					 */
19868 					curr_e_offset = MIN(map_entry->vme_next->vme_start, end);
19869 				}
19870 			} else {
19871 				if (map_entry->vme_next == vm_map_to_entry(map)) {
19872 					/*
19873 					 * Hole at the end of the map.
19874 					 */
19875 					curr_e_offset = MIN(map->max_offset, end);
19876 				} else {
19877 					curr_e_offset = MIN(map_entry->vme_next->vme_start, end);
19878 				}
19879 			}
19880 
19881 			assert(curr_e_offset >= curr_s_offset);
19882 
19883 			uint64_t num_pages = (curr_e_offset - curr_s_offset) >> effective_page_shift;
19884 
19885 			void *info_ptr = (void*) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
19886 
19887 			bzero(info_ptr, num_pages * sizeof(struct vm_page_info_basic));
19888 
19889 			curr_s_offset = curr_e_offset;
19890 
19891 			info_idx += num_pages;
19892 
19893 			continue;
19894 		}
19895 
19896 		/* compute offset from this map entry's start */
19897 		offset_in_object = curr_s_offset - map_entry->vme_start;
19898 
19899 		/* compute offset into this map entry's object (or submap) */
19900 		offset_in_object += VME_OFFSET(map_entry);
19901 
19902 		if (map_entry->is_sub_map) {
19903 			vm_map_t sub_map = VM_MAP_NULL;
19904 			vm_page_info_t submap_info = 0;
19905 			vm_map_offset_t submap_s_offset = 0, submap_e_offset = 0, range_len = 0;
19906 
19907 			range_len = MIN(map_entry->vme_end, end) - curr_s_offset;
19908 
19909 			submap_s_offset = offset_in_object;
19910 			submap_e_offset = submap_s_offset + range_len;
19911 
19912 			sub_map = VME_SUBMAP(map_entry);
19913 
19914 			vm_map_reference(sub_map);
19915 			vm_map_unlock_read(map);
19916 
19917 			submap_info = (vm_page_info_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
19918 
19919 			assertf(VM_MAP_PAGE_SHIFT(sub_map) >= VM_MAP_PAGE_SHIFT(map),
19920 			    "Submap page size (%d) differs from current map (%d)\n", VM_MAP_PAGE_SIZE(sub_map), VM_MAP_PAGE_SIZE(map));
19921 
19922 			retval = vm_map_page_range_info_internal(sub_map,
19923 			    submap_s_offset,
19924 			    submap_e_offset,
19925 			    effective_page_shift,
19926 			    VM_PAGE_INFO_BASIC,
19927 			    (vm_page_info_t) submap_info,
19928 			    count);
19929 
19930 			assert(retval == KERN_SUCCESS);
19931 
19932 			vm_map_lock_read(map);
19933 			vm_map_deallocate(sub_map);
19934 
19935 			/* Move the "info" index by the number of pages we inspected.*/
19936 			info_idx += range_len >> effective_page_shift;
19937 
19938 			/* Move our current offset by the size of the range we inspected.*/
19939 			curr_s_offset += range_len;
19940 
19941 			continue;
19942 		}
19943 
19944 		object = VME_OBJECT(map_entry);
19945 
19946 		if (object == VM_OBJECT_NULL) {
19947 			/*
19948 			 * We don't have an object here and, hence,
19949 			 * no pages to inspect. We'll fill up the
19950 			 * info structure appropriately.
19951 			 */
19952 
19953 			curr_e_offset = MIN(map_entry->vme_end, end);
19954 
19955 			uint64_t num_pages = (curr_e_offset - curr_s_offset) >> effective_page_shift;
19956 
19957 			void *info_ptr = (void*) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
19958 
19959 			bzero(info_ptr, num_pages * sizeof(struct vm_page_info_basic));
19960 
19961 			curr_s_offset = curr_e_offset;
19962 
19963 			info_idx += num_pages;
19964 
19965 			continue;
19966 		}
19967 
19968 		if (do_region_footprint) {
19969 			disposition = 0;
19970 			if (map->has_corpse_footprint) {
19971 				/*
19972 				 * Query the page info data we saved
19973 				 * while forking the corpse.
19974 				 */
19975 				vm_map_corpse_footprint_query_page_info(
19976 					map,
19977 					curr_s_offset,
19978 					&disposition);
19979 			} else {
19980 				/*
19981 				 * Query the live pmap for footprint info
19982 				 * about this page.
19983 				 */
19984 				vm_map_footprint_query_page_info(
19985 					map,
19986 					map_entry,
19987 					curr_s_offset,
19988 					&disposition);
19989 			}
19990 			switch (flavor) {
19991 			case VM_PAGE_INFO_BASIC:
19992 				basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
19993 				basic_info->disposition = disposition;
19994 				basic_info->ref_count = 1;
19995 				basic_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
19996 				basic_info->offset = 0;
19997 				basic_info->depth = 0;
19998 
19999 				info_idx++;
20000 				break;
20001 			}
20002 			curr_s_offset += effective_page_size;
20003 			continue;
20004 		}
20005 
20006 		vm_object_reference(object);
20007 		/*
20008 		 * Shared mode -- so we can allow other readers
20009 		 * to grab the lock too.
20010 		 */
20011 		vm_object_lock_shared(object);
20012 
20013 		curr_e_offset = MIN(map_entry->vme_end, end);
20014 
20015 		vm_map_unlock_read(map);
20016 
20017 		map_entry = NULL; /* map is unlocked, the entry is no longer valid. */
20018 
20019 		curr_object = object;
20020 
20021 		for (; curr_s_offset < curr_e_offset;) {
20022 			if (object == curr_object) {
20023 				ref_count = curr_object->ref_count - 1; /* account for our object reference above. */
20024 			} else {
20025 				ref_count = curr_object->ref_count;
20026 			}
20027 
20028 			curr_offset_in_object = offset_in_object;
20029 
20030 			for (;;) {
20031 				m = vm_page_lookup(curr_object, vm_object_trunc_page(curr_offset_in_object));
20032 
20033 				if (m != VM_PAGE_NULL) {
20034 					disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
20035 					break;
20036 				} else {
20037 					if (curr_object->internal &&
20038 					    curr_object->alive &&
20039 					    !curr_object->terminating &&
20040 					    curr_object->pager_ready) {
20041 						if (VM_COMPRESSOR_PAGER_STATE_GET(curr_object, vm_object_trunc_page(curr_offset_in_object))
20042 						    == VM_EXTERNAL_STATE_EXISTS) {
20043 							/* the pager has that page */
20044 							disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
20045 							break;
20046 						}
20047 					}
20048 
20049 					/*
20050 					 * Go down the VM object shadow chain until we find the page
20051 					 * we're looking for.
20052 					 */
20053 
20054 					if (curr_object->shadow != VM_OBJECT_NULL) {
20055 						vm_object_t shadow = VM_OBJECT_NULL;
20056 
20057 						curr_offset_in_object += curr_object->vo_shadow_offset;
20058 						shadow = curr_object->shadow;
20059 
20060 						vm_object_lock_shared(shadow);
20061 						vm_object_unlock(curr_object);
20062 
20063 						curr_object = shadow;
20064 						depth++;
20065 						continue;
20066 					} else {
20067 						break;
20068 					}
20069 				}
20070 			}
20071 
20072 			/* The ref_count is not strictly accurate, it measures the number   */
20073 			/* of entities holding a ref on the object, they may not be mapping */
20074 			/* the object or may not be mapping the section holding the         */
20075 			/* target page but its still a ball park number and though an over- */
20076 			/* count, it picks up the copy-on-write cases                       */
20077 
20078 			/* We could also get a picture of page sharing from pmap_attributes */
20079 			/* but this would under count as only faulted-in mappings would     */
20080 			/* show up.							    */
20081 
20082 			if ((curr_object == object) && curr_object->shadow) {
20083 				disposition |= VM_PAGE_QUERY_PAGE_COPIED;
20084 			}
20085 
20086 			if (!curr_object->internal) {
20087 				disposition |= VM_PAGE_QUERY_PAGE_EXTERNAL;
20088 			}
20089 
20090 			if (m != VM_PAGE_NULL) {
20091 				if (m->vmp_fictitious) {
20092 					disposition |= VM_PAGE_QUERY_PAGE_FICTITIOUS;
20093 				} else {
20094 					if (m->vmp_dirty || pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(m))) {
20095 						disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
20096 					}
20097 
20098 					if (m->vmp_reference || pmap_is_referenced(VM_PAGE_GET_PHYS_PAGE(m))) {
20099 						disposition |= VM_PAGE_QUERY_PAGE_REF;
20100 					}
20101 
20102 					if (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
20103 						disposition |= VM_PAGE_QUERY_PAGE_SPECULATIVE;
20104 					}
20105 
20106 					/*
20107 					 * XXX TODO4K:
20108 					 * when this routine deals with 4k
20109 					 * pages, check the appropriate CS bit
20110 					 * here.
20111 					 */
20112 					if (m->vmp_cs_validated) {
20113 						disposition |= VM_PAGE_QUERY_PAGE_CS_VALIDATED;
20114 					}
20115 					if (m->vmp_cs_tainted) {
20116 						disposition |= VM_PAGE_QUERY_PAGE_CS_TAINTED;
20117 					}
20118 					if (m->vmp_cs_nx) {
20119 						disposition |= VM_PAGE_QUERY_PAGE_CS_NX;
20120 					}
20121 					if (m->vmp_reusable || curr_object->all_reusable) {
20122 						disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
20123 					}
20124 				}
20125 			}
20126 
20127 			switch (flavor) {
20128 			case VM_PAGE_INFO_BASIC:
20129 				basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20130 				basic_info->disposition = disposition;
20131 				basic_info->ref_count = ref_count;
20132 				basic_info->object_id = (vm_object_id_t) (uintptr_t)
20133 				    VM_KERNEL_ADDRPERM(curr_object);
20134 				basic_info->offset =
20135 				    (memory_object_offset_t) curr_offset_in_object + offset_in_page;
20136 				basic_info->depth = depth;
20137 
20138 				info_idx++;
20139 				break;
20140 			}
20141 
20142 			disposition = 0;
20143 			offset_in_page = 0; // This doesn't really make sense for any offset other than the starting offset.
20144 
20145 			/*
20146 			 * Move to next offset in the range and in our object.
20147 			 */
20148 			curr_s_offset += effective_page_size;
20149 			offset_in_object += effective_page_size;
20150 			curr_offset_in_object = offset_in_object;
20151 
20152 			if (curr_object != object) {
20153 				vm_object_unlock(curr_object);
20154 
20155 				curr_object = object;
20156 
20157 				vm_object_lock_shared(curr_object);
20158 			} else {
20159 				vm_object_lock_yield_shared(curr_object);
20160 			}
20161 		}
20162 
20163 		vm_object_unlock(curr_object);
20164 		vm_object_deallocate(curr_object);
20165 
20166 		vm_map_lock_read(map);
20167 	}
20168 
20169 	vm_map_unlock_read(map);
20170 	return retval;
20171 }
20172 
20173 /*
20174  *	vm_map_msync
20175  *
20176  *	Synchronises the memory range specified with its backing store
20177  *	image by either flushing or cleaning the contents to the appropriate
20178  *	memory manager engaging in a memory object synchronize dialog with
20179  *	the manager.  The client doesn't return until the manager issues
20180  *	m_o_s_completed message.  MIG Magically converts user task parameter
20181  *	to the task's address map.
20182  *
20183  *	interpretation of sync_flags
20184  *	VM_SYNC_INVALIDATE	- discard pages, only return precious
20185  *				  pages to manager.
20186  *
20187  *	VM_SYNC_INVALIDATE & (VM_SYNC_SYNCHRONOUS | VM_SYNC_ASYNCHRONOUS)
20188  *				- discard pages, write dirty or precious
20189  *				  pages back to memory manager.
20190  *
20191  *	VM_SYNC_SYNCHRONOUS | VM_SYNC_ASYNCHRONOUS
20192  *				- write dirty or precious pages back to
20193  *				  the memory manager.
20194  *
20195  *	VM_SYNC_CONTIGUOUS	- does everything normally, but if there
20196  *				  is a hole in the region, and we would
20197  *				  have returned KERN_SUCCESS, return
20198  *				  KERN_INVALID_ADDRESS instead.
20199  *
20200  *	NOTE
20201  *	The memory object attributes have not yet been implemented, this
20202  *	function will have to deal with the invalidate attribute
20203  *
20204  *	RETURNS
20205  *	KERN_INVALID_TASK		Bad task parameter
20206  *	KERN_INVALID_ARGUMENT		both sync and async were specified.
20207  *	KERN_SUCCESS			The usual.
20208  *	KERN_INVALID_ADDRESS		There was a hole in the region.
20209  */
20210 
20211 kern_return_t
vm_map_msync(vm_map_t map,vm_map_address_t address,vm_map_size_t size,vm_sync_t sync_flags)20212 vm_map_msync(
20213 	vm_map_t                map,
20214 	vm_map_address_t        address,
20215 	vm_map_size_t           size,
20216 	vm_sync_t               sync_flags)
20217 {
20218 	vm_map_entry_t          entry;
20219 	vm_map_size_t           amount_left;
20220 	vm_object_offset_t      offset;
20221 	vm_object_offset_t      start_offset, end_offset;
20222 	boolean_t               do_sync_req;
20223 	boolean_t               had_hole = FALSE;
20224 	vm_map_offset_t         pmap_offset;
20225 
20226 	if ((sync_flags & VM_SYNC_ASYNCHRONOUS) &&
20227 	    (sync_flags & VM_SYNC_SYNCHRONOUS)) {
20228 		return KERN_INVALID_ARGUMENT;
20229 	}
20230 
20231 	if (__improbable(vm_map_range_overflows(map, address, size))) {
20232 		return KERN_INVALID_ADDRESS;
20233 	}
20234 
20235 	if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
20236 		DEBUG4K_SHARE("map %p address 0x%llx size 0x%llx flags 0x%x\n", map, (uint64_t)address, (uint64_t)size, sync_flags);
20237 	}
20238 
20239 	/*
20240 	 * align address and size on page boundaries
20241 	 */
20242 	size = (vm_map_round_page(address + size,
20243 	    VM_MAP_PAGE_MASK(map)) -
20244 	    vm_map_trunc_page(address,
20245 	    VM_MAP_PAGE_MASK(map)));
20246 	address = vm_map_trunc_page(address,
20247 	    VM_MAP_PAGE_MASK(map));
20248 
20249 	if (map == VM_MAP_NULL) {
20250 		return KERN_INVALID_TASK;
20251 	}
20252 
20253 	if (size == 0) {
20254 		return KERN_SUCCESS;
20255 	}
20256 
20257 	amount_left = size;
20258 
20259 	while (amount_left > 0) {
20260 		vm_object_size_t        flush_size;
20261 		vm_object_t             object;
20262 
20263 		vm_map_lock(map);
20264 		if (!vm_map_lookup_entry(map,
20265 		    address,
20266 		    &entry)) {
20267 			vm_map_size_t   skip;
20268 
20269 			/*
20270 			 * hole in the address map.
20271 			 */
20272 			had_hole = TRUE;
20273 
20274 			if (sync_flags & VM_SYNC_KILLPAGES) {
20275 				/*
20276 				 * For VM_SYNC_KILLPAGES, there should be
20277 				 * no holes in the range, since we couldn't
20278 				 * prevent someone else from allocating in
20279 				 * that hole and we wouldn't want to "kill"
20280 				 * their pages.
20281 				 */
20282 				vm_map_unlock(map);
20283 				break;
20284 			}
20285 
20286 			/*
20287 			 * Check for empty map.
20288 			 */
20289 			if (entry == vm_map_to_entry(map) &&
20290 			    entry->vme_next == entry) {
20291 				vm_map_unlock(map);
20292 				break;
20293 			}
20294 			/*
20295 			 * Check that we don't wrap and that
20296 			 * we have at least one real map entry.
20297 			 */
20298 			if ((map->hdr.nentries == 0) ||
20299 			    (entry->vme_next->vme_start < address)) {
20300 				vm_map_unlock(map);
20301 				break;
20302 			}
20303 			/*
20304 			 * Move up to the next entry if needed
20305 			 */
20306 			skip = (entry->vme_next->vme_start - address);
20307 			if (skip >= amount_left) {
20308 				amount_left = 0;
20309 			} else {
20310 				amount_left -= skip;
20311 			}
20312 			address = entry->vme_next->vme_start;
20313 			vm_map_unlock(map);
20314 			continue;
20315 		}
20316 
20317 		offset = address - entry->vme_start;
20318 		pmap_offset = address;
20319 
20320 		/*
20321 		 * do we have more to flush than is contained in this
20322 		 * entry ?
20323 		 */
20324 		if (amount_left + entry->vme_start + offset > entry->vme_end) {
20325 			flush_size = entry->vme_end -
20326 			    (entry->vme_start + offset);
20327 		} else {
20328 			flush_size = amount_left;
20329 		}
20330 		amount_left -= flush_size;
20331 		address += flush_size;
20332 
20333 		if (entry->is_sub_map == TRUE) {
20334 			vm_map_t        local_map;
20335 			vm_map_offset_t local_offset;
20336 
20337 			local_map = VME_SUBMAP(entry);
20338 			local_offset = VME_OFFSET(entry);
20339 			vm_map_reference(local_map);
20340 			vm_map_unlock(map);
20341 			if (vm_map_msync(
20342 				    local_map,
20343 				    local_offset,
20344 				    flush_size,
20345 				    sync_flags) == KERN_INVALID_ADDRESS) {
20346 				had_hole = TRUE;
20347 			}
20348 			vm_map_deallocate(local_map);
20349 			continue;
20350 		}
20351 		object = VME_OBJECT(entry);
20352 
20353 		/*
20354 		 * We can't sync this object if the object has not been
20355 		 * created yet
20356 		 */
20357 		if (object == VM_OBJECT_NULL) {
20358 			vm_map_unlock(map);
20359 			continue;
20360 		}
20361 		offset += VME_OFFSET(entry);
20362 
20363 		vm_object_lock(object);
20364 
20365 		if (sync_flags & (VM_SYNC_KILLPAGES | VM_SYNC_DEACTIVATE)) {
20366 			int kill_pages = 0;
20367 
20368 			if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
20369 				/*
20370 				 * This is a destructive operation and so we
20371 				 * err on the side of limiting the range of
20372 				 * the operation.
20373 				 */
20374 				start_offset = vm_object_round_page(offset);
20375 				end_offset = vm_object_trunc_page(offset + flush_size);
20376 
20377 				if (end_offset <= start_offset) {
20378 					vm_object_unlock(object);
20379 					vm_map_unlock(map);
20380 					continue;
20381 				}
20382 
20383 				pmap_offset += start_offset - offset;
20384 			} else {
20385 				start_offset = offset;
20386 				end_offset = offset + flush_size;
20387 			}
20388 
20389 			if (sync_flags & VM_SYNC_KILLPAGES) {
20390 				if (((object->ref_count == 1) ||
20391 				    ((object->copy_strategy !=
20392 				    MEMORY_OBJECT_COPY_SYMMETRIC) &&
20393 				    (object->vo_copy == VM_OBJECT_NULL))) &&
20394 				    (object->shadow == VM_OBJECT_NULL)) {
20395 					if (object->ref_count != 1) {
20396 						vm_page_stats_reusable.free_shared++;
20397 					}
20398 					kill_pages = 1;
20399 				} else {
20400 					kill_pages = -1;
20401 				}
20402 			}
20403 			if (kill_pages != -1) {
20404 				vm_object_deactivate_pages(
20405 					object,
20406 					start_offset,
20407 					(vm_object_size_t) (end_offset - start_offset),
20408 					kill_pages,
20409 					FALSE, /* reusable_pages */
20410 					FALSE, /* reusable_no_write */
20411 					map->pmap,
20412 					pmap_offset);
20413 			}
20414 			vm_object_unlock(object);
20415 			vm_map_unlock(map);
20416 			continue;
20417 		}
20418 		/*
20419 		 * We can't sync this object if there isn't a pager.
20420 		 * Don't bother to sync internal objects, since there can't
20421 		 * be any "permanent" storage for these objects anyway.
20422 		 */
20423 		if ((object->pager == MEMORY_OBJECT_NULL) ||
20424 		    (object->internal) || (object->private)) {
20425 			vm_object_unlock(object);
20426 			vm_map_unlock(map);
20427 			continue;
20428 		}
20429 		/*
20430 		 * keep reference on the object until syncing is done
20431 		 */
20432 		vm_object_reference_locked(object);
20433 		vm_object_unlock(object);
20434 
20435 		vm_map_unlock(map);
20436 
20437 		if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
20438 			start_offset = vm_object_trunc_page(offset);
20439 			end_offset = vm_object_round_page(offset + flush_size);
20440 		} else {
20441 			start_offset = offset;
20442 			end_offset = offset + flush_size;
20443 		}
20444 
20445 		do_sync_req = vm_object_sync(object,
20446 		    start_offset,
20447 		    (end_offset - start_offset),
20448 		    sync_flags & VM_SYNC_INVALIDATE,
20449 		    ((sync_flags & VM_SYNC_SYNCHRONOUS) ||
20450 		    (sync_flags & VM_SYNC_ASYNCHRONOUS)),
20451 		    sync_flags & VM_SYNC_SYNCHRONOUS);
20452 
20453 		if ((sync_flags & VM_SYNC_INVALIDATE) && object->resident_page_count == 0) {
20454 			/*
20455 			 * clear out the clustering and read-ahead hints
20456 			 */
20457 			vm_object_lock(object);
20458 
20459 			object->pages_created = 0;
20460 			object->pages_used = 0;
20461 			object->sequential = 0;
20462 			object->last_alloc = 0;
20463 
20464 			vm_object_unlock(object);
20465 		}
20466 		vm_object_deallocate(object);
20467 	} /* while */
20468 
20469 	/* for proper msync() behaviour */
20470 	if (had_hole == TRUE && (sync_flags & VM_SYNC_CONTIGUOUS)) {
20471 		return KERN_INVALID_ADDRESS;
20472 	}
20473 
20474 	return KERN_SUCCESS;
20475 }/* vm_msync */
20476 
20477 void
vm_named_entry_associate_vm_object(vm_named_entry_t named_entry,vm_object_t object,vm_object_offset_t offset,vm_object_size_t size,vm_prot_t prot)20478 vm_named_entry_associate_vm_object(
20479 	vm_named_entry_t        named_entry,
20480 	vm_object_t             object,
20481 	vm_object_offset_t      offset,
20482 	vm_object_size_t        size,
20483 	vm_prot_t               prot)
20484 {
20485 	vm_map_copy_t copy;
20486 	vm_map_entry_t copy_entry;
20487 
20488 	assert(!named_entry->is_sub_map);
20489 	assert(!named_entry->is_copy);
20490 	assert(!named_entry->is_object);
20491 	assert(!named_entry->internal);
20492 	assert(named_entry->backing.copy == VM_MAP_COPY_NULL);
20493 
20494 	copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
20495 	copy->offset = offset;
20496 	copy->size = size;
20497 	copy->cpy_hdr.page_shift = (uint16_t)PAGE_SHIFT;
20498 
20499 	copy_entry = vm_map_copy_entry_create(copy);
20500 	copy_entry->protection = prot;
20501 	copy_entry->max_protection = prot;
20502 	copy_entry->use_pmap = TRUE;
20503 	copy_entry->vme_start = VM_MAP_TRUNC_PAGE(offset, PAGE_MASK);
20504 	copy_entry->vme_end = VM_MAP_ROUND_PAGE(offset + size, PAGE_MASK);
20505 	VME_OBJECT_SET(copy_entry, object, false, 0);
20506 	VME_OFFSET_SET(copy_entry, vm_object_trunc_page(offset));
20507 	vm_map_copy_entry_link(copy, vm_map_copy_last_entry(copy), copy_entry);
20508 
20509 	named_entry->backing.copy = copy;
20510 	named_entry->is_object = TRUE;
20511 	if (object->internal) {
20512 		named_entry->internal = TRUE;
20513 	}
20514 
20515 	DEBUG4K_MEMENTRY("named_entry %p copy %p object %p offset 0x%llx size 0x%llx prot 0x%x\n",
20516 	    named_entry, copy, object, offset, size, prot);
20517 }
20518 
20519 vm_object_t
vm_named_entry_to_vm_object(vm_named_entry_t named_entry)20520 vm_named_entry_to_vm_object(
20521 	vm_named_entry_t named_entry)
20522 {
20523 	vm_map_copy_t   copy;
20524 	vm_map_entry_t  copy_entry;
20525 	vm_object_t     object;
20526 
20527 	assert(!named_entry->is_sub_map);
20528 	assert(!named_entry->is_copy);
20529 	assert(named_entry->is_object);
20530 	copy = named_entry->backing.copy;
20531 	assert(copy != VM_MAP_COPY_NULL);
20532 	/*
20533 	 * Assert that the vm_map_copy is coming from the right
20534 	 * zone and hasn't been forged
20535 	 */
20536 	vm_map_copy_require(copy);
20537 	assert(copy->cpy_hdr.nentries == 1);
20538 	copy_entry = vm_map_copy_first_entry(copy);
20539 	object = VME_OBJECT(copy_entry);
20540 
20541 	DEBUG4K_MEMENTRY("%p -> %p -> %p [0x%llx 0x%llx 0x%llx 0x%x/0x%x ] -> %p offset 0x%llx size 0x%llx prot 0x%x\n", named_entry, copy, copy_entry, (uint64_t)copy_entry->vme_start, (uint64_t)copy_entry->vme_end, copy_entry->vme_offset, copy_entry->protection, copy_entry->max_protection, object, named_entry->offset, named_entry->size, named_entry->protection);
20542 
20543 	return object;
20544 }
20545 
20546 /*
20547  *	Routine:	convert_port_entry_to_map
20548  *	Purpose:
20549  *		Convert from a port specifying an entry or a task
20550  *		to a map. Doesn't consume the port ref; produces a map ref,
20551  *		which may be null.  Unlike convert_port_to_map, the
20552  *		port may be task or a named entry backed.
20553  *	Conditions:
20554  *		Nothing locked.
20555  */
20556 
20557 vm_map_t
convert_port_entry_to_map(ipc_port_t port)20558 convert_port_entry_to_map(
20559 	ipc_port_t      port)
20560 {
20561 	vm_map_t map = VM_MAP_NULL;
20562 	vm_named_entry_t named_entry;
20563 
20564 	if (!IP_VALID(port)) {
20565 		return VM_MAP_NULL;
20566 	}
20567 
20568 	if (ip_kotype(port) != IKOT_NAMED_ENTRY) {
20569 		return convert_port_to_map(port);
20570 	}
20571 
20572 	named_entry = mach_memory_entry_from_port(port);
20573 
20574 	if ((named_entry->is_sub_map) &&
20575 	    (named_entry->protection & VM_PROT_WRITE)) {
20576 		map = named_entry->backing.map;
20577 		if (map->pmap != PMAP_NULL) {
20578 			if (map->pmap == kernel_pmap) {
20579 				panic("userspace has access "
20580 				    "to a kernel map %p", map);
20581 			}
20582 			pmap_require(map->pmap);
20583 		}
20584 		vm_map_reference(map);
20585 	}
20586 
20587 	return map;
20588 }
20589 
20590 /*
20591  * Export routines to other components for the things we access locally through
20592  * macros.
20593  */
20594 #undef current_map
20595 vm_map_t
current_map(void)20596 current_map(void)
20597 {
20598 	return current_map_fast();
20599 }
20600 
20601 /*
20602  *	vm_map_reference:
20603  *
20604  *	Takes a reference on the specified map.
20605  */
20606 void
vm_map_reference(vm_map_t map)20607 vm_map_reference(
20608 	vm_map_t        map)
20609 {
20610 	if (__probable(map != VM_MAP_NULL)) {
20611 		vm_map_require(map);
20612 		os_ref_retain_raw(&map->map_refcnt, &map_refgrp);
20613 	}
20614 }
20615 
20616 /*
20617  *	vm_map_deallocate:
20618  *
20619  *	Removes a reference from the specified map,
20620  *	destroying it if no references remain.
20621  *	The map should not be locked.
20622  */
20623 void
vm_map_deallocate(vm_map_t map)20624 vm_map_deallocate(
20625 	vm_map_t        map)
20626 {
20627 	if (__probable(map != VM_MAP_NULL)) {
20628 		vm_map_require(map);
20629 		if (os_ref_release_raw(&map->map_refcnt, &map_refgrp) == 0) {
20630 			vm_map_destroy(map);
20631 		}
20632 	}
20633 }
20634 
20635 void
vm_map_inspect_deallocate(vm_map_inspect_t map)20636 vm_map_inspect_deallocate(
20637 	vm_map_inspect_t      map)
20638 {
20639 	vm_map_deallocate((vm_map_t)map);
20640 }
20641 
20642 void
vm_map_read_deallocate(vm_map_read_t map)20643 vm_map_read_deallocate(
20644 	vm_map_read_t      map)
20645 {
20646 	vm_map_deallocate((vm_map_t)map);
20647 }
20648 
20649 
20650 void
vm_map_disable_NX(vm_map_t map)20651 vm_map_disable_NX(vm_map_t map)
20652 {
20653 	if (map == NULL) {
20654 		return;
20655 	}
20656 	if (map->pmap == NULL) {
20657 		return;
20658 	}
20659 
20660 	pmap_disable_NX(map->pmap);
20661 }
20662 
20663 void
vm_map_disallow_data_exec(vm_map_t map)20664 vm_map_disallow_data_exec(vm_map_t map)
20665 {
20666 	if (map == NULL) {
20667 		return;
20668 	}
20669 
20670 	map->map_disallow_data_exec = TRUE;
20671 }
20672 
20673 /* XXX Consider making these constants (VM_MAX_ADDRESS and MACH_VM_MAX_ADDRESS)
20674  * more descriptive.
20675  */
20676 void
vm_map_set_32bit(vm_map_t map)20677 vm_map_set_32bit(vm_map_t map)
20678 {
20679 #if defined(__arm64__)
20680 	map->max_offset = pmap_max_offset(FALSE, ARM_PMAP_MAX_OFFSET_DEVICE);
20681 #else
20682 	map->max_offset = (vm_map_offset_t)VM_MAX_ADDRESS;
20683 #endif
20684 }
20685 
20686 
20687 void
vm_map_set_64bit(vm_map_t map)20688 vm_map_set_64bit(vm_map_t map)
20689 {
20690 #if defined(__arm64__)
20691 	map->max_offset = pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_DEVICE);
20692 #else
20693 	map->max_offset = (vm_map_offset_t)MACH_VM_MAX_ADDRESS;
20694 #endif
20695 }
20696 
20697 /*
20698  * Expand the maximum size of an existing map to the maximum supported.
20699  */
20700 void
vm_map_set_jumbo(vm_map_t map)20701 vm_map_set_jumbo(vm_map_t map)
20702 {
20703 #if defined (__arm64__) && !XNU_TARGET_OS_OSX
20704 	vm_map_set_max_addr(map, ~0);
20705 #else /* arm64 */
20706 	(void) map;
20707 #endif
20708 }
20709 
20710 /*
20711  * This map has a JIT entitlement
20712  */
20713 void
vm_map_set_jit_entitled(vm_map_t map)20714 vm_map_set_jit_entitled(vm_map_t map)
20715 {
20716 #if defined (__arm64__)
20717 	pmap_set_jit_entitled(map->pmap);
20718 #else /* arm64 */
20719 	(void) map;
20720 #endif
20721 }
20722 
20723 /*
20724  * Get status of this maps TPRO flag
20725  */
20726 boolean_t
vm_map_tpro(vm_map_t map)20727 vm_map_tpro(vm_map_t map)
20728 {
20729 #if defined (__arm64e__)
20730 	return pmap_get_tpro(map->pmap);
20731 #else /* arm64e */
20732 	(void) map;
20733 	return FALSE;
20734 #endif
20735 }
20736 
20737 /*
20738  * This map has TPRO enabled
20739  */
20740 void
vm_map_set_tpro(vm_map_t map)20741 vm_map_set_tpro(vm_map_t map)
20742 {
20743 #if defined (__arm64e__)
20744 	pmap_set_tpro(map->pmap);
20745 #else /* arm64e */
20746 	(void) map;
20747 #endif
20748 }
20749 
20750 /*
20751  * Does this map have TPRO enforcement enabled
20752  */
20753 boolean_t
vm_map_tpro_enforcement(vm_map_t map)20754 vm_map_tpro_enforcement(vm_map_t map)
20755 {
20756 	return map->tpro_enforcement;
20757 }
20758 
20759 /*
20760  * Set TPRO enforcement for this map
20761  */
20762 void
vm_map_set_tpro_enforcement(vm_map_t map)20763 vm_map_set_tpro_enforcement(vm_map_t map)
20764 {
20765 	if (vm_map_tpro(map)) {
20766 		vm_map_lock(map);
20767 		map->tpro_enforcement = TRUE;
20768 		vm_map_unlock(map);
20769 	}
20770 }
20771 
20772 /*
20773  * Enable TPRO on the requested region
20774  *
20775  * Note:
20776  *     This routine is primarily intended to be called during/soon after map
20777  *     creation before the associated task has been released to run. It is only
20778  *     currently safe when we have no resident pages.
20779  */
20780 boolean_t
vm_map_set_tpro_range(__unused vm_map_t map,__unused vm_map_address_t start,__unused vm_map_address_t end)20781 vm_map_set_tpro_range(
20782 	__unused vm_map_t map,
20783 	__unused vm_map_address_t start,
20784 	__unused vm_map_address_t end)
20785 {
20786 	return TRUE;
20787 }
20788 
20789 /*
20790  * Expand the maximum size of an existing map.
20791  */
20792 void
vm_map_set_max_addr(vm_map_t map,vm_map_offset_t new_max_offset)20793 vm_map_set_max_addr(vm_map_t map, vm_map_offset_t new_max_offset)
20794 {
20795 #if defined(__arm64__)
20796 	vm_map_offset_t max_supported_offset;
20797 	vm_map_offset_t old_max_offset;
20798 
20799 	vm_map_lock(map);
20800 
20801 	old_max_offset = map->max_offset;
20802 	max_supported_offset = pmap_max_offset(vm_map_is_64bit(map), ARM_PMAP_MAX_OFFSET_JUMBO);
20803 
20804 	new_max_offset = trunc_page(new_max_offset);
20805 
20806 	/* The address space cannot be shrunk using this routine. */
20807 	if (old_max_offset >= new_max_offset) {
20808 		vm_map_unlock(map);
20809 		return;
20810 	}
20811 
20812 	if (max_supported_offset < new_max_offset) {
20813 		new_max_offset = max_supported_offset;
20814 	}
20815 
20816 	map->max_offset = new_max_offset;
20817 
20818 	if (map->holelistenabled) {
20819 		if (map->holes_list->prev->vme_end == old_max_offset) {
20820 			/*
20821 			 * There is already a hole at the end of the map; simply make it bigger.
20822 			 */
20823 			map->holes_list->prev->vme_end = map->max_offset;
20824 		} else {
20825 			/*
20826 			 * There is no hole at the end, so we need to create a new hole
20827 			 * for the new empty space we're creating.
20828 			 */
20829 			struct vm_map_links *new_hole;
20830 
20831 			new_hole = zalloc_id(ZONE_ID_VM_MAP_HOLES, Z_WAITOK | Z_NOFAIL);
20832 			new_hole->start = old_max_offset;
20833 			new_hole->end = map->max_offset;
20834 			new_hole->prev = map->holes_list->prev;
20835 			new_hole->next = (struct vm_map_entry *)map->holes_list;
20836 			map->holes_list->prev->vme_next = (struct vm_map_entry *)new_hole;
20837 			map->holes_list->prev = (struct vm_map_entry *)new_hole;
20838 		}
20839 	}
20840 
20841 	vm_map_unlock(map);
20842 #else
20843 	(void)map;
20844 	(void)new_max_offset;
20845 #endif
20846 }
20847 
20848 vm_map_offset_t
vm_compute_max_offset(boolean_t is64)20849 vm_compute_max_offset(boolean_t is64)
20850 {
20851 #if defined(__arm64__)
20852 	return pmap_max_offset(is64, ARM_PMAP_MAX_OFFSET_DEVICE);
20853 #else
20854 	return is64 ? (vm_map_offset_t)MACH_VM_MAX_ADDRESS : (vm_map_offset_t)VM_MAX_ADDRESS;
20855 #endif
20856 }
20857 
20858 void
vm_map_get_max_aslr_slide_section(vm_map_t map __unused,int64_t * max_sections,int64_t * section_size)20859 vm_map_get_max_aslr_slide_section(
20860 	vm_map_t                map __unused,
20861 	int64_t                 *max_sections,
20862 	int64_t                 *section_size)
20863 {
20864 #if defined(__arm64__)
20865 	*max_sections = 3;
20866 	*section_size = ARM_TT_TWIG_SIZE;
20867 #else
20868 	*max_sections = 1;
20869 	*section_size = 0;
20870 #endif
20871 }
20872 
20873 uint64_t
vm_map_get_max_aslr_slide_pages(vm_map_t map)20874 vm_map_get_max_aslr_slide_pages(vm_map_t map)
20875 {
20876 #if defined(__arm64__)
20877 	/* Limit arm64 slide to 16MB to conserve contiguous VA space in the more
20878 	 * limited embedded address space; this is also meant to minimize pmap
20879 	 * memory usage on 16KB page systems.
20880 	 */
20881 	return 1 << (24 - VM_MAP_PAGE_SHIFT(map));
20882 #else
20883 	return 1 << (vm_map_is_64bit(map) ? 16 : 8);
20884 #endif
20885 }
20886 
20887 uint64_t
vm_map_get_max_loader_aslr_slide_pages(vm_map_t map)20888 vm_map_get_max_loader_aslr_slide_pages(vm_map_t map)
20889 {
20890 #if defined(__arm64__)
20891 	/* We limit the loader slide to 4MB, in order to ensure at least 8 bits
20892 	 * of independent entropy on 16KB page systems.
20893 	 */
20894 	return 1 << (22 - VM_MAP_PAGE_SHIFT(map));
20895 #else
20896 	return 1 << (vm_map_is_64bit(map) ? 16 : 8);
20897 #endif
20898 }
20899 
20900 boolean_t
vm_map_is_64bit(vm_map_t map)20901 vm_map_is_64bit(
20902 	vm_map_t map)
20903 {
20904 	return map->max_offset > ((vm_map_offset_t)VM_MAX_ADDRESS);
20905 }
20906 
20907 boolean_t
vm_map_has_hard_pagezero(vm_map_t map,vm_map_offset_t pagezero_size)20908 vm_map_has_hard_pagezero(
20909 	vm_map_t        map,
20910 	vm_map_offset_t pagezero_size)
20911 {
20912 	/*
20913 	 * XXX FBDP
20914 	 * We should lock the VM map (for read) here but we can get away
20915 	 * with it for now because there can't really be any race condition:
20916 	 * the VM map's min_offset is changed only when the VM map is created
20917 	 * and when the zero page is established (when the binary gets loaded),
20918 	 * and this routine gets called only when the task terminates and the
20919 	 * VM map is being torn down, and when a new map is created via
20920 	 * load_machfile()/execve().
20921 	 */
20922 	return map->min_offset >= pagezero_size;
20923 }
20924 
20925 /*
20926  * Raise a VM map's maximun offset.
20927  */
20928 kern_return_t
vm_map_raise_max_offset(vm_map_t map,vm_map_offset_t new_max_offset)20929 vm_map_raise_max_offset(
20930 	vm_map_t        map,
20931 	vm_map_offset_t new_max_offset)
20932 {
20933 	kern_return_t   ret;
20934 
20935 	vm_map_lock(map);
20936 	ret = KERN_INVALID_ADDRESS;
20937 
20938 	if (new_max_offset >= map->max_offset) {
20939 		if (!vm_map_is_64bit(map)) {
20940 			if (new_max_offset <= (vm_map_offset_t)VM_MAX_ADDRESS) {
20941 				map->max_offset = new_max_offset;
20942 				ret = KERN_SUCCESS;
20943 			}
20944 		} else {
20945 			if (new_max_offset <= (vm_map_offset_t)MACH_VM_MAX_ADDRESS) {
20946 				map->max_offset = new_max_offset;
20947 				ret = KERN_SUCCESS;
20948 			}
20949 		}
20950 	}
20951 
20952 	vm_map_unlock(map);
20953 	return ret;
20954 }
20955 
20956 
20957 /*
20958  * Raise a VM map's minimum offset.
20959  * To strictly enforce "page zero" reservation.
20960  */
20961 kern_return_t
vm_map_raise_min_offset(vm_map_t map,vm_map_offset_t new_min_offset)20962 vm_map_raise_min_offset(
20963 	vm_map_t        map,
20964 	vm_map_offset_t new_min_offset)
20965 {
20966 	vm_map_entry_t  first_entry;
20967 
20968 	new_min_offset = vm_map_round_page(new_min_offset,
20969 	    VM_MAP_PAGE_MASK(map));
20970 
20971 	vm_map_lock(map);
20972 
20973 	if (new_min_offset < map->min_offset) {
20974 		/*
20975 		 * Can't move min_offset backwards, as that would expose
20976 		 * a part of the address space that was previously, and for
20977 		 * possibly good reasons, inaccessible.
20978 		 */
20979 		vm_map_unlock(map);
20980 		return KERN_INVALID_ADDRESS;
20981 	}
20982 	if (new_min_offset >= map->max_offset) {
20983 		/* can't go beyond the end of the address space */
20984 		vm_map_unlock(map);
20985 		return KERN_INVALID_ADDRESS;
20986 	}
20987 
20988 	first_entry = vm_map_first_entry(map);
20989 	if (first_entry != vm_map_to_entry(map) &&
20990 	    first_entry->vme_start < new_min_offset) {
20991 		/*
20992 		 * Some memory was already allocated below the new
20993 		 * minimun offset.  It's too late to change it now...
20994 		 */
20995 		vm_map_unlock(map);
20996 		return KERN_NO_SPACE;
20997 	}
20998 
20999 	map->min_offset = new_min_offset;
21000 
21001 	if (map->holelistenabled) {
21002 		assert(map->holes_list);
21003 		map->holes_list->start = new_min_offset;
21004 		assert(new_min_offset < map->holes_list->end);
21005 	}
21006 
21007 	vm_map_unlock(map);
21008 
21009 	return KERN_SUCCESS;
21010 }
21011 
21012 /*
21013  * Set the limit on the maximum amount of address space and user wired memory allowed for this map.
21014  * This is basically a copy of the RLIMIT_AS and RLIMIT_MEMLOCK rlimit value maintained by the BSD
21015  * side of the kernel. The limits are checked in the mach VM side, so we keep a copy so we don't
21016  * have to reach over to the BSD data structures.
21017  */
21018 
21019 uint64_t vm_map_set_size_limit_count = 0;
21020 kern_return_t
vm_map_set_size_limit(vm_map_t map,uint64_t new_size_limit)21021 vm_map_set_size_limit(vm_map_t map, uint64_t new_size_limit)
21022 {
21023 	kern_return_t kr;
21024 
21025 	vm_map_lock(map);
21026 	if (new_size_limit < map->size) {
21027 		/* new limit should not be lower than its current size */
21028 		DTRACE_VM2(vm_map_set_size_limit_fail,
21029 		    vm_map_size_t, map->size,
21030 		    uint64_t, new_size_limit);
21031 		kr = KERN_FAILURE;
21032 	} else if (new_size_limit == map->size_limit) {
21033 		/* no change */
21034 		kr = KERN_SUCCESS;
21035 	} else {
21036 		/* set new limit */
21037 		DTRACE_VM2(vm_map_set_size_limit,
21038 		    vm_map_size_t, map->size,
21039 		    uint64_t, new_size_limit);
21040 		if (new_size_limit != RLIM_INFINITY) {
21041 			vm_map_set_size_limit_count++;
21042 		}
21043 		map->size_limit = new_size_limit;
21044 		kr = KERN_SUCCESS;
21045 	}
21046 	vm_map_unlock(map);
21047 	return kr;
21048 }
21049 
21050 uint64_t vm_map_set_data_limit_count = 0;
21051 kern_return_t
vm_map_set_data_limit(vm_map_t map,uint64_t new_data_limit)21052 vm_map_set_data_limit(vm_map_t map, uint64_t new_data_limit)
21053 {
21054 	kern_return_t kr;
21055 
21056 	vm_map_lock(map);
21057 	if (new_data_limit < map->size) {
21058 		/* new limit should not be lower than its current size */
21059 		DTRACE_VM2(vm_map_set_data_limit_fail,
21060 		    vm_map_size_t, map->size,
21061 		    uint64_t, new_data_limit);
21062 		kr = KERN_FAILURE;
21063 	} else if (new_data_limit == map->data_limit) {
21064 		/* no change */
21065 		kr = KERN_SUCCESS;
21066 	} else {
21067 		/* set new limit */
21068 		DTRACE_VM2(vm_map_set_data_limit,
21069 		    vm_map_size_t, map->size,
21070 		    uint64_t, new_data_limit);
21071 		if (new_data_limit != RLIM_INFINITY) {
21072 			vm_map_set_data_limit_count++;
21073 		}
21074 		map->data_limit = new_data_limit;
21075 		kr = KERN_SUCCESS;
21076 	}
21077 	vm_map_unlock(map);
21078 	return kr;
21079 }
21080 
21081 void
vm_map_set_user_wire_limit(vm_map_t map,vm_size_t limit)21082 vm_map_set_user_wire_limit(vm_map_t     map,
21083     vm_size_t    limit)
21084 {
21085 	vm_map_lock(map);
21086 	map->user_wire_limit = limit;
21087 	vm_map_unlock(map);
21088 }
21089 
21090 
21091 void
vm_map_switch_protect(vm_map_t map,boolean_t val)21092 vm_map_switch_protect(vm_map_t     map,
21093     boolean_t    val)
21094 {
21095 	vm_map_lock(map);
21096 	map->switch_protect = val;
21097 	vm_map_unlock(map);
21098 }
21099 
21100 extern int cs_process_enforcement_enable;
21101 boolean_t
vm_map_cs_enforcement(vm_map_t map)21102 vm_map_cs_enforcement(
21103 	vm_map_t map)
21104 {
21105 	if (cs_process_enforcement_enable) {
21106 		return TRUE;
21107 	}
21108 	return map->cs_enforcement;
21109 }
21110 
21111 kern_return_t
vm_map_cs_wx_enable(__unused vm_map_t map)21112 vm_map_cs_wx_enable(
21113 	__unused vm_map_t map)
21114 {
21115 #if CODE_SIGNING_MONITOR
21116 	kern_return_t ret = csm_allow_invalid_code(vm_map_pmap(map));
21117 	if ((ret == KERN_SUCCESS) || (ret == KERN_NOT_SUPPORTED)) {
21118 		return KERN_SUCCESS;
21119 	}
21120 	return ret;
21121 #else
21122 	/* The VM manages WX memory entirely on its own */
21123 	return KERN_SUCCESS;
21124 #endif
21125 }
21126 
21127 kern_return_t
vm_map_csm_allow_jit(__unused vm_map_t map)21128 vm_map_csm_allow_jit(
21129 	__unused vm_map_t map)
21130 {
21131 #if CODE_SIGNING_MONITOR
21132 	return csm_allow_jit_region(vm_map_pmap(map));
21133 #else
21134 	/* No code signing monitor to enforce JIT policy */
21135 	return KERN_SUCCESS;
21136 #endif
21137 }
21138 
21139 void
vm_map_cs_debugged_set(vm_map_t map,boolean_t val)21140 vm_map_cs_debugged_set(
21141 	vm_map_t map,
21142 	boolean_t val)
21143 {
21144 	vm_map_lock(map);
21145 	map->cs_debugged = val;
21146 	vm_map_unlock(map);
21147 }
21148 
21149 void
vm_map_cs_enforcement_set(vm_map_t map,boolean_t val)21150 vm_map_cs_enforcement_set(
21151 	vm_map_t map,
21152 	boolean_t val)
21153 {
21154 	vm_map_lock(map);
21155 	map->cs_enforcement = val;
21156 	pmap_set_vm_map_cs_enforced(map->pmap, val);
21157 	vm_map_unlock(map);
21158 }
21159 
21160 /*
21161  * IOKit has mapped a region into this map; adjust the pmap's ledgers appropriately.
21162  * phys_footprint is a composite limit consisting of iokit + physmem, so we need to
21163  * bump both counters.
21164  */
21165 void
vm_map_iokit_mapped_region(vm_map_t map,vm_size_t bytes)21166 vm_map_iokit_mapped_region(vm_map_t map, vm_size_t bytes)
21167 {
21168 	pmap_t pmap = vm_map_pmap(map);
21169 
21170 	ledger_credit(pmap->ledger, task_ledgers.iokit_mapped, bytes);
21171 	ledger_credit(pmap->ledger, task_ledgers.phys_footprint, bytes);
21172 }
21173 
21174 void
vm_map_iokit_unmapped_region(vm_map_t map,vm_size_t bytes)21175 vm_map_iokit_unmapped_region(vm_map_t map, vm_size_t bytes)
21176 {
21177 	pmap_t pmap = vm_map_pmap(map);
21178 
21179 	ledger_debit(pmap->ledger, task_ledgers.iokit_mapped, bytes);
21180 	ledger_debit(pmap->ledger, task_ledgers.phys_footprint, bytes);
21181 }
21182 
21183 /* Add (generate) code signature for memory range */
21184 #if CONFIG_DYNAMIC_CODE_SIGNING
21185 kern_return_t
vm_map_sign(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)21186 vm_map_sign(vm_map_t map,
21187     vm_map_offset_t start,
21188     vm_map_offset_t end)
21189 {
21190 	vm_map_entry_t entry;
21191 	vm_page_t m;
21192 	vm_object_t object;
21193 
21194 	/*
21195 	 * Vet all the input parameters and current type and state of the
21196 	 * underlaying object.  Return with an error if anything is amiss.
21197 	 */
21198 	if (map == VM_MAP_NULL) {
21199 		return KERN_INVALID_ARGUMENT;
21200 	}
21201 
21202 	if (__improbable(vm_map_range_overflows(map, start, end - start))) {
21203 		return KERN_INVALID_ADDRESS;
21204 	}
21205 
21206 	vm_map_lock_read(map);
21207 
21208 	if (!vm_map_lookup_entry(map, start, &entry) || entry->is_sub_map) {
21209 		/*
21210 		 * Must pass a valid non-submap address.
21211 		 */
21212 		vm_map_unlock_read(map);
21213 		return KERN_INVALID_ADDRESS;
21214 	}
21215 
21216 	if ((entry->vme_start > start) || (entry->vme_end < end)) {
21217 		/*
21218 		 * Map entry doesn't cover the requested range. Not handling
21219 		 * this situation currently.
21220 		 */
21221 		vm_map_unlock_read(map);
21222 		return KERN_INVALID_ARGUMENT;
21223 	}
21224 
21225 	object = VME_OBJECT(entry);
21226 	if (object == VM_OBJECT_NULL) {
21227 		/*
21228 		 * Object must already be present or we can't sign.
21229 		 */
21230 		vm_map_unlock_read(map);
21231 		return KERN_INVALID_ARGUMENT;
21232 	}
21233 
21234 	vm_object_lock(object);
21235 	vm_map_unlock_read(map);
21236 
21237 	while (start < end) {
21238 		uint32_t refmod;
21239 
21240 		m = vm_page_lookup(object,
21241 		    start - entry->vme_start + VME_OFFSET(entry));
21242 		if (m == VM_PAGE_NULL) {
21243 			/* shoud we try to fault a page here? we can probably
21244 			 * demand it exists and is locked for this request */
21245 			vm_object_unlock(object);
21246 			return KERN_FAILURE;
21247 		}
21248 		/* deal with special page status */
21249 		if (m->vmp_busy ||
21250 		    (m->vmp_unusual && (VMP_ERROR_GET(m) || m->vmp_restart || m->vmp_private || m->vmp_absent))) {
21251 			vm_object_unlock(object);
21252 			return KERN_FAILURE;
21253 		}
21254 
21255 		/* Page is OK... now "validate" it */
21256 		/* This is the place where we'll call out to create a code
21257 		 * directory, later */
21258 		/* XXX TODO4K: deal with 4k subpages individually? */
21259 		m->vmp_cs_validated = VMP_CS_ALL_TRUE;
21260 
21261 		/* The page is now "clean" for codesigning purposes. That means
21262 		 * we don't consider it as modified (wpmapped) anymore. But
21263 		 * we'll disconnect the page so we note any future modification
21264 		 * attempts. */
21265 		m->vmp_wpmapped = FALSE;
21266 		refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
21267 
21268 		/* Pull the dirty status from the pmap, since we cleared the
21269 		 * wpmapped bit */
21270 		if ((refmod & VM_MEM_MODIFIED) && !m->vmp_dirty) {
21271 			SET_PAGE_DIRTY(m, FALSE);
21272 		}
21273 
21274 		/* On to the next page */
21275 		start += PAGE_SIZE;
21276 	}
21277 	vm_object_unlock(object);
21278 
21279 	return KERN_SUCCESS;
21280 }
21281 #endif
21282 
21283 kern_return_t
vm_map_partial_reap(vm_map_t map,unsigned int * reclaimed_resident,unsigned int * reclaimed_compressed)21284 vm_map_partial_reap(vm_map_t map, unsigned int *reclaimed_resident, unsigned int *reclaimed_compressed)
21285 {
21286 	vm_map_entry_t  entry = VM_MAP_ENTRY_NULL;
21287 	vm_map_entry_t  next_entry;
21288 	kern_return_t   kr = KERN_SUCCESS;
21289 	VM_MAP_ZAP_DECLARE(zap_list);
21290 
21291 	vm_map_lock(map);
21292 
21293 	for (entry = vm_map_first_entry(map);
21294 	    entry != vm_map_to_entry(map);
21295 	    entry = next_entry) {
21296 		next_entry = entry->vme_next;
21297 
21298 		if (!entry->is_sub_map &&
21299 		    VME_OBJECT(entry) &&
21300 		    (VME_OBJECT(entry)->internal == TRUE) &&
21301 		    (VME_OBJECT(entry)->ref_count == 1)) {
21302 			*reclaimed_resident += VME_OBJECT(entry)->resident_page_count;
21303 			*reclaimed_compressed += vm_compressor_pager_get_count(VME_OBJECT(entry)->pager);
21304 
21305 			(void)vm_map_delete(map, entry->vme_start,
21306 			    entry->vme_end, VM_MAP_REMOVE_NO_YIELD,
21307 			    KMEM_GUARD_NONE, &zap_list);
21308 		}
21309 	}
21310 
21311 	vm_map_unlock(map);
21312 
21313 	vm_map_zap_dispose(&zap_list);
21314 
21315 	return kr;
21316 }
21317 
21318 
21319 #if DEVELOPMENT || DEBUG
21320 
21321 int
vm_map_disconnect_page_mappings(vm_map_t map,boolean_t do_unnest)21322 vm_map_disconnect_page_mappings(
21323 	vm_map_t map,
21324 	boolean_t do_unnest)
21325 {
21326 	vm_map_entry_t entry;
21327 	ledger_amount_t byte_count = 0;
21328 
21329 	if (do_unnest == TRUE) {
21330 #ifndef NO_NESTED_PMAP
21331 		vm_map_lock(map);
21332 
21333 		for (entry = vm_map_first_entry(map);
21334 		    entry != vm_map_to_entry(map);
21335 		    entry = entry->vme_next) {
21336 			if (entry->is_sub_map && entry->use_pmap) {
21337 				/*
21338 				 * Make sure the range between the start of this entry and
21339 				 * the end of this entry is no longer nested, so that
21340 				 * we will only remove mappings from the pmap in use by this
21341 				 * this task
21342 				 */
21343 				vm_map_clip_unnest(map, entry, entry->vme_start, entry->vme_end);
21344 			}
21345 		}
21346 		vm_map_unlock(map);
21347 #endif
21348 	}
21349 	vm_map_lock_read(map);
21350 
21351 	ledger_get_balance(map->pmap->ledger, task_ledgers.phys_mem, &byte_count);
21352 
21353 	for (entry = vm_map_first_entry(map);
21354 	    entry != vm_map_to_entry(map);
21355 	    entry = entry->vme_next) {
21356 		if (!entry->is_sub_map && ((VME_OBJECT(entry) == 0) ||
21357 		    (VME_OBJECT(entry)->phys_contiguous))) {
21358 			continue;
21359 		}
21360 		if (entry->is_sub_map) {
21361 			assert(!entry->use_pmap);
21362 		}
21363 
21364 		pmap_remove_options(map->pmap, entry->vme_start, entry->vme_end, 0);
21365 	}
21366 	vm_map_unlock_read(map);
21367 
21368 	return (int) (byte_count / VM_MAP_PAGE_SIZE(map));
21369 }
21370 
21371 kern_return_t
vm_map_inject_error(vm_map_t map,vm_map_offset_t vaddr)21372 vm_map_inject_error(vm_map_t map, vm_map_offset_t vaddr)
21373 {
21374 	vm_object_t object = NULL;
21375 	vm_object_offset_t offset;
21376 	vm_prot_t prot;
21377 	boolean_t wired;
21378 	vm_map_version_t version;
21379 	vm_map_t real_map;
21380 	int result = KERN_FAILURE;
21381 
21382 	vaddr = vm_map_trunc_page(vaddr, PAGE_MASK);
21383 	vm_map_lock(map);
21384 
21385 	result = vm_map_lookup_and_lock_object(&map, vaddr, VM_PROT_READ,
21386 	    OBJECT_LOCK_EXCLUSIVE, &version, &object, &offset, &prot, &wired,
21387 	    NULL, &real_map, NULL);
21388 	if (object == NULL) {
21389 		result = KERN_MEMORY_ERROR;
21390 	} else if (object->pager) {
21391 		result = vm_compressor_pager_inject_error(object->pager,
21392 		    offset);
21393 	} else {
21394 		result = KERN_MEMORY_PRESENT;
21395 	}
21396 
21397 	if (object != NULL) {
21398 		vm_object_unlock(object);
21399 	}
21400 
21401 	if (real_map != map) {
21402 		vm_map_unlock(real_map);
21403 	}
21404 	vm_map_unlock(map);
21405 
21406 	return result;
21407 }
21408 
21409 #endif
21410 
21411 
21412 #if CONFIG_FREEZE
21413 
21414 
21415 extern struct freezer_context freezer_context_global;
21416 AbsoluteTime c_freezer_last_yield_ts = 0;
21417 
21418 extern unsigned int memorystatus_freeze_private_shared_pages_ratio;
21419 extern unsigned int memorystatus_freeze_shared_mb_per_process_max;
21420 
21421 kern_return_t
vm_map_freeze(task_t task,unsigned int * purgeable_count,unsigned int * wired_count,unsigned int * clean_count,unsigned int * dirty_count,unsigned int dirty_budget,unsigned int * shared_count,int * freezer_error_code,boolean_t eval_only)21422 vm_map_freeze(
21423 	task_t       task,
21424 	unsigned int *purgeable_count,
21425 	unsigned int *wired_count,
21426 	unsigned int *clean_count,
21427 	unsigned int *dirty_count,
21428 	unsigned int dirty_budget,
21429 	unsigned int *shared_count,
21430 	int          *freezer_error_code,
21431 	boolean_t    eval_only)
21432 {
21433 	vm_map_entry_t  entry2 = VM_MAP_ENTRY_NULL;
21434 	kern_return_t   kr = KERN_SUCCESS;
21435 	boolean_t       evaluation_phase = TRUE;
21436 	vm_object_t     cur_shared_object = NULL;
21437 	int             cur_shared_obj_ref_cnt = 0;
21438 	unsigned int    dirty_private_count = 0, dirty_shared_count = 0, obj_pages_snapshot = 0;
21439 
21440 	*purgeable_count = *wired_count = *clean_count = *dirty_count = *shared_count = 0;
21441 
21442 	/*
21443 	 * We need the exclusive lock here so that we can
21444 	 * block any page faults or lookups while we are
21445 	 * in the middle of freezing this vm map.
21446 	 */
21447 	vm_map_t map = task->map;
21448 
21449 	vm_map_lock(map);
21450 
21451 	assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
21452 
21453 	if (vm_compressor_low_on_space() || vm_swap_low_on_space()) {
21454 		if (vm_compressor_low_on_space()) {
21455 			*freezer_error_code = FREEZER_ERROR_NO_COMPRESSOR_SPACE;
21456 		}
21457 
21458 		if (vm_swap_low_on_space()) {
21459 			*freezer_error_code = FREEZER_ERROR_NO_SWAP_SPACE;
21460 		}
21461 
21462 		kr = KERN_NO_SPACE;
21463 		goto done;
21464 	}
21465 
21466 	if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE == FALSE) {
21467 		/*
21468 		 * In-memory compressor backing the freezer. No disk.
21469 		 * So no need to do the evaluation phase.
21470 		 */
21471 		evaluation_phase = FALSE;
21472 
21473 		if (eval_only == TRUE) {
21474 			/*
21475 			 * We don't support 'eval_only' mode
21476 			 * in this non-swap config.
21477 			 */
21478 			*freezer_error_code = FREEZER_ERROR_GENERIC;
21479 			kr = KERN_INVALID_ARGUMENT;
21480 			goto done;
21481 		}
21482 
21483 		freezer_context_global.freezer_ctx_uncompressed_pages = 0;
21484 		clock_get_uptime(&c_freezer_last_yield_ts);
21485 	}
21486 again:
21487 
21488 	for (entry2 = vm_map_first_entry(map);
21489 	    entry2 != vm_map_to_entry(map);
21490 	    entry2 = entry2->vme_next) {
21491 		vm_object_t src_object;
21492 
21493 		if (entry2->is_sub_map) {
21494 			continue;
21495 		}
21496 
21497 		src_object = VME_OBJECT(entry2);
21498 		if (!src_object ||
21499 		    src_object->phys_contiguous ||
21500 		    !src_object->internal) {
21501 			continue;
21502 		}
21503 
21504 		/* If eligible, scan the entry, moving eligible pages over to our parent object */
21505 
21506 		if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
21507 			/*
21508 			 * We skip purgeable objects during evaluation phase only.
21509 			 * If we decide to freeze this process, we'll explicitly
21510 			 * purge these objects before we go around again with
21511 			 * 'evaluation_phase' set to FALSE.
21512 			 */
21513 
21514 			if ((src_object->purgable == VM_PURGABLE_EMPTY) || (src_object->purgable == VM_PURGABLE_VOLATILE)) {
21515 				/*
21516 				 * We want to purge objects that may not belong to this task but are mapped
21517 				 * in this task alone. Since we already purged this task's purgeable memory
21518 				 * at the end of a successful evaluation phase, we want to avoid doing no-op calls
21519 				 * on this task's purgeable objects. Hence the check for only volatile objects.
21520 				 */
21521 				if (evaluation_phase == FALSE &&
21522 				    (src_object->purgable == VM_PURGABLE_VOLATILE) &&
21523 				    (src_object->ref_count == 1)) {
21524 					vm_object_lock(src_object);
21525 					vm_object_purge(src_object, 0);
21526 					vm_object_unlock(src_object);
21527 				}
21528 				continue;
21529 			}
21530 
21531 			/*
21532 			 * Pages belonging to this object could be swapped to disk.
21533 			 * Make sure it's not a shared object because we could end
21534 			 * up just bringing it back in again.
21535 			 *
21536 			 * We try to optimize somewhat by checking for objects that are mapped
21537 			 * more than once within our own map. But we don't do full searches,
21538 			 * we just look at the entries following our current entry.
21539 			 */
21540 
21541 			if (src_object->ref_count > 1) {
21542 				if (src_object != cur_shared_object) {
21543 					obj_pages_snapshot = (src_object->resident_page_count - src_object->wired_page_count) + vm_compressor_pager_get_count(src_object->pager);
21544 					dirty_shared_count += obj_pages_snapshot;
21545 
21546 					cur_shared_object = src_object;
21547 					cur_shared_obj_ref_cnt = 1;
21548 					continue;
21549 				} else {
21550 					cur_shared_obj_ref_cnt++;
21551 					if (src_object->ref_count == cur_shared_obj_ref_cnt) {
21552 						/*
21553 						 * Fall through to below and treat this object as private.
21554 						 * So deduct its pages from our shared total and add it to the
21555 						 * private total.
21556 						 */
21557 
21558 						dirty_shared_count -= obj_pages_snapshot;
21559 						dirty_private_count += obj_pages_snapshot;
21560 					} else {
21561 						continue;
21562 					}
21563 				}
21564 			}
21565 
21566 
21567 			if (src_object->ref_count == 1) {
21568 				dirty_private_count += (src_object->resident_page_count - src_object->wired_page_count) + vm_compressor_pager_get_count(src_object->pager);
21569 			}
21570 
21571 			if (evaluation_phase == TRUE) {
21572 				continue;
21573 			}
21574 		}
21575 
21576 		uint32_t paged_out_count = vm_object_compressed_freezer_pageout(src_object, dirty_budget);
21577 		*wired_count += src_object->wired_page_count;
21578 
21579 		if (vm_compressor_low_on_space() || vm_swap_low_on_space()) {
21580 			if (vm_compressor_low_on_space()) {
21581 				*freezer_error_code = FREEZER_ERROR_NO_COMPRESSOR_SPACE;
21582 			}
21583 
21584 			if (vm_swap_low_on_space()) {
21585 				*freezer_error_code = FREEZER_ERROR_NO_SWAP_SPACE;
21586 			}
21587 
21588 			kr = KERN_NO_SPACE;
21589 			break;
21590 		}
21591 		if (paged_out_count >= dirty_budget) {
21592 			break;
21593 		}
21594 		dirty_budget -= paged_out_count;
21595 	}
21596 
21597 	*shared_count = (unsigned int) ((dirty_shared_count * PAGE_SIZE_64) / (1024 * 1024ULL));
21598 	if (evaluation_phase) {
21599 		unsigned int shared_pages_threshold = (memorystatus_freeze_shared_mb_per_process_max * 1024 * 1024ULL) / PAGE_SIZE_64;
21600 
21601 		if (dirty_shared_count > shared_pages_threshold) {
21602 			*freezer_error_code = FREEZER_ERROR_EXCESS_SHARED_MEMORY;
21603 			kr = KERN_FAILURE;
21604 			goto done;
21605 		}
21606 
21607 		if (dirty_shared_count &&
21608 		    ((dirty_private_count / dirty_shared_count) < memorystatus_freeze_private_shared_pages_ratio)) {
21609 			*freezer_error_code = FREEZER_ERROR_LOW_PRIVATE_SHARED_RATIO;
21610 			kr = KERN_FAILURE;
21611 			goto done;
21612 		}
21613 
21614 		evaluation_phase = FALSE;
21615 		dirty_shared_count = dirty_private_count = 0;
21616 
21617 		freezer_context_global.freezer_ctx_uncompressed_pages = 0;
21618 		clock_get_uptime(&c_freezer_last_yield_ts);
21619 
21620 		if (eval_only) {
21621 			kr = KERN_SUCCESS;
21622 			goto done;
21623 		}
21624 
21625 		vm_purgeable_purge_task_owned(task);
21626 
21627 		goto again;
21628 	} else {
21629 		kr = KERN_SUCCESS;
21630 	}
21631 
21632 done:
21633 	vm_map_unlock(map);
21634 
21635 	if ((eval_only == FALSE) && (kr == KERN_SUCCESS)) {
21636 		vm_object_compressed_freezer_done();
21637 	}
21638 	return kr;
21639 }
21640 
21641 #endif
21642 
21643 /*
21644  * vm_map_entry_should_cow_for_true_share:
21645  *
21646  * Determines if the map entry should be clipped and setup for copy-on-write
21647  * to avoid applying "true_share" to a large VM object when only a subset is
21648  * targeted.
21649  *
21650  * For now, we target only the map entries created for the Objective C
21651  * Garbage Collector, which initially have the following properties:
21652  *	- alias == VM_MEMORY_MALLOC
21653  *      - wired_count == 0
21654  *      - !needs_copy
21655  * and a VM object with:
21656  *      - internal
21657  *      - copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC
21658  *      - !true_share
21659  *      - vo_size == ANON_CHUNK_SIZE
21660  *
21661  * Only non-kernel map entries.
21662  */
21663 boolean_t
vm_map_entry_should_cow_for_true_share(vm_map_entry_t entry)21664 vm_map_entry_should_cow_for_true_share(
21665 	vm_map_entry_t  entry)
21666 {
21667 	vm_object_t     object;
21668 
21669 	if (entry->is_sub_map) {
21670 		/* entry does not point at a VM object */
21671 		return FALSE;
21672 	}
21673 
21674 	if (entry->needs_copy) {
21675 		/* already set for copy_on_write: done! */
21676 		return FALSE;
21677 	}
21678 
21679 	if (VME_ALIAS(entry) != VM_MEMORY_MALLOC &&
21680 	    VME_ALIAS(entry) != VM_MEMORY_MALLOC_SMALL) {
21681 		/* not a malloc heap or Obj-C Garbage Collector heap */
21682 		return FALSE;
21683 	}
21684 
21685 	if (entry->wired_count) {
21686 		/* wired: can't change the map entry... */
21687 		vm_counters.should_cow_but_wired++;
21688 		return FALSE;
21689 	}
21690 
21691 	object = VME_OBJECT(entry);
21692 
21693 	if (object == VM_OBJECT_NULL) {
21694 		/* no object yet... */
21695 		return FALSE;
21696 	}
21697 
21698 	if (!object->internal) {
21699 		/* not an internal object */
21700 		return FALSE;
21701 	}
21702 
21703 	if (object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
21704 		/* not the default copy strategy */
21705 		return FALSE;
21706 	}
21707 
21708 	if (object->true_share) {
21709 		/* already true_share: too late to avoid it */
21710 		return FALSE;
21711 	}
21712 
21713 	if (VME_ALIAS(entry) == VM_MEMORY_MALLOC &&
21714 	    object->vo_size != ANON_CHUNK_SIZE) {
21715 		/* ... not an object created for the ObjC Garbage Collector */
21716 		return FALSE;
21717 	}
21718 
21719 	if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_SMALL &&
21720 	    object->vo_size != 2048 * 4096) {
21721 		/* ... not a "MALLOC_SMALL" heap */
21722 		return FALSE;
21723 	}
21724 
21725 	/*
21726 	 * All the criteria match: we have a large object being targeted for "true_share".
21727 	 * To limit the adverse side-effects linked with "true_share", tell the caller to
21728 	 * try and avoid setting up the entire object for "true_share" by clipping the
21729 	 * targeted range and setting it up for copy-on-write.
21730 	 */
21731 	return TRUE;
21732 }
21733 
21734 uint64_t vm_map_range_overflows_count = 0;
21735 TUNABLE_WRITEABLE(boolean_t, vm_map_range_overflows_log, "vm_map_range_overflows_log", FALSE);
21736 bool
vm_map_range_overflows(vm_map_t map,vm_map_offset_t addr,vm_map_size_t size)21737 vm_map_range_overflows(
21738 	vm_map_t map,
21739 	vm_map_offset_t addr,
21740 	vm_map_size_t size)
21741 {
21742 	vm_map_offset_t start, end, sum;
21743 	vm_map_offset_t pgmask;
21744 
21745 	if (size == 0) {
21746 		/* empty range -> no overflow */
21747 		return false;
21748 	}
21749 	pgmask = vm_map_page_mask(map);
21750 	start = vm_map_trunc_page_mask(addr, pgmask);
21751 	end = vm_map_round_page_mask(addr + size, pgmask);
21752 	if (__improbable(os_add_overflow(addr, size, &sum) || end <= start)) {
21753 		vm_map_range_overflows_count++;
21754 		if (vm_map_range_overflows_log) {
21755 			printf("%d[%s] vm_map_range_overflows addr 0x%llx size 0x%llx pgmask 0x%llx\n",
21756 			    proc_selfpid(),
21757 			    proc_best_name(current_proc()),
21758 			    (uint64_t)addr,
21759 			    (uint64_t)size,
21760 			    (uint64_t)pgmask);
21761 		}
21762 		DTRACE_VM4(vm_map_range_overflows,
21763 		    vm_map_t, map,
21764 		    uint32_t, pgmask,
21765 		    uint64_t, (uint64_t)addr,
21766 		    uint64_t, (uint64_t)size);
21767 		return true;
21768 	}
21769 	return false;
21770 }
21771 
21772 vm_map_offset_t
vm_map_round_page_mask(vm_map_offset_t offset,vm_map_offset_t mask)21773 vm_map_round_page_mask(
21774 	vm_map_offset_t offset,
21775 	vm_map_offset_t mask)
21776 {
21777 	return VM_MAP_ROUND_PAGE(offset, mask);
21778 }
21779 
21780 vm_map_offset_t
vm_map_trunc_page_mask(vm_map_offset_t offset,vm_map_offset_t mask)21781 vm_map_trunc_page_mask(
21782 	vm_map_offset_t offset,
21783 	vm_map_offset_t mask)
21784 {
21785 	return VM_MAP_TRUNC_PAGE(offset, mask);
21786 }
21787 
21788 boolean_t
vm_map_page_aligned(vm_map_offset_t offset,vm_map_offset_t mask)21789 vm_map_page_aligned(
21790 	vm_map_offset_t offset,
21791 	vm_map_offset_t mask)
21792 {
21793 	return ((offset) & mask) == 0;
21794 }
21795 
21796 int
vm_map_page_shift(vm_map_t map)21797 vm_map_page_shift(
21798 	vm_map_t map)
21799 {
21800 	return VM_MAP_PAGE_SHIFT(map);
21801 }
21802 
21803 int
vm_map_page_size(vm_map_t map)21804 vm_map_page_size(
21805 	vm_map_t map)
21806 {
21807 	return VM_MAP_PAGE_SIZE(map);
21808 }
21809 
21810 vm_map_offset_t
vm_map_page_mask(vm_map_t map)21811 vm_map_page_mask(
21812 	vm_map_t map)
21813 {
21814 	return VM_MAP_PAGE_MASK(map);
21815 }
21816 
21817 kern_return_t
vm_map_set_page_shift(vm_map_t map,int pageshift)21818 vm_map_set_page_shift(
21819 	vm_map_t        map,
21820 	int             pageshift)
21821 {
21822 	if (map->hdr.nentries != 0) {
21823 		/* too late to change page size */
21824 		return KERN_FAILURE;
21825 	}
21826 
21827 	map->hdr.page_shift = (uint16_t)pageshift;
21828 
21829 	return KERN_SUCCESS;
21830 }
21831 
21832 kern_return_t
vm_map_query_volatile(vm_map_t map,mach_vm_size_t * volatile_virtual_size_p,mach_vm_size_t * volatile_resident_size_p,mach_vm_size_t * volatile_compressed_size_p,mach_vm_size_t * volatile_pmap_size_p,mach_vm_size_t * volatile_compressed_pmap_size_p)21833 vm_map_query_volatile(
21834 	vm_map_t        map,
21835 	mach_vm_size_t  *volatile_virtual_size_p,
21836 	mach_vm_size_t  *volatile_resident_size_p,
21837 	mach_vm_size_t  *volatile_compressed_size_p,
21838 	mach_vm_size_t  *volatile_pmap_size_p,
21839 	mach_vm_size_t  *volatile_compressed_pmap_size_p)
21840 {
21841 	mach_vm_size_t  volatile_virtual_size;
21842 	mach_vm_size_t  volatile_resident_count;
21843 	mach_vm_size_t  volatile_compressed_count;
21844 	mach_vm_size_t  volatile_pmap_count;
21845 	mach_vm_size_t  volatile_compressed_pmap_count;
21846 	mach_vm_size_t  resident_count;
21847 	vm_map_entry_t  entry;
21848 	vm_object_t     object;
21849 
21850 	/* map should be locked by caller */
21851 
21852 	volatile_virtual_size = 0;
21853 	volatile_resident_count = 0;
21854 	volatile_compressed_count = 0;
21855 	volatile_pmap_count = 0;
21856 	volatile_compressed_pmap_count = 0;
21857 
21858 	for (entry = vm_map_first_entry(map);
21859 	    entry != vm_map_to_entry(map);
21860 	    entry = entry->vme_next) {
21861 		mach_vm_size_t  pmap_resident_bytes, pmap_compressed_bytes;
21862 
21863 		if (entry->is_sub_map) {
21864 			continue;
21865 		}
21866 		if (!(entry->protection & VM_PROT_WRITE)) {
21867 			continue;
21868 		}
21869 		object = VME_OBJECT(entry);
21870 		if (object == VM_OBJECT_NULL) {
21871 			continue;
21872 		}
21873 		if (object->purgable != VM_PURGABLE_VOLATILE &&
21874 		    object->purgable != VM_PURGABLE_EMPTY) {
21875 			continue;
21876 		}
21877 		if (VME_OFFSET(entry)) {
21878 			/*
21879 			 * If the map entry has been split and the object now
21880 			 * appears several times in the VM map, we don't want
21881 			 * to count the object's resident_page_count more than
21882 			 * once.  We count it only for the first one, starting
21883 			 * at offset 0 and ignore the other VM map entries.
21884 			 */
21885 			continue;
21886 		}
21887 		resident_count = object->resident_page_count;
21888 		if ((VME_OFFSET(entry) / PAGE_SIZE) >= resident_count) {
21889 			resident_count = 0;
21890 		} else {
21891 			resident_count -= (VME_OFFSET(entry) / PAGE_SIZE);
21892 		}
21893 
21894 		volatile_virtual_size += entry->vme_end - entry->vme_start;
21895 		volatile_resident_count += resident_count;
21896 		if (object->pager) {
21897 			volatile_compressed_count +=
21898 			    vm_compressor_pager_get_count(object->pager);
21899 		}
21900 		pmap_compressed_bytes = 0;
21901 		pmap_resident_bytes =
21902 		    pmap_query_resident(map->pmap,
21903 		    entry->vme_start,
21904 		    entry->vme_end,
21905 		    &pmap_compressed_bytes);
21906 		volatile_pmap_count += (pmap_resident_bytes / PAGE_SIZE);
21907 		volatile_compressed_pmap_count += (pmap_compressed_bytes
21908 		    / PAGE_SIZE);
21909 	}
21910 
21911 	/* map is still locked on return */
21912 
21913 	*volatile_virtual_size_p = volatile_virtual_size;
21914 	*volatile_resident_size_p = volatile_resident_count * PAGE_SIZE;
21915 	*volatile_compressed_size_p = volatile_compressed_count * PAGE_SIZE;
21916 	*volatile_pmap_size_p = volatile_pmap_count * PAGE_SIZE;
21917 	*volatile_compressed_pmap_size_p = volatile_compressed_pmap_count * PAGE_SIZE;
21918 
21919 	return KERN_SUCCESS;
21920 }
21921 
21922 void
vm_map_sizes(vm_map_t map,vm_map_size_t * psize,vm_map_size_t * pfree,vm_map_size_t * plargest_free)21923 vm_map_sizes(vm_map_t map,
21924     vm_map_size_t * psize,
21925     vm_map_size_t * pfree,
21926     vm_map_size_t * plargest_free)
21927 {
21928 	vm_map_entry_t  entry;
21929 	vm_map_offset_t prev;
21930 	vm_map_size_t   free, total_free, largest_free;
21931 	boolean_t       end;
21932 
21933 	if (!map) {
21934 		*psize = *pfree = *plargest_free = 0;
21935 		return;
21936 	}
21937 	total_free = largest_free = 0;
21938 
21939 	vm_map_lock_read(map);
21940 	if (psize) {
21941 		*psize = map->max_offset - map->min_offset;
21942 	}
21943 
21944 	prev = map->min_offset;
21945 	for (entry = vm_map_first_entry(map);; entry = entry->vme_next) {
21946 		end = (entry == vm_map_to_entry(map));
21947 
21948 		if (end) {
21949 			free = entry->vme_end   - prev;
21950 		} else {
21951 			free = entry->vme_start - prev;
21952 		}
21953 
21954 		total_free += free;
21955 		if (free > largest_free) {
21956 			largest_free = free;
21957 		}
21958 
21959 		if (end) {
21960 			break;
21961 		}
21962 		prev = entry->vme_end;
21963 	}
21964 	vm_map_unlock_read(map);
21965 	if (pfree) {
21966 		*pfree = total_free;
21967 	}
21968 	if (plargest_free) {
21969 		*plargest_free = largest_free;
21970 	}
21971 }
21972 
21973 #if VM_SCAN_FOR_SHADOW_CHAIN
21974 int vm_map_shadow_max(vm_map_t map);
21975 int
vm_map_shadow_max(vm_map_t map)21976 vm_map_shadow_max(
21977 	vm_map_t map)
21978 {
21979 	int             shadows, shadows_max;
21980 	vm_map_entry_t  entry;
21981 	vm_object_t     object, next_object;
21982 
21983 	if (map == NULL) {
21984 		return 0;
21985 	}
21986 
21987 	shadows_max = 0;
21988 
21989 	vm_map_lock_read(map);
21990 
21991 	for (entry = vm_map_first_entry(map);
21992 	    entry != vm_map_to_entry(map);
21993 	    entry = entry->vme_next) {
21994 		if (entry->is_sub_map) {
21995 			continue;
21996 		}
21997 		object = VME_OBJECT(entry);
21998 		if (object == NULL) {
21999 			continue;
22000 		}
22001 		vm_object_lock_shared(object);
22002 		for (shadows = 0;
22003 		    object->shadow != NULL;
22004 		    shadows++, object = next_object) {
22005 			next_object = object->shadow;
22006 			vm_object_lock_shared(next_object);
22007 			vm_object_unlock(object);
22008 		}
22009 		vm_object_unlock(object);
22010 		if (shadows > shadows_max) {
22011 			shadows_max = shadows;
22012 		}
22013 	}
22014 
22015 	vm_map_unlock_read(map);
22016 
22017 	return shadows_max;
22018 }
22019 #endif /* VM_SCAN_FOR_SHADOW_CHAIN */
22020 
22021 void
vm_commit_pagezero_status(vm_map_t lmap)22022 vm_commit_pagezero_status(vm_map_t lmap)
22023 {
22024 	pmap_advise_pagezero_range(lmap->pmap, lmap->min_offset);
22025 }
22026 
22027 #if __x86_64__
22028 void
vm_map_set_high_start(vm_map_t map,vm_map_offset_t high_start)22029 vm_map_set_high_start(
22030 	vm_map_t        map,
22031 	vm_map_offset_t high_start)
22032 {
22033 	map->vmmap_high_start = high_start;
22034 }
22035 #endif /* __x86_64__ */
22036 
22037 #if CODE_SIGNING_MONITOR
22038 
22039 kern_return_t
vm_map_entry_cs_associate(vm_map_t map,vm_map_entry_t entry,vm_map_kernel_flags_t vmk_flags)22040 vm_map_entry_cs_associate(
22041 	vm_map_t                map,
22042 	vm_map_entry_t          entry,
22043 	vm_map_kernel_flags_t   vmk_flags)
22044 {
22045 	vm_object_t cs_object, cs_shadow, backing_object;
22046 	vm_object_offset_t cs_offset, backing_offset;
22047 	void *cs_blobs;
22048 	struct vnode *cs_vnode;
22049 	kern_return_t cs_ret;
22050 
22051 	if (map->pmap == NULL ||
22052 	    entry->is_sub_map || /* XXX FBDP: recurse on sub-range? */
22053 	    (csm_address_space_exempt(map->pmap) == KERN_SUCCESS) ||
22054 	    VME_OBJECT(entry) == VM_OBJECT_NULL) {
22055 		return KERN_SUCCESS;
22056 	}
22057 
22058 	if (!(entry->protection & VM_PROT_EXECUTE)) {
22059 		/*
22060 		 * This memory region is not executable, so the code-signing
22061 		 * monitor would usually not care about it...
22062 		 */
22063 		if (vmk_flags.vmkf_remap_prot_copy &&
22064 		    (entry->max_protection & VM_PROT_EXECUTE)) {
22065 			/*
22066 			 * ... except if the memory region is being remapped
22067 			 * from r-x/r-x to rw-/rwx via vm_protect(VM_PROT_COPY)
22068 			 * which is what a debugger or dtrace would be doing
22069 			 * to prepare to modify an executable page to insert
22070 			 * a breakpoint or activate a probe.
22071 			 * In that case, fall through so that we can mark
22072 			 * this region as being "debugged" and no longer
22073 			 * strictly code-signed.
22074 			 */
22075 		} else {
22076 			/*
22077 			 * Really not executable, so no need to tell the
22078 			 * code-signing monitor.
22079 			 */
22080 			return KERN_SUCCESS;
22081 		}
22082 	}
22083 
22084 	vm_map_lock_assert_exclusive(map);
22085 
22086 	if (entry->used_for_jit) {
22087 		cs_ret = csm_associate_jit_region(
22088 			map->pmap,
22089 			entry->vme_start,
22090 			entry->vme_end - entry->vme_start);
22091 		goto done;
22092 	}
22093 
22094 	if (vmk_flags.vmkf_remap_prot_copy) {
22095 		cs_ret = csm_associate_debug_region(
22096 			map->pmap,
22097 			entry->vme_start,
22098 			entry->vme_end - entry->vme_start);
22099 		if (cs_ret == KERN_SUCCESS) {
22100 			entry->vme_xnu_user_debug = TRUE;
22101 		}
22102 #if DEVELOPMENT || DEBUG
22103 		if (vm_log_xnu_user_debug) {
22104 			printf("FBDP %d[%s] %s:%d map %p entry %p [ 0x%llx 0x%llx ]  vme_xnu_user_debug=%d cs_ret %d\n",
22105 			    proc_selfpid(),
22106 			    (get_bsdtask_info(current_task()) ? proc_name_address(get_bsdtask_info(current_task())) : "?"),
22107 			    __FUNCTION__, __LINE__,
22108 			    map, entry,
22109 			    (uint64_t)entry->vme_start, (uint64_t)entry->vme_end,
22110 			    entry->vme_xnu_user_debug,
22111 			    cs_ret);
22112 		}
22113 #endif /* DEVELOPMENT || DEBUG */
22114 		goto done;
22115 	}
22116 
22117 	cs_object = VME_OBJECT(entry);
22118 	vm_object_lock_shared(cs_object);
22119 	cs_offset = VME_OFFSET(entry);
22120 
22121 	/* find the VM object backed by the code-signed vnode */
22122 	for (;;) {
22123 		/* go to the bottom of cs_object's shadow chain */
22124 		for (;
22125 		    cs_object->shadow != VM_OBJECT_NULL;
22126 		    cs_object = cs_shadow) {
22127 			cs_shadow = cs_object->shadow;
22128 			cs_offset += cs_object->vo_shadow_offset;
22129 			vm_object_lock_shared(cs_shadow);
22130 			vm_object_unlock(cs_object);
22131 		}
22132 		if (cs_object->internal ||
22133 		    cs_object->pager == MEMORY_OBJECT_NULL) {
22134 			vm_object_unlock(cs_object);
22135 			return KERN_SUCCESS;
22136 		}
22137 
22138 		cs_offset += cs_object->paging_offset;
22139 
22140 		/*
22141 		 * cs_object could be backed by a:
22142 		 *      vnode_pager
22143 		 *	apple_protect_pager
22144 		 *      shared_region_pager
22145 		 *	fourk_pager (multiple backing objects -> fail?)
22146 		 * ask the pager if it has a backing VM object
22147 		 */
22148 		if (!memory_object_backing_object(cs_object->pager,
22149 		    cs_offset,
22150 		    &backing_object,
22151 		    &backing_offset)) {
22152 			/* no backing object: cs_object is it */
22153 			break;
22154 		}
22155 
22156 		/* look down the backing object's shadow chain */
22157 		vm_object_lock_shared(backing_object);
22158 		vm_object_unlock(cs_object);
22159 		cs_object = backing_object;
22160 		cs_offset = backing_offset;
22161 	}
22162 
22163 	cs_vnode = vnode_pager_lookup_vnode(cs_object->pager);
22164 	if (cs_vnode == NULL) {
22165 		/* no vnode, no code signatures to associate */
22166 		cs_ret = KERN_SUCCESS;
22167 	} else {
22168 		cs_ret = vnode_pager_get_cs_blobs(cs_vnode,
22169 		    &cs_blobs);
22170 		assert(cs_ret == KERN_SUCCESS);
22171 		cs_ret = cs_associate_blob_with_mapping(map->pmap,
22172 		    entry->vme_start,
22173 		    (entry->vme_end - entry->vme_start),
22174 		    cs_offset,
22175 		    cs_blobs);
22176 	}
22177 	vm_object_unlock(cs_object);
22178 	cs_object = VM_OBJECT_NULL;
22179 
22180 done:
22181 	if (cs_ret == KERN_SUCCESS) {
22182 		DTRACE_VM2(vm_map_entry_cs_associate_success,
22183 		    vm_map_offset_t, entry->vme_start,
22184 		    vm_map_offset_t, entry->vme_end);
22185 		if (vm_map_executable_immutable) {
22186 			/*
22187 			 * Prevent this executable
22188 			 * mapping from being unmapped
22189 			 * or modified.
22190 			 */
22191 			entry->vme_permanent = TRUE;
22192 		}
22193 		/*
22194 		 * pmap says it will validate the
22195 		 * code-signing validity of pages
22196 		 * faulted in via this mapping, so
22197 		 * this map entry should be marked so
22198 		 * that vm_fault() bypasses code-signing
22199 		 * validation for faults coming through
22200 		 * this mapping.
22201 		 */
22202 		entry->csm_associated = TRUE;
22203 	} else if (cs_ret == KERN_NOT_SUPPORTED) {
22204 		/*
22205 		 * pmap won't check the code-signing
22206 		 * validity of pages faulted in via
22207 		 * this mapping, so VM should keep
22208 		 * doing it.
22209 		 */
22210 		DTRACE_VM3(vm_map_entry_cs_associate_off,
22211 		    vm_map_offset_t, entry->vme_start,
22212 		    vm_map_offset_t, entry->vme_end,
22213 		    int, cs_ret);
22214 	} else {
22215 		/*
22216 		 * A real error: do not allow
22217 		 * execution in this mapping.
22218 		 */
22219 		DTRACE_VM3(vm_map_entry_cs_associate_failure,
22220 		    vm_map_offset_t, entry->vme_start,
22221 		    vm_map_offset_t, entry->vme_end,
22222 		    int, cs_ret);
22223 		if (vmk_flags.vmkf_overwrite_immutable) {
22224 			/*
22225 			 * We can get here when we remap an apple_protect pager
22226 			 * on top of an already cs_associated executable mapping
22227 			 * with the same code signatures, so we don't want to
22228 			 * lose VM_PROT_EXECUTE in that case...
22229 			 */
22230 		} else {
22231 			entry->protection &= ~VM_PROT_ALLEXEC;
22232 			entry->max_protection &= ~VM_PROT_ALLEXEC;
22233 		}
22234 	}
22235 
22236 	return cs_ret;
22237 }
22238 
22239 #endif /* CODE_SIGNING_MONITOR */
22240 
22241 inline bool
vm_map_is_corpse_source(vm_map_t map)22242 vm_map_is_corpse_source(vm_map_t map)
22243 {
22244 	bool status = false;
22245 	if (map) {
22246 		vm_map_lock_read(map);
22247 		status = map->corpse_source;
22248 		vm_map_unlock_read(map);
22249 	}
22250 	return status;
22251 }
22252 
22253 inline void
vm_map_set_corpse_source(vm_map_t map)22254 vm_map_set_corpse_source(vm_map_t map)
22255 {
22256 	if (map) {
22257 		vm_map_lock(map);
22258 		map->corpse_source = true;
22259 		vm_map_unlock(map);
22260 	}
22261 }
22262 
22263 inline void
vm_map_unset_corpse_source(vm_map_t map)22264 vm_map_unset_corpse_source(vm_map_t map)
22265 {
22266 	if (map) {
22267 		vm_map_lock(map);
22268 		map->corpse_source = false;
22269 		vm_map_unlock(map);
22270 	}
22271 }
22272 /*
22273  * FORKED CORPSE FOOTPRINT
22274  *
22275  * A forked corpse gets a copy of the original VM map but its pmap is mostly
22276  * empty since it never ran and never got to fault in any pages.
22277  * Collecting footprint info (via "sysctl vm.self_region_footprint") for
22278  * a forked corpse would therefore return very little information.
22279  *
22280  * When forking a corpse, we can pass the VM_MAP_FORK_CORPSE_FOOTPRINT option
22281  * to vm_map_fork() to collect footprint information from the original VM map
22282  * and its pmap, and store it in the forked corpse's VM map.  That information
22283  * is stored in place of the VM map's "hole list" since we'll never need to
22284  * lookup for holes in the corpse's map.
22285  *
22286  * The corpse's footprint info looks like this:
22287  *
22288  * vm_map->vmmap_corpse_footprint points to pageable kernel memory laid out
22289  * as follows:
22290  *                     +---------------------------------------+
22291  *            header-> | cf_size                               |
22292  *                     +-------------------+-------------------+
22293  *                     | cf_last_region    | cf_last_zeroes    |
22294  *                     +-------------------+-------------------+
22295  *           region1-> | cfr_vaddr                             |
22296  *                     +-------------------+-------------------+
22297  *                     | cfr_num_pages     | d0 | d1 | d2 | d3 |
22298  *                     +---------------------------------------+
22299  *                     | d4 | d5 | ...                         |
22300  *                     +---------------------------------------+
22301  *                     | ...                                   |
22302  *                     +-------------------+-------------------+
22303  *                     | dy | dz | na | na | cfr_vaddr...      | <-region2
22304  *                     +-------------------+-------------------+
22305  *                     | cfr_vaddr (ctd)   | cfr_num_pages     |
22306  *                     +---------------------------------------+
22307  *                     | d0 | d1 ...                           |
22308  *                     +---------------------------------------+
22309  *                       ...
22310  *                     +---------------------------------------+
22311  *       last region-> | cfr_vaddr                             |
22312  *                     +---------------------------------------+
22313  *                     + cfr_num_pages     | d0 | d1 | d2 | d3 |
22314  *                     +---------------------------------------+
22315  *                       ...
22316  *                     +---------------------------------------+
22317  *                     | dx | dy | dz | na | na | na | na | na |
22318  *                     +---------------------------------------+
22319  *
22320  * where:
22321  *      cf_size:	total size of the buffer (rounded to page size)
22322  *      cf_last_region:	offset in the buffer of the last "region" sub-header
22323  *	cf_last_zeroes: number of trailing "zero" dispositions at the end
22324  *			of last region
22325  *	cfr_vaddr:	virtual address of the start of the covered "region"
22326  *	cfr_num_pages:	number of pages in the covered "region"
22327  *	d*:		disposition of the page at that virtual address
22328  * Regions in the buffer are word-aligned.
22329  *
22330  * We estimate the size of the buffer based on the number of memory regions
22331  * and the virtual size of the address space.  While copying each memory region
22332  * during vm_map_fork(), we also collect the footprint info for that region
22333  * and store it in the buffer, packing it as much as possible (coalescing
22334  * contiguous memory regions to avoid having too many region headers and
22335  * avoiding long streaks of "zero" page dispositions by splitting footprint
22336  * "regions", so the number of regions in the footprint buffer might not match
22337  * the number of memory regions in the address space.
22338  *
22339  * We also have to copy the original task's "nonvolatile" ledgers since that's
22340  * part of the footprint and will need to be reported to any tool asking for
22341  * the footprint information of the forked corpse.
22342  */
22343 
22344 uint64_t vm_map_corpse_footprint_count = 0;
22345 uint64_t vm_map_corpse_footprint_size_avg = 0;
22346 uint64_t vm_map_corpse_footprint_size_max = 0;
22347 uint64_t vm_map_corpse_footprint_full = 0;
22348 uint64_t vm_map_corpse_footprint_no_buf = 0;
22349 
22350 struct vm_map_corpse_footprint_header {
22351 	vm_size_t       cf_size;        /* allocated buffer size */
22352 	uint32_t        cf_last_region; /* offset of last region in buffer */
22353 	union {
22354 		uint32_t cfu_last_zeroes; /* during creation:
22355 		                           * number of "zero" dispositions at
22356 		                           * end of last region */
22357 		uint32_t cfu_hint_region; /* during lookup:
22358 		                           * offset of last looked up region */
22359 #define cf_last_zeroes cfu.cfu_last_zeroes
22360 #define cf_hint_region cfu.cfu_hint_region
22361 	} cfu;
22362 };
22363 typedef uint8_t cf_disp_t;
22364 struct vm_map_corpse_footprint_region {
22365 	vm_map_offset_t cfr_vaddr;      /* region start virtual address */
22366 	uint32_t        cfr_num_pages;  /* number of pages in this "region" */
22367 	cf_disp_t   cfr_disposition[0]; /* disposition of each page */
22368 } __attribute__((packed));
22369 
22370 static cf_disp_t
vm_page_disposition_to_cf_disp(int disposition)22371 vm_page_disposition_to_cf_disp(
22372 	int disposition)
22373 {
22374 	assert(sizeof(cf_disp_t) == 1);
22375 	/* relocate bits that don't fit in a "uint8_t" */
22376 	if (disposition & VM_PAGE_QUERY_PAGE_REUSABLE) {
22377 		disposition |= VM_PAGE_QUERY_PAGE_FICTITIOUS;
22378 	}
22379 	/* cast gets rid of extra bits */
22380 	return (cf_disp_t) disposition;
22381 }
22382 
22383 static int
vm_page_cf_disp_to_disposition(cf_disp_t cf_disp)22384 vm_page_cf_disp_to_disposition(
22385 	cf_disp_t cf_disp)
22386 {
22387 	int disposition;
22388 
22389 	assert(sizeof(cf_disp_t) == 1);
22390 	disposition = (int) cf_disp;
22391 	/* move relocated bits back in place */
22392 	if (cf_disp & VM_PAGE_QUERY_PAGE_FICTITIOUS) {
22393 		disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
22394 		disposition &= ~VM_PAGE_QUERY_PAGE_FICTITIOUS;
22395 	}
22396 	return disposition;
22397 }
22398 
22399 /*
22400  * vm_map_corpse_footprint_new_region:
22401  *      closes the current footprint "region" and creates a new one
22402  *
22403  * Returns NULL if there's not enough space in the buffer for a new region.
22404  */
22405 static struct vm_map_corpse_footprint_region *
vm_map_corpse_footprint_new_region(struct vm_map_corpse_footprint_header * footprint_header)22406 vm_map_corpse_footprint_new_region(
22407 	struct vm_map_corpse_footprint_header *footprint_header)
22408 {
22409 	uintptr_t       footprint_edge;
22410 	uint32_t        new_region_offset;
22411 	struct vm_map_corpse_footprint_region *footprint_region;
22412 	struct vm_map_corpse_footprint_region *new_footprint_region;
22413 
22414 	footprint_edge = ((uintptr_t)footprint_header +
22415 	    footprint_header->cf_size);
22416 	footprint_region = ((struct vm_map_corpse_footprint_region *)
22417 	    ((char *)footprint_header +
22418 	    footprint_header->cf_last_region));
22419 	assert((uintptr_t)footprint_region + sizeof(*footprint_region) <=
22420 	    footprint_edge);
22421 
22422 	/* get rid of trailing zeroes in the last region */
22423 	assert(footprint_region->cfr_num_pages >=
22424 	    footprint_header->cf_last_zeroes);
22425 	footprint_region->cfr_num_pages -=
22426 	    footprint_header->cf_last_zeroes;
22427 	footprint_header->cf_last_zeroes = 0;
22428 
22429 	/* reuse this region if it's now empty */
22430 	if (footprint_region->cfr_num_pages == 0) {
22431 		return footprint_region;
22432 	}
22433 
22434 	/* compute offset of new region */
22435 	new_region_offset = footprint_header->cf_last_region;
22436 	new_region_offset += sizeof(*footprint_region);
22437 	new_region_offset += (footprint_region->cfr_num_pages * sizeof(cf_disp_t));
22438 	new_region_offset = roundup(new_region_offset, sizeof(int));
22439 
22440 	/* check if we're going over the edge */
22441 	if (((uintptr_t)footprint_header +
22442 	    new_region_offset +
22443 	    sizeof(*footprint_region)) >=
22444 	    footprint_edge) {
22445 		/* over the edge: no new region */
22446 		return NULL;
22447 	}
22448 
22449 	/* adjust offset of last region in header */
22450 	footprint_header->cf_last_region = new_region_offset;
22451 
22452 	new_footprint_region = (struct vm_map_corpse_footprint_region *)
22453 	    ((char *)footprint_header +
22454 	    footprint_header->cf_last_region);
22455 	new_footprint_region->cfr_vaddr = 0;
22456 	new_footprint_region->cfr_num_pages = 0;
22457 	/* caller needs to initialize new region */
22458 
22459 	return new_footprint_region;
22460 }
22461 
22462 /*
22463  * vm_map_corpse_footprint_collect:
22464  *	collect footprint information for "old_entry" in "old_map" and
22465  *	stores it in "new_map"'s vmmap_footprint_info.
22466  */
22467 kern_return_t
vm_map_corpse_footprint_collect(vm_map_t old_map,vm_map_entry_t old_entry,vm_map_t new_map)22468 vm_map_corpse_footprint_collect(
22469 	vm_map_t        old_map,
22470 	vm_map_entry_t  old_entry,
22471 	vm_map_t        new_map)
22472 {
22473 	vm_map_offset_t va;
22474 	kern_return_t   kr;
22475 	struct vm_map_corpse_footprint_header *footprint_header;
22476 	struct vm_map_corpse_footprint_region *footprint_region;
22477 	struct vm_map_corpse_footprint_region *new_footprint_region;
22478 	cf_disp_t       *next_disp_p;
22479 	uintptr_t       footprint_edge;
22480 	uint32_t        num_pages_tmp;
22481 	int             effective_page_size;
22482 
22483 	effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(old_map));
22484 
22485 	va = old_entry->vme_start;
22486 
22487 	vm_map_lock_assert_exclusive(old_map);
22488 	vm_map_lock_assert_exclusive(new_map);
22489 
22490 	assert(new_map->has_corpse_footprint);
22491 	assert(!old_map->has_corpse_footprint);
22492 	if (!new_map->has_corpse_footprint ||
22493 	    old_map->has_corpse_footprint) {
22494 		/*
22495 		 * This can only transfer footprint info from a
22496 		 * map with a live pmap to a map with a corpse footprint.
22497 		 */
22498 		return KERN_NOT_SUPPORTED;
22499 	}
22500 
22501 	if (new_map->vmmap_corpse_footprint == NULL) {
22502 		vm_offset_t     buf;
22503 		vm_size_t       buf_size;
22504 
22505 		buf = 0;
22506 		buf_size = (sizeof(*footprint_header) +
22507 		    (old_map->hdr.nentries
22508 		    *
22509 		    (sizeof(*footprint_region) +
22510 		    +3))            /* potential alignment for each region */
22511 		    +
22512 		    ((old_map->size / effective_page_size)
22513 		    *
22514 		    sizeof(cf_disp_t)));      /* disposition for each page */
22515 //		printf("FBDP corpse map %p guestimate footprint size 0x%llx\n", new_map, (uint64_t) buf_size);
22516 		buf_size = round_page(buf_size);
22517 
22518 		/* limit buffer to 1 page to validate overflow detection */
22519 //		buf_size = PAGE_SIZE;
22520 
22521 		/* limit size to a somewhat sane amount */
22522 #if XNU_TARGET_OS_OSX
22523 #define VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE   (8*1024*1024)   /* 8MB */
22524 #else /* XNU_TARGET_OS_OSX */
22525 #define VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE   (256*1024)      /* 256KB */
22526 #endif /* XNU_TARGET_OS_OSX */
22527 		if (buf_size > VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE) {
22528 			buf_size = VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE;
22529 		}
22530 
22531 		/*
22532 		 * Allocate the pageable buffer (with a trailing guard page).
22533 		 * It will be zero-filled on demand.
22534 		 */
22535 		kr = kmem_alloc(kernel_map, &buf, buf_size + PAGE_SIZE,
22536 		    KMA_DATA | KMA_PAGEABLE | KMA_GUARD_LAST,
22537 		    VM_KERN_MEMORY_DIAG);
22538 		if (kr != KERN_SUCCESS) {
22539 			vm_map_corpse_footprint_no_buf++;
22540 			return kr;
22541 		}
22542 
22543 		/* initialize header and 1st region */
22544 		footprint_header = (struct vm_map_corpse_footprint_header *)buf;
22545 		new_map->vmmap_corpse_footprint = footprint_header;
22546 
22547 		footprint_header->cf_size = buf_size;
22548 		footprint_header->cf_last_region =
22549 		    sizeof(*footprint_header);
22550 		footprint_header->cf_last_zeroes = 0;
22551 
22552 		footprint_region = (struct vm_map_corpse_footprint_region *)
22553 		    ((char *)footprint_header +
22554 		    footprint_header->cf_last_region);
22555 		footprint_region->cfr_vaddr = 0;
22556 		footprint_region->cfr_num_pages = 0;
22557 	} else {
22558 		/* retrieve header and last region */
22559 		footprint_header = (struct vm_map_corpse_footprint_header *)
22560 		    new_map->vmmap_corpse_footprint;
22561 		footprint_region = (struct vm_map_corpse_footprint_region *)
22562 		    ((char *)footprint_header +
22563 		    footprint_header->cf_last_region);
22564 	}
22565 	footprint_edge = ((uintptr_t)footprint_header +
22566 	    footprint_header->cf_size);
22567 
22568 	if ((footprint_region->cfr_vaddr +
22569 	    (((vm_map_offset_t)footprint_region->cfr_num_pages) *
22570 	    effective_page_size))
22571 	    != old_entry->vme_start) {
22572 		uint64_t num_pages_delta, num_pages_delta_size;
22573 		uint32_t region_offset_delta_size;
22574 
22575 		/*
22576 		 * Not the next contiguous virtual address:
22577 		 * start a new region or store "zero" dispositions for
22578 		 * the missing pages?
22579 		 */
22580 		/* size of gap in actual page dispositions */
22581 		num_pages_delta = ((old_entry->vme_start -
22582 		    footprint_region->cfr_vaddr) / effective_page_size)
22583 		    - footprint_region->cfr_num_pages;
22584 		num_pages_delta_size = num_pages_delta * sizeof(cf_disp_t);
22585 		/* size of gap as a new footprint region header */
22586 		region_offset_delta_size =
22587 		    (sizeof(*footprint_region) +
22588 		    roundup(((footprint_region->cfr_num_pages -
22589 		    footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)),
22590 		    sizeof(int)) -
22591 		    ((footprint_region->cfr_num_pages -
22592 		    footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)));
22593 //		printf("FBDP %s:%d region 0x%x 0x%llx 0x%x vme_start 0x%llx pages_delta 0x%llx region_delta 0x%x\n", __FUNCTION__, __LINE__, footprint_header->cf_last_region, footprint_region->cfr_vaddr, footprint_region->cfr_num_pages, old_entry->vme_start, num_pages_delta, region_offset_delta);
22594 		if (region_offset_delta_size < num_pages_delta_size ||
22595 		    os_add3_overflow(footprint_region->cfr_num_pages,
22596 		    (uint32_t) num_pages_delta,
22597 		    1,
22598 		    &num_pages_tmp)) {
22599 			/*
22600 			 * Storing data for this gap would take more space
22601 			 * than inserting a new footprint region header:
22602 			 * let's start a new region and save space. If it's a
22603 			 * tie, let's avoid using a new region, since that
22604 			 * would require more region hops to find the right
22605 			 * range during lookups.
22606 			 *
22607 			 * If the current region's cfr_num_pages would overflow
22608 			 * if we added "zero" page dispositions for the gap,
22609 			 * no choice but to start a new region.
22610 			 */
22611 //			printf("FBDP %s:%d new region\n", __FUNCTION__, __LINE__);
22612 			new_footprint_region =
22613 			    vm_map_corpse_footprint_new_region(footprint_header);
22614 			/* check that we're not going over the edge */
22615 			if (new_footprint_region == NULL) {
22616 				goto over_the_edge;
22617 			}
22618 			footprint_region = new_footprint_region;
22619 			/* initialize new region as empty */
22620 			footprint_region->cfr_vaddr = old_entry->vme_start;
22621 			footprint_region->cfr_num_pages = 0;
22622 		} else {
22623 			/*
22624 			 * Store "zero" page dispositions for the missing
22625 			 * pages.
22626 			 */
22627 //			printf("FBDP %s:%d zero gap\n", __FUNCTION__, __LINE__);
22628 			for (; num_pages_delta > 0; num_pages_delta--) {
22629 				next_disp_p = (cf_disp_t *)
22630 				    ((uintptr_t) footprint_region +
22631 				    sizeof(*footprint_region));
22632 				next_disp_p += footprint_region->cfr_num_pages;
22633 				/* check that we're not going over the edge */
22634 				if ((uintptr_t)next_disp_p >= footprint_edge) {
22635 					goto over_the_edge;
22636 				}
22637 				/* store "zero" disposition for this gap page */
22638 				footprint_region->cfr_num_pages++;
22639 				*next_disp_p = (cf_disp_t) 0;
22640 				footprint_header->cf_last_zeroes++;
22641 			}
22642 		}
22643 	}
22644 
22645 	for (va = old_entry->vme_start;
22646 	    va < old_entry->vme_end;
22647 	    va += effective_page_size) {
22648 		int             disposition;
22649 		cf_disp_t       cf_disp;
22650 
22651 		vm_map_footprint_query_page_info(old_map,
22652 		    old_entry,
22653 		    va,
22654 		    &disposition);
22655 		cf_disp = vm_page_disposition_to_cf_disp(disposition);
22656 
22657 //		if (va < SHARED_REGION_BASE_ARM64) printf("FBDP collect map %p va 0x%llx disp 0x%x\n", new_map, va, disp);
22658 
22659 		if (cf_disp == 0 && footprint_region->cfr_num_pages == 0) {
22660 			/*
22661 			 * Ignore "zero" dispositions at start of
22662 			 * region: just move start of region.
22663 			 */
22664 			footprint_region->cfr_vaddr += effective_page_size;
22665 			continue;
22666 		}
22667 
22668 		/* would region's cfr_num_pages overflow? */
22669 		if (os_add_overflow(footprint_region->cfr_num_pages, 1,
22670 		    &num_pages_tmp)) {
22671 			/* overflow: create a new region */
22672 			new_footprint_region =
22673 			    vm_map_corpse_footprint_new_region(
22674 				footprint_header);
22675 			if (new_footprint_region == NULL) {
22676 				goto over_the_edge;
22677 			}
22678 			footprint_region = new_footprint_region;
22679 			footprint_region->cfr_vaddr = va;
22680 			footprint_region->cfr_num_pages = 0;
22681 		}
22682 
22683 		next_disp_p = (cf_disp_t *) ((uintptr_t) footprint_region +
22684 		    sizeof(*footprint_region));
22685 		next_disp_p += footprint_region->cfr_num_pages;
22686 		/* check that we're not going over the edge */
22687 		if ((uintptr_t)next_disp_p >= footprint_edge) {
22688 			goto over_the_edge;
22689 		}
22690 		/* store this dispostion */
22691 		*next_disp_p = cf_disp;
22692 		footprint_region->cfr_num_pages++;
22693 
22694 		if (cf_disp != 0) {
22695 			/* non-zero disp: break the current zero streak */
22696 			footprint_header->cf_last_zeroes = 0;
22697 			/* done */
22698 			continue;
22699 		}
22700 
22701 		/* zero disp: add to the current streak of zeroes */
22702 		footprint_header->cf_last_zeroes++;
22703 		if ((footprint_header->cf_last_zeroes +
22704 		    roundup(((footprint_region->cfr_num_pages -
22705 		    footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)) &
22706 		    (sizeof(int) - 1),
22707 		    sizeof(int))) <
22708 		    (sizeof(*footprint_header))) {
22709 			/*
22710 			 * There are not enough trailing "zero" dispositions
22711 			 * (+ the extra padding we would need for the previous
22712 			 * region); creating a new region would not save space
22713 			 * at this point, so let's keep this "zero" disposition
22714 			 * in this region and reconsider later.
22715 			 */
22716 			continue;
22717 		}
22718 		/*
22719 		 * Create a new region to avoid having too many consecutive
22720 		 * "zero" dispositions.
22721 		 */
22722 		new_footprint_region =
22723 		    vm_map_corpse_footprint_new_region(footprint_header);
22724 		if (new_footprint_region == NULL) {
22725 			goto over_the_edge;
22726 		}
22727 		footprint_region = new_footprint_region;
22728 		/* initialize the new region as empty ... */
22729 		footprint_region->cfr_num_pages = 0;
22730 		/* ... and skip this "zero" disp */
22731 		footprint_region->cfr_vaddr = va + effective_page_size;
22732 	}
22733 
22734 	return KERN_SUCCESS;
22735 
22736 over_the_edge:
22737 //	printf("FBDP map %p footprint was full for va 0x%llx\n", new_map, va);
22738 	vm_map_corpse_footprint_full++;
22739 	return KERN_RESOURCE_SHORTAGE;
22740 }
22741 
22742 /*
22743  * vm_map_corpse_footprint_collect_done:
22744  *	completes the footprint collection by getting rid of any remaining
22745  *	trailing "zero" dispositions and trimming the unused part of the
22746  *	kernel buffer
22747  */
22748 void
vm_map_corpse_footprint_collect_done(vm_map_t new_map)22749 vm_map_corpse_footprint_collect_done(
22750 	vm_map_t        new_map)
22751 {
22752 	struct vm_map_corpse_footprint_header *footprint_header;
22753 	struct vm_map_corpse_footprint_region *footprint_region;
22754 	vm_size_t       buf_size, actual_size;
22755 	kern_return_t   kr;
22756 
22757 	assert(new_map->has_corpse_footprint);
22758 	if (!new_map->has_corpse_footprint ||
22759 	    new_map->vmmap_corpse_footprint == NULL) {
22760 		return;
22761 	}
22762 
22763 	footprint_header = (struct vm_map_corpse_footprint_header *)
22764 	    new_map->vmmap_corpse_footprint;
22765 	buf_size = footprint_header->cf_size;
22766 
22767 	footprint_region = (struct vm_map_corpse_footprint_region *)
22768 	    ((char *)footprint_header +
22769 	    footprint_header->cf_last_region);
22770 
22771 	/* get rid of trailing zeroes in last region */
22772 	assert(footprint_region->cfr_num_pages >= footprint_header->cf_last_zeroes);
22773 	footprint_region->cfr_num_pages -= footprint_header->cf_last_zeroes;
22774 	footprint_header->cf_last_zeroes = 0;
22775 
22776 	actual_size = (vm_size_t)(footprint_header->cf_last_region +
22777 	    sizeof(*footprint_region) +
22778 	    (footprint_region->cfr_num_pages * sizeof(cf_disp_t)));
22779 
22780 //	printf("FBDP map %p buf_size 0x%llx actual_size 0x%llx\n", new_map, (uint64_t) buf_size, (uint64_t) actual_size);
22781 	vm_map_corpse_footprint_size_avg =
22782 	    (((vm_map_corpse_footprint_size_avg *
22783 	    vm_map_corpse_footprint_count) +
22784 	    actual_size) /
22785 	    (vm_map_corpse_footprint_count + 1));
22786 	vm_map_corpse_footprint_count++;
22787 	if (actual_size > vm_map_corpse_footprint_size_max) {
22788 		vm_map_corpse_footprint_size_max = actual_size;
22789 	}
22790 
22791 	actual_size = round_page(actual_size);
22792 	if (buf_size > actual_size) {
22793 		kr = vm_deallocate(kernel_map,
22794 		    ((vm_address_t)footprint_header +
22795 		    actual_size +
22796 		    PAGE_SIZE),                 /* trailing guard page */
22797 		    (buf_size - actual_size));
22798 		assertf(kr == KERN_SUCCESS,
22799 		    "trim: footprint_header %p buf_size 0x%llx actual_size 0x%llx kr=0x%x\n",
22800 		    footprint_header,
22801 		    (uint64_t) buf_size,
22802 		    (uint64_t) actual_size,
22803 		    kr);
22804 		kr = vm_protect(kernel_map,
22805 		    ((vm_address_t)footprint_header +
22806 		    actual_size),
22807 		    PAGE_SIZE,
22808 		    FALSE,             /* set_maximum */
22809 		    VM_PROT_NONE);
22810 		assertf(kr == KERN_SUCCESS,
22811 		    "guard: footprint_header %p buf_size 0x%llx actual_size 0x%llx kr=0x%x\n",
22812 		    footprint_header,
22813 		    (uint64_t) buf_size,
22814 		    (uint64_t) actual_size,
22815 		    kr);
22816 	}
22817 
22818 	footprint_header->cf_size = actual_size;
22819 }
22820 
22821 /*
22822  * vm_map_corpse_footprint_query_page_info:
22823  *	retrieves the disposition of the page at virtual address "vaddr"
22824  *	in the forked corpse's VM map
22825  *
22826  * This is the equivalent of vm_map_footprint_query_page_info() for a forked corpse.
22827  */
22828 kern_return_t
vm_map_corpse_footprint_query_page_info(vm_map_t map,vm_map_offset_t va,int * disposition_p)22829 vm_map_corpse_footprint_query_page_info(
22830 	vm_map_t        map,
22831 	vm_map_offset_t va,
22832 	int             *disposition_p)
22833 {
22834 	struct vm_map_corpse_footprint_header *footprint_header;
22835 	struct vm_map_corpse_footprint_region *footprint_region;
22836 	uint32_t        footprint_region_offset;
22837 	vm_map_offset_t region_start, region_end;
22838 	int             disp_idx;
22839 	kern_return_t   kr;
22840 	int             effective_page_size;
22841 	cf_disp_t       cf_disp;
22842 
22843 	if (!map->has_corpse_footprint) {
22844 		*disposition_p = 0;
22845 		kr = KERN_INVALID_ARGUMENT;
22846 		goto done;
22847 	}
22848 
22849 	footprint_header = map->vmmap_corpse_footprint;
22850 	if (footprint_header == NULL) {
22851 		*disposition_p = 0;
22852 //		if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p);
22853 		kr = KERN_INVALID_ARGUMENT;
22854 		goto done;
22855 	}
22856 
22857 	/* start looking at the hint ("cf_hint_region") */
22858 	footprint_region_offset = footprint_header->cf_hint_region;
22859 
22860 	effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(map));
22861 
22862 lookup_again:
22863 	if (footprint_region_offset < sizeof(*footprint_header)) {
22864 		/* hint too low: start from 1st region */
22865 		footprint_region_offset = sizeof(*footprint_header);
22866 	}
22867 	if (footprint_region_offset >= footprint_header->cf_last_region) {
22868 		/* hint too high: re-start from 1st region */
22869 		footprint_region_offset = sizeof(*footprint_header);
22870 	}
22871 	footprint_region = (struct vm_map_corpse_footprint_region *)
22872 	    ((char *)footprint_header + footprint_region_offset);
22873 	region_start = footprint_region->cfr_vaddr;
22874 	region_end = (region_start +
22875 	    ((vm_map_offset_t)(footprint_region->cfr_num_pages) *
22876 	    effective_page_size));
22877 	if (va < region_start &&
22878 	    footprint_region_offset != sizeof(*footprint_header)) {
22879 		/* our range starts before the hint region */
22880 
22881 		/* reset the hint (in a racy way...) */
22882 		footprint_header->cf_hint_region = sizeof(*footprint_header);
22883 		/* lookup "va" again from 1st region */
22884 		footprint_region_offset = sizeof(*footprint_header);
22885 		goto lookup_again;
22886 	}
22887 
22888 	while (va >= region_end) {
22889 		if (footprint_region_offset >= footprint_header->cf_last_region) {
22890 			break;
22891 		}
22892 		/* skip the region's header */
22893 		footprint_region_offset += sizeof(*footprint_region);
22894 		/* skip the region's page dispositions */
22895 		footprint_region_offset += (footprint_region->cfr_num_pages * sizeof(cf_disp_t));
22896 		/* align to next word boundary */
22897 		footprint_region_offset =
22898 		    roundup(footprint_region_offset,
22899 		    sizeof(int));
22900 		footprint_region = (struct vm_map_corpse_footprint_region *)
22901 		    ((char *)footprint_header + footprint_region_offset);
22902 		region_start = footprint_region->cfr_vaddr;
22903 		region_end = (region_start +
22904 		    ((vm_map_offset_t)(footprint_region->cfr_num_pages) *
22905 		    effective_page_size));
22906 	}
22907 	if (va < region_start || va >= region_end) {
22908 		/* page not found */
22909 		*disposition_p = 0;
22910 //		if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p);
22911 		kr = KERN_SUCCESS;
22912 		goto done;
22913 	}
22914 
22915 	/* "va" found: set the lookup hint for next lookup (in a racy way...) */
22916 	footprint_header->cf_hint_region = footprint_region_offset;
22917 
22918 	/* get page disposition for "va" in this region */
22919 	disp_idx = (int) ((va - footprint_region->cfr_vaddr) / effective_page_size);
22920 	cf_disp = footprint_region->cfr_disposition[disp_idx];
22921 	*disposition_p = vm_page_cf_disp_to_disposition(cf_disp);
22922 	kr = KERN_SUCCESS;
22923 done:
22924 //	if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p);
22925 	/* dtrace -n 'vminfo:::footprint_query_page_info { printf("map 0x%p va 0x%llx disp 0x%x kr 0x%x", arg0, arg1, arg2, arg3); }' */
22926 	DTRACE_VM4(footprint_query_page_info,
22927 	    vm_map_t, map,
22928 	    vm_map_offset_t, va,
22929 	    int, *disposition_p,
22930 	    kern_return_t, kr);
22931 
22932 	return kr;
22933 }
22934 
22935 void
vm_map_corpse_footprint_destroy(vm_map_t map)22936 vm_map_corpse_footprint_destroy(
22937 	vm_map_t        map)
22938 {
22939 	if (map->has_corpse_footprint &&
22940 	    map->vmmap_corpse_footprint != 0) {
22941 		struct vm_map_corpse_footprint_header *footprint_header;
22942 		vm_size_t buf_size;
22943 		kern_return_t kr;
22944 
22945 		footprint_header = map->vmmap_corpse_footprint;
22946 		buf_size = footprint_header->cf_size;
22947 		kr = vm_deallocate(kernel_map,
22948 		    (vm_offset_t) map->vmmap_corpse_footprint,
22949 		    ((vm_size_t) buf_size
22950 		    + PAGE_SIZE));                 /* trailing guard page */
22951 		assertf(kr == KERN_SUCCESS, "kr=0x%x\n", kr);
22952 		map->vmmap_corpse_footprint = 0;
22953 		map->has_corpse_footprint = FALSE;
22954 	}
22955 }
22956 
22957 /*
22958  * vm_map_copy_footprint_ledgers:
22959  *	copies any ledger that's relevant to the memory footprint of "old_task"
22960  *	into the forked corpse's task ("new_task")
22961  */
22962 void
vm_map_copy_footprint_ledgers(task_t old_task,task_t new_task)22963 vm_map_copy_footprint_ledgers(
22964 	task_t  old_task,
22965 	task_t  new_task)
22966 {
22967 	vm_map_copy_ledger(old_task, new_task, task_ledgers.phys_footprint);
22968 	vm_map_copy_ledger(old_task, new_task, task_ledgers.purgeable_nonvolatile);
22969 	vm_map_copy_ledger(old_task, new_task, task_ledgers.purgeable_nonvolatile_compressed);
22970 	vm_map_copy_ledger(old_task, new_task, task_ledgers.internal);
22971 	vm_map_copy_ledger(old_task, new_task, task_ledgers.internal_compressed);
22972 	vm_map_copy_ledger(old_task, new_task, task_ledgers.iokit_mapped);
22973 	vm_map_copy_ledger(old_task, new_task, task_ledgers.alternate_accounting);
22974 	vm_map_copy_ledger(old_task, new_task, task_ledgers.alternate_accounting_compressed);
22975 	vm_map_copy_ledger(old_task, new_task, task_ledgers.page_table);
22976 	vm_map_copy_ledger(old_task, new_task, task_ledgers.tagged_footprint);
22977 	vm_map_copy_ledger(old_task, new_task, task_ledgers.tagged_footprint_compressed);
22978 	vm_map_copy_ledger(old_task, new_task, task_ledgers.network_nonvolatile);
22979 	vm_map_copy_ledger(old_task, new_task, task_ledgers.network_nonvolatile_compressed);
22980 	vm_map_copy_ledger(old_task, new_task, task_ledgers.media_footprint);
22981 	vm_map_copy_ledger(old_task, new_task, task_ledgers.media_footprint_compressed);
22982 	vm_map_copy_ledger(old_task, new_task, task_ledgers.graphics_footprint);
22983 	vm_map_copy_ledger(old_task, new_task, task_ledgers.graphics_footprint_compressed);
22984 	vm_map_copy_ledger(old_task, new_task, task_ledgers.neural_footprint);
22985 	vm_map_copy_ledger(old_task, new_task, task_ledgers.neural_footprint_compressed);
22986 	vm_map_copy_ledger(old_task, new_task, task_ledgers.wired_mem);
22987 }
22988 
22989 /*
22990  * vm_map_copy_ledger:
22991  *	copy a single ledger from "old_task" to "new_task"
22992  */
22993 void
vm_map_copy_ledger(task_t old_task,task_t new_task,int ledger_entry)22994 vm_map_copy_ledger(
22995 	task_t  old_task,
22996 	task_t  new_task,
22997 	int     ledger_entry)
22998 {
22999 	ledger_amount_t old_balance, new_balance, delta;
23000 
23001 	assert(new_task->map->has_corpse_footprint);
23002 	if (!new_task->map->has_corpse_footprint) {
23003 		return;
23004 	}
23005 
23006 	/* turn off sanity checks for the ledger we're about to mess with */
23007 	ledger_disable_panic_on_negative(new_task->ledger,
23008 	    ledger_entry);
23009 
23010 	/* adjust "new_task" to match "old_task" */
23011 	ledger_get_balance(old_task->ledger,
23012 	    ledger_entry,
23013 	    &old_balance);
23014 	ledger_get_balance(new_task->ledger,
23015 	    ledger_entry,
23016 	    &new_balance);
23017 	if (new_balance == old_balance) {
23018 		/* new == old: done */
23019 	} else if (new_balance > old_balance) {
23020 		/* new > old ==> new -= new - old */
23021 		delta = new_balance - old_balance;
23022 		ledger_debit(new_task->ledger,
23023 		    ledger_entry,
23024 		    delta);
23025 	} else {
23026 		/* new < old ==> new += old - new */
23027 		delta = old_balance - new_balance;
23028 		ledger_credit(new_task->ledger,
23029 		    ledger_entry,
23030 		    delta);
23031 	}
23032 }
23033 
23034 /*
23035  * vm_map_get_pmap:
23036  * returns the pmap associated with the vm_map
23037  */
23038 pmap_t
vm_map_get_pmap(vm_map_t map)23039 vm_map_get_pmap(vm_map_t map)
23040 {
23041 	return vm_map_pmap(map);
23042 }
23043 
23044 #if CONFIG_MAP_RANGES
23045 static bitmap_t vm_map_user_range_heap_map[BITMAP_LEN(VM_MEMORY_COUNT)];
23046 
23047 static_assert(UMEM_RANGE_ID_DEFAULT == MACH_VM_RANGE_DEFAULT);
23048 static_assert(UMEM_RANGE_ID_HEAP == MACH_VM_RANGE_DATA);
23049 
23050 /*
23051  * vm_map_range_map_init:
23052  *  initializes the VM range ID map to enable index lookup
23053  *  of user VM ranges based on VM tag from userspace.
23054  */
23055 static void
vm_map_range_map_init(void)23056 vm_map_range_map_init(void)
23057 {
23058 	/*
23059 	 * VM_MEMORY_MALLOC{,_NANO} are skipped on purpose:
23060 	 * - the former is malloc metadata which should be kept separate
23061 	 * - the latter has its own ranges
23062 	 */
23063 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_HUGE);
23064 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_LARGE);
23065 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_LARGE_REUSED);
23066 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_MEDIUM);
23067 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_PROB_GUARD);
23068 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_SMALL);
23069 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_TINY);
23070 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_TCMALLOC);
23071 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_LIBNETWORK);
23072 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_IOACCELERATOR);
23073 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_IOSURFACE);
23074 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_IMAGEIO);
23075 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_COREGRAPHICS);
23076 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_CORESERVICES);
23077 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_COREDATA);
23078 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_LAYERKIT);
23079 }
23080 
23081 static struct mach_vm_range
vm_map_range_random_uniform(vm_map_size_t req_size,vm_map_offset_t min_addr,vm_map_offset_t max_addr,vm_map_offset_t offmask)23082 vm_map_range_random_uniform(
23083 	vm_map_size_t           req_size,
23084 	vm_map_offset_t         min_addr,
23085 	vm_map_offset_t         max_addr,
23086 	vm_map_offset_t         offmask)
23087 {
23088 	vm_map_offset_t random_addr;
23089 	struct mach_vm_range alloc;
23090 
23091 	req_size = (req_size + offmask) & ~offmask;
23092 	min_addr = (min_addr + offmask) & ~offmask;
23093 	max_addr = max_addr & ~offmask;
23094 
23095 	read_random(&random_addr, sizeof(random_addr));
23096 	random_addr %= (max_addr - req_size - min_addr);
23097 	random_addr &= ~offmask;
23098 
23099 	alloc.min_address = min_addr + random_addr;
23100 	alloc.max_address = min_addr + random_addr + req_size;
23101 	return alloc;
23102 }
23103 
23104 static vm_map_offset_t
vm_map_range_offmask(void)23105 vm_map_range_offmask(void)
23106 {
23107 	uint32_t pte_depth;
23108 
23109 	/*
23110 	 * PTE optimizations
23111 	 *
23112 	 *
23113 	 * 16k pages systems
23114 	 * ~~~~~~~~~~~~~~~~~
23115 	 *
23116 	 * A single L1 (sub-)page covers the address space.
23117 	 * - L2 pages cover 64G,
23118 	 * - L3 pages cover 32M.
23119 	 *
23120 	 * On embedded, the dynamic VA range is 64G and uses a single L2 page.
23121 	 * As a result, we really only need to align the ranges to 32M to avoid
23122 	 * partial L3 pages.
23123 	 *
23124 	 * On macOS, the usage of L2 pages will increase, so as a result we will
23125 	 * want to align ranges to 64G in order to utilize them fully.
23126 	 *
23127 	 *
23128 	 * 4k pages systems
23129 	 * ~~~~~~~~~~~~~~~~
23130 	 *
23131 	 * A single L0 (sub-)page covers the address space.
23132 	 * - L1 pages cover 512G,
23133 	 * - L2 pages cover 1G,
23134 	 * - L3 pages cover 2M.
23135 	 *
23136 	 * The long tail of processes on a system will tend to have a VA usage
23137 	 * (ignoring the shared regions) in the 100s of MB order of magnitnude.
23138 	 * This is achievable with a single L1 and a few L2s without
23139 	 * randomization.
23140 	 *
23141 	 * However once randomization is introduced, the system will immediately
23142 	 * need several L1s and many more L2s. As a result:
23143 	 *
23144 	 * - on embedded devices, the cost of these extra pages isn't
23145 	 *   sustainable, and we just disable the feature entirely,
23146 	 *
23147 	 * - on macOS we align ranges to a 512G boundary so that the extra L1
23148 	 *   pages can be used to their full potential.
23149 	 */
23150 
23151 	/*
23152 	 * note, this function assumes _non exotic mappings_
23153 	 * which is why it uses the native kernel's PAGE_SHIFT.
23154 	 */
23155 #if XNU_PLATFORM_MacOSX
23156 	pte_depth = PAGE_SHIFT > 12 ? 2 : 3;
23157 #else /* !XNU_PLATFORM_MacOSX */
23158 	pte_depth = PAGE_SHIFT > 12 ? 1 : 0;
23159 #endif /* !XNU_PLATFORM_MacOSX */
23160 
23161 	if (pte_depth == 0) {
23162 		return 0;
23163 	}
23164 
23165 	return (1ull << ((PAGE_SHIFT - 3) * pte_depth + PAGE_SHIFT)) - 1;
23166 }
23167 
23168 /*
23169  * vm_map_range_configure:
23170  *	configures the user vm_map ranges by increasing the maximum VA range of
23171  *  the map and carving out a range at the end of VA space (searching backwards
23172  *  in the newly expanded map).
23173  */
23174 kern_return_t
vm_map_range_configure(vm_map_t map)23175 vm_map_range_configure(vm_map_t map)
23176 {
23177 	const vm_map_offset_t offmask = vm_map_range_offmask();
23178 	struct mach_vm_range data_range;
23179 	vm_map_offset_t default_end;
23180 	kern_return_t kr;
23181 
23182 	if (!vm_map_is_64bit(map) || vm_map_is_exotic(map) || offmask == 0) {
23183 		/*
23184 		 * No point doing vm ranges in a 32bit address space.
23185 		 */
23186 		return KERN_NOT_SUPPORTED;
23187 	}
23188 
23189 	/* Should not be applying ranges to kernel map or kernel map submaps */
23190 	assert(vm_map_pmap(map) != kernel_pmap);
23191 
23192 #if XNU_PLATFORM_MacOSX
23193 
23194 	/*
23195 	 * on macOS, the address space is a massive 47 bits (128T),
23196 	 * with several carve outs that processes can't use:
23197 	 * - the shared region
23198 	 * - the commpage region
23199 	 * - the GPU carve out (if applicable)
23200 	 *
23201 	 * and when nano-malloc is in use it desires memory at the 96T mark.
23202 	 *
23203 	 * However, their location is architecture dependent:
23204 	 * - On intel, the shared region and commpage are
23205 	 *   at the very end of the usable address space (above +127T),
23206 	 *   and there is no GPU carve out, and pthread wants to place
23207 	 *   threads at the 112T mark (0x70T).
23208 	 *
23209 	 * - On arm64, these are in the same spot as on embedded devices:
23210 	 *   o shared region:   [ 6G,  10G)  [ will likely grow over time ]
23211 	 *   o commpage region: [63G,  64G)
23212 	 *   o GPU carve out:   [64G, 448G)
23213 	 *
23214 	 * This is conveninent because the mappings at the end of the address
23215 	 * space (when they exist) are made by the kernel.
23216 	 *
23217 	 * The policy is to allocate a random 1T for the data heap
23218 	 * in the end of the address-space in the:
23219 	 * - [0x71, 0x7f) range on Intel (to leave space for pthread stacks)
23220 	 * - [0x61, 0x7f) range on ASM (to leave space for Nano malloc).
23221 	 */
23222 
23223 	/* see NANOZONE_SIGNATURE in libmalloc */
23224 #if __x86_64__
23225 	default_end = 0x71ull << 40;
23226 #else
23227 	default_end = 0x61ull << 40;
23228 #endif
23229 	data_range  = vm_map_range_random_uniform(1ull << 40,
23230 	        default_end, 0x7full << 40, offmask);
23231 
23232 #else /* !XNU_PLATFORM_MacOSX */
23233 
23234 	/*
23235 	 * Embedded devices:
23236 	 *
23237 	 *   The default VA Size scales with the device physical memory.
23238 	 *
23239 	 *   Out of that:
23240 	 *   - the "zero" page typically uses 4G + some slide
23241 	 *   - the shared region uses SHARED_REGION_SIZE bytes (4G)
23242 	 *
23243 	 *   Without the use of jumbo or any adjustment to the address space,
23244 	 *   a default VM map typically looks like this:
23245 	 *
23246 	 *       0G -->╒════════════╕
23247 	 *             │  pagezero  │
23248 	 *             │  + slide   │
23249 	 *      ~4G -->╞════════════╡<-- vm_map_min(map)
23250 	 *             │            │
23251 	 *       6G -->├────────────┤
23252 	 *             │   shared   │
23253 	 *             │   region   │
23254 	 *      10G -->├────────────┤
23255 	 *             │            │
23256 	 *   max_va -->├────────────┤<-- vm_map_max(map)
23257 	 *             │            │
23258 	 *             ╎   jumbo    ╎
23259 	 *             ╎            ╎
23260 	 *             │            │
23261 	 *      63G -->╞════════════╡<-- MACH_VM_MAX_ADDRESS
23262 	 *             │  commpage  │
23263 	 *      64G -->├────────────┤<-- MACH_VM_MIN_GPU_CARVEOUT_ADDRESS
23264 	 *             │            │
23265 	 *             ╎    GPU     ╎
23266 	 *             ╎  carveout  ╎
23267 	 *             │            │
23268 	 *     448G -->├────────────┤<-- MACH_VM_MAX_GPU_CARVEOUT_ADDRESS
23269 	 *             │            │
23270 	 *             ╎            ╎
23271 	 *             ╎            ╎
23272 	 *             │            │
23273 	 *     512G -->╘════════════╛<-- (1ull << ARM_16K_TT_L1_SHIFT)
23274 	 *
23275 	 *   When this drawing was made, "max_va" was smaller than
23276 	 *   ARM64_MAX_OFFSET_DEVICE_LARGE (~15.5G), leaving shy of
23277 	 *   12G of address space for the zero-page, slide, files,
23278 	 *   binaries, heap ...
23279 	 *
23280 	 *   We will want to make a "heap/data" carve out inside
23281 	 *   the jumbo range of half of that usable space, assuming
23282 	 *   that this is less than a forth of the jumbo range.
23283 	 *
23284 	 *   The assert below intends to catch when max_va grows
23285 	 *   too large for this heuristic.
23286 	 */
23287 
23288 	vm_map_lock_read(map);
23289 	default_end = vm_map_max(map);
23290 	vm_map_unlock_read(map);
23291 
23292 	/*
23293 	 * Check that we're not already jumbo'd,
23294 	 * or our address space was somehow modified.
23295 	 *
23296 	 * If so we cannot guarantee that we can set up the ranges
23297 	 * safely without interfering with the existing map.
23298 	 */
23299 	if (default_end > vm_compute_max_offset(true)) {
23300 		return KERN_NO_SPACE;
23301 	}
23302 
23303 	if (pmap_max_offset(true, ARM_PMAP_MAX_OFFSET_DEFAULT)) {
23304 		/*
23305 		 * an override boot-arg was set, disable user-ranges
23306 		 *
23307 		 * XXX: this is problematic because it means these boot-args
23308 		 *      no longer test the behavior changing the value
23309 		 *      of ARM64_MAX_OFFSET_DEVICE_* would have.
23310 		 */
23311 		return KERN_NOT_SUPPORTED;
23312 	}
23313 
23314 	/* expand the default VM space to the largest possible address */
23315 	vm_map_set_jumbo(map);
23316 
23317 	assert3u(4 * GiB(10), <=, vm_map_max(map) - default_end);
23318 	data_range = vm_map_range_random_uniform(GiB(10),
23319 	    default_end + PAGE_SIZE, vm_map_max(map), offmask);
23320 
23321 #endif /* !XNU_PLATFORM_MacOSX */
23322 
23323 	/*
23324 	 * Poke holes so that ASAN or people listing regions
23325 	 * do not think this space is free.
23326 	 */
23327 
23328 	if (default_end != data_range.min_address) {
23329 		kr = vm_map_enter(map, &default_end,
23330 		    data_range.min_address - default_end,
23331 		    0, VM_MAP_KERNEL_FLAGS_FIXED_PERMANENT(), VM_OBJECT_NULL,
23332 		    0, FALSE, VM_PROT_NONE, VM_PROT_NONE, VM_INHERIT_DEFAULT);
23333 		assert(kr == KERN_SUCCESS);
23334 	}
23335 
23336 	if (data_range.max_address != vm_map_max(map)) {
23337 		vm_map_entry_t entry;
23338 		vm_size_t size;
23339 
23340 		vm_map_lock_read(map);
23341 		vm_map_lookup_entry_or_next(map, data_range.max_address, &entry);
23342 		if (entry != vm_map_to_entry(map)) {
23343 			size = vm_map_max(map) - data_range.max_address;
23344 		} else {
23345 			size = entry->vme_start - data_range.max_address;
23346 		}
23347 		vm_map_unlock_read(map);
23348 
23349 		kr = vm_map_enter(map, &data_range.max_address, size,
23350 		    0, VM_MAP_KERNEL_FLAGS_FIXED_PERMANENT(), VM_OBJECT_NULL,
23351 		    0, FALSE, VM_PROT_NONE, VM_PROT_NONE, VM_INHERIT_DEFAULT);
23352 		assert(kr == KERN_SUCCESS);
23353 	}
23354 
23355 	vm_map_lock(map);
23356 	map->default_range.min_address = vm_map_min(map);
23357 	map->default_range.max_address = default_end;
23358 	map->data_range = data_range;
23359 	map->uses_user_ranges = true;
23360 	vm_map_unlock(map);
23361 
23362 	return KERN_SUCCESS;
23363 }
23364 
23365 /*
23366  * vm_map_range_fork:
23367  *	clones the array of ranges from old_map to new_map in support
23368  *  of a VM map fork.
23369  */
23370 void
vm_map_range_fork(vm_map_t new_map,vm_map_t old_map)23371 vm_map_range_fork(vm_map_t new_map, vm_map_t old_map)
23372 {
23373 	if (!old_map->uses_user_ranges) {
23374 		/* nothing to do */
23375 		return;
23376 	}
23377 
23378 	new_map->default_range = old_map->default_range;
23379 	new_map->data_range = old_map->data_range;
23380 
23381 	if (old_map->extra_ranges_count) {
23382 		vm_map_user_range_t otable, ntable;
23383 		uint16_t count;
23384 
23385 		otable = old_map->extra_ranges;
23386 		count  = old_map->extra_ranges_count;
23387 		ntable = kalloc_data(count * sizeof(struct vm_map_user_range),
23388 		    Z_WAITOK | Z_ZERO | Z_NOFAIL);
23389 		memcpy(ntable, otable,
23390 		    count * sizeof(struct vm_map_user_range));
23391 
23392 		new_map->extra_ranges_count = count;
23393 		new_map->extra_ranges = ntable;
23394 	}
23395 
23396 	new_map->uses_user_ranges = true;
23397 }
23398 
23399 /*
23400  * vm_map_get_user_range:
23401  *	copy the VM user range for the given VM map and range ID.
23402  */
23403 kern_return_t
vm_map_get_user_range(vm_map_t map,vm_map_range_id_t range_id,mach_vm_range_t range)23404 vm_map_get_user_range(
23405 	vm_map_t                map,
23406 	vm_map_range_id_t       range_id,
23407 	mach_vm_range_t         range)
23408 {
23409 	if (map == NULL || !map->uses_user_ranges || range == NULL) {
23410 		return KERN_INVALID_ARGUMENT;
23411 	}
23412 
23413 	switch (range_id) {
23414 	case UMEM_RANGE_ID_DEFAULT:
23415 		*range = map->default_range;
23416 		return KERN_SUCCESS;
23417 
23418 	case UMEM_RANGE_ID_HEAP:
23419 		*range = map->data_range;
23420 		return KERN_SUCCESS;
23421 
23422 	default:
23423 		return KERN_INVALID_ARGUMENT;
23424 	}
23425 }
23426 
23427 static vm_map_range_id_t
vm_map_user_range_resolve(vm_map_t map,mach_vm_address_t addr,mach_vm_size_t size,mach_vm_range_t range)23428 vm_map_user_range_resolve(
23429 	vm_map_t                map,
23430 	mach_vm_address_t       addr,
23431 	mach_vm_size_t          size,
23432 	mach_vm_range_t         range)
23433 {
23434 	struct mach_vm_range tmp;
23435 
23436 	vm_map_lock_assert_held(map);
23437 
23438 	static_assert(UMEM_RANGE_ID_DEFAULT == MACH_VM_RANGE_DEFAULT);
23439 	static_assert(UMEM_RANGE_ID_HEAP == MACH_VM_RANGE_DATA);
23440 
23441 	if (mach_vm_range_contains(&map->default_range, addr, size)) {
23442 		if (range) {
23443 			*range = map->default_range;
23444 		}
23445 		return UMEM_RANGE_ID_DEFAULT;
23446 	}
23447 
23448 	if (mach_vm_range_contains(&map->data_range, addr, size)) {
23449 		if (range) {
23450 			*range = map->data_range;
23451 		}
23452 		return UMEM_RANGE_ID_HEAP;
23453 	}
23454 
23455 	for (size_t i = 0; i < map->extra_ranges_count; i++) {
23456 		vm_map_user_range_t r = &map->extra_ranges[i];
23457 
23458 		tmp.min_address = r->vmur_min_address;
23459 		tmp.max_address = r->vmur_max_address;
23460 
23461 		if (mach_vm_range_contains(&tmp, addr, size)) {
23462 			if (range) {
23463 				*range = tmp;
23464 			}
23465 			return r->vmur_range_id;
23466 		}
23467 	}
23468 
23469 	if (range) {
23470 		range->min_address = range->max_address = 0;
23471 	}
23472 	return UMEM_RANGE_ID_DEFAULT;
23473 }
23474 
23475 static int
vm_map_user_range_cmp(const void * e1,const void * e2)23476 vm_map_user_range_cmp(const void *e1, const void *e2)
23477 {
23478 	const struct vm_map_user_range *r1 = e1;
23479 	const struct vm_map_user_range *r2 = e2;
23480 
23481 	if (r1->vmur_min_address != r2->vmur_min_address) {
23482 		return r1->vmur_min_address < r2->vmur_min_address ? -1 : 1;
23483 	}
23484 
23485 	return 0;
23486 }
23487 
23488 static int
mach_vm_range_recipe_v1_cmp(const void * e1,const void * e2)23489 mach_vm_range_recipe_v1_cmp(const void *e1, const void *e2)
23490 {
23491 	const mach_vm_range_recipe_v1_t *r1 = e1;
23492 	const mach_vm_range_recipe_v1_t *r2 = e2;
23493 
23494 	if (r1->range.min_address != r2->range.min_address) {
23495 		return r1->range.min_address < r2->range.min_address ? -1 : 1;
23496 	}
23497 
23498 	return 0;
23499 }
23500 
23501 /*!
23502  * @function mach_vm_range_create_v1()
23503  *
23504  * @brief
23505  * Handle the backend for mach_vm_range_create() for the
23506  * MACH_VM_RANGE_FLAVOR_V1 flavor.
23507  *
23508  * @description
23509  * This call allows to create "ranges" in the map of a task
23510  * that have special semantics/policies around placement of
23511  * new allocations (in the vm_map_locate_space() sense).
23512  *
23513  * @returns
23514  * - KERN_SUCCESS on success
23515  * - KERN_INVALID_ARGUMENT for incorrect arguments
23516  * - KERN_NO_SPACE if the maximum amount of ranges would be exceeded
23517  * - KERN_MEMORY_PRESENT if any of the requested ranges
23518  *   overlaps with existing ranges or allocations in the map.
23519  */
23520 static kern_return_t
mach_vm_range_create_v1(vm_map_t map,mach_vm_range_recipe_v1_t * recipe,uint32_t new_count)23521 mach_vm_range_create_v1(
23522 	vm_map_t                map,
23523 	mach_vm_range_recipe_v1_t *recipe,
23524 	uint32_t                new_count)
23525 {
23526 	const vm_offset_t mask = VM_MAP_PAGE_MASK(map);
23527 	vm_map_user_range_t table;
23528 	kern_return_t kr = KERN_SUCCESS;
23529 	uint16_t count;
23530 
23531 	struct mach_vm_range void1 = {
23532 		.min_address = map->default_range.max_address,
23533 		.max_address = map->data_range.min_address,
23534 	};
23535 	struct mach_vm_range void2 = {
23536 		.min_address = map->data_range.max_address,
23537 		.max_address = vm_map_max(map),
23538 	};
23539 
23540 	qsort(recipe, new_count, sizeof(mach_vm_range_recipe_v1_t),
23541 	    mach_vm_range_recipe_v1_cmp);
23542 
23543 	/*
23544 	 * Step 1: Validate that the recipes have no intersections.
23545 	 */
23546 
23547 	for (size_t i = 0; i < new_count; i++) {
23548 		mach_vm_range_t r = &recipe[i].range;
23549 		mach_vm_size_t s = mach_vm_range_size(r);
23550 
23551 		if (recipe[i].flags) {
23552 			return KERN_INVALID_ARGUMENT;
23553 		}
23554 
23555 		static_assert(UMEM_RANGE_ID_FIXED == MACH_VM_RANGE_FIXED);
23556 		switch (recipe[i].range_tag) {
23557 		case MACH_VM_RANGE_FIXED:
23558 			break;
23559 		default:
23560 			return KERN_INVALID_ARGUMENT;
23561 		}
23562 
23563 		if (!VM_MAP_PAGE_ALIGNED(r->min_address, mask) ||
23564 		    !VM_MAP_PAGE_ALIGNED(r->max_address, mask)) {
23565 			return KERN_INVALID_ARGUMENT;
23566 		}
23567 
23568 		if (!mach_vm_range_contains(&void1, r->min_address, s) &&
23569 		    !mach_vm_range_contains(&void2, r->min_address, s)) {
23570 			return KERN_INVALID_ARGUMENT;
23571 		}
23572 
23573 		if (i > 0 && recipe[i - 1].range.max_address >
23574 		    recipe[i].range.min_address) {
23575 			return KERN_INVALID_ARGUMENT;
23576 		}
23577 	}
23578 
23579 	vm_map_lock(map);
23580 
23581 	table = map->extra_ranges;
23582 	count = map->extra_ranges_count;
23583 
23584 	if (count + new_count > VM_MAP_EXTRA_RANGES_MAX) {
23585 		kr = KERN_NO_SPACE;
23586 		goto out_unlock;
23587 	}
23588 
23589 	/*
23590 	 * Step 2: Check that there is no intersection with existing ranges.
23591 	 */
23592 
23593 	for (size_t i = 0, j = 0; i < new_count && j < count;) {
23594 		mach_vm_range_t     r1 = &recipe[i].range;
23595 		vm_map_user_range_t r2 = &table[j];
23596 
23597 		if (r1->max_address <= r2->vmur_min_address) {
23598 			i++;
23599 		} else if (r2->vmur_max_address <= r1->min_address) {
23600 			j++;
23601 		} else {
23602 			kr = KERN_MEMORY_PRESENT;
23603 			goto out_unlock;
23604 		}
23605 	}
23606 
23607 	/*
23608 	 * Step 4: commit the new ranges.
23609 	 */
23610 
23611 	static_assert(VM_MAP_EXTRA_RANGES_MAX * sizeof(struct vm_map_user_range) <=
23612 	    KALLOC_SAFE_ALLOC_SIZE);
23613 
23614 	table = krealloc_data(table,
23615 	    count * sizeof(struct vm_map_user_range),
23616 	    (count + new_count) * sizeof(struct vm_map_user_range),
23617 	    Z_ZERO | Z_WAITOK | Z_NOFAIL);
23618 
23619 	for (size_t i = 0; i < new_count; i++) {
23620 		static_assert(MACH_VM_MAX_ADDRESS < (1ull << 56));
23621 
23622 		table[count + i] = (struct vm_map_user_range){
23623 			.vmur_min_address = recipe[i].range.min_address,
23624 			.vmur_max_address = recipe[i].range.max_address,
23625 			.vmur_range_id    = (vm_map_range_id_t)recipe[i].range_tag,
23626 		};
23627 	}
23628 
23629 	qsort(table, count + new_count,
23630 	    sizeof(struct vm_map_user_range), vm_map_user_range_cmp);
23631 
23632 	map->extra_ranges_count += new_count;
23633 	map->extra_ranges = table;
23634 
23635 out_unlock:
23636 	vm_map_unlock(map);
23637 
23638 	if (kr == KERN_SUCCESS) {
23639 		for (size_t i = 0; i < new_count; i++) {
23640 			vm_map_kernel_flags_t vmk_flags = {
23641 				.vmf_fixed = true,
23642 				.vmf_overwrite = true,
23643 				.vmkf_overwrite_immutable = true,
23644 				.vm_tag = recipe[i].vm_tag,
23645 			};
23646 			__assert_only kern_return_t kr2;
23647 
23648 			kr2 = vm_map_enter(map, &recipe[i].range.min_address,
23649 			    mach_vm_range_size(&recipe[i].range),
23650 			    0, vmk_flags, VM_OBJECT_NULL, 0, FALSE,
23651 			    VM_PROT_NONE, VM_PROT_ALL,
23652 			    VM_INHERIT_DEFAULT);
23653 			assert(kr2 == KERN_SUCCESS);
23654 		}
23655 	}
23656 	return kr;
23657 }
23658 
23659 kern_return_t
mach_vm_range_create(vm_map_t map,mach_vm_range_flavor_t flavor,mach_vm_range_recipes_raw_t recipe,natural_t size)23660 mach_vm_range_create(
23661 	vm_map_t                map,
23662 	mach_vm_range_flavor_t  flavor,
23663 	mach_vm_range_recipes_raw_t recipe,
23664 	natural_t               size)
23665 {
23666 	if (map != current_map()) {
23667 		return KERN_INVALID_ARGUMENT;
23668 	}
23669 
23670 	if (!map->uses_user_ranges) {
23671 		return KERN_NOT_SUPPORTED;
23672 	}
23673 
23674 	if (size == 0) {
23675 		return KERN_SUCCESS;
23676 	}
23677 
23678 	if (flavor == MACH_VM_RANGE_FLAVOR_V1) {
23679 		mach_vm_range_recipe_v1_t *array;
23680 
23681 		if (size % sizeof(mach_vm_range_recipe_v1_t)) {
23682 			return KERN_INVALID_ARGUMENT;
23683 		}
23684 
23685 		size /= sizeof(mach_vm_range_recipe_v1_t);
23686 		if (size > VM_MAP_EXTRA_RANGES_MAX) {
23687 			return KERN_NO_SPACE;
23688 		}
23689 
23690 		array = (mach_vm_range_recipe_v1_t *)recipe;
23691 		return mach_vm_range_create_v1(map, array, size);
23692 	}
23693 
23694 	return KERN_INVALID_ARGUMENT;
23695 }
23696 
23697 #else /* !CONFIG_MAP_RANGES */
23698 
23699 kern_return_t
mach_vm_range_create(vm_map_t map,mach_vm_range_flavor_t flavor,mach_vm_range_recipes_raw_t recipe,natural_t size)23700 mach_vm_range_create(
23701 	vm_map_t                map,
23702 	mach_vm_range_flavor_t  flavor,
23703 	mach_vm_range_recipes_raw_t recipe,
23704 	natural_t               size)
23705 {
23706 #pragma unused(map, flavor, recipe, size)
23707 	return KERN_NOT_SUPPORTED;
23708 }
23709 
23710 #endif /* !CONFIG_MAP_RANGES */
23711 
23712 void
vm_map_kernel_flags_update_range_id(vm_map_kernel_flags_t * vmkf,vm_map_t map)23713 vm_map_kernel_flags_update_range_id(vm_map_kernel_flags_t *vmkf, vm_map_t map)
23714 {
23715 	if (map == kernel_map) {
23716 		if (vmkf->vmkf_range_id == KMEM_RANGE_ID_NONE) {
23717 			vmkf->vmkf_range_id = KMEM_RANGE_ID_DATA;
23718 		}
23719 #if CONFIG_MAP_RANGES
23720 	} else if (vmkf->vm_tag < VM_MEMORY_COUNT &&
23721 	    vmkf->vmkf_range_id == UMEM_RANGE_ID_DEFAULT &&
23722 	    bitmap_test(vm_map_user_range_heap_map, vmkf->vm_tag)) {
23723 		vmkf->vmkf_range_id = UMEM_RANGE_ID_HEAP;
23724 #endif /* CONFIG_MAP_RANGES */
23725 	}
23726 }
23727 
23728 /*
23729  * vm_map_entry_has_device_pager:
23730  * Check if the vm map entry specified by the virtual address has a device pager.
23731  * If the vm map entry does not exist or if the map is NULL, this returns FALSE.
23732  */
23733 boolean_t
vm_map_entry_has_device_pager(vm_map_t map,vm_map_offset_t vaddr)23734 vm_map_entry_has_device_pager(vm_map_t map, vm_map_offset_t vaddr)
23735 {
23736 	vm_map_entry_t entry;
23737 	vm_object_t object;
23738 	boolean_t result;
23739 
23740 	if (map == NULL) {
23741 		return FALSE;
23742 	}
23743 
23744 	vm_map_lock(map);
23745 	while (TRUE) {
23746 		if (!vm_map_lookup_entry(map, vaddr, &entry)) {
23747 			result = FALSE;
23748 			break;
23749 		}
23750 		if (entry->is_sub_map) {
23751 			// Check the submap
23752 			vm_map_t submap = VME_SUBMAP(entry);
23753 			assert(submap != NULL);
23754 			vm_map_lock(submap);
23755 			vm_map_unlock(map);
23756 			map = submap;
23757 			continue;
23758 		}
23759 		object = VME_OBJECT(entry);
23760 		if (object != NULL && object->pager != NULL && is_device_pager_ops(object->pager->mo_pager_ops)) {
23761 			result = TRUE;
23762 			break;
23763 		}
23764 		result = FALSE;
23765 		break;
23766 	}
23767 
23768 	vm_map_unlock(map);
23769 	return result;
23770 }
23771 
23772 
23773 #if MACH_ASSERT
23774 
23775 extern int pmap_ledgers_panic;
23776 extern int pmap_ledgers_panic_leeway;
23777 
23778 #define LEDGER_DRIFT(__LEDGER)                    \
23779 	int             __LEDGER##_over;          \
23780 	ledger_amount_t __LEDGER##_over_total;    \
23781 	ledger_amount_t __LEDGER##_over_max;      \
23782 	int             __LEDGER##_under;         \
23783 	ledger_amount_t __LEDGER##_under_total;   \
23784 	ledger_amount_t __LEDGER##_under_max
23785 
23786 struct {
23787 	uint64_t        num_pmaps_checked;
23788 
23789 	LEDGER_DRIFT(phys_footprint);
23790 	LEDGER_DRIFT(internal);
23791 	LEDGER_DRIFT(internal_compressed);
23792 	LEDGER_DRIFT(external);
23793 	LEDGER_DRIFT(reusable);
23794 	LEDGER_DRIFT(iokit_mapped);
23795 	LEDGER_DRIFT(alternate_accounting);
23796 	LEDGER_DRIFT(alternate_accounting_compressed);
23797 	LEDGER_DRIFT(page_table);
23798 	LEDGER_DRIFT(purgeable_volatile);
23799 	LEDGER_DRIFT(purgeable_nonvolatile);
23800 	LEDGER_DRIFT(purgeable_volatile_compressed);
23801 	LEDGER_DRIFT(purgeable_nonvolatile_compressed);
23802 	LEDGER_DRIFT(tagged_nofootprint);
23803 	LEDGER_DRIFT(tagged_footprint);
23804 	LEDGER_DRIFT(tagged_nofootprint_compressed);
23805 	LEDGER_DRIFT(tagged_footprint_compressed);
23806 	LEDGER_DRIFT(network_volatile);
23807 	LEDGER_DRIFT(network_nonvolatile);
23808 	LEDGER_DRIFT(network_volatile_compressed);
23809 	LEDGER_DRIFT(network_nonvolatile_compressed);
23810 	LEDGER_DRIFT(media_nofootprint);
23811 	LEDGER_DRIFT(media_footprint);
23812 	LEDGER_DRIFT(media_nofootprint_compressed);
23813 	LEDGER_DRIFT(media_footprint_compressed);
23814 	LEDGER_DRIFT(graphics_nofootprint);
23815 	LEDGER_DRIFT(graphics_footprint);
23816 	LEDGER_DRIFT(graphics_nofootprint_compressed);
23817 	LEDGER_DRIFT(graphics_footprint_compressed);
23818 	LEDGER_DRIFT(neural_nofootprint);
23819 	LEDGER_DRIFT(neural_footprint);
23820 	LEDGER_DRIFT(neural_nofootprint_compressed);
23821 	LEDGER_DRIFT(neural_footprint_compressed);
23822 } pmap_ledgers_drift;
23823 
23824 void
vm_map_pmap_check_ledgers(pmap_t pmap,ledger_t ledger,int pid,char * procname)23825 vm_map_pmap_check_ledgers(
23826 	pmap_t          pmap,
23827 	ledger_t        ledger,
23828 	int             pid,
23829 	char            *procname)
23830 {
23831 	ledger_amount_t bal;
23832 	boolean_t       do_panic;
23833 
23834 	do_panic = FALSE;
23835 
23836 	pmap_ledgers_drift.num_pmaps_checked++;
23837 
23838 #define LEDGER_CHECK_BALANCE(__LEDGER)                                  \
23839 MACRO_BEGIN                                                             \
23840 	int panic_on_negative = TRUE;                                   \
23841 	ledger_get_balance(ledger,                                      \
23842 	                   task_ledgers.__LEDGER,                       \
23843 	                   &bal);                                       \
23844 	ledger_get_panic_on_negative(ledger,                            \
23845 	                             task_ledgers.__LEDGER,             \
23846 	                             &panic_on_negative);               \
23847 	if (bal != 0) {                                                 \
23848 	        if (panic_on_negative ||                                \
23849 	            (pmap_ledgers_panic &&                              \
23850 	             pmap_ledgers_panic_leeway > 0 &&                   \
23851 	             (bal > (pmap_ledgers_panic_leeway * PAGE_SIZE) ||  \
23852 	              bal < (-pmap_ledgers_panic_leeway * PAGE_SIZE)))) { \
23853 	                do_panic = TRUE;                                \
23854 	        }                                                       \
23855 	        printf("LEDGER BALANCE proc %d (%s) "                   \
23856 	               "\"%s\" = %lld\n",                               \
23857 	               pid, procname, #__LEDGER, bal);                  \
23858 	        if (bal > 0) {                                          \
23859 	                pmap_ledgers_drift.__LEDGER##_over++;           \
23860 	                pmap_ledgers_drift.__LEDGER##_over_total += bal; \
23861 	                if (bal > pmap_ledgers_drift.__LEDGER##_over_max) { \
23862 	                        pmap_ledgers_drift.__LEDGER##_over_max = bal; \
23863 	                }                                               \
23864 	        } else if (bal < 0) {                                   \
23865 	                pmap_ledgers_drift.__LEDGER##_under++;          \
23866 	                pmap_ledgers_drift.__LEDGER##_under_total += bal; \
23867 	                if (bal < pmap_ledgers_drift.__LEDGER##_under_max) { \
23868 	                        pmap_ledgers_drift.__LEDGER##_under_max = bal; \
23869 	                }                                               \
23870 	        }                                                       \
23871 	}                                                               \
23872 MACRO_END
23873 
23874 	LEDGER_CHECK_BALANCE(phys_footprint);
23875 	LEDGER_CHECK_BALANCE(internal);
23876 	LEDGER_CHECK_BALANCE(internal_compressed);
23877 	LEDGER_CHECK_BALANCE(external);
23878 	LEDGER_CHECK_BALANCE(reusable);
23879 	LEDGER_CHECK_BALANCE(iokit_mapped);
23880 	LEDGER_CHECK_BALANCE(alternate_accounting);
23881 	LEDGER_CHECK_BALANCE(alternate_accounting_compressed);
23882 	LEDGER_CHECK_BALANCE(page_table);
23883 	LEDGER_CHECK_BALANCE(purgeable_volatile);
23884 	LEDGER_CHECK_BALANCE(purgeable_nonvolatile);
23885 	LEDGER_CHECK_BALANCE(purgeable_volatile_compressed);
23886 	LEDGER_CHECK_BALANCE(purgeable_nonvolatile_compressed);
23887 	LEDGER_CHECK_BALANCE(tagged_nofootprint);
23888 	LEDGER_CHECK_BALANCE(tagged_footprint);
23889 	LEDGER_CHECK_BALANCE(tagged_nofootprint_compressed);
23890 	LEDGER_CHECK_BALANCE(tagged_footprint_compressed);
23891 	LEDGER_CHECK_BALANCE(network_volatile);
23892 	LEDGER_CHECK_BALANCE(network_nonvolatile);
23893 	LEDGER_CHECK_BALANCE(network_volatile_compressed);
23894 	LEDGER_CHECK_BALANCE(network_nonvolatile_compressed);
23895 	LEDGER_CHECK_BALANCE(media_nofootprint);
23896 	LEDGER_CHECK_BALANCE(media_footprint);
23897 	LEDGER_CHECK_BALANCE(media_nofootprint_compressed);
23898 	LEDGER_CHECK_BALANCE(media_footprint_compressed);
23899 	LEDGER_CHECK_BALANCE(graphics_nofootprint);
23900 	LEDGER_CHECK_BALANCE(graphics_footprint);
23901 	LEDGER_CHECK_BALANCE(graphics_nofootprint_compressed);
23902 	LEDGER_CHECK_BALANCE(graphics_footprint_compressed);
23903 	LEDGER_CHECK_BALANCE(neural_nofootprint);
23904 	LEDGER_CHECK_BALANCE(neural_footprint);
23905 	LEDGER_CHECK_BALANCE(neural_nofootprint_compressed);
23906 	LEDGER_CHECK_BALANCE(neural_footprint_compressed);
23907 
23908 	if (do_panic) {
23909 		if (pmap_ledgers_panic) {
23910 			panic("pmap_destroy(%p) %d[%s] has imbalanced ledgers",
23911 			    pmap, pid, procname);
23912 		} else {
23913 			printf("pmap_destroy(%p) %d[%s] has imbalanced ledgers\n",
23914 			    pmap, pid, procname);
23915 		}
23916 	}
23917 }
23918 
23919 void
vm_map_pmap_set_process(vm_map_t map,int pid,char * procname)23920 vm_map_pmap_set_process(
23921 	vm_map_t map,
23922 	int pid,
23923 	char *procname)
23924 {
23925 	pmap_set_process(vm_map_pmap(map), pid, procname);
23926 }
23927 
23928 #endif /* MACH_ASSERT */
23929