xref: /xnu-8792.41.9/osfmk/vm/vm_map.c (revision 5c2921b07a2480ab43ec66f5b9e41cb872bc554f)
1 /*
2  * Copyright (c) 2000-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * @OSF_COPYRIGHT@
30  */
31 /*
32  * Mach Operating System
33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34  * All Rights Reserved.
35  *
36  * Permission to use, copy, modify and distribute this software and its
37  * documentation is hereby granted, provided that both the copyright
38  * notice and this permission notice appear in all copies of the
39  * software, derivative works or modified versions, and any portions
40  * thereof, and that both notices appear in supporting documentation.
41  *
42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45  *
46  * Carnegie Mellon requests users of this software to return to
47  *
48  *  Software Distribution Coordinator  or  [email protected]
49  *  School of Computer Science
50  *  Carnegie Mellon University
51  *  Pittsburgh PA 15213-3890
52  *
53  * any improvements or extensions that they make and grant Carnegie Mellon
54  * the rights to redistribute these changes.
55  */
56 /*
57  */
58 /*
59  *	File:	vm/vm_map.c
60  *	Author:	Avadis Tevanian, Jr., Michael Wayne Young
61  *	Date:	1985
62  *
63  *	Virtual memory mapping module.
64  */
65 
66 #include <mach/vm_types.h>
67 #include <mach_assert.h>
68 
69 #include <vm/vm_options.h>
70 
71 #include <libkern/OSAtomic.h>
72 
73 #include <mach/kern_return.h>
74 #include <mach/port.h>
75 #include <mach/vm_attributes.h>
76 #include <mach/vm_param.h>
77 #include <mach/vm_behavior.h>
78 #include <mach/vm_statistics.h>
79 #include <mach/memory_object.h>
80 #include <mach/mach_vm.h>
81 #include <machine/cpu_capabilities.h>
82 #include <mach/sdt.h>
83 
84 #include <kern/assert.h>
85 #include <kern/backtrace.h>
86 #include <kern/counter.h>
87 #include <kern/exc_guard.h>
88 #include <kern/kalloc.h>
89 #include <kern/zalloc_internal.h>
90 
91 #include <vm/cpm.h>
92 #include <vm/vm_compressor.h>
93 #include <vm/vm_compressor_pager.h>
94 #include <vm/vm_init.h>
95 #include <vm/vm_fault.h>
96 #include <vm/vm_map_internal.h>
97 #include <vm/vm_object.h>
98 #include <vm/vm_page.h>
99 #include <vm/vm_pageout.h>
100 #include <vm/pmap.h>
101 #include <vm/vm_kern.h>
102 #include <ipc/ipc_port.h>
103 #include <kern/sched_prim.h>
104 #include <kern/misc_protos.h>
105 
106 #include <mach/vm_map_server.h>
107 #include <mach/mach_host_server.h>
108 #include <vm/vm_protos.h>
109 #include <vm/vm_purgeable_internal.h>
110 #include <vm/vm_reclaim_internal.h>
111 
112 #include <vm/vm_protos.h>
113 #include <vm/vm_shared_region.h>
114 #include <vm/vm_map_store.h>
115 
116 #include <san/kasan.h>
117 
118 #include <sys/resource.h>
119 #include <sys/codesign.h>
120 #include <sys/code_signing.h>
121 #include <sys/mman.h>
122 #include <sys/reboot.h>
123 #include <sys/kdebug_triage.h>
124 
125 #include <libkern/section_keywords.h>
126 
127 #if DEVELOPMENT || DEBUG
128 extern int proc_selfcsflags(void);
129 int panic_on_unsigned_execute = 0;
130 int panic_on_mlock_failure = 0;
131 #endif /* DEVELOPMENT || DEBUG */
132 
133 #if MACH_ASSERT
134 int debug4k_filter = 0;
135 char debug4k_proc_name[1024] = "";
136 int debug4k_proc_filter = (int)-1 & ~(1 << __DEBUG4K_FAULT);
137 int debug4k_panic_on_misaligned_sharing = 0;
138 const char *debug4k_category_name[] = {
139 	"error",        /* 0 */
140 	"life",         /* 1 */
141 	"load",         /* 2 */
142 	"fault",        /* 3 */
143 	"copy",         /* 4 */
144 	"share",        /* 5 */
145 	"adjust",       /* 6 */
146 	"pmap",         /* 7 */
147 	"mementry",     /* 8 */
148 	"iokit",        /* 9 */
149 	"upl",          /* 10 */
150 	"exc",          /* 11 */
151 	"vfs"           /* 12 */
152 };
153 #endif /* MACH_ASSERT */
154 int debug4k_no_cow_copyin = 0;
155 
156 
157 #if __arm64__
158 extern const int fourk_binary_compatibility_unsafe;
159 extern const int fourk_binary_compatibility_allow_wx;
160 #endif /* __arm64__ */
161 extern int proc_selfpid(void);
162 extern char *proc_name_address(void *p);
163 
164 #if VM_MAP_DEBUG_APPLE_PROTECT
165 int vm_map_debug_apple_protect = 0;
166 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
167 #if VM_MAP_DEBUG_FOURK
168 int vm_map_debug_fourk = 0;
169 #endif /* VM_MAP_DEBUG_FOURK */
170 
171 #if DEBUG || DEVELOPMENT
172 static TUNABLE(bool, vm_map_executable_immutable,
173     "vm_map_executable_immutable", true);
174 #else
175 #define vm_map_executable_immutable true
176 #endif
177 
178 #if CONFIG_MAP_RANGES
179 static TUNABLE(bool, vm_map_user_ranges, "vm_map_user_ranges", true);
180 static SECURITY_READ_ONLY_LATE(uint8_t) vm_map_range_id_map[VM_MEMORY_COUNT];
181 #endif
182 
183 os_refgrp_decl(static, map_refgrp, "vm_map", NULL);
184 
185 extern u_int32_t random(void);  /* from <libkern/libkern.h> */
186 /* Internal prototypes
187  */
188 
189 typedef struct vm_map_zap {
190 	vm_map_entry_t          vmz_head;
191 	vm_map_entry_t         *vmz_tail;
192 } *vm_map_zap_t;
193 
194 #define VM_MAP_ZAP_DECLARE(zap) \
195 	struct vm_map_zap zap = { .vmz_tail = &zap.vmz_head }
196 
197 static vm_map_entry_t   vm_map_entry_insert(
198 	vm_map_t                map,
199 	vm_map_entry_t          insp_entry,
200 	vm_map_offset_t         start,
201 	vm_map_offset_t         end,
202 	vm_object_t             object,
203 	vm_object_offset_t      offset,
204 	vm_map_kernel_flags_t   vmk_flags,
205 	boolean_t               needs_copy,
206 	vm_prot_t               cur_protection,
207 	vm_prot_t               max_protection,
208 	vm_inherit_t            inheritance,
209 	boolean_t               no_cache,
210 	boolean_t               permanent,
211 	unsigned int            superpage_size,
212 	boolean_t               clear_map_aligned,
213 	int                     alias);
214 
215 static void vm_map_simplify_range(
216 	vm_map_t        map,
217 	vm_map_offset_t start,
218 	vm_map_offset_t end);   /* forward */
219 
220 static boolean_t        vm_map_range_check(
221 	vm_map_t        map,
222 	vm_map_offset_t start,
223 	vm_map_offset_t end,
224 	vm_map_entry_t  *entry);
225 
226 static void vm_map_submap_pmap_clean(
227 	vm_map_t        map,
228 	vm_map_offset_t start,
229 	vm_map_offset_t end,
230 	vm_map_t        sub_map,
231 	vm_map_offset_t offset);
232 
233 static void             vm_map_pmap_enter(
234 	vm_map_t                map,
235 	vm_map_offset_t         addr,
236 	vm_map_offset_t         end_addr,
237 	vm_object_t             object,
238 	vm_object_offset_t      offset,
239 	vm_prot_t               protection);
240 
241 static void             _vm_map_clip_end(
242 	struct vm_map_header    *map_header,
243 	vm_map_entry_t          entry,
244 	vm_map_offset_t         end);
245 
246 static void             _vm_map_clip_start(
247 	struct vm_map_header    *map_header,
248 	vm_map_entry_t          entry,
249 	vm_map_offset_t         start);
250 
251 static kmem_return_t vm_map_delete(
252 	vm_map_t        map,
253 	vm_map_offset_t start,
254 	vm_map_offset_t end,
255 	vmr_flags_t     flags,
256 	kmem_guard_t    guard,
257 	vm_map_zap_t    zap);
258 
259 static void             vm_map_copy_insert(
260 	vm_map_t        map,
261 	vm_map_entry_t  after_where,
262 	vm_map_copy_t   copy);
263 
264 static kern_return_t    vm_map_copy_overwrite_unaligned(
265 	vm_map_t        dst_map,
266 	vm_map_entry_t  entry,
267 	vm_map_copy_t   copy,
268 	vm_map_address_t start,
269 	boolean_t       discard_on_success);
270 
271 static kern_return_t    vm_map_copy_overwrite_aligned(
272 	vm_map_t        dst_map,
273 	vm_map_entry_t  tmp_entry,
274 	vm_map_copy_t   copy,
275 	vm_map_offset_t start,
276 	pmap_t          pmap);
277 
278 static kern_return_t    vm_map_copyin_kernel_buffer(
279 	vm_map_t        src_map,
280 	vm_map_address_t src_addr,
281 	vm_map_size_t   len,
282 	boolean_t       src_destroy,
283 	vm_map_copy_t   *copy_result);  /* OUT */
284 
285 static kern_return_t    vm_map_copyout_kernel_buffer(
286 	vm_map_t        map,
287 	vm_map_address_t *addr, /* IN/OUT */
288 	vm_map_copy_t   copy,
289 	vm_map_size_t   copy_size,
290 	boolean_t       overwrite,
291 	boolean_t       consume_on_success);
292 
293 static void             vm_map_fork_share(
294 	vm_map_t        old_map,
295 	vm_map_entry_t  old_entry,
296 	vm_map_t        new_map);
297 
298 static boolean_t        vm_map_fork_copy(
299 	vm_map_t        old_map,
300 	vm_map_entry_t  *old_entry_p,
301 	vm_map_t        new_map,
302 	int             vm_map_copyin_flags);
303 
304 static kern_return_t    vm_map_wire_nested(
305 	vm_map_t                   map,
306 	vm_map_offset_t            start,
307 	vm_map_offset_t            end,
308 	vm_prot_t                  caller_prot,
309 	vm_tag_t                   tag,
310 	boolean_t                  user_wire,
311 	pmap_t                     map_pmap,
312 	vm_map_offset_t            pmap_addr,
313 	ppnum_t                    *physpage_p);
314 
315 static kern_return_t    vm_map_unwire_nested(
316 	vm_map_t                   map,
317 	vm_map_offset_t            start,
318 	vm_map_offset_t            end,
319 	boolean_t                  user_wire,
320 	pmap_t                     map_pmap,
321 	vm_map_offset_t            pmap_addr);
322 
323 static kern_return_t    vm_map_overwrite_submap_recurse(
324 	vm_map_t                   dst_map,
325 	vm_map_offset_t            dst_addr,
326 	vm_map_size_t              dst_size);
327 
328 static kern_return_t    vm_map_copy_overwrite_nested(
329 	vm_map_t                   dst_map,
330 	vm_map_offset_t            dst_addr,
331 	vm_map_copy_t              copy,
332 	boolean_t                  interruptible,
333 	pmap_t                     pmap,
334 	boolean_t                  discard_on_success);
335 
336 static kern_return_t    vm_map_remap_extract(
337 	vm_map_t                map,
338 	vm_map_offset_t         addr,
339 	vm_map_size_t           size,
340 	boolean_t               copy,
341 	struct vm_map_header    *map_header,
342 	vm_prot_t               *cur_protection,
343 	vm_prot_t               *max_protection,
344 	vm_inherit_t            inheritance,
345 	vm_map_kernel_flags_t   vmk_flags);
346 
347 static kern_return_t    vm_map_remap_range_allocate(
348 	vm_map_t                map,
349 	vm_map_address_t        *address,
350 	vm_map_size_t           size,
351 	vm_map_offset_t         mask,
352 	int                     flags,
353 	vm_map_kernel_flags_t   vmk_flags,
354 	vm_tag_t                tag,
355 	vm_map_entry_t          *map_entry,
356 	vm_map_zap_t            zap_list);
357 
358 static void             vm_map_region_look_for_page(
359 	vm_map_t                   map,
360 	vm_map_offset_t            va,
361 	vm_object_t                object,
362 	vm_object_offset_t         offset,
363 	int                        max_refcnt,
364 	unsigned short             depth,
365 	vm_region_extended_info_t  extended,
366 	mach_msg_type_number_t count);
367 
368 static int              vm_map_region_count_obj_refs(
369 	vm_map_entry_t             entry,
370 	vm_object_t                object);
371 
372 
373 static kern_return_t    vm_map_willneed(
374 	vm_map_t        map,
375 	vm_map_offset_t start,
376 	vm_map_offset_t end);
377 
378 static kern_return_t    vm_map_reuse_pages(
379 	vm_map_t        map,
380 	vm_map_offset_t start,
381 	vm_map_offset_t end);
382 
383 static kern_return_t    vm_map_reusable_pages(
384 	vm_map_t        map,
385 	vm_map_offset_t start,
386 	vm_map_offset_t end);
387 
388 static kern_return_t    vm_map_can_reuse(
389 	vm_map_t        map,
390 	vm_map_offset_t start,
391 	vm_map_offset_t end);
392 
393 #if MACH_ASSERT
394 static kern_return_t    vm_map_pageout(
395 	vm_map_t        map,
396 	vm_map_offset_t start,
397 	vm_map_offset_t end);
398 #endif /* MACH_ASSERT */
399 
400 kern_return_t vm_map_corpse_footprint_collect(
401 	vm_map_t        old_map,
402 	vm_map_entry_t  old_entry,
403 	vm_map_t        new_map);
404 void vm_map_corpse_footprint_collect_done(
405 	vm_map_t        new_map);
406 void vm_map_corpse_footprint_destroy(
407 	vm_map_t        map);
408 kern_return_t vm_map_corpse_footprint_query_page_info(
409 	vm_map_t        map,
410 	vm_map_offset_t va,
411 	int             *disposition_p);
412 void vm_map_footprint_query_page_info(
413 	vm_map_t        map,
414 	vm_map_entry_t  map_entry,
415 	vm_map_offset_t curr_s_offset,
416 	int             *disposition_p);
417 
418 #if CONFIG_MAP_RANGES
419 static void vm_map_range_map_init(void);
420 #endif /* CONFIG_MAP_RANGES */
421 
422 pid_t find_largest_process_vm_map_entries(void);
423 
424 extern int exit_with_guard_exception(void *p, mach_exception_data_type_t code,
425     mach_exception_data_type_t subcode);
426 
427 /*
428  * Macros to copy a vm_map_entry. We must be careful to correctly
429  * manage the wired page count. vm_map_entry_copy() creates a new
430  * map entry to the same memory - the wired count in the new entry
431  * must be set to zero. vm_map_entry_copy_full() creates a new
432  * entry that is identical to the old entry.  This preserves the
433  * wire count; it's used for map splitting and zone changing in
434  * vm_map_copyout.
435  */
436 
437 static inline void
vm_map_entry_copy_pmap_cs_assoc(vm_map_t map __unused,vm_map_entry_t new __unused,vm_map_entry_t old __unused)438 vm_map_entry_copy_pmap_cs_assoc(
439 	vm_map_t map __unused,
440 	vm_map_entry_t new __unused,
441 	vm_map_entry_t old __unused)
442 {
443 	/* when pmap_cs is not enabled, assert as a sanity check */
444 	assert(new->pmap_cs_associated == FALSE);
445 }
446 
447 /*
448  * The "used_for_jit" flag was copied from OLD to NEW in vm_map_entry_copy().
449  * But for security reasons on some platforms, we don't want the
450  * new mapping to be "used for jit", so we reset the flag here.
451  */
452 static inline void
vm_map_entry_copy_code_signing(vm_map_t map,vm_map_entry_t new,vm_map_entry_t old __unused)453 vm_map_entry_copy_code_signing(
454 	vm_map_t map,
455 	vm_map_entry_t new,
456 	vm_map_entry_t old __unused)
457 {
458 	if (VM_MAP_POLICY_ALLOW_JIT_COPY(map)) {
459 		assert(new->used_for_jit == old->used_for_jit);
460 	} else {
461 		new->used_for_jit = FALSE;
462 	}
463 }
464 
465 static inline void
vm_map_entry_copy_full(vm_map_entry_t new,vm_map_entry_t old)466 vm_map_entry_copy_full(
467 	vm_map_entry_t new,
468 	vm_map_entry_t old)
469 {
470 #if MAP_ENTRY_CREATION_DEBUG
471 	btref_put(new->vme_creation_bt);
472 	btref_retain(old->vme_creation_bt);
473 #endif
474 #if MAP_ENTRY_INSERTION_DEBUG
475 	btref_put(new->vme_insertion_bt);
476 	btref_retain(old->vme_insertion_bt);
477 #endif
478 	*new = *old;
479 }
480 
481 static inline void
vm_map_entry_copy(vm_map_t map,vm_map_entry_t new,vm_map_entry_t old)482 vm_map_entry_copy(
483 	vm_map_t map,
484 	vm_map_entry_t new,
485 	vm_map_entry_t old)
486 {
487 	vm_map_entry_copy_full(new, old);
488 
489 	new->is_shared = FALSE;
490 	new->needs_wakeup = FALSE;
491 	new->in_transition = FALSE;
492 	new->wired_count = 0;
493 	new->user_wired_count = 0;
494 	new->vme_permanent = FALSE;
495 	vm_map_entry_copy_code_signing(map, new, old);
496 	vm_map_entry_copy_pmap_cs_assoc(map, new, old);
497 	if (new->iokit_acct) {
498 		assertf(!new->use_pmap, "old %p new %p\n", old, new);
499 		new->iokit_acct = FALSE;
500 		new->use_pmap = TRUE;
501 	}
502 	new->vme_resilient_codesign = FALSE;
503 	new->vme_resilient_media = FALSE;
504 	new->vme_atomic = FALSE;
505 	new->vme_no_copy_on_read = FALSE;
506 }
507 
508 /*
509  * Normal lock_read_to_write() returns FALSE/0 on failure.
510  * These functions evaluate to zero on success and non-zero value on failure.
511  */
512 __attribute__((always_inline))
513 int
vm_map_lock_read_to_write(vm_map_t map)514 vm_map_lock_read_to_write(vm_map_t map)
515 {
516 	if (lck_rw_lock_shared_to_exclusive(&(map)->lock)) {
517 		DTRACE_VM(vm_map_lock_upgrade);
518 		return 0;
519 	}
520 	return 1;
521 }
522 
523 __attribute__((always_inline))
524 boolean_t
vm_map_try_lock(vm_map_t map)525 vm_map_try_lock(vm_map_t map)
526 {
527 	if (lck_rw_try_lock_exclusive(&(map)->lock)) {
528 		DTRACE_VM(vm_map_lock_w);
529 		return TRUE;
530 	}
531 	return FALSE;
532 }
533 
534 __attribute__((always_inline))
535 boolean_t
vm_map_try_lock_read(vm_map_t map)536 vm_map_try_lock_read(vm_map_t map)
537 {
538 	if (lck_rw_try_lock_shared(&(map)->lock)) {
539 		DTRACE_VM(vm_map_lock_r);
540 		return TRUE;
541 	}
542 	return FALSE;
543 }
544 
545 /*!
546  * @function kdp_vm_map_is_acquired_exclusive
547  *
548  * @abstract
549  * Checks if vm map is acquired exclusive.
550  *
551  * @discussion
552  * NOT SAFE: To be used only by kernel debugger.
553  *
554  * @param map map to check
555  *
556  * @returns TRUE if the map is acquired exclusively.
557  */
558 boolean_t
kdp_vm_map_is_acquired_exclusive(vm_map_t map)559 kdp_vm_map_is_acquired_exclusive(vm_map_t map)
560 {
561 	return kdp_lck_rw_lock_is_acquired_exclusive(&map->lock);
562 }
563 
564 /*
565  * Routines to get the page size the caller should
566  * use while inspecting the target address space.
567  * Use the "_safely" variant if the caller is dealing with a user-provided
568  * array whose size depends on the page size, to avoid any overflow or
569  * underflow of a user-allocated buffer.
570  */
571 int
vm_self_region_page_shift_safely(vm_map_t target_map)572 vm_self_region_page_shift_safely(
573 	vm_map_t target_map)
574 {
575 	int effective_page_shift = 0;
576 
577 	if (PAGE_SIZE == (4096)) {
578 		/* x86_64 and 4k watches: always use 4k */
579 		return PAGE_SHIFT;
580 	}
581 	/* did caller provide an explicit page size for this thread to use? */
582 	effective_page_shift = thread_self_region_page_shift();
583 	if (effective_page_shift) {
584 		/* use the explicitly-provided page size */
585 		return effective_page_shift;
586 	}
587 	/* no explicit page size: use the caller's page size... */
588 	effective_page_shift = VM_MAP_PAGE_SHIFT(current_map());
589 	if (effective_page_shift == VM_MAP_PAGE_SHIFT(target_map)) {
590 		/* page size match: safe to use */
591 		return effective_page_shift;
592 	}
593 	/* page size mismatch */
594 	return -1;
595 }
596 int
vm_self_region_page_shift(vm_map_t target_map)597 vm_self_region_page_shift(
598 	vm_map_t target_map)
599 {
600 	int effective_page_shift;
601 
602 	effective_page_shift = vm_self_region_page_shift_safely(target_map);
603 	if (effective_page_shift == -1) {
604 		/* no safe value but OK to guess for caller */
605 		effective_page_shift = MIN(VM_MAP_PAGE_SHIFT(current_map()),
606 		    VM_MAP_PAGE_SHIFT(target_map));
607 	}
608 	return effective_page_shift;
609 }
610 
611 
612 /*
613  *	Decide if we want to allow processes to execute from their data or stack areas.
614  *	override_nx() returns true if we do.  Data/stack execution can be enabled independently
615  *	for 32 and 64 bit processes.  Set the VM_ABI_32 or VM_ABI_64 flags in allow_data_exec
616  *	or allow_stack_exec to enable data execution for that type of data area for that particular
617  *	ABI (or both by or'ing the flags together).  These are initialized in the architecture
618  *	specific pmap files since the default behavior varies according to architecture.  The
619  *	main reason it varies is because of the need to provide binary compatibility with old
620  *	applications that were written before these restrictions came into being.  In the old
621  *	days, an app could execute anything it could read, but this has slowly been tightened
622  *	up over time.  The default behavior is:
623  *
624  *	32-bit PPC apps		may execute from both stack and data areas
625  *	32-bit Intel apps	may exeucte from data areas but not stack
626  *	64-bit PPC/Intel apps	may not execute from either data or stack
627  *
628  *	An application on any architecture may override these defaults by explicitly
629  *	adding PROT_EXEC permission to the page in question with the mprotect(2)
630  *	system call.  This code here just determines what happens when an app tries to
631  *      execute from a page that lacks execute permission.
632  *
633  *	Note that allow_data_exec or allow_stack_exec may also be modified by sysctl to change the
634  *	default behavior for both 32 and 64 bit apps on a system-wide basis. Furthermore,
635  *	a Mach-O header flag bit (MH_NO_HEAP_EXECUTION) can be used to forcibly disallow
636  *	execution from data areas for a particular binary even if the arch normally permits it. As
637  *	a final wrinkle, a posix_spawn attribute flag can be used to negate this opt-in header bit
638  *	to support some complicated use cases, notably browsers with out-of-process plugins that
639  *	are not all NX-safe.
640  */
641 
642 extern int allow_data_exec, allow_stack_exec;
643 
644 int
override_nx(vm_map_t map,uint32_t user_tag)645 override_nx(vm_map_t map, uint32_t user_tag) /* map unused on arm */
646 {
647 	int current_abi;
648 
649 	if (map->pmap == kernel_pmap) {
650 		return FALSE;
651 	}
652 
653 	/*
654 	 * Determine if the app is running in 32 or 64 bit mode.
655 	 */
656 
657 	if (vm_map_is_64bit(map)) {
658 		current_abi = VM_ABI_64;
659 	} else {
660 		current_abi = VM_ABI_32;
661 	}
662 
663 	/*
664 	 * Determine if we should allow the execution based on whether it's a
665 	 * stack or data area and the current architecture.
666 	 */
667 
668 	if (user_tag == VM_MEMORY_STACK) {
669 		return allow_stack_exec & current_abi;
670 	}
671 
672 	return (allow_data_exec & current_abi) && (map->map_disallow_data_exec == FALSE);
673 }
674 
675 
676 /*
677  *	Virtual memory maps provide for the mapping, protection,
678  *	and sharing of virtual memory objects.  In addition,
679  *	this module provides for an efficient virtual copy of
680  *	memory from one map to another.
681  *
682  *	Synchronization is required prior to most operations.
683  *
684  *	Maps consist of an ordered doubly-linked list of simple
685  *	entries; a single hint is used to speed up lookups.
686  *
687  *	Sharing maps have been deleted from this version of Mach.
688  *	All shared objects are now mapped directly into the respective
689  *	maps.  This requires a change in the copy on write strategy;
690  *	the asymmetric (delayed) strategy is used for shared temporary
691  *	objects instead of the symmetric (shadow) strategy.  All maps
692  *	are now "top level" maps (either task map, kernel map or submap
693  *	of the kernel map).
694  *
695  *	Since portions of maps are specified by start/end addreses,
696  *	which may not align with existing map entries, all
697  *	routines merely "clip" entries to these start/end values.
698  *	[That is, an entry is split into two, bordering at a
699  *	start or end value.]  Note that these clippings may not
700  *	always be necessary (as the two resulting entries are then
701  *	not changed); however, the clipping is done for convenience.
702  *	No attempt is currently made to "glue back together" two
703  *	abutting entries.
704  *
705  *	The symmetric (shadow) copy strategy implements virtual copy
706  *	by copying VM object references from one map to
707  *	another, and then marking both regions as copy-on-write.
708  *	It is important to note that only one writeable reference
709  *	to a VM object region exists in any map when this strategy
710  *	is used -- this means that shadow object creation can be
711  *	delayed until a write operation occurs.  The symmetric (delayed)
712  *	strategy allows multiple maps to have writeable references to
713  *	the same region of a vm object, and hence cannot delay creating
714  *	its copy objects.  See vm_object_copy_quickly() in vm_object.c.
715  *	Copying of permanent objects is completely different; see
716  *	vm_object_copy_strategically() in vm_object.c.
717  */
718 
719 ZONE_DECLARE_ID(ZONE_ID_VM_MAP_COPY, struct vm_map_copy);
720 
721 #define VM_MAP_ZONE_NAME "maps"
722 #define VM_MAP_ZFLAGS ( \
723 	ZC_NOENCRYPT | \
724 	ZC_VM_LP64)
725 
726 #define VM_MAP_ENTRY_ZONE_NAME "VM map entries"
727 #define VM_MAP_ENTRY_ZFLAGS ( \
728 	ZC_NOENCRYPT | \
729 	ZC_CACHING | \
730 	ZC_KASAN_NOQUARANTINE | \
731 	ZC_VM_LP64)
732 
733 #define VM_MAP_HOLES_ZONE_NAME "VM map holes"
734 #define VM_MAP_HOLES_ZFLAGS ( \
735 	ZC_NOENCRYPT | \
736 	ZC_CACHING | \
737 	ZC_KASAN_NOQUARANTINE | \
738 	ZC_VM_LP64)
739 
740 /*
741  * Asserts that a vm_map_copy object is coming from the
742  * vm_map_copy_zone to ensure that it isn't a fake constructed
743  * anywhere else.
744  */
745 void
vm_map_copy_require(struct vm_map_copy * copy)746 vm_map_copy_require(struct vm_map_copy *copy)
747 {
748 	zone_id_require(ZONE_ID_VM_MAP_COPY, sizeof(struct vm_map_copy), copy);
749 }
750 
751 /*
752  *	vm_map_require:
753  *
754  *	Ensures that the argument is memory allocated from the genuine
755  *	vm map zone. (See zone_id_require_allow_foreign).
756  */
757 void
vm_map_require(vm_map_t map)758 vm_map_require(vm_map_t map)
759 {
760 	zone_id_require(ZONE_ID_VM_MAP, sizeof(struct _vm_map), map);
761 }
762 
763 #define VM_MAP_EARLY_COUNT_MAX         16
764 static __startup_data vm_offset_t      map_data;
765 static __startup_data vm_size_t        map_data_size;
766 static __startup_data vm_offset_t      kentry_data;
767 static __startup_data vm_size_t        kentry_data_size;
768 static __startup_data vm_offset_t      map_holes_data;
769 static __startup_data vm_size_t        map_holes_data_size;
770 static __startup_data vm_map_t        *early_map_owners[VM_MAP_EARLY_COUNT_MAX];
771 static __startup_data uint32_t         early_map_count;
772 
773 #if XNU_TARGET_OS_OSX
774 #define         NO_COALESCE_LIMIT  ((1024 * 128) - 1)
775 #else /* XNU_TARGET_OS_OSX */
776 #define         NO_COALESCE_LIMIT  0
777 #endif /* XNU_TARGET_OS_OSX */
778 
779 /* Skip acquiring locks if we're in the midst of a kernel core dump */
780 unsigned int not_in_kdp = 1;
781 
782 unsigned int vm_map_set_cache_attr_count = 0;
783 
784 kern_return_t
vm_map_set_cache_attr(vm_map_t map,vm_map_offset_t va)785 vm_map_set_cache_attr(
786 	vm_map_t        map,
787 	vm_map_offset_t va)
788 {
789 	vm_map_entry_t  map_entry;
790 	vm_object_t     object;
791 	kern_return_t   kr = KERN_SUCCESS;
792 
793 	vm_map_lock_read(map);
794 
795 	if (!vm_map_lookup_entry(map, va, &map_entry) ||
796 	    map_entry->is_sub_map) {
797 		/*
798 		 * that memory is not properly mapped
799 		 */
800 		kr = KERN_INVALID_ARGUMENT;
801 		goto done;
802 	}
803 	object = VME_OBJECT(map_entry);
804 
805 	if (object == VM_OBJECT_NULL) {
806 		/*
807 		 * there should be a VM object here at this point
808 		 */
809 		kr = KERN_INVALID_ARGUMENT;
810 		goto done;
811 	}
812 	vm_object_lock(object);
813 	object->set_cache_attr = TRUE;
814 	vm_object_unlock(object);
815 
816 	vm_map_set_cache_attr_count++;
817 done:
818 	vm_map_unlock_read(map);
819 
820 	return kr;
821 }
822 
823 
824 #if CONFIG_CODE_DECRYPTION
825 /*
826  * vm_map_apple_protected:
827  * This remaps the requested part of the object with an object backed by
828  * the decrypting pager.
829  * crypt_info contains entry points and session data for the crypt module.
830  * The crypt_info block will be copied by vm_map_apple_protected. The data structures
831  * referenced in crypt_info must remain valid until crypt_info->crypt_end() is called.
832  */
833 kern_return_t
vm_map_apple_protected(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_object_offset_t crypto_backing_offset,struct pager_crypt_info * crypt_info,uint32_t cryptid)834 vm_map_apple_protected(
835 	vm_map_t                map,
836 	vm_map_offset_t         start,
837 	vm_map_offset_t         end,
838 	vm_object_offset_t      crypto_backing_offset,
839 	struct pager_crypt_info *crypt_info,
840 	uint32_t                cryptid)
841 {
842 	boolean_t       map_locked;
843 	kern_return_t   kr;
844 	vm_map_entry_t  map_entry;
845 	struct vm_map_entry tmp_entry;
846 	memory_object_t unprotected_mem_obj;
847 	vm_object_t     protected_object;
848 	vm_map_offset_t map_addr;
849 	vm_map_offset_t start_aligned, end_aligned;
850 	vm_object_offset_t      crypto_start, crypto_end;
851 	int             vm_flags;
852 	vm_map_kernel_flags_t vmk_flags;
853 	boolean_t       cache_pager;
854 
855 	vm_flags = 0;
856 	vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
857 
858 	map_locked = FALSE;
859 	unprotected_mem_obj = MEMORY_OBJECT_NULL;
860 
861 	start_aligned = vm_map_trunc_page(start, PAGE_MASK_64);
862 	end_aligned = vm_map_round_page(end, PAGE_MASK_64);
863 	start_aligned = vm_map_trunc_page(start_aligned, VM_MAP_PAGE_MASK(map));
864 	end_aligned = vm_map_round_page(end_aligned, VM_MAP_PAGE_MASK(map));
865 
866 #if __arm64__
867 	/*
868 	 * "start" and "end" might be 4K-aligned but not 16K-aligned,
869 	 * so we might have to loop and establish up to 3 mappings:
870 	 *
871 	 * + the first 16K-page, which might overlap with the previous
872 	 *   4K-aligned mapping,
873 	 * + the center,
874 	 * + the last 16K-page, which might overlap with the next
875 	 *   4K-aligned mapping.
876 	 * Each of these mapping might be backed by a vnode pager (if
877 	 * properly page-aligned) or a "fourk_pager", itself backed by a
878 	 * vnode pager (if 4K-aligned but not page-aligned).
879 	 */
880 #endif /* __arm64__ */
881 
882 	map_addr = start_aligned;
883 	for (map_addr = start_aligned;
884 	    map_addr < end;
885 	    map_addr = tmp_entry.vme_end) {
886 		vm_map_lock(map);
887 		map_locked = TRUE;
888 
889 		/* lookup the protected VM object */
890 		if (!vm_map_lookup_entry(map,
891 		    map_addr,
892 		    &map_entry) ||
893 		    map_entry->is_sub_map ||
894 		    VME_OBJECT(map_entry) == VM_OBJECT_NULL) {
895 			/* that memory is not properly mapped */
896 			kr = KERN_INVALID_ARGUMENT;
897 			goto done;
898 		}
899 
900 		/* ensure mapped memory is mapped as executable except
901 		 *  except for model decryption flow */
902 		if ((cryptid != CRYPTID_MODEL_ENCRYPTION) &&
903 		    !(map_entry->protection & VM_PROT_EXECUTE)) {
904 			kr = KERN_INVALID_ARGUMENT;
905 			goto done;
906 		}
907 
908 		/* get the protected object to be decrypted */
909 		protected_object = VME_OBJECT(map_entry);
910 		if (protected_object == VM_OBJECT_NULL) {
911 			/* there should be a VM object here at this point */
912 			kr = KERN_INVALID_ARGUMENT;
913 			goto done;
914 		}
915 		/* ensure protected object stays alive while map is unlocked */
916 		vm_object_reference(protected_object);
917 
918 		/* limit the map entry to the area we want to cover */
919 		vm_map_clip_start(map, map_entry, start_aligned);
920 		vm_map_clip_end(map, map_entry, end_aligned);
921 
922 		tmp_entry = *map_entry;
923 		map_entry = VM_MAP_ENTRY_NULL; /* not valid after unlocking map */
924 		vm_map_unlock(map);
925 		map_locked = FALSE;
926 
927 		/*
928 		 * This map entry might be only partially encrypted
929 		 * (if not fully "page-aligned").
930 		 */
931 		crypto_start = 0;
932 		crypto_end = tmp_entry.vme_end - tmp_entry.vme_start;
933 		if (tmp_entry.vme_start < start) {
934 			if (tmp_entry.vme_start != start_aligned) {
935 				kr = KERN_INVALID_ADDRESS;
936 			}
937 			crypto_start += (start - tmp_entry.vme_start);
938 		}
939 		if (tmp_entry.vme_end > end) {
940 			if (tmp_entry.vme_end != end_aligned) {
941 				kr = KERN_INVALID_ADDRESS;
942 			}
943 			crypto_end -= (tmp_entry.vme_end - end);
944 		}
945 
946 		/*
947 		 * This "extra backing offset" is needed to get the decryption
948 		 * routine to use the right key.  It adjusts for the possibly
949 		 * relative offset of an interposed "4K" pager...
950 		 */
951 		if (crypto_backing_offset == (vm_object_offset_t) -1) {
952 			crypto_backing_offset = VME_OFFSET(&tmp_entry);
953 		}
954 
955 		cache_pager = TRUE;
956 #if XNU_TARGET_OS_OSX
957 		if (vm_map_is_alien(map)) {
958 			cache_pager = FALSE;
959 		}
960 #endif /* XNU_TARGET_OS_OSX */
961 
962 		/*
963 		 * Lookup (and create if necessary) the protected memory object
964 		 * matching that VM object.
965 		 * If successful, this also grabs a reference on the memory object,
966 		 * to guarantee that it doesn't go away before we get a chance to map
967 		 * it.
968 		 */
969 		unprotected_mem_obj = apple_protect_pager_setup(
970 			protected_object,
971 			VME_OFFSET(&tmp_entry),
972 			crypto_backing_offset,
973 			crypt_info,
974 			crypto_start,
975 			crypto_end,
976 			cache_pager);
977 
978 		/* release extra ref on protected object */
979 		vm_object_deallocate(protected_object);
980 
981 		if (unprotected_mem_obj == NULL) {
982 			kr = KERN_FAILURE;
983 			goto done;
984 		}
985 
986 		vm_flags = VM_FLAGS_FIXED | VM_FLAGS_OVERWRITE;
987 		/* can overwrite an immutable mapping */
988 		vmk_flags.vmkf_overwrite_immutable = TRUE;
989 #if __arm64__
990 		if (tmp_entry.used_for_jit &&
991 		    (VM_MAP_PAGE_SHIFT(map) != FOURK_PAGE_SHIFT ||
992 		    PAGE_SHIFT != FOURK_PAGE_SHIFT) &&
993 		    fourk_binary_compatibility_unsafe &&
994 		    fourk_binary_compatibility_allow_wx) {
995 			printf("** FOURK_COMPAT [%d]: "
996 			    "allowing write+execute at 0x%llx\n",
997 			    proc_selfpid(), tmp_entry.vme_start);
998 			vmk_flags.vmkf_map_jit = TRUE;
999 		}
1000 #endif /* __arm64__ */
1001 
1002 		/* map this memory object in place of the current one */
1003 		map_addr = tmp_entry.vme_start;
1004 		kr = vm_map_enter_mem_object(map,
1005 		    &map_addr,
1006 		    (tmp_entry.vme_end -
1007 		    tmp_entry.vme_start),
1008 		    (mach_vm_offset_t) 0,
1009 		    vm_flags,
1010 		    vmk_flags,
1011 		    VM_KERN_MEMORY_NONE,
1012 		    (ipc_port_t)(uintptr_t) unprotected_mem_obj,
1013 		    0,
1014 		    TRUE,
1015 		    tmp_entry.protection,
1016 		    tmp_entry.max_protection,
1017 		    tmp_entry.inheritance);
1018 		assertf(kr == KERN_SUCCESS,
1019 		    "kr = 0x%x\n", kr);
1020 		assertf(map_addr == tmp_entry.vme_start,
1021 		    "map_addr=0x%llx vme_start=0x%llx tmp_entry=%p\n",
1022 		    (uint64_t)map_addr,
1023 		    (uint64_t) tmp_entry.vme_start,
1024 		    &tmp_entry);
1025 
1026 #if VM_MAP_DEBUG_APPLE_PROTECT
1027 		if (vm_map_debug_apple_protect) {
1028 			printf("APPLE_PROTECT: map %p [0x%llx:0x%llx] pager %p:"
1029 			    " backing:[object:%p,offset:0x%llx,"
1030 			    "crypto_backing_offset:0x%llx,"
1031 			    "crypto_start:0x%llx,crypto_end:0x%llx]\n",
1032 			    map,
1033 			    (uint64_t) map_addr,
1034 			    (uint64_t) (map_addr + (tmp_entry.vme_end -
1035 			    tmp_entry.vme_start)),
1036 			    unprotected_mem_obj,
1037 			    protected_object,
1038 			    VME_OFFSET(&tmp_entry),
1039 			    crypto_backing_offset,
1040 			    crypto_start,
1041 			    crypto_end);
1042 		}
1043 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
1044 
1045 		/*
1046 		 * Release the reference obtained by
1047 		 * apple_protect_pager_setup().
1048 		 * The mapping (if it succeeded) is now holding a reference on
1049 		 * the memory object.
1050 		 */
1051 		memory_object_deallocate(unprotected_mem_obj);
1052 		unprotected_mem_obj = MEMORY_OBJECT_NULL;
1053 
1054 		/* continue with next map entry */
1055 		crypto_backing_offset += (tmp_entry.vme_end -
1056 		    tmp_entry.vme_start);
1057 		crypto_backing_offset -= crypto_start;
1058 	}
1059 	kr = KERN_SUCCESS;
1060 
1061 done:
1062 	if (map_locked) {
1063 		vm_map_unlock(map);
1064 	}
1065 	return kr;
1066 }
1067 #endif  /* CONFIG_CODE_DECRYPTION */
1068 
1069 
1070 LCK_GRP_DECLARE(vm_map_lck_grp, "vm_map");
1071 LCK_ATTR_DECLARE(vm_map_lck_attr, 0, 0);
1072 LCK_ATTR_DECLARE(vm_map_lck_rw_attr, 0, LCK_ATTR_DEBUG);
1073 
1074 #if XNU_TARGET_OS_OSX
1075 int malloc_no_cow = 0;
1076 #else /* XNU_TARGET_OS_OSX */
1077 int malloc_no_cow = 1;
1078 #endif /* XNU_TARGET_OS_OSX */
1079 uint64_t vm_memory_malloc_no_cow_mask = 0ULL;
1080 #if DEBUG
1081 int vm_check_map_sanity = 0;
1082 #endif
1083 
1084 /*
1085  *	vm_map_init:
1086  *
1087  *	Initialize the vm_map module.  Must be called before
1088  *	any other vm_map routines.
1089  *
1090  *	Map and entry structures are allocated from zones -- we must
1091  *	initialize those zones.
1092  *
1093  *	There are three zones of interest:
1094  *
1095  *	vm_map_zone:		used to allocate maps.
1096  *	vm_map_entry_zone:	used to allocate map entries.
1097  *
1098  *	LP32:
1099  *	vm_map_entry_reserved_zone:     fallback zone for kernel map entries
1100  *
1101  *	The kernel allocates map entries from a special zone that is initially
1102  *	"crammed" with memory.  It would be difficult (perhaps impossible) for
1103  *	the kernel to allocate more memory to a entry zone when it became
1104  *	empty since the very act of allocating memory implies the creation
1105  *	of a new entry.
1106  */
1107 __startup_func
1108 void
vm_map_init(void)1109 vm_map_init(void)
1110 {
1111 
1112 #if MACH_ASSERT
1113 	PE_parse_boot_argn("debug4k_filter", &debug4k_filter,
1114 	    sizeof(debug4k_filter));
1115 #endif /* MACH_ASSERT */
1116 
1117 	zone_create_ext(VM_MAP_ZONE_NAME, sizeof(struct _vm_map),
1118 	    VM_MAP_ZFLAGS, ZONE_ID_VM_MAP, NULL);
1119 
1120 	/*
1121 	 * Don't quarantine because we always need elements available
1122 	 * Disallow GC on this zone... to aid the GC.
1123 	 */
1124 	zone_create_ext(VM_MAP_ENTRY_ZONE_NAME,
1125 	    sizeof(struct vm_map_entry), VM_MAP_ENTRY_ZFLAGS,
1126 	    ZONE_ID_VM_MAP_ENTRY, ^(zone_t z) {
1127 		z->z_elems_rsv = (uint16_t)(32 *
1128 		(ml_early_cpu_max_number() + 1));
1129 	});
1130 
1131 	zone_create_ext(VM_MAP_HOLES_ZONE_NAME,
1132 	    sizeof(struct vm_map_links), VM_MAP_HOLES_ZFLAGS,
1133 	    ZONE_ID_VM_MAP_HOLES, ^(zone_t z) {
1134 		z->z_elems_rsv = (uint16_t)(16 * 1024 / zone_elem_size(z));
1135 	});
1136 
1137 	zone_create_ext("VM map copies", sizeof(struct vm_map_copy),
1138 	    ZC_NOENCRYPT | ZC_CACHING, ZONE_ID_VM_MAP_COPY, NULL);
1139 
1140 	/*
1141 	 * Add the stolen memory to zones, adjust zone size and stolen counts.
1142 	 */
1143 	zone_cram_early(vm_map_zone, map_data, map_data_size);
1144 	zone_cram_early(vm_map_entry_zone, kentry_data, kentry_data_size);
1145 	zone_cram_early(vm_map_holes_zone, map_holes_data, map_holes_data_size);
1146 	printf("VM boostrap: %d maps, %d entries and %d holes available\n",
1147 	    vm_map_zone->z_elems_free,
1148 	    vm_map_entry_zone->z_elems_free,
1149 	    vm_map_holes_zone->z_elems_free);
1150 
1151 	/*
1152 	 * Since these are covered by zones, remove them from stolen page accounting.
1153 	 */
1154 	VM_PAGE_MOVE_STOLEN(atop_64(map_data_size) + atop_64(kentry_data_size) + atop_64(map_holes_data_size));
1155 
1156 #if VM_MAP_DEBUG_APPLE_PROTECT
1157 	PE_parse_boot_argn("vm_map_debug_apple_protect",
1158 	    &vm_map_debug_apple_protect,
1159 	    sizeof(vm_map_debug_apple_protect));
1160 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
1161 #if VM_MAP_DEBUG_APPLE_FOURK
1162 	PE_parse_boot_argn("vm_map_debug_fourk",
1163 	    &vm_map_debug_fourk,
1164 	    sizeof(vm_map_debug_fourk));
1165 #endif /* VM_MAP_DEBUG_FOURK */
1166 
1167 	PE_parse_boot_argn("malloc_no_cow",
1168 	    &malloc_no_cow,
1169 	    sizeof(malloc_no_cow));
1170 	if (malloc_no_cow) {
1171 		vm_memory_malloc_no_cow_mask = 0ULL;
1172 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC;
1173 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_SMALL;
1174 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_MEDIUM;
1175 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE;
1176 //		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_HUGE;
1177 //		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_REALLOC;
1178 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_TINY;
1179 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE_REUSABLE;
1180 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE_REUSED;
1181 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_NANO;
1182 //		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_TCMALLOC;
1183 		PE_parse_boot_argn("vm_memory_malloc_no_cow_mask",
1184 		    &vm_memory_malloc_no_cow_mask,
1185 		    sizeof(vm_memory_malloc_no_cow_mask));
1186 	}
1187 
1188 #if CONFIG_MAP_RANGES
1189 	vm_map_range_map_init();
1190 #endif /* CONFIG_MAP_RANGES */
1191 
1192 #if DEBUG
1193 	PE_parse_boot_argn("vm_check_map_sanity", &vm_check_map_sanity, sizeof(vm_check_map_sanity));
1194 	if (vm_check_map_sanity) {
1195 		kprintf("VM sanity checking enabled\n");
1196 	} else {
1197 		kprintf("VM sanity checking disabled. Set bootarg vm_check_map_sanity=1 to enable\n");
1198 	}
1199 #endif /* DEBUG */
1200 
1201 #if DEVELOPMENT || DEBUG
1202 	PE_parse_boot_argn("panic_on_unsigned_execute",
1203 	    &panic_on_unsigned_execute,
1204 	    sizeof(panic_on_unsigned_execute));
1205 	PE_parse_boot_argn("panic_on_mlock_failure",
1206 	    &panic_on_mlock_failure,
1207 	    sizeof(panic_on_mlock_failure));
1208 #endif /* DEVELOPMENT || DEBUG */
1209 }
1210 
1211 __startup_func
1212 static void
vm_map_steal_memory(void)1213 vm_map_steal_memory(void)
1214 {
1215 	/*
1216 	 * We need to reserve enough memory to support boostraping VM maps
1217 	 * and the zone subsystem.
1218 	 *
1219 	 * The VM Maps that need to function before zones can support them
1220 	 * are the ones registered with vm_map_will_allocate_early_map(),
1221 	 * which are:
1222 	 * - the kernel map
1223 	 * - the various submaps used by zones (pgz, meta, ...)
1224 	 *
1225 	 * We also need enough entries and holes to support them
1226 	 * until zone_metadata_init() is called, which is when
1227 	 * the zone allocator becomes capable of expanding dynamically.
1228 	 *
1229 	 * We need:
1230 	 * - VM_MAP_EARLY_COUNT_MAX worth of VM Maps.
1231 	 * - To allow for 3-4 entries per map, but the kernel map
1232 	 *   needs a multiple of VM_MAP_EARLY_COUNT_MAX entries
1233 	 *   to describe the submaps, so double it (and make it 8x too)
1234 	 * - To allow for holes between entries,
1235 	 *   hence needs the same budget as entries
1236 	 */
1237 	map_data_size = zone_get_early_alloc_size(VM_MAP_ZONE_NAME,
1238 	    sizeof(struct _vm_map), VM_MAP_ZFLAGS,
1239 	    VM_MAP_EARLY_COUNT_MAX);
1240 
1241 	kentry_data_size = zone_get_early_alloc_size(VM_MAP_ENTRY_ZONE_NAME,
1242 	    sizeof(struct vm_map_entry), VM_MAP_ENTRY_ZFLAGS,
1243 	    8 * VM_MAP_EARLY_COUNT_MAX);
1244 
1245 	map_holes_data_size = zone_get_early_alloc_size(VM_MAP_HOLES_ZONE_NAME,
1246 	    sizeof(struct vm_map_links), VM_MAP_HOLES_ZFLAGS,
1247 	    8 * VM_MAP_EARLY_COUNT_MAX);
1248 
1249 	/*
1250 	 * Steal a contiguous range of memory so that a simple range check
1251 	 * can validate early addresses being freed/crammed to these
1252 	 * zones
1253 	 */
1254 	map_data       = zone_early_mem_init(map_data_size + kentry_data_size +
1255 	    map_holes_data_size);
1256 	kentry_data    = map_data + map_data_size;
1257 	map_holes_data = kentry_data + kentry_data_size;
1258 }
1259 STARTUP(PMAP_STEAL, STARTUP_RANK_FIRST, vm_map_steal_memory);
1260 
1261 __startup_func
1262 static void
vm_kernel_boostraped(void)1263 vm_kernel_boostraped(void)
1264 {
1265 	printf("VM bootstrap done: %d maps, %d entries and %d holes left\n",
1266 	    vm_map_zone->z_elems_free,
1267 	    vm_map_entry_zone->z_elems_free,
1268 	    vm_map_holes_zone->z_elems_free);
1269 }
1270 STARTUP(ZALLOC, STARTUP_RANK_SECOND, vm_kernel_boostraped);
1271 
1272 void
vm_map_disable_hole_optimization(vm_map_t map)1273 vm_map_disable_hole_optimization(vm_map_t map)
1274 {
1275 	vm_map_entry_t  head_entry, hole_entry, next_hole_entry;
1276 
1277 	if (map->holelistenabled) {
1278 		head_entry = hole_entry = CAST_TO_VM_MAP_ENTRY(map->holes_list);
1279 
1280 		while (hole_entry != NULL) {
1281 			next_hole_entry = hole_entry->vme_next;
1282 
1283 			hole_entry->vme_next = NULL;
1284 			hole_entry->vme_prev = NULL;
1285 			zfree_id(ZONE_ID_VM_MAP_HOLES, hole_entry);
1286 
1287 			if (next_hole_entry == head_entry) {
1288 				hole_entry = NULL;
1289 			} else {
1290 				hole_entry = next_hole_entry;
1291 			}
1292 		}
1293 
1294 		map->holes_list = NULL;
1295 		map->holelistenabled = FALSE;
1296 
1297 		map->first_free = vm_map_first_entry(map);
1298 		SAVE_HINT_HOLE_WRITE(map, NULL);
1299 	}
1300 }
1301 
1302 boolean_t
vm_kernel_map_is_kernel(vm_map_t map)1303 vm_kernel_map_is_kernel(vm_map_t map)
1304 {
1305 	return map->pmap == kernel_pmap;
1306 }
1307 
1308 /*
1309  *	vm_map_create:
1310  *
1311  *	Creates and returns a new empty VM map with
1312  *	the given physical map structure, and having
1313  *	the given lower and upper address bounds.
1314  */
1315 
1316 extern vm_map_t vm_map_create_external(
1317 	pmap_t                  pmap,
1318 	vm_map_offset_t         min_off,
1319 	vm_map_offset_t         max_off,
1320 	boolean_t               pageable);
1321 
1322 vm_map_t
vm_map_create_external(pmap_t pmap,vm_map_offset_t min,vm_map_offset_t max,boolean_t pageable)1323 vm_map_create_external(
1324 	pmap_t                  pmap,
1325 	vm_map_offset_t         min,
1326 	vm_map_offset_t         max,
1327 	boolean_t               pageable)
1328 {
1329 	vm_map_create_options_t options = VM_MAP_CREATE_DEFAULT;
1330 
1331 	if (pageable) {
1332 		options |= VM_MAP_CREATE_PAGEABLE;
1333 	}
1334 	return vm_map_create_options(pmap, min, max, options);
1335 }
1336 
1337 __startup_func
1338 void
vm_map_will_allocate_early_map(vm_map_t * owner)1339 vm_map_will_allocate_early_map(vm_map_t *owner)
1340 {
1341 	if (early_map_count >= VM_MAP_EARLY_COUNT_MAX) {
1342 		panic("VM_MAP_EARLY_COUNT_MAX is too low");
1343 	}
1344 
1345 	early_map_owners[early_map_count++] = owner;
1346 }
1347 
1348 __startup_func
1349 void
vm_map_relocate_early_maps(vm_offset_t delta)1350 vm_map_relocate_early_maps(vm_offset_t delta)
1351 {
1352 	for (uint32_t i = 0; i < early_map_count; i++) {
1353 		vm_address_t addr = (vm_address_t)*early_map_owners[i];
1354 
1355 		*early_map_owners[i] = (vm_map_t)(addr + delta);
1356 	}
1357 
1358 	early_map_count = ~0u;
1359 }
1360 
1361 /*
1362  *	Routine:	vm_map_relocate_early_elem
1363  *
1364  *	Purpose:
1365  *		Early zone elements are allocated in a temporary part
1366  *		of the address space.
1367  *
1368  *		Once the zones live in their final place, the early
1369  *		VM maps, map entries and map holes need to be relocated.
1370  *
1371  *		It involves rewriting any vm_map_t, vm_map_entry_t or
1372  *		pointers to vm_map_links. Other pointers to other types
1373  *		are fine.
1374  *
1375  *		Fortunately, pointers to those types are self-contained
1376  *		in those zones, _except_ for pointers to VM maps,
1377  *		which are tracked during early boot and fixed with
1378  *		vm_map_relocate_early_maps().
1379  */
1380 __startup_func
1381 void
vm_map_relocate_early_elem(uint32_t zone_id,vm_offset_t new_addr,vm_offset_t delta)1382 vm_map_relocate_early_elem(
1383 	uint32_t                zone_id,
1384 	vm_offset_t             new_addr,
1385 	vm_offset_t             delta)
1386 {
1387 #define relocate(type_t, field)  ({ \
1388 	typeof(((type_t)NULL)->field) *__field = &((type_t)new_addr)->field;   \
1389 	if (*__field) {                                                        \
1390 	        *__field = (typeof(*__field))((vm_offset_t)*__field + delta);  \
1391 	}                                                                      \
1392 })
1393 
1394 	switch (zone_id) {
1395 	case ZONE_ID_VM_MAP:
1396 	case ZONE_ID_VM_MAP_ENTRY:
1397 	case ZONE_ID_VM_MAP_HOLES:
1398 		break;
1399 
1400 	default:
1401 		panic("Unexpected zone ID %d", zone_id);
1402 	}
1403 
1404 	if (zone_id == ZONE_ID_VM_MAP) {
1405 		relocate(vm_map_t, hdr.links.prev);
1406 		relocate(vm_map_t, hdr.links.next);
1407 		((vm_map_t)new_addr)->pmap = kernel_pmap;
1408 #ifdef VM_MAP_STORE_USE_RB
1409 		relocate(vm_map_t, hdr.rb_head_store.rbh_root);
1410 #endif /* VM_MAP_STORE_USE_RB */
1411 		relocate(vm_map_t, hint);
1412 		relocate(vm_map_t, hole_hint);
1413 		relocate(vm_map_t, first_free);
1414 		return;
1415 	}
1416 
1417 	relocate(struct vm_map_links *, prev);
1418 	relocate(struct vm_map_links *, next);
1419 
1420 	if (zone_id == ZONE_ID_VM_MAP_ENTRY) {
1421 #ifdef VM_MAP_STORE_USE_RB
1422 		relocate(vm_map_entry_t, store.entry.rbe_left);
1423 		relocate(vm_map_entry_t, store.entry.rbe_right);
1424 		relocate(vm_map_entry_t, store.entry.rbe_parent);
1425 #endif /* VM_MAP_STORE_USE_RB */
1426 		if (((vm_map_entry_t)new_addr)->is_sub_map) {
1427 			/* no object to relocate because we haven't made any */
1428 			((vm_map_entry_t)new_addr)->vme_submap +=
1429 			    delta >> VME_SUBMAP_SHIFT;
1430 		}
1431 #if MAP_ENTRY_CREATION_DEBUG
1432 		relocate(vm_map_entry_t, vme_creation_maphdr);
1433 #endif /* MAP_ENTRY_CREATION_DEBUG */
1434 	}
1435 
1436 #undef relocate
1437 }
1438 
1439 vm_map_t
vm_map_create_options(pmap_t pmap,vm_map_offset_t min,vm_map_offset_t max,vm_map_create_options_t options)1440 vm_map_create_options(
1441 	pmap_t                  pmap,
1442 	vm_map_offset_t         min,
1443 	vm_map_offset_t         max,
1444 	vm_map_create_options_t options)
1445 {
1446 	vm_map_t result;
1447 
1448 #if DEBUG || DEVELOPMENT
1449 	if (__improbable(startup_phase < STARTUP_SUB_ZALLOC)) {
1450 		if (early_map_count != ~0u && early_map_count !=
1451 		    zone_count_allocated(vm_map_zone) + 1) {
1452 			panic("allocating %dth early map, owner not known",
1453 			    zone_count_allocated(vm_map_zone) + 1);
1454 		}
1455 		if (early_map_count != ~0u && pmap && pmap != kernel_pmap) {
1456 			panic("allocating %dth early map for non kernel pmap",
1457 			    early_map_count);
1458 		}
1459 	}
1460 #endif /* DEBUG || DEVELOPMENT */
1461 
1462 	result = zalloc_id(ZONE_ID_VM_MAP, Z_WAITOK | Z_NOFAIL | Z_ZERO);
1463 
1464 	vm_map_first_entry(result) = vm_map_to_entry(result);
1465 	vm_map_last_entry(result)  = vm_map_to_entry(result);
1466 
1467 	vm_map_store_init(&result->hdr);
1468 	result->hdr.entries_pageable = (bool)(options & VM_MAP_CREATE_PAGEABLE);
1469 	vm_map_set_page_shift(result, PAGE_SHIFT);
1470 
1471 	result->size_limit = RLIM_INFINITY;             /* default unlimited */
1472 	result->data_limit = RLIM_INFINITY;             /* default unlimited */
1473 	result->user_wire_limit = MACH_VM_MAX_ADDRESS;  /* default limit is unlimited */
1474 	os_ref_init_count_raw(&result->map_refcnt, &map_refgrp, 1);
1475 	result->pmap = pmap;
1476 	result->min_offset = min;
1477 	result->max_offset = max;
1478 	result->first_free = vm_map_to_entry(result);
1479 	result->hint = vm_map_to_entry(result);
1480 
1481 	if (options & VM_MAP_CREATE_NEVER_FAULTS) {
1482 		assert(pmap == kernel_pmap);
1483 		result->never_faults = true;
1484 	}
1485 
1486 	/* "has_corpse_footprint" and "holelistenabled" are mutually exclusive */
1487 	if (options & VM_MAP_CREATE_CORPSE_FOOTPRINT) {
1488 		result->has_corpse_footprint = true;
1489 	} else if (!(options & VM_MAP_CREATE_DISABLE_HOLELIST)) {
1490 		struct vm_map_links *hole_entry;
1491 
1492 		hole_entry = zalloc_id(ZONE_ID_VM_MAP_HOLES, Z_WAITOK | Z_NOFAIL);
1493 		hole_entry->start = min;
1494 #if defined(__arm64__)
1495 		hole_entry->end = result->max_offset;
1496 #else
1497 		hole_entry->end = MAX(max, (vm_map_offset_t)MACH_VM_MAX_ADDRESS);
1498 #endif
1499 		result->holes_list = result->hole_hint = hole_entry;
1500 		hole_entry->prev = hole_entry->next = CAST_TO_VM_MAP_ENTRY(hole_entry);
1501 		result->holelistenabled = true;
1502 	}
1503 
1504 	vm_map_lock_init(result);
1505 
1506 	return result;
1507 }
1508 
1509 /*
1510  * Adjusts a submap that was made by kmem_suballoc()
1511  * before it knew where it would be mapped,
1512  * so that it has the right min/max offsets.
1513  *
1514  * We do not need to hold any locks:
1515  * only the caller knows about this map,
1516  * and it is not published on any entry yet.
1517  */
1518 static void
vm_map_adjust_offsets(vm_map_t map,vm_map_offset_t min_off,vm_map_offset_t max_off)1519 vm_map_adjust_offsets(
1520 	vm_map_t                map,
1521 	vm_map_offset_t         min_off,
1522 	vm_map_offset_t         max_off)
1523 {
1524 	assert(map->min_offset == 0);
1525 	assert(map->max_offset == max_off - min_off);
1526 	assert(map->hdr.nentries == 0);
1527 	assert(os_ref_get_count_raw(&map->map_refcnt) == 2);
1528 
1529 	map->min_offset = min_off;
1530 	map->max_offset = max_off;
1531 
1532 	if (map->holelistenabled) {
1533 		struct vm_map_links *hole = map->holes_list;
1534 
1535 		hole->start = min_off;
1536 #if defined(__arm64__)
1537 		hole->end = max_off;
1538 #else
1539 		hole->end = MAX(max_off, (vm_map_offset_t)MACH_VM_MAX_ADDRESS);
1540 #endif
1541 	}
1542 }
1543 
1544 
1545 vm_map_size_t
vm_map_adjusted_size(vm_map_t map)1546 vm_map_adjusted_size(vm_map_t map)
1547 {
1548 	struct vm_reserved_region *regions = NULL;
1549 	size_t num_regions = 0;
1550 	mach_vm_size_t  reserved_size = 0, map_size = 0;
1551 
1552 	if (map == NULL || (map->size == 0)) {
1553 		return 0;
1554 	}
1555 
1556 	map_size = map->size;
1557 
1558 	if (map->reserved_regions == FALSE || !vm_map_is_exotic(map) || map->terminated) {
1559 		/*
1560 		 * No special reserved regions or not an exotic map or the task
1561 		 * is terminating and these special regions might have already
1562 		 * been deallocated.
1563 		 */
1564 		return map_size;
1565 	}
1566 
1567 	num_regions = ml_get_vm_reserved_regions(vm_map_is_64bit(map), &regions);
1568 	assert((num_regions == 0) || (num_regions > 0 && regions != NULL));
1569 
1570 	while (num_regions) {
1571 		reserved_size += regions[--num_regions].vmrr_size;
1572 	}
1573 
1574 	/*
1575 	 * There are a few places where the map is being switched out due to
1576 	 * 'termination' without that bit being set (e.g. exec and corpse purging).
1577 	 * In those cases, we could have the map's regions being deallocated on
1578 	 * a core while some accounting process is trying to get the map's size.
1579 	 * So this assert can't be enabled till all those places are uniform in
1580 	 * their use of the 'map->terminated' bit.
1581 	 *
1582 	 * assert(map_size >= reserved_size);
1583 	 */
1584 
1585 	return (map_size >= reserved_size) ? (map_size - reserved_size) : map_size;
1586 }
1587 
1588 /*
1589  *	vm_map_entry_create:	[ internal use only ]
1590  *
1591  *	Allocates a VM map entry for insertion in the
1592  *	given map (or map copy).  No fields are filled.
1593  *
1594  *	The VM entry will be zero initialized, except for:
1595  *	- behavior set to VM_BEHAVIOR_DEFAULT
1596  *	- inheritance set to VM_INHERIT_DEFAULT
1597  */
1598 #define vm_map_entry_create(map)    _vm_map_entry_create(&(map)->hdr)
1599 
1600 #define vm_map_copy_entry_create(copy) _vm_map_entry_create(&(copy)->cpy_hdr)
1601 
1602 static vm_map_entry_t
_vm_map_entry_create(struct vm_map_header * map_header __unused)1603 _vm_map_entry_create(
1604 	struct vm_map_header    *map_header __unused)
1605 {
1606 	vm_map_entry_t entry = NULL;
1607 
1608 	entry = zalloc_id(ZONE_ID_VM_MAP_ENTRY, Z_WAITOK | Z_ZERO);
1609 
1610 	/*
1611 	 * Help the compiler with what we know to be true,
1612 	 * so that the further bitfields inits have good codegen.
1613 	 *
1614 	 * See rdar://87041299
1615 	 */
1616 	__builtin_assume(entry->vme_object_value == 0);
1617 	__builtin_assume(*(uint64_t *)(&entry->vme_object_value + 1) == 0);
1618 	__builtin_assume(*(uint64_t *)(&entry->vme_object_value + 2) == 0);
1619 
1620 	static_assert(VM_MAX_TAG_VALUE <= VME_ALIAS_MASK,
1621 	    "VME_ALIAS_MASK covers tags");
1622 
1623 	static_assert(VM_BEHAVIOR_DEFAULT == 0,
1624 	    "can skip zeroing of the behavior field");
1625 	entry->inheritance = VM_INHERIT_DEFAULT;
1626 
1627 	vm_map_store_update((vm_map_t) NULL, entry, VM_MAP_ENTRY_CREATE);
1628 
1629 #if MAP_ENTRY_CREATION_DEBUG
1630 	entry->vme_creation_maphdr = map_header;
1631 	entry->vme_creation_bt = btref_get(__builtin_frame_address(0),
1632 	    BTREF_GET_NOWAIT);
1633 #endif
1634 	return entry;
1635 }
1636 
1637 /*
1638  *	vm_map_entry_dispose:	[ internal use only ]
1639  *
1640  *	Inverse of vm_map_entry_create.
1641  *
1642  *      write map lock held so no need to
1643  *	do anything special to insure correctness
1644  *      of the stores
1645  */
1646 static void
vm_map_entry_dispose(vm_map_entry_t entry)1647 vm_map_entry_dispose(
1648 	vm_map_entry_t          entry)
1649 {
1650 #if MAP_ENTRY_CREATION_DEBUG
1651 	btref_put(entry->vme_creation_bt);
1652 #endif
1653 #if MAP_ENTRY_INSERTION_DEBUG
1654 	btref_put(entry->vme_insertion_bt);
1655 #endif
1656 	zfree(vm_map_entry_zone, entry);
1657 }
1658 
1659 #define vm_map_copy_entry_dispose(copy_entry) \
1660 	vm_map_entry_dispose(copy_entry)
1661 
1662 static vm_map_entry_t
vm_map_zap_first_entry(vm_map_zap_t list)1663 vm_map_zap_first_entry(
1664 	vm_map_zap_t            list)
1665 {
1666 	return list->vmz_head;
1667 }
1668 
1669 static vm_map_entry_t
vm_map_zap_last_entry(vm_map_zap_t list)1670 vm_map_zap_last_entry(
1671 	vm_map_zap_t            list)
1672 {
1673 	assert(vm_map_zap_first_entry(list));
1674 	return __container_of(list->vmz_tail, struct vm_map_entry, vme_next);
1675 }
1676 
1677 static void
vm_map_zap_append(vm_map_zap_t list,vm_map_entry_t entry)1678 vm_map_zap_append(
1679 	vm_map_zap_t            list,
1680 	vm_map_entry_t          entry)
1681 {
1682 	entry->vme_next = VM_MAP_ENTRY_NULL;
1683 	*list->vmz_tail = entry;
1684 	list->vmz_tail = &entry->vme_next;
1685 }
1686 
1687 static vm_map_entry_t
vm_map_zap_pop(vm_map_zap_t list)1688 vm_map_zap_pop(
1689 	vm_map_zap_t            list)
1690 {
1691 	vm_map_entry_t head = list->vmz_head;
1692 
1693 	if (head != VM_MAP_ENTRY_NULL &&
1694 	    (list->vmz_head = head->vme_next) == VM_MAP_ENTRY_NULL) {
1695 		list->vmz_tail = &list->vmz_head;
1696 	}
1697 
1698 	return head;
1699 }
1700 
1701 static void
vm_map_zap_dispose(vm_map_zap_t list)1702 vm_map_zap_dispose(
1703 	vm_map_zap_t            list)
1704 {
1705 	vm_map_entry_t          entry;
1706 
1707 	while ((entry = vm_map_zap_pop(list))) {
1708 		if (entry->is_sub_map) {
1709 			vm_map_deallocate(VME_SUBMAP(entry));
1710 		} else {
1711 			vm_object_deallocate(VME_OBJECT(entry));
1712 		}
1713 
1714 		vm_map_entry_dispose(entry);
1715 	}
1716 }
1717 
1718 #if MACH_ASSERT
1719 static boolean_t first_free_check = FALSE;
1720 boolean_t
first_free_is_valid(vm_map_t map)1721 first_free_is_valid(
1722 	vm_map_t        map)
1723 {
1724 	if (!first_free_check) {
1725 		return TRUE;
1726 	}
1727 
1728 	return first_free_is_valid_store( map );
1729 }
1730 #endif /* MACH_ASSERT */
1731 
1732 
1733 #define vm_map_copy_entry_link(copy, after_where, entry)                \
1734 	_vm_map_store_entry_link(&(copy)->cpy_hdr, after_where, (entry))
1735 
1736 #define vm_map_copy_entry_unlink(copy, entry)                           \
1737 	_vm_map_store_entry_unlink(&(copy)->cpy_hdr, (entry), false)
1738 
1739 /*
1740  *	vm_map_destroy:
1741  *
1742  *	Actually destroy a map.
1743  */
1744 void
vm_map_destroy(vm_map_t map)1745 vm_map_destroy(
1746 	vm_map_t        map)
1747 {
1748 	/* final cleanup: this is not allowed to fail */
1749 	vmr_flags_t flags = VM_MAP_REMOVE_NO_FLAGS;
1750 
1751 	VM_MAP_ZAP_DECLARE(zap);
1752 
1753 	vm_map_lock(map);
1754 
1755 	map->terminated = true;
1756 	/* clean up regular map entries */
1757 	(void)vm_map_delete(map, map->min_offset, map->max_offset, flags,
1758 	    KMEM_GUARD_NONE, &zap);
1759 	/* clean up leftover special mappings (commpage, GPU carveout, etc...) */
1760 	(void)vm_map_delete(map, 0x0, 0xFFFFFFFFFFFFF000ULL, flags,
1761 	    KMEM_GUARD_NONE, &zap);
1762 
1763 	vm_map_disable_hole_optimization(map);
1764 	vm_map_corpse_footprint_destroy(map);
1765 
1766 	vm_map_unlock(map);
1767 
1768 	vm_map_zap_dispose(&zap);
1769 
1770 	assert(map->hdr.nentries == 0);
1771 
1772 	if (map->pmap) {
1773 		pmap_destroy(map->pmap);
1774 	}
1775 
1776 	lck_rw_destroy(&map->lock, &vm_map_lck_grp);
1777 
1778 	zfree_id(ZONE_ID_VM_MAP, map);
1779 }
1780 
1781 /*
1782  * Returns pid of the task with the largest number of VM map entries.
1783  * Used in the zone-map-exhaustion jetsam path.
1784  */
1785 pid_t
find_largest_process_vm_map_entries(void)1786 find_largest_process_vm_map_entries(void)
1787 {
1788 	pid_t victim_pid = -1;
1789 	int max_vm_map_entries = 0;
1790 	task_t task = TASK_NULL;
1791 	queue_head_t *task_list = &tasks;
1792 
1793 	lck_mtx_lock(&tasks_threads_lock);
1794 	queue_iterate(task_list, task, task_t, tasks) {
1795 		if (task == kernel_task || !task->active) {
1796 			continue;
1797 		}
1798 
1799 		vm_map_t task_map = task->map;
1800 		if (task_map != VM_MAP_NULL) {
1801 			int task_vm_map_entries = task_map->hdr.nentries;
1802 			if (task_vm_map_entries > max_vm_map_entries) {
1803 				max_vm_map_entries = task_vm_map_entries;
1804 				victim_pid = pid_from_task(task);
1805 			}
1806 		}
1807 	}
1808 	lck_mtx_unlock(&tasks_threads_lock);
1809 
1810 	printf("zone_map_exhaustion: victim pid %d, vm region count: %d\n", victim_pid, max_vm_map_entries);
1811 	return victim_pid;
1812 }
1813 
1814 
1815 /*
1816  *	vm_map_lookup_entry:	[ internal use only ]
1817  *
1818  *	Calls into the vm map store layer to find the map
1819  *	entry containing (or immediately preceding) the
1820  *	specified address in the given map; the entry is returned
1821  *	in the "entry" parameter.  The boolean
1822  *	result indicates whether the address is
1823  *	actually contained in the map.
1824  */
1825 boolean_t
vm_map_lookup_entry(vm_map_t map,vm_map_offset_t address,vm_map_entry_t * entry)1826 vm_map_lookup_entry(
1827 	vm_map_t        map,
1828 	vm_map_offset_t address,
1829 	vm_map_entry_t  *entry)         /* OUT */
1830 {
1831 #if CONFIG_KERNEL_TBI
1832 	if (VM_KERNEL_ADDRESS(address)) {
1833 		address = VM_KERNEL_STRIP_UPTR(address);
1834 	}
1835 #endif /* CONFIG_KERNEL_TBI */
1836 #if CONFIG_PROB_GZALLOC
1837 	if (map->pmap == kernel_pmap) {
1838 		assertf(!pgz_owned(address),
1839 		    "it is the responsibility of callers to unguard PGZ addresses");
1840 	}
1841 #endif /* CONFIG_PROB_GZALLOC */
1842 	return vm_map_store_lookup_entry( map, address, entry );
1843 }
1844 
1845 boolean_t
vm_map_lookup_entry_or_next(vm_map_t map,vm_map_offset_t address,vm_map_entry_t * entry)1846 vm_map_lookup_entry_or_next(
1847 	vm_map_t        map,
1848 	vm_map_offset_t address,
1849 	vm_map_entry_t  *entry)         /* OUT */
1850 {
1851 	if (vm_map_lookup_entry(map, address, entry)) {
1852 		return true;
1853 	}
1854 
1855 	*entry = (*entry)->vme_next;
1856 	return false;
1857 }
1858 
1859 #if CONFIG_PROB_GZALLOC
1860 boolean_t
vm_map_lookup_entry_allow_pgz(vm_map_t map,vm_map_offset_t address,vm_map_entry_t * entry)1861 vm_map_lookup_entry_allow_pgz(
1862 	vm_map_t        map,
1863 	vm_map_offset_t address,
1864 	vm_map_entry_t  *entry)         /* OUT */
1865 {
1866 #if CONFIG_KERNEL_TBI
1867 	if (VM_KERNEL_ADDRESS(address)) {
1868 		address = VM_KERNEL_STRIP_UPTR(address);
1869 	}
1870 #endif /* CONFIG_KERNEL_TBI */
1871 	return vm_map_store_lookup_entry( map, address, entry );
1872 }
1873 #endif /* CONFIG_PROB_GZALLOC */
1874 
1875 #if !ZSECURITY_CONFIG(KERNEL_DATA_SPLIT)
1876 /*
1877  *	Routine:	vm_map_adjust_direction
1878  *	Purpose:
1879  *			Overrides direction to reduce fragmentation. Allocate small
1880  *			allocations from the end and large allocations from the right.
1881  */
1882 static void
vm_map_adjust_direction(vm_map_kernel_flags_t * vmk_flags,vm_map_size_t size)1883 vm_map_adjust_direction(
1884 	vm_map_kernel_flags_t *vmk_flags,
1885 	vm_map_size_t          size)
1886 {
1887 	if (size < KMEM_SMALLMAP_THRESHOLD) {
1888 		vmk_flags->vmkf_last_free = true;
1889 	} else {
1890 		vmk_flags->vmkf_last_free = false;
1891 	}
1892 }
1893 #endif /* !ZSECURITY_CONFIG(KERNEL_DATA_SPLIT) */
1894 
1895 /*
1896  *	Routine:	vm_map_range_invalid_panic
1897  *	Purpose:
1898  *			Panic on detection of an invalid range id.
1899  */
1900 __abortlike
1901 static void
vm_map_range_invalid_panic(vm_map_t map,vm_map_range_id_t range_id)1902 vm_map_range_invalid_panic(
1903 	vm_map_t                map,
1904 	vm_map_range_id_t       range_id)
1905 {
1906 	panic("invalid range ID (%u) for map %p", range_id, map);
1907 }
1908 
1909 /*
1910  *	Routine:	vm_map_get_range
1911  *	Purpose:
1912  *			Adjust bounds based on security policy.
1913  */
1914 static struct mach_vm_range
vm_map_get_range(vm_map_t map,vm_map_address_t * address,vm_map_kernel_flags_t * vmk_flags,vm_map_size_t size)1915 vm_map_get_range(
1916 	vm_map_t                map,
1917 	vm_map_address_t       *address,
1918 	vm_map_kernel_flags_t  *vmk_flags,
1919 	vm_map_size_t           size)
1920 {
1921 	struct mach_vm_range effective_range = {};
1922 	vm_map_range_id_t range_id = vmk_flags->vmkf_range_id;
1923 
1924 	if (map == kernel_map) {
1925 		effective_range = kmem_ranges[range_id];
1926 
1927 		if (startup_phase >= STARTUP_SUB_KMEM) {
1928 			/*
1929 			 * Hint provided by caller is zeroed as the range is restricted to a
1930 			 * subset of the entire kernel_map VA, which could put the hint outside
1931 			 * the range, causing vm_map_store_find_space to fail.
1932 			 */
1933 			*address = 0ull;
1934 			/*
1935 			 * Ensure that range_id passed in by the caller is within meaningful
1936 			 * bounds. Range id of KMEM_RANGE_ID_NONE will cause vm_map_locate_space
1937 			 * to fail as the corresponding range is invalid. Range id larger than
1938 			 * KMEM_RANGE_ID_MAX will lead to an OOB access.
1939 			 */
1940 			if ((range_id == KMEM_RANGE_ID_NONE) ||
1941 			    (range_id > KMEM_RANGE_ID_MAX)) {
1942 				vm_map_range_invalid_panic(map, range_id);
1943 			}
1944 #if ZSECURITY_CONFIG(KERNEL_DATA_SPLIT)
1945 			/*
1946 			 * Each allocation front looks like [ S | L | S ]
1947 			 * Adjust range for allocations larger than KMEM_SMALLMAP_THRESHOLD.
1948 			 * Allocations smaller than KMEM_SMALLMAP_THRESHOLD are allowed to
1949 			 * use the entire range. Two small allocations from different fronts
1950 			 * (left and right) can only meet when memory in the that range is
1951 			 * entirely exhausted.
1952 			 */
1953 			if (size >= KMEM_SMALLMAP_THRESHOLD) {
1954 				effective_range = kmem_large_ranges[range_id];
1955 			}
1956 #else /* ZSECURITY_CONFIG(KERNEL_DATA_SPLIT) */
1957 			vm_map_adjust_direction(vmk_flags, size);
1958 #endif /* ZSECURITY_CONFIG(KERNEL_DATA_SPLIT) */
1959 		}
1960 #if CONFIG_MAP_RANGES
1961 	} else if (map->uses_user_ranges) {
1962 		if (range_id > UMEM_RANGE_ID_MAX) {
1963 			vm_map_range_invalid_panic(map, range_id);
1964 		}
1965 
1966 		effective_range = map->user_range[range_id];
1967 #endif /* CONFIG_MAP_RANGES */
1968 	} else {
1969 		/*
1970 		 * If minimum is 0, bump it up by PAGE_SIZE.  We want to limit
1971 		 * allocations of PAGEZERO to explicit requests since its
1972 		 * normal use is to catch dereferences of NULL and many
1973 		 * applications also treat pointers with a value of 0 as
1974 		 * special and suddenly having address 0 contain useable
1975 		 * memory would tend to confuse those applications.
1976 		 */
1977 		effective_range.min_address = MAX(map->min_offset, VM_MAP_PAGE_SIZE(map));
1978 		effective_range.max_address = map->max_offset;
1979 	}
1980 
1981 	return effective_range;
1982 }
1983 
1984 /*
1985  *	Routine:	vm_map_locate_space
1986  *	Purpose:
1987  *		Finds a range in the specified virtual address map,
1988  *		returning the start of that range,
1989  *		as well as the entry right before it.
1990  */
1991 kern_return_t
vm_map_locate_space(vm_map_t map,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_map_offset_t * start_inout,vm_map_entry_t * entry_out)1992 vm_map_locate_space(
1993 	vm_map_t                map,
1994 	vm_map_size_t           size,
1995 	vm_map_offset_t         mask,
1996 	vm_map_kernel_flags_t   vmk_flags,
1997 	vm_map_offset_t        *start_inout,
1998 	vm_map_entry_t         *entry_out)
1999 {
2000 	struct mach_vm_range effective_range = {};
2001 	vm_map_size_t   guard_offset;
2002 	vm_map_offset_t hint, limit;
2003 	vm_map_entry_t  entry;
2004 
2005 	/*
2006 	 * Only supported by vm_map_enter() with a fixed address.
2007 	 */
2008 	assert(!vmk_flags.vmkf_beyond_max);
2009 
2010 	if (__improbable(map->wait_for_space)) {
2011 		/*
2012 		 * support for "wait_for_space" is minimal,
2013 		 * its only consumer is the ipc_kernel_copy_map.
2014 		 */
2015 		assert(!map->holelistenabled &&
2016 		    !vmk_flags.vmkf_last_free &&
2017 		    !vmk_flags.vmkf_keep_map_locked &&
2018 		    !vmk_flags.vmkf_map_jit &&
2019 		    !vmk_flags.vmkf_random_address &&
2020 		    *start_inout <= map->min_offset);
2021 	} else if (vmk_flags.vmkf_last_free) {
2022 		assert(!vmk_flags.vmkf_map_jit &&
2023 		    !vmk_flags.vmkf_random_address);
2024 	}
2025 
2026 	if (vmk_flags.vmkf_guard_before) {
2027 		guard_offset = VM_MAP_PAGE_SIZE(map);
2028 		assert(size > guard_offset);
2029 		size -= guard_offset;
2030 	} else {
2031 		assert(size != 0);
2032 		guard_offset = 0;
2033 	}
2034 
2035 	effective_range = vm_map_get_range(map, start_inout, &vmk_flags, size);
2036 #if XNU_TARGET_OS_OSX
2037 	if (__improbable(vmk_flags.vmkf_32bit_map_va)) {
2038 		assert(map != kernel_map);
2039 		effective_range.max_address = MIN(map->max_offset, 0x00000000FFFFF000ULL);
2040 	}
2041 #endif /* XNU_TARGET_OS_OSX */
2042 
2043 again:
2044 	if (vmk_flags.vmkf_last_free) {
2045 		hint = *start_inout;
2046 
2047 		if (hint == 0 || hint > effective_range.max_address) {
2048 			hint = effective_range.max_address;
2049 		}
2050 		if (hint <= effective_range.min_address) {
2051 			return KERN_NO_SPACE;
2052 		}
2053 		limit = effective_range.min_address;
2054 	} else {
2055 		hint = *start_inout;
2056 
2057 		if (vmk_flags.vmkf_map_jit) {
2058 			if (map->jit_entry_exists &&
2059 			    !VM_MAP_POLICY_ALLOW_MULTIPLE_JIT(map)) {
2060 				return KERN_INVALID_ARGUMENT;
2061 			}
2062 			if (VM_MAP_POLICY_ALLOW_JIT_RANDOM_ADDRESS(map)) {
2063 				vmk_flags.vmkf_random_address = true;
2064 			}
2065 		}
2066 
2067 		if (vmk_flags.vmkf_random_address) {
2068 			kern_return_t kr;
2069 
2070 			kr = vm_map_random_address_for_size(map, &hint, size, vmk_flags);
2071 			if (kr != KERN_SUCCESS) {
2072 				return kr;
2073 			}
2074 		}
2075 #if XNU_TARGET_OS_OSX
2076 		else if ((hint == 0 || hint == vm_map_min(map)) &&
2077 		    !map->disable_vmentry_reuse &&
2078 		    map->vmmap_high_start != 0) {
2079 			hint = map->vmmap_high_start;
2080 		}
2081 #endif /* XNU_TARGET_OS_OSX */
2082 
2083 		if (hint < effective_range.min_address) {
2084 			hint = effective_range.min_address;
2085 		}
2086 		if (effective_range.max_address <= hint) {
2087 			return KERN_NO_SPACE;
2088 		}
2089 
2090 		limit = effective_range.max_address;
2091 	}
2092 	entry = vm_map_store_find_space(map,
2093 	    hint, limit, vmk_flags.vmkf_last_free,
2094 	    guard_offset, size, mask,
2095 	    start_inout);
2096 
2097 	if (__improbable(entry == NULL)) {
2098 		if (map->wait_for_space &&
2099 		    guard_offset + size <=
2100 		    effective_range.max_address - effective_range.min_address) {
2101 			assert_wait((event_t)map, THREAD_ABORTSAFE);
2102 			vm_map_unlock(map);
2103 			thread_block(THREAD_CONTINUE_NULL);
2104 			vm_map_lock(map);
2105 			goto again;
2106 		}
2107 		return KERN_NO_SPACE;
2108 	}
2109 
2110 	if (entry_out) {
2111 		*entry_out = entry;
2112 	}
2113 	return KERN_SUCCESS;
2114 }
2115 
2116 
2117 /*
2118  *	Routine:	vm_map_find_space
2119  *	Purpose:
2120  *		Allocate a range in the specified virtual address map,
2121  *		returning the entry allocated for that range.
2122  *		Used by kmem_alloc, etc.
2123  *
2124  *		The map must be NOT be locked. It will be returned locked
2125  *		on KERN_SUCCESS, unlocked on failure.
2126  *
2127  *		If an entry is allocated, the object/offset fields
2128  *		are initialized to zero.
2129  */
2130 kern_return_t
vm_map_find_space(vm_map_t map,vm_map_offset_t hint_address,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_map_entry_t * o_entry)2131 vm_map_find_space(
2132 	vm_map_t                map,
2133 	vm_map_offset_t         hint_address,
2134 	vm_map_size_t           size,
2135 	vm_map_offset_t         mask,
2136 	vm_map_kernel_flags_t   vmk_flags,
2137 	vm_map_entry_t          *o_entry)       /* OUT */
2138 {
2139 	vm_map_entry_t          new_entry, entry;
2140 	kern_return_t           kr;
2141 
2142 	if (size == 0) {
2143 		return KERN_INVALID_ARGUMENT;
2144 	}
2145 
2146 	new_entry = vm_map_entry_create(map);
2147 	new_entry->use_pmap = true;
2148 	new_entry->protection = VM_PROT_DEFAULT;
2149 	new_entry->max_protection = VM_PROT_ALL;
2150 
2151 	if (VM_MAP_PAGE_SHIFT(map) != PAGE_SHIFT) {
2152 		new_entry->map_aligned = true;
2153 	}
2154 	if (vmk_flags.vmkf_permanent) {
2155 		new_entry->vme_permanent = true;
2156 	}
2157 
2158 	vm_map_lock(map);
2159 
2160 	kr = vm_map_locate_space(map, size, mask, vmk_flags,
2161 	    &hint_address, &entry);
2162 	if (kr != KERN_SUCCESS) {
2163 		vm_map_unlock(map);
2164 		vm_map_entry_dispose(new_entry);
2165 		return kr;
2166 	}
2167 	new_entry->vme_start = hint_address;
2168 	new_entry->vme_end = hint_address + size;
2169 
2170 	/*
2171 	 *	At this point,
2172 	 *
2173 	 *	- new_entry's "vme_start" and "vme_end" should define
2174 	 *	  the endpoints of the available new range,
2175 	 *
2176 	 *	- and "entry" should refer to the region before
2177 	 *	  the new range,
2178 	 *
2179 	 *	- and the map should still be locked.
2180 	 */
2181 
2182 	assert(page_aligned(new_entry->vme_start));
2183 	assert(page_aligned(new_entry->vme_end));
2184 	assert(VM_MAP_PAGE_ALIGNED(new_entry->vme_start, VM_MAP_PAGE_MASK(map)));
2185 	assert(VM_MAP_PAGE_ALIGNED(new_entry->vme_end, VM_MAP_PAGE_MASK(map)));
2186 
2187 	/*
2188 	 *	Insert the new entry into the list
2189 	 */
2190 
2191 	vm_map_store_entry_link(map, entry, new_entry, VM_MAP_KERNEL_FLAGS_NONE);
2192 	map->size += size;
2193 
2194 	/*
2195 	 *	Update the lookup hint
2196 	 */
2197 	SAVE_HINT_MAP_WRITE(map, new_entry);
2198 
2199 	*o_entry = new_entry;
2200 	return KERN_SUCCESS;
2201 }
2202 
2203 int vm_map_pmap_enter_print = FALSE;
2204 int vm_map_pmap_enter_enable = FALSE;
2205 
2206 /*
2207  *	Routine:	vm_map_pmap_enter [internal only]
2208  *
2209  *	Description:
2210  *		Force pages from the specified object to be entered into
2211  *		the pmap at the specified address if they are present.
2212  *		As soon as a page not found in the object the scan ends.
2213  *
2214  *	Returns:
2215  *		Nothing.
2216  *
2217  *	In/out conditions:
2218  *		The source map should not be locked on entry.
2219  */
2220 __unused static void
vm_map_pmap_enter(vm_map_t map,vm_map_offset_t addr,vm_map_offset_t end_addr,vm_object_t object,vm_object_offset_t offset,vm_prot_t protection)2221 vm_map_pmap_enter(
2222 	vm_map_t                map,
2223 	vm_map_offset_t         addr,
2224 	vm_map_offset_t         end_addr,
2225 	vm_object_t             object,
2226 	vm_object_offset_t      offset,
2227 	vm_prot_t               protection)
2228 {
2229 	int                     type_of_fault;
2230 	kern_return_t           kr;
2231 	struct vm_object_fault_info fault_info = {};
2232 
2233 	if (map->pmap == 0) {
2234 		return;
2235 	}
2236 
2237 	assert(VM_MAP_PAGE_SHIFT(map) == PAGE_SHIFT);
2238 
2239 	while (addr < end_addr) {
2240 		vm_page_t       m;
2241 
2242 
2243 		/*
2244 		 * TODO:
2245 		 * From vm_map_enter(), we come into this function without the map
2246 		 * lock held or the object lock held.
2247 		 * We haven't taken a reference on the object either.
2248 		 * We should do a proper lookup on the map to make sure
2249 		 * that things are sane before we go locking objects that
2250 		 * could have been deallocated from under us.
2251 		 */
2252 
2253 		vm_object_lock(object);
2254 
2255 		m = vm_page_lookup(object, offset);
2256 
2257 		if (m == VM_PAGE_NULL || m->vmp_busy || m->vmp_fictitious ||
2258 		    (m->vmp_unusual && (VMP_ERROR_GET(m) || m->vmp_restart || m->vmp_absent))) {
2259 			vm_object_unlock(object);
2260 			return;
2261 		}
2262 
2263 		if (vm_map_pmap_enter_print) {
2264 			printf("vm_map_pmap_enter:");
2265 			printf("map: %p, addr: %llx, object: %p, offset: %llx\n",
2266 			    map, (unsigned long long)addr, object, (unsigned long long)offset);
2267 		}
2268 		type_of_fault = DBG_CACHE_HIT_FAULT;
2269 		kr = vm_fault_enter(m, map->pmap,
2270 		    addr,
2271 		    PAGE_SIZE, 0,
2272 		    protection, protection,
2273 		    VM_PAGE_WIRED(m),
2274 		    FALSE,                 /* change_wiring */
2275 		    VM_KERN_MEMORY_NONE,                 /* tag - not wiring */
2276 		    &fault_info,
2277 		    NULL,                  /* need_retry */
2278 		    &type_of_fault);
2279 
2280 		vm_object_unlock(object);
2281 
2282 		offset += PAGE_SIZE_64;
2283 		addr += PAGE_SIZE;
2284 	}
2285 }
2286 
2287 #define MAX_TRIES_TO_GET_RANDOM_ADDRESS 1000
2288 kern_return_t
vm_map_random_address_for_size(vm_map_t map,vm_map_offset_t * address,vm_map_size_t size,vm_map_kernel_flags_t vmk_flags)2289 vm_map_random_address_for_size(
2290 	vm_map_t                map,
2291 	vm_map_offset_t        *address,
2292 	vm_map_size_t           size,
2293 	vm_map_kernel_flags_t   vmk_flags)
2294 {
2295 	kern_return_t   kr = KERN_SUCCESS;
2296 	int             tries = 0;
2297 	vm_map_offset_t random_addr = 0;
2298 	vm_map_offset_t hole_end;
2299 
2300 	vm_map_entry_t  next_entry = VM_MAP_ENTRY_NULL;
2301 	vm_map_entry_t  prev_entry = VM_MAP_ENTRY_NULL;
2302 	vm_map_size_t   vm_hole_size = 0;
2303 	vm_map_size_t   addr_space_size;
2304 	struct mach_vm_range effective_range = vm_map_get_range(map, address, &vmk_flags, size);
2305 
2306 	addr_space_size = effective_range.max_address - effective_range.min_address;
2307 	if (size >= addr_space_size) {
2308 		return KERN_NO_SPACE;
2309 	}
2310 	addr_space_size -= size;
2311 
2312 	assert(VM_MAP_PAGE_ALIGNED(size, VM_MAP_PAGE_MASK(map)));
2313 
2314 	while (tries < MAX_TRIES_TO_GET_RANDOM_ADDRESS) {
2315 		if (startup_phase < STARTUP_SUB_ZALLOC) {
2316 			random_addr = (vm_map_offset_t)early_random();
2317 		} else {
2318 			random_addr = (vm_map_offset_t)random();
2319 		}
2320 		random_addr <<= VM_MAP_PAGE_SHIFT(map);
2321 		random_addr = vm_map_trunc_page(
2322 			effective_range.min_address + (random_addr % addr_space_size),
2323 			VM_MAP_PAGE_MASK(map));
2324 
2325 #if CONFIG_PROB_GZALLOC
2326 		if (map->pmap == kernel_pmap && pgz_owned(random_addr)) {
2327 			continue;
2328 		}
2329 #endif /* CONFIG_PROB_GZALLOC */
2330 
2331 		if (vm_map_lookup_entry(map, random_addr, &prev_entry) == FALSE) {
2332 			if (prev_entry == vm_map_to_entry(map)) {
2333 				next_entry = vm_map_first_entry(map);
2334 			} else {
2335 				next_entry = prev_entry->vme_next;
2336 			}
2337 			if (next_entry == vm_map_to_entry(map)) {
2338 				hole_end = vm_map_max(map);
2339 			} else {
2340 				hole_end = next_entry->vme_start;
2341 			}
2342 			vm_hole_size = hole_end - random_addr;
2343 			if (vm_hole_size >= size) {
2344 				*address = random_addr;
2345 				break;
2346 			}
2347 		}
2348 		tries++;
2349 	}
2350 
2351 	if (tries == MAX_TRIES_TO_GET_RANDOM_ADDRESS) {
2352 		kr = KERN_NO_SPACE;
2353 	}
2354 	return kr;
2355 }
2356 
2357 static boolean_t
vm_memory_malloc_no_cow(int alias)2358 vm_memory_malloc_no_cow(
2359 	int alias)
2360 {
2361 	uint64_t alias_mask;
2362 
2363 	if (alias > 63) {
2364 		return FALSE;
2365 	}
2366 
2367 	alias_mask = 1ULL << alias;
2368 	if (alias_mask & vm_memory_malloc_no_cow_mask) {
2369 		return TRUE;
2370 	}
2371 	return FALSE;
2372 }
2373 
2374 uint64_t vm_map_enter_RLIMIT_AS_count = 0;
2375 uint64_t vm_map_enter_RLIMIT_DATA_count = 0;
2376 /*
2377  *	Routine:	vm_map_enter
2378  *
2379  *	Description:
2380  *		Allocate a range in the specified virtual address map.
2381  *		The resulting range will refer to memory defined by
2382  *		the given memory object and offset into that object.
2383  *
2384  *		Arguments are as defined in the vm_map call.
2385  */
2386 static unsigned int vm_map_enter_restore_successes = 0;
2387 static unsigned int vm_map_enter_restore_failures = 0;
2388 kern_return_t
vm_map_enter(vm_map_t map,vm_map_offset_t * address,vm_map_size_t size,vm_map_offset_t mask,int flags,vm_map_kernel_flags_t vmk_flags,vm_tag_t alias,vm_object_t object,vm_object_offset_t offset,boolean_t needs_copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)2389 vm_map_enter(
2390 	vm_map_t                map,
2391 	vm_map_offset_t         *address,       /* IN/OUT */
2392 	vm_map_size_t           size,
2393 	vm_map_offset_t         mask,
2394 	int                     flags,
2395 	vm_map_kernel_flags_t   vmk_flags,
2396 	vm_tag_t                alias,
2397 	vm_object_t             object,
2398 	vm_object_offset_t      offset,
2399 	boolean_t               needs_copy,
2400 	vm_prot_t               cur_protection,
2401 	vm_prot_t               max_protection,
2402 	vm_inherit_t            inheritance)
2403 {
2404 	vm_map_entry_t          entry, new_entry;
2405 	vm_map_offset_t         start, tmp_start, tmp_offset;
2406 	vm_map_offset_t         end, tmp_end;
2407 	vm_map_offset_t         tmp2_start, tmp2_end;
2408 	vm_map_offset_t         step;
2409 	kern_return_t           result = KERN_SUCCESS;
2410 	boolean_t               map_locked = FALSE;
2411 	boolean_t               pmap_empty = TRUE;
2412 	boolean_t               new_mapping_established = FALSE;
2413 	boolean_t               keep_map_locked = vmk_flags.vmkf_keep_map_locked;
2414 	boolean_t               anywhere = ((flags & VM_FLAGS_ANYWHERE) != 0);
2415 	boolean_t               purgable = ((flags & VM_FLAGS_PURGABLE) != 0);
2416 	boolean_t               overwrite = ((flags & VM_FLAGS_OVERWRITE) != 0);
2417 	boolean_t               no_cache = ((flags & VM_FLAGS_NO_CACHE) != 0);
2418 	const boolean_t         is_submap = vmk_flags.vmkf_submap;
2419 	boolean_t               permanent = (((flags & VM_FLAGS_PERMANENT) != 0) || vmk_flags.vmkf_permanent);
2420 	const boolean_t         no_copy_on_read = vmk_flags.vmkf_no_copy_on_read;
2421 	const boolean_t         entry_for_jit = vmk_flags.vmkf_map_jit;
2422 	boolean_t               iokit_acct = vmk_flags.vmkf_iokit_acct;
2423 	boolean_t               resilient_codesign = ((flags & VM_FLAGS_RESILIENT_CODESIGN) != 0);
2424 	boolean_t               resilient_media = ((flags & VM_FLAGS_RESILIENT_MEDIA) != 0);
2425 	boolean_t               entry_for_tpro = ((flags & VM_FLAGS_TPRO) != 0);
2426 	unsigned int            superpage_size = ((flags & VM_FLAGS_SUPERPAGE_MASK) >> VM_FLAGS_SUPERPAGE_SHIFT);
2427 	vm_tag_t                user_alias;
2428 	kern_return_t           kr;
2429 	boolean_t               clear_map_aligned = FALSE;
2430 	vm_map_size_t           chunk_size = 0;
2431 	vm_object_t             caller_object;
2432 	VM_MAP_ZAP_DECLARE(zap_old_list);
2433 	VM_MAP_ZAP_DECLARE(zap_new_list);
2434 
2435 	caller_object = object;
2436 
2437 	assertf(vmk_flags.__vmkf_unused == 0, "vmk_flags unused=0x%x\n", vmk_flags.__vmkf_unused);
2438 
2439 	if (flags & VM_FLAGS_4GB_CHUNK) {
2440 #if defined(__LP64__)
2441 		chunk_size = (4ULL * 1024 * 1024 * 1024); /* max. 4GB chunks for the new allocation */
2442 #else /* __LP64__ */
2443 		chunk_size = ANON_CHUNK_SIZE;
2444 #endif /* __LP64__ */
2445 	} else {
2446 		chunk_size = ANON_CHUNK_SIZE;
2447 	}
2448 
2449 	if (superpage_size) {
2450 		switch (superpage_size) {
2451 			/*
2452 			 * Note that the current implementation only supports
2453 			 * a single size for superpages, SUPERPAGE_SIZE, per
2454 			 * architecture. As soon as more sizes are supposed
2455 			 * to be supported, SUPERPAGE_SIZE has to be replaced
2456 			 * with a lookup of the size depending on superpage_size.
2457 			 */
2458 #ifdef __x86_64__
2459 		case SUPERPAGE_SIZE_ANY:
2460 			/* handle it like 2 MB and round up to page size */
2461 			size = (size + 2 * 1024 * 1024 - 1) & ~(2 * 1024 * 1024 - 1);
2462 			OS_FALLTHROUGH;
2463 		case SUPERPAGE_SIZE_2MB:
2464 			break;
2465 #endif
2466 		default:
2467 			return KERN_INVALID_ARGUMENT;
2468 		}
2469 		mask = SUPERPAGE_SIZE - 1;
2470 		if (size & (SUPERPAGE_SIZE - 1)) {
2471 			return KERN_INVALID_ARGUMENT;
2472 		}
2473 		inheritance = VM_INHERIT_NONE;  /* fork() children won't inherit superpages */
2474 	}
2475 
2476 
2477 	if ((cur_protection & VM_PROT_WRITE) &&
2478 	    (cur_protection & VM_PROT_EXECUTE) &&
2479 #if XNU_TARGET_OS_OSX
2480 	    map->pmap != kernel_pmap &&
2481 	    (cs_process_global_enforcement() ||
2482 	    (vmk_flags.vmkf_cs_enforcement_override
2483 	    ? vmk_flags.vmkf_cs_enforcement
2484 	    : (vm_map_cs_enforcement(map)
2485 #if __arm64__
2486 	    || !VM_MAP_IS_EXOTIC(map)
2487 #endif /* __arm64__ */
2488 	    ))) &&
2489 #endif /* XNU_TARGET_OS_OSX */
2490 	    (VM_MAP_POLICY_WX_FAIL(map) ||
2491 	    VM_MAP_POLICY_WX_STRIP_X(map)) &&
2492 	    !entry_for_jit) {
2493 		boolean_t vm_protect_wx_fail = VM_MAP_POLICY_WX_FAIL(map);
2494 
2495 		DTRACE_VM3(cs_wx,
2496 		    uint64_t, 0,
2497 		    uint64_t, 0,
2498 		    vm_prot_t, cur_protection);
2499 		printf("CODE SIGNING: %d[%s] %s: curprot cannot be write+execute. %s\n",
2500 		    proc_selfpid(),
2501 		    (get_bsdtask_info(current_task())
2502 		    ? proc_name_address(get_bsdtask_info(current_task()))
2503 		    : "?"),
2504 		    __FUNCTION__,
2505 		    (vm_protect_wx_fail ? "failing" : "turning off execute"));
2506 		cur_protection &= ~VM_PROT_EXECUTE;
2507 		if (vm_protect_wx_fail) {
2508 			return KERN_PROTECTION_FAILURE;
2509 		}
2510 	}
2511 
2512 	/*
2513 	 * If the task has requested executable lockdown,
2514 	 * deny any new executable mapping.
2515 	 */
2516 	if (map->map_disallow_new_exec == TRUE) {
2517 		if (cur_protection & VM_PROT_EXECUTE) {
2518 			return KERN_PROTECTION_FAILURE;
2519 		}
2520 	}
2521 
2522 	if (resilient_codesign) {
2523 		assert(!is_submap);
2524 		int reject_prot = (needs_copy ? VM_PROT_ALLEXEC : (VM_PROT_WRITE | VM_PROT_ALLEXEC));
2525 		if ((cur_protection | max_protection) & reject_prot) {
2526 			return KERN_PROTECTION_FAILURE;
2527 		}
2528 	}
2529 
2530 	if (resilient_media) {
2531 		assert(!is_submap);
2532 //		assert(!needs_copy);
2533 		if (object != VM_OBJECT_NULL &&
2534 		    !object->internal) {
2535 			/*
2536 			 * This mapping is directly backed by an external
2537 			 * memory manager (e.g. a vnode pager for a file):
2538 			 * we would not have any safe place to inject
2539 			 * a zero-filled page if an actual page is not
2540 			 * available, without possibly impacting the actual
2541 			 * contents of the mapped object (e.g. the file),
2542 			 * so we can't provide any media resiliency here.
2543 			 */
2544 			return KERN_INVALID_ARGUMENT;
2545 		}
2546 	}
2547 
2548 	if (is_submap) {
2549 		vm_map_t submap;
2550 		if (purgable) {
2551 			/* submaps can not be purgeable */
2552 			return KERN_INVALID_ARGUMENT;
2553 		}
2554 		if (object == VM_OBJECT_NULL) {
2555 			/* submaps can not be created lazily */
2556 			return KERN_INVALID_ARGUMENT;
2557 		}
2558 		submap = (vm_map_t) object;
2559 		if (VM_MAP_PAGE_SHIFT(submap) != VM_MAP_PAGE_SHIFT(map)) {
2560 			/* page size mismatch */
2561 			return KERN_INVALID_ARGUMENT;
2562 		}
2563 	}
2564 	if (vmk_flags.vmkf_already) {
2565 		/*
2566 		 * VM_FLAGS_ALREADY says that it's OK if the same mapping
2567 		 * is already present.  For it to be meaningul, the requested
2568 		 * mapping has to be at a fixed address (!VM_FLAGS_ANYWHERE) and
2569 		 * we shouldn't try and remove what was mapped there first
2570 		 * (!VM_FLAGS_OVERWRITE).
2571 		 */
2572 		if ((flags & VM_FLAGS_ANYWHERE) ||
2573 		    (flags & VM_FLAGS_OVERWRITE)) {
2574 			return KERN_INVALID_ARGUMENT;
2575 		}
2576 	}
2577 
2578 	if (size == 0 ||
2579 	    (offset & MIN(VM_MAP_PAGE_MASK(map), PAGE_MASK_64)) != 0) {
2580 		*address = 0;
2581 		return KERN_INVALID_ARGUMENT;
2582 	}
2583 
2584 	if (map->pmap == kernel_pmap) {
2585 		user_alias = VM_KERN_MEMORY_NONE;
2586 	} else {
2587 		user_alias = alias;
2588 	}
2589 
2590 	if (user_alias == VM_MEMORY_MALLOC_MEDIUM) {
2591 		chunk_size = MALLOC_MEDIUM_CHUNK_SIZE;
2592 	}
2593 
2594 #define RETURN(value)   { result = value; goto BailOut; }
2595 
2596 	assertf(VM_MAP_PAGE_ALIGNED(*address, FOURK_PAGE_MASK), "0x%llx", (uint64_t)*address);
2597 	assertf(VM_MAP_PAGE_ALIGNED(size, FOURK_PAGE_MASK), "0x%llx", (uint64_t)size);
2598 	if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK) {
2599 		assertf(page_aligned(*address), "0x%llx", (uint64_t)*address);
2600 		assertf(page_aligned(size), "0x%llx", (uint64_t)size);
2601 	}
2602 
2603 	if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK &&
2604 	    !VM_MAP_PAGE_ALIGNED(size, VM_MAP_PAGE_MASK(map))) {
2605 		/*
2606 		 * In most cases, the caller rounds the size up to the
2607 		 * map's page size.
2608 		 * If we get a size that is explicitly not map-aligned here,
2609 		 * we'll have to respect the caller's wish and mark the
2610 		 * mapping as "not map-aligned" to avoid tripping the
2611 		 * map alignment checks later.
2612 		 */
2613 		clear_map_aligned = TRUE;
2614 	}
2615 	if (!anywhere &&
2616 	    VM_MAP_PAGE_MASK(map) >= PAGE_MASK &&
2617 	    !VM_MAP_PAGE_ALIGNED(*address, VM_MAP_PAGE_MASK(map))) {
2618 		/*
2619 		 * We've been asked to map at a fixed address and that
2620 		 * address is not aligned to the map's specific alignment.
2621 		 * The caller should know what it's doing (i.e. most likely
2622 		 * mapping some fragmented copy map, transferring memory from
2623 		 * a VM map with a different alignment), so clear map_aligned
2624 		 * for this new VM map entry and proceed.
2625 		 */
2626 		clear_map_aligned = TRUE;
2627 	}
2628 
2629 	/*
2630 	 * Only zero-fill objects are allowed to be purgable.
2631 	 * LP64todo - limit purgable objects to 32-bits for now
2632 	 */
2633 	if (purgable &&
2634 	    (offset != 0 ||
2635 	    (object != VM_OBJECT_NULL &&
2636 	    (object->vo_size != size ||
2637 	    object->purgable == VM_PURGABLE_DENY))
2638 #if __LP64__
2639 	    || size > ANON_MAX_SIZE
2640 #endif
2641 	    )) {
2642 		return KERN_INVALID_ARGUMENT;
2643 	}
2644 
2645 	start = *address;
2646 
2647 	if (anywhere) {
2648 		vm_map_lock(map);
2649 		map_locked = TRUE;
2650 
2651 		if (flags & VM_FLAGS_RANDOM_ADDR) {
2652 			vmk_flags.vmkf_random_address = true;
2653 		}
2654 
2655 		result = vm_map_locate_space(map, size, mask, vmk_flags,
2656 		    &start, &entry);
2657 		if (result != KERN_SUCCESS) {
2658 			goto BailOut;
2659 		}
2660 
2661 		*address = start;
2662 		end = start + size;
2663 		assert(VM_MAP_PAGE_ALIGNED(*address,
2664 		    VM_MAP_PAGE_MASK(map)));
2665 	} else {
2666 		vm_map_offset_t effective_min_offset, effective_max_offset;
2667 
2668 		effective_min_offset = map->min_offset;
2669 		effective_max_offset = map->max_offset;
2670 
2671 		if (vmk_flags.vmkf_beyond_max) {
2672 			/*
2673 			 * Allow an insertion beyond the map's max offset.
2674 			 */
2675 			effective_max_offset = 0x00000000FFFFF000ULL;
2676 			if (vm_map_is_64bit(map)) {
2677 				effective_max_offset = 0xFFFFFFFFFFFFF000ULL;
2678 			}
2679 #if XNU_TARGET_OS_OSX
2680 		} else if (__improbable(vmk_flags.vmkf_32bit_map_va)) {
2681 			effective_max_offset = MIN(map->max_offset, 0x00000000FFFFF000ULL);
2682 #endif /* XNU_TARGET_OS_OSX */
2683 		}
2684 
2685 		if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT &&
2686 		    !overwrite &&
2687 		    user_alias == VM_MEMORY_REALLOC) {
2688 			/*
2689 			 * Force realloc() to switch to a new allocation,
2690 			 * to prevent 4k-fragmented virtual ranges.
2691 			 */
2692 //			DEBUG4K_ERROR("no realloc in place");
2693 			return KERN_NO_SPACE;
2694 		}
2695 
2696 		/*
2697 		 *	Verify that:
2698 		 *		the address doesn't itself violate
2699 		 *		the mask requirement.
2700 		 */
2701 
2702 		vm_map_lock(map);
2703 		map_locked = TRUE;
2704 		if ((start & mask) != 0) {
2705 			RETURN(KERN_NO_SPACE);
2706 		}
2707 
2708 		/*
2709 		 *	...	the address is within bounds
2710 		 */
2711 
2712 		end = start + size;
2713 
2714 		if ((start < effective_min_offset) ||
2715 		    (end > effective_max_offset) ||
2716 		    (start >= end)) {
2717 			RETURN(KERN_INVALID_ADDRESS);
2718 		}
2719 
2720 		if (overwrite) {
2721 			vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_MAP_ALIGN;
2722 			kern_return_t remove_kr;
2723 
2724 			/*
2725 			 * Fixed mapping and "overwrite" flag: attempt to
2726 			 * remove all existing mappings in the specified
2727 			 * address range, saving them in our "zap_old_list".
2728 			 *
2729 			 * This avoids releasing the VM map lock in
2730 			 * vm_map_entry_delete() and allows atomicity
2731 			 * when we want to replace some mappings with a new one.
2732 			 * It also allows us to restore the old VM mappings if the
2733 			 * new mapping fails.
2734 			 */
2735 			remove_flags |= VM_MAP_REMOVE_NO_YIELD;
2736 
2737 			if (vmk_flags.vmkf_overwrite_immutable) {
2738 				/* we can overwrite immutable mappings */
2739 				remove_flags |= VM_MAP_REMOVE_IMMUTABLE;
2740 			}
2741 			if (vmk_flags.vmkf_remap_prot_copy) {
2742 				remove_flags |= VM_MAP_REMOVE_IMMUTABLE_CODE;
2743 			}
2744 			remove_kr = vm_map_delete(map, start, end, remove_flags,
2745 			    KMEM_GUARD_NONE, &zap_old_list).kmr_return;
2746 			if (remove_kr) {
2747 				/* XXX FBDP restore zap_old_list? */
2748 				RETURN(remove_kr);
2749 			}
2750 		}
2751 
2752 		/*
2753 		 *	...	the starting address isn't allocated
2754 		 */
2755 
2756 		if (vm_map_lookup_entry(map, start, &entry)) {
2757 			if (!(vmk_flags.vmkf_already)) {
2758 				RETURN(KERN_NO_SPACE);
2759 			}
2760 			/*
2761 			 * Check if what's already there is what we want.
2762 			 */
2763 			tmp_start = start;
2764 			tmp_offset = offset;
2765 			if (entry->vme_start < start) {
2766 				tmp_start -= start - entry->vme_start;
2767 				tmp_offset -= start - entry->vme_start;
2768 			}
2769 			for (; entry->vme_start < end;
2770 			    entry = entry->vme_next) {
2771 				/*
2772 				 * Check if the mapping's attributes
2773 				 * match the existing map entry.
2774 				 */
2775 				if (entry == vm_map_to_entry(map) ||
2776 				    entry->vme_start != tmp_start ||
2777 				    entry->is_sub_map != is_submap ||
2778 				    VME_OFFSET(entry) != tmp_offset ||
2779 				    entry->needs_copy != needs_copy ||
2780 				    entry->protection != cur_protection ||
2781 				    entry->max_protection != max_protection ||
2782 				    entry->inheritance != inheritance ||
2783 				    entry->iokit_acct != iokit_acct ||
2784 				    VME_ALIAS(entry) != alias) {
2785 					/* not the same mapping ! */
2786 					RETURN(KERN_NO_SPACE);
2787 				}
2788 				/*
2789 				 * Check if the same object is being mapped.
2790 				 */
2791 				if (is_submap) {
2792 					if (VME_SUBMAP(entry) !=
2793 					    (vm_map_t) object) {
2794 						/* not the same submap */
2795 						RETURN(KERN_NO_SPACE);
2796 					}
2797 				} else {
2798 					if (VME_OBJECT(entry) != object) {
2799 						/* not the same VM object... */
2800 						vm_object_t obj2;
2801 
2802 						obj2 = VME_OBJECT(entry);
2803 						if ((obj2 == VM_OBJECT_NULL ||
2804 						    obj2->internal) &&
2805 						    (object == VM_OBJECT_NULL ||
2806 						    object->internal)) {
2807 							/*
2808 							 * ... but both are
2809 							 * anonymous memory,
2810 							 * so equivalent.
2811 							 */
2812 						} else {
2813 							RETURN(KERN_NO_SPACE);
2814 						}
2815 					}
2816 				}
2817 
2818 				tmp_offset += entry->vme_end - entry->vme_start;
2819 				tmp_start += entry->vme_end - entry->vme_start;
2820 				if (entry->vme_end >= end) {
2821 					/* reached the end of our mapping */
2822 					break;
2823 				}
2824 			}
2825 			/* it all matches:  let's use what's already there ! */
2826 			RETURN(KERN_MEMORY_PRESENT);
2827 		}
2828 
2829 		/*
2830 		 *	...	the next region doesn't overlap the
2831 		 *		end point.
2832 		 */
2833 
2834 		if ((entry->vme_next != vm_map_to_entry(map)) &&
2835 		    (entry->vme_next->vme_start < end)) {
2836 			RETURN(KERN_NO_SPACE);
2837 		}
2838 	}
2839 
2840 	/*
2841 	 *	At this point,
2842 	 *		"start" and "end" should define the endpoints of the
2843 	 *			available new range, and
2844 	 *		"entry" should refer to the region before the new
2845 	 *			range, and
2846 	 *
2847 	 *		the map should be locked.
2848 	 */
2849 
2850 	/*
2851 	 *	See whether we can avoid creating a new entry (and object) by
2852 	 *	extending one of our neighbors.  [So far, we only attempt to
2853 	 *	extend from below.]  Note that we can never extend/join
2854 	 *	purgable objects because they need to remain distinct
2855 	 *	entities in order to implement their "volatile object"
2856 	 *	semantics.
2857 	 */
2858 
2859 	if (purgable ||
2860 	    entry_for_jit ||
2861 	    entry_for_tpro ||
2862 	    vm_memory_malloc_no_cow(user_alias)) {
2863 		if (object == VM_OBJECT_NULL) {
2864 			object = vm_object_allocate(size);
2865 			object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
2866 			object->true_share = FALSE;
2867 			if (purgable) {
2868 				task_t owner;
2869 				object->purgable = VM_PURGABLE_NONVOLATILE;
2870 				if (map->pmap == kernel_pmap) {
2871 					/*
2872 					 * Purgeable mappings made in a kernel
2873 					 * map are "owned" by the kernel itself
2874 					 * rather than the current user task
2875 					 * because they're likely to be used by
2876 					 * more than this user task (see
2877 					 * execargs_purgeable_allocate(), for
2878 					 * example).
2879 					 */
2880 					owner = kernel_task;
2881 				} else {
2882 					owner = current_task();
2883 				}
2884 				assert(object->vo_owner == NULL);
2885 				assert(object->resident_page_count == 0);
2886 				assert(object->wired_page_count == 0);
2887 				vm_object_lock(object);
2888 				vm_purgeable_nonvolatile_enqueue(object, owner);
2889 				vm_object_unlock(object);
2890 			}
2891 			offset = (vm_object_offset_t)0;
2892 		}
2893 	} else if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
2894 		/* no coalescing if address space uses sub-pages */
2895 	} else if ((is_submap == FALSE) &&
2896 	    (object == VM_OBJECT_NULL) &&
2897 	    (entry != vm_map_to_entry(map)) &&
2898 	    (entry->vme_end == start) &&
2899 	    (!entry->is_shared) &&
2900 	    (!entry->is_sub_map) &&
2901 	    (!entry->in_transition) &&
2902 	    (!entry->needs_wakeup) &&
2903 	    (entry->behavior == VM_BEHAVIOR_DEFAULT) &&
2904 	    (entry->protection == cur_protection) &&
2905 	    (entry->max_protection == max_protection) &&
2906 	    (entry->inheritance == inheritance) &&
2907 	    ((user_alias == VM_MEMORY_REALLOC) ||
2908 	    (VME_ALIAS(entry) == alias)) &&
2909 	    (entry->no_cache == no_cache) &&
2910 	    (entry->vme_permanent == permanent) &&
2911 	    /* no coalescing for immutable executable mappings */
2912 	    !((entry->protection & VM_PROT_EXECUTE) &&
2913 	    entry->vme_permanent) &&
2914 	    (!entry->superpage_size && !superpage_size) &&
2915 	    /*
2916 	     * No coalescing if not map-aligned, to avoid propagating
2917 	     * that condition any further than needed:
2918 	     */
2919 	    (!entry->map_aligned || !clear_map_aligned) &&
2920 	    (!entry->zero_wired_pages) &&
2921 	    (!entry->used_for_jit && !entry_for_jit) &&
2922 	    (!entry->pmap_cs_associated) &&
2923 	    (entry->iokit_acct == iokit_acct) &&
2924 	    (!entry->vme_resilient_codesign) &&
2925 	    (!entry->vme_resilient_media) &&
2926 	    (!entry->vme_atomic) &&
2927 	    (entry->vme_no_copy_on_read == no_copy_on_read) &&
2928 
2929 	    ((entry->vme_end - entry->vme_start) + size <=
2930 	    (user_alias == VM_MEMORY_REALLOC ?
2931 	    ANON_CHUNK_SIZE :
2932 	    NO_COALESCE_LIMIT)) &&
2933 
2934 	    (entry->wired_count == 0)) {        /* implies user_wired_count == 0 */
2935 		if (vm_object_coalesce(VME_OBJECT(entry),
2936 		    VM_OBJECT_NULL,
2937 		    VME_OFFSET(entry),
2938 		    (vm_object_offset_t) 0,
2939 		    (vm_map_size_t)(entry->vme_end - entry->vme_start),
2940 		    (vm_map_size_t)(end - entry->vme_end))) {
2941 			/*
2942 			 *	Coalesced the two objects - can extend
2943 			 *	the previous map entry to include the
2944 			 *	new range.
2945 			 */
2946 			map->size += (end - entry->vme_end);
2947 			assert(entry->vme_start < end);
2948 			assert(VM_MAP_PAGE_ALIGNED(end,
2949 			    VM_MAP_PAGE_MASK(map)));
2950 			if (__improbable(vm_debug_events)) {
2951 				DTRACE_VM5(map_entry_extend, vm_map_t, map, vm_map_entry_t, entry, vm_address_t, entry->vme_start, vm_address_t, entry->vme_end, vm_address_t, end);
2952 			}
2953 			entry->vme_end = end;
2954 			if (map->holelistenabled) {
2955 				vm_map_store_update_first_free(map, entry, TRUE);
2956 			} else {
2957 				vm_map_store_update_first_free(map, map->first_free, TRUE);
2958 			}
2959 			new_mapping_established = TRUE;
2960 			RETURN(KERN_SUCCESS);
2961 		}
2962 	}
2963 
2964 	step = superpage_size ? SUPERPAGE_SIZE : (end - start);
2965 	new_entry = NULL;
2966 
2967 	if (vmk_flags.vmkf_submap_adjust) {
2968 		vm_map_adjust_offsets((vm_map_t)caller_object, start, end);
2969 		offset = start;
2970 	}
2971 
2972 	for (tmp2_start = start; tmp2_start < end; tmp2_start += step) {
2973 		tmp2_end = tmp2_start + step;
2974 		/*
2975 		 *	Create a new entry
2976 		 *
2977 		 * XXX FBDP
2978 		 * The reserved "page zero" in each process's address space can
2979 		 * be arbitrarily large.  Splitting it into separate objects and
2980 		 * therefore different VM map entries serves no purpose and just
2981 		 * slows down operations on the VM map, so let's not split the
2982 		 * allocation into chunks if the max protection is NONE.  That
2983 		 * memory should never be accessible, so it will never get to the
2984 		 * default pager.
2985 		 */
2986 		tmp_start = tmp2_start;
2987 		if (!is_submap &&
2988 		    object == VM_OBJECT_NULL &&
2989 		    size > chunk_size &&
2990 		    max_protection != VM_PROT_NONE &&
2991 		    superpage_size == 0) {
2992 			tmp_end = tmp_start + chunk_size;
2993 		} else {
2994 			tmp_end = tmp2_end;
2995 		}
2996 		do {
2997 			if (!is_submap &&
2998 			    object != VM_OBJECT_NULL &&
2999 			    object->internal &&
3000 			    offset + (tmp_end - tmp_start) > object->vo_size) {
3001 //				printf("FBDP object %p size 0x%llx overmapping offset 0x%llx size 0x%llx\n", object, object->vo_size, offset, (uint64_t)(tmp_end - tmp_start));
3002 				DTRACE_VM5(vm_map_enter_overmap,
3003 				    vm_map_t, map,
3004 				    vm_map_address_t, tmp_start,
3005 				    vm_map_address_t, tmp_end,
3006 				    vm_object_offset_t, offset,
3007 				    vm_object_size_t, object->vo_size);
3008 			}
3009 			new_entry = vm_map_entry_insert(map,
3010 			    entry, tmp_start, tmp_end,
3011 			    object, offset, vmk_flags,
3012 			    needs_copy,
3013 			    cur_protection, max_protection,
3014 			    (entry_for_jit && !VM_MAP_POLICY_ALLOW_JIT_INHERIT(map) ?
3015 			    VM_INHERIT_NONE : inheritance),
3016 			    no_cache,
3017 			    permanent,
3018 			    superpage_size,
3019 			    clear_map_aligned,
3020 			    alias);
3021 
3022 			assert((object != kernel_object) || (VM_KERN_MEMORY_NONE != alias));
3023 
3024 			if (resilient_codesign) {
3025 				int reject_prot = (needs_copy ? VM_PROT_ALLEXEC : (VM_PROT_WRITE | VM_PROT_ALLEXEC));
3026 				if (!((cur_protection | max_protection) & reject_prot)) {
3027 					new_entry->vme_resilient_codesign = TRUE;
3028 				}
3029 			}
3030 
3031 			if (resilient_media &&
3032 			    (object == VM_OBJECT_NULL ||
3033 			    object->internal)) {
3034 				new_entry->vme_resilient_media = TRUE;
3035 			}
3036 
3037 			assert(!new_entry->iokit_acct);
3038 			if (!is_submap &&
3039 			    object != VM_OBJECT_NULL &&
3040 			    (object->purgable != VM_PURGABLE_DENY ||
3041 			    object->vo_ledger_tag)) {
3042 				assert(new_entry->use_pmap);
3043 				assert(!new_entry->iokit_acct);
3044 				/*
3045 				 * Turn off pmap accounting since
3046 				 * purgeable (or tagged) objects have their
3047 				 * own ledgers.
3048 				 */
3049 				new_entry->use_pmap = FALSE;
3050 			} else if (!is_submap &&
3051 			    iokit_acct &&
3052 			    object != VM_OBJECT_NULL &&
3053 			    object->internal) {
3054 				/* alternate accounting */
3055 				assert(!new_entry->iokit_acct);
3056 				assert(new_entry->use_pmap);
3057 				new_entry->iokit_acct = TRUE;
3058 				new_entry->use_pmap = FALSE;
3059 				DTRACE_VM4(
3060 					vm_map_iokit_mapped_region,
3061 					vm_map_t, map,
3062 					vm_map_offset_t, new_entry->vme_start,
3063 					vm_map_offset_t, new_entry->vme_end,
3064 					int, VME_ALIAS(new_entry));
3065 				vm_map_iokit_mapped_region(
3066 					map,
3067 					(new_entry->vme_end -
3068 					new_entry->vme_start));
3069 			} else if (!is_submap) {
3070 				assert(!new_entry->iokit_acct);
3071 				assert(new_entry->use_pmap);
3072 			}
3073 
3074 			if (is_submap) {
3075 				vm_map_t        submap;
3076 				boolean_t       submap_is_64bit;
3077 				boolean_t       use_pmap;
3078 
3079 				assert(new_entry->is_sub_map);
3080 				assert(!new_entry->use_pmap);
3081 				assert(!new_entry->iokit_acct);
3082 				submap = (vm_map_t) object;
3083 				submap_is_64bit = vm_map_is_64bit(submap);
3084 				use_pmap = vmk_flags.vmkf_nested_pmap;
3085 #ifndef NO_NESTED_PMAP
3086 				if (use_pmap && submap->pmap == NULL) {
3087 					ledger_t ledger = map->pmap->ledger;
3088 					/* we need a sub pmap to nest... */
3089 					submap->pmap = pmap_create_options(ledger, 0,
3090 					    submap_is_64bit ? PMAP_CREATE_64BIT : 0);
3091 					if (submap->pmap == NULL) {
3092 						/* let's proceed without nesting... */
3093 					}
3094 #if defined(__arm64__)
3095 					else {
3096 						pmap_set_nested(submap->pmap);
3097 					}
3098 #endif
3099 				}
3100 				if (use_pmap && submap->pmap != NULL) {
3101 					if (VM_MAP_PAGE_SHIFT(map) != VM_MAP_PAGE_SHIFT(submap)) {
3102 						DEBUG4K_ERROR("map %p (%d) submap %p (%d): incompatible page sizes\n", map, VM_MAP_PAGE_SHIFT(map), submap, VM_MAP_PAGE_SHIFT(submap));
3103 						kr = KERN_FAILURE;
3104 					} else {
3105 						kr = pmap_nest(map->pmap,
3106 						    submap->pmap,
3107 						    tmp_start,
3108 						    tmp_end - tmp_start);
3109 					}
3110 					if (kr != KERN_SUCCESS) {
3111 						printf("vm_map_enter: "
3112 						    "pmap_nest(0x%llx,0x%llx) "
3113 						    "error 0x%x\n",
3114 						    (long long)tmp_start,
3115 						    (long long)tmp_end,
3116 						    kr);
3117 					} else {
3118 						/* we're now nested ! */
3119 						new_entry->use_pmap = TRUE;
3120 						pmap_empty = FALSE;
3121 					}
3122 				}
3123 #endif /* NO_NESTED_PMAP */
3124 			}
3125 			entry = new_entry;
3126 
3127 			if (superpage_size) {
3128 				vm_page_t pages, m;
3129 				vm_object_t sp_object;
3130 				vm_object_offset_t sp_offset;
3131 
3132 				VME_OFFSET_SET(entry, 0);
3133 
3134 				/* allocate one superpage */
3135 				kr = cpm_allocate(SUPERPAGE_SIZE, &pages, 0, SUPERPAGE_NBASEPAGES - 1, TRUE, 0);
3136 				if (kr != KERN_SUCCESS) {
3137 					/* deallocate whole range... */
3138 					new_mapping_established = TRUE;
3139 					/* ... but only up to "tmp_end" */
3140 					size -= end - tmp_end;
3141 					RETURN(kr);
3142 				}
3143 
3144 				/* create one vm_object per superpage */
3145 				sp_object = vm_object_allocate((vm_map_size_t)(entry->vme_end - entry->vme_start));
3146 				sp_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
3147 				sp_object->phys_contiguous = TRUE;
3148 				sp_object->vo_shadow_offset = (vm_object_offset_t)VM_PAGE_GET_PHYS_PAGE(pages) * PAGE_SIZE;
3149 				VME_OBJECT_SET(entry, sp_object, false, 0);
3150 				assert(entry->use_pmap);
3151 
3152 				/* enter the base pages into the object */
3153 				vm_object_lock(sp_object);
3154 				for (sp_offset = 0;
3155 				    sp_offset < SUPERPAGE_SIZE;
3156 				    sp_offset += PAGE_SIZE) {
3157 					m = pages;
3158 					pmap_zero_page(VM_PAGE_GET_PHYS_PAGE(m));
3159 					pages = NEXT_PAGE(m);
3160 					*(NEXT_PAGE_PTR(m)) = VM_PAGE_NULL;
3161 					vm_page_insert_wired(m, sp_object, sp_offset, VM_KERN_MEMORY_OSFMK);
3162 				}
3163 				vm_object_unlock(sp_object);
3164 			}
3165 		} while (tmp_end != tmp2_end &&
3166 		    (tmp_start = tmp_end) &&
3167 		    (tmp_end = (tmp2_end - tmp_end > chunk_size) ?
3168 		    tmp_end + chunk_size : tmp2_end));
3169 	}
3170 
3171 	new_mapping_established = TRUE;
3172 
3173 BailOut:
3174 	assert(map_locked == TRUE);
3175 
3176 	/*
3177 	 * Address space limit enforcement (RLIMIT_AS and RLIMIT_DATA):
3178 	 * If we have identified and possibly established the new mapping(s),
3179 	 * make sure we did not go beyond the address space limit.
3180 	 */
3181 	if (result == KERN_SUCCESS) {
3182 		if (map->size_limit != RLIM_INFINITY &&
3183 		    map->size > map->size_limit) {
3184 			/*
3185 			 * Establishing the requested mappings would exceed
3186 			 * the process's RLIMIT_AS limit: fail with
3187 			 * KERN_NO_SPACE.
3188 			 */
3189 			result = KERN_NO_SPACE;
3190 			printf("%d[%s] %s: map size 0x%llx over RLIMIT_AS 0x%llx\n",
3191 			    proc_selfpid(),
3192 			    (get_bsdtask_info(current_task())
3193 			    ? proc_name_address(get_bsdtask_info(current_task()))
3194 			    : "?"),
3195 			    __FUNCTION__,
3196 			    (uint64_t) map->size,
3197 			    (uint64_t) map->size_limit);
3198 			DTRACE_VM2(vm_map_enter_RLIMIT_AS,
3199 			    vm_map_size_t, map->size,
3200 			    uint64_t, map->size_limit);
3201 			vm_map_enter_RLIMIT_AS_count++;
3202 		} else if (map->data_limit != RLIM_INFINITY &&
3203 		    map->size > map->data_limit) {
3204 			/*
3205 			 * Establishing the requested mappings would exceed
3206 			 * the process's RLIMIT_DATA limit: fail with
3207 			 * KERN_NO_SPACE.
3208 			 */
3209 			result = KERN_NO_SPACE;
3210 			printf("%d[%s] %s: map size 0x%llx over RLIMIT_DATA 0x%llx\n",
3211 			    proc_selfpid(),
3212 			    (get_bsdtask_info(current_task())
3213 			    ? proc_name_address(get_bsdtask_info(current_task()))
3214 			    : "?"),
3215 			    __FUNCTION__,
3216 			    (uint64_t) map->size,
3217 			    (uint64_t) map->data_limit);
3218 			DTRACE_VM2(vm_map_enter_RLIMIT_DATA,
3219 			    vm_map_size_t, map->size,
3220 			    uint64_t, map->data_limit);
3221 			vm_map_enter_RLIMIT_DATA_count++;
3222 		}
3223 	}
3224 
3225 	if (result == KERN_SUCCESS) {
3226 		vm_prot_t pager_prot;
3227 		memory_object_t pager;
3228 
3229 #if DEBUG
3230 		if (pmap_empty &&
3231 		    !(vmk_flags.vmkf_no_pmap_check)) {
3232 			assert(pmap_is_empty(map->pmap,
3233 			    *address,
3234 			    *address + size));
3235 		}
3236 #endif /* DEBUG */
3237 
3238 		/*
3239 		 * For "named" VM objects, let the pager know that the
3240 		 * memory object is being mapped.  Some pagers need to keep
3241 		 * track of this, to know when they can reclaim the memory
3242 		 * object, for example.
3243 		 * VM calls memory_object_map() for each mapping (specifying
3244 		 * the protection of each mapping) and calls
3245 		 * memory_object_last_unmap() when all the mappings are gone.
3246 		 */
3247 		pager_prot = max_protection;
3248 		if (needs_copy) {
3249 			/*
3250 			 * Copy-On-Write mapping: won't modify
3251 			 * the memory object.
3252 			 */
3253 			pager_prot &= ~VM_PROT_WRITE;
3254 		}
3255 		if (!is_submap &&
3256 		    object != VM_OBJECT_NULL &&
3257 		    object->named &&
3258 		    object->pager != MEMORY_OBJECT_NULL) {
3259 			vm_object_lock(object);
3260 			pager = object->pager;
3261 			if (object->named &&
3262 			    pager != MEMORY_OBJECT_NULL) {
3263 				assert(object->pager_ready);
3264 				vm_object_mapping_wait(object, THREAD_UNINT);
3265 				vm_object_mapping_begin(object);
3266 				vm_object_unlock(object);
3267 
3268 				kr = memory_object_map(pager, pager_prot);
3269 				assert(kr == KERN_SUCCESS);
3270 
3271 				vm_object_lock(object);
3272 				vm_object_mapping_end(object);
3273 			}
3274 			vm_object_unlock(object);
3275 		}
3276 	}
3277 
3278 	assert(map_locked == TRUE);
3279 
3280 	if (new_mapping_established) {
3281 		/*
3282 		 * If we release the map lock for any reason below,
3283 		 * another thread could deallocate our new mapping,
3284 		 * releasing the caller's reference on "caller_object",
3285 		 * which was transferred to the mapping.
3286 		 * If this was the only reference, the object could be
3287 		 * destroyed.
3288 		 *
3289 		 * We need to take an extra reference on "caller_object"
3290 		 * to keep it alive if we need to return the caller's
3291 		 * reference to the caller in case of failure.
3292 		 */
3293 		if (is_submap) {
3294 			vm_map_reference((vm_map_t)caller_object);
3295 		} else {
3296 			vm_object_reference(caller_object);
3297 		}
3298 	}
3299 
3300 	if (!keep_map_locked) {
3301 		vm_map_unlock(map);
3302 		map_locked = FALSE;
3303 		entry = VM_MAP_ENTRY_NULL;
3304 		new_entry = VM_MAP_ENTRY_NULL;
3305 	}
3306 
3307 	/*
3308 	 * We can't hold the map lock if we enter this block.
3309 	 */
3310 
3311 	if (result == KERN_SUCCESS) {
3312 		/*	Wire down the new entry if the user
3313 		 *	requested all new map entries be wired.
3314 		 */
3315 		if ((map->wiring_required) || (superpage_size)) {
3316 			assert(!keep_map_locked);
3317 			pmap_empty = FALSE; /* pmap won't be empty */
3318 			kr = vm_map_wire_kernel(map, start, end,
3319 			    cur_protection, VM_KERN_MEMORY_MLOCK,
3320 			    TRUE);
3321 			result = kr;
3322 		}
3323 
3324 	}
3325 
3326 	if (result != KERN_SUCCESS) {
3327 		if (new_mapping_established) {
3328 			vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_FLAGS;
3329 
3330 			/*
3331 			 * We have to get rid of the new mappings since we
3332 			 * won't make them available to the user.
3333 			 * Try and do that atomically, to minimize the risk
3334 			 * that someone else create new mappings that range.
3335 			 */
3336 			if (!map_locked) {
3337 				vm_map_lock(map);
3338 				map_locked = TRUE;
3339 			}
3340 			remove_flags |= VM_MAP_REMOVE_NO_MAP_ALIGN;
3341 			remove_flags |= VM_MAP_REMOVE_NO_YIELD;
3342 			if (permanent) {
3343 				remove_flags |= VM_MAP_REMOVE_IMMUTABLE;
3344 			}
3345 			(void) vm_map_delete(map,
3346 			    *address, *address + size,
3347 			    remove_flags,
3348 			    KMEM_GUARD_NONE, &zap_new_list);
3349 		}
3350 
3351 		if (vm_map_zap_first_entry(&zap_old_list)) {
3352 			vm_map_entry_t entry1, entry2;
3353 
3354 			/*
3355 			 * The new mapping failed.  Attempt to restore
3356 			 * the old mappings, saved in the "zap_old_map".
3357 			 */
3358 			if (!map_locked) {
3359 				vm_map_lock(map);
3360 				map_locked = TRUE;
3361 			}
3362 
3363 			/* first check if the coast is still clear */
3364 			start = vm_map_zap_first_entry(&zap_old_list)->vme_start;
3365 			end   = vm_map_zap_last_entry(&zap_old_list)->vme_end;
3366 
3367 			if (vm_map_lookup_entry(map, start, &entry1) ||
3368 			    vm_map_lookup_entry(map, end, &entry2) ||
3369 			    entry1 != entry2) {
3370 				/*
3371 				 * Part of that range has already been
3372 				 * re-mapped:  we can't restore the old
3373 				 * mappings...
3374 				 */
3375 				vm_map_enter_restore_failures++;
3376 			} else {
3377 				/*
3378 				 * Transfer the saved map entries from
3379 				 * "zap_old_map" to the original "map",
3380 				 * inserting them all after "entry1".
3381 				 */
3382 				while ((entry2 = vm_map_zap_pop(&zap_old_list))) {
3383 					vm_map_size_t entry_size;
3384 
3385 					entry_size = (entry2->vme_end -
3386 					    entry2->vme_start);
3387 					vm_map_store_entry_link(map, entry1, entry2,
3388 					    VM_MAP_KERNEL_FLAGS_NONE);
3389 					map->size += entry_size;
3390 					entry1 = entry2;
3391 				}
3392 				if (map->wiring_required) {
3393 					/*
3394 					 * XXX TODO: we should rewire the
3395 					 * old pages here...
3396 					 */
3397 				}
3398 				vm_map_enter_restore_successes++;
3399 			}
3400 		}
3401 	}
3402 
3403 	/*
3404 	 * The caller is responsible for releasing the lock if it requested to
3405 	 * keep the map locked.
3406 	 */
3407 	if (map_locked && !keep_map_locked) {
3408 		vm_map_unlock(map);
3409 	}
3410 
3411 	vm_map_zap_dispose(&zap_old_list);
3412 	vm_map_zap_dispose(&zap_new_list);
3413 
3414 	if (new_mapping_established) {
3415 		/*
3416 		 * The caller had a reference on "caller_object" and we
3417 		 * transferred that reference to the mapping.
3418 		 * We also took an extra reference on "caller_object" to keep
3419 		 * it alive while the map was unlocked.
3420 		 */
3421 		if (result == KERN_SUCCESS) {
3422 			/*
3423 			 * On success, the caller's reference on the object gets
3424 			 * tranferred to the mapping.
3425 			 * Release our extra reference.
3426 			 */
3427 			if (is_submap) {
3428 				vm_map_deallocate((vm_map_t)caller_object);
3429 			} else {
3430 				vm_object_deallocate(caller_object);
3431 			}
3432 		} else {
3433 			/*
3434 			 * On error, the caller expects to still have a
3435 			 * reference on the object it gave us.
3436 			 * Let's use our extra reference for that.
3437 			 */
3438 		}
3439 	}
3440 
3441 	return result;
3442 
3443 #undef  RETURN
3444 }
3445 
3446 #if __arm64__
3447 extern const struct memory_object_pager_ops fourk_pager_ops;
3448 kern_return_t
vm_map_enter_fourk(vm_map_t map,vm_map_offset_t * address,vm_map_size_t size,vm_map_offset_t mask,int flags,vm_map_kernel_flags_t vmk_flags,vm_tag_t alias,vm_object_t object,vm_object_offset_t offset,boolean_t needs_copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)3449 vm_map_enter_fourk(
3450 	vm_map_t                map,
3451 	vm_map_offset_t         *address,       /* IN/OUT */
3452 	vm_map_size_t           size,
3453 	vm_map_offset_t         mask,
3454 	int                     flags,
3455 	vm_map_kernel_flags_t   vmk_flags,
3456 	vm_tag_t                alias,
3457 	vm_object_t             object,
3458 	vm_object_offset_t      offset,
3459 	boolean_t               needs_copy,
3460 	vm_prot_t               cur_protection,
3461 	vm_prot_t               max_protection,
3462 	vm_inherit_t            inheritance)
3463 {
3464 	vm_map_entry_t          entry, new_entry;
3465 	vm_map_offset_t         start, fourk_start;
3466 	vm_map_offset_t         end, fourk_end;
3467 	vm_map_size_t           fourk_size;
3468 	kern_return_t           result = KERN_SUCCESS;
3469 	boolean_t               map_locked = FALSE;
3470 	boolean_t               pmap_empty = TRUE;
3471 	boolean_t               new_mapping_established = FALSE;
3472 	boolean_t               keep_map_locked = vmk_flags.vmkf_keep_map_locked;
3473 	boolean_t               anywhere = ((flags & VM_FLAGS_ANYWHERE) != 0);
3474 	boolean_t               purgable = ((flags & VM_FLAGS_PURGABLE) != 0);
3475 	boolean_t               overwrite = ((flags & VM_FLAGS_OVERWRITE) != 0);
3476 	boolean_t               no_cache = ((flags & VM_FLAGS_NO_CACHE) != 0);
3477 	const boolean_t         is_submap = vmk_flags.vmkf_submap;
3478 	boolean_t               permanent = (((flags & VM_FLAGS_PERMANENT) != 0) || vmk_flags.vmkf_permanent);
3479 	const boolean_t         entry_for_jit = vmk_flags.vmkf_map_jit;
3480 //	boolean_t		iokit_acct = vmk_flags.vmkf_iokit_acct;
3481 	unsigned int            superpage_size = ((flags & VM_FLAGS_SUPERPAGE_MASK) >> VM_FLAGS_SUPERPAGE_SHIFT);
3482 	vm_map_offset_t         effective_min_offset, effective_max_offset;
3483 	kern_return_t           kr;
3484 	boolean_t               clear_map_aligned = FALSE;
3485 	memory_object_t         fourk_mem_obj;
3486 	vm_object_t             fourk_object;
3487 	vm_map_offset_t         fourk_pager_offset;
3488 	int                     fourk_pager_index_start, fourk_pager_index_num;
3489 	int                     cur_idx;
3490 	boolean_t               fourk_copy;
3491 	vm_object_t             copy_object;
3492 	vm_object_offset_t      copy_offset;
3493 	VM_MAP_ZAP_DECLARE(zap_list);
3494 
3495 	if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
3496 		panic("%s:%d", __FUNCTION__, __LINE__);
3497 	}
3498 	fourk_mem_obj = MEMORY_OBJECT_NULL;
3499 	fourk_object = VM_OBJECT_NULL;
3500 
3501 	if (superpage_size) {
3502 		return KERN_NOT_SUPPORTED;
3503 	}
3504 
3505 	if ((cur_protection & VM_PROT_WRITE) &&
3506 	    (cur_protection & VM_PROT_EXECUTE) &&
3507 #if XNU_TARGET_OS_OSX
3508 	    map->pmap != kernel_pmap &&
3509 	    (vm_map_cs_enforcement(map)
3510 #if __arm64__
3511 	    || !VM_MAP_IS_EXOTIC(map)
3512 #endif /* __arm64__ */
3513 	    ) &&
3514 #endif /* XNU_TARGET_OS_OSX */
3515 	    !entry_for_jit) {
3516 		DTRACE_VM3(cs_wx,
3517 		    uint64_t, 0,
3518 		    uint64_t, 0,
3519 		    vm_prot_t, cur_protection);
3520 		printf("CODE SIGNING: %d[%s] %s: curprot cannot be write+execute. "
3521 		    "turning off execute\n",
3522 		    proc_selfpid(),
3523 		    (get_bsdtask_info(current_task())
3524 		    ? proc_name_address(get_bsdtask_info(current_task()))
3525 		    : "?"),
3526 		    __FUNCTION__);
3527 		cur_protection &= ~VM_PROT_EXECUTE;
3528 	}
3529 
3530 	/*
3531 	 * If the task has requested executable lockdown,
3532 	 * deny any new executable mapping.
3533 	 */
3534 	if (map->map_disallow_new_exec == TRUE) {
3535 		if (cur_protection & VM_PROT_EXECUTE) {
3536 			return KERN_PROTECTION_FAILURE;
3537 		}
3538 	}
3539 
3540 	if (is_submap) {
3541 		return KERN_NOT_SUPPORTED;
3542 	}
3543 	if (vmk_flags.vmkf_already) {
3544 		return KERN_NOT_SUPPORTED;
3545 	}
3546 	if (purgable || entry_for_jit) {
3547 		return KERN_NOT_SUPPORTED;
3548 	}
3549 
3550 	effective_min_offset = map->min_offset;
3551 
3552 	if (vmk_flags.vmkf_beyond_max) {
3553 		return KERN_NOT_SUPPORTED;
3554 	} else {
3555 		effective_max_offset = map->max_offset;
3556 	}
3557 
3558 	if (size == 0 ||
3559 	    (offset & FOURK_PAGE_MASK) != 0) {
3560 		*address = 0;
3561 		return KERN_INVALID_ARGUMENT;
3562 	}
3563 
3564 #define RETURN(value)   { result = value; goto BailOut; }
3565 
3566 	assert(VM_MAP_PAGE_ALIGNED(*address, FOURK_PAGE_MASK));
3567 	assert(VM_MAP_PAGE_ALIGNED(size, FOURK_PAGE_MASK));
3568 
3569 	if (!anywhere && overwrite) {
3570 		return KERN_NOT_SUPPORTED;
3571 	}
3572 
3573 	fourk_start = *address;
3574 	fourk_size = size;
3575 	fourk_end = fourk_start + fourk_size;
3576 
3577 	start = vm_map_trunc_page(*address, VM_MAP_PAGE_MASK(map));
3578 	end = vm_map_round_page(fourk_end, VM_MAP_PAGE_MASK(map));
3579 	size = end - start;
3580 
3581 	if (anywhere) {
3582 		return KERN_NOT_SUPPORTED;
3583 	} else {
3584 		/*
3585 		 *	Verify that:
3586 		 *		the address doesn't itself violate
3587 		 *		the mask requirement.
3588 		 */
3589 
3590 		vm_map_lock(map);
3591 		map_locked = TRUE;
3592 		if ((start & mask) != 0) {
3593 			RETURN(KERN_NO_SPACE);
3594 		}
3595 
3596 		/*
3597 		 *	...	the address is within bounds
3598 		 */
3599 
3600 		end = start + size;
3601 
3602 		if ((start < effective_min_offset) ||
3603 		    (end > effective_max_offset) ||
3604 		    (start >= end)) {
3605 			RETURN(KERN_INVALID_ADDRESS);
3606 		}
3607 
3608 		/*
3609 		 *	...	the starting address isn't allocated
3610 		 */
3611 		if (vm_map_lookup_entry(map, start, &entry)) {
3612 			vm_object_t cur_object, shadow_object;
3613 
3614 			/*
3615 			 * We might already some 4K mappings
3616 			 * in a 16K page here.
3617 			 */
3618 
3619 			if (entry->vme_end - entry->vme_start
3620 			    != SIXTEENK_PAGE_SIZE) {
3621 				RETURN(KERN_NO_SPACE);
3622 			}
3623 			if (entry->is_sub_map) {
3624 				RETURN(KERN_NO_SPACE);
3625 			}
3626 			if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
3627 				RETURN(KERN_NO_SPACE);
3628 			}
3629 
3630 			/* go all the way down the shadow chain */
3631 			cur_object = VME_OBJECT(entry);
3632 			vm_object_lock(cur_object);
3633 			while (cur_object->shadow != VM_OBJECT_NULL) {
3634 				shadow_object = cur_object->shadow;
3635 				vm_object_lock(shadow_object);
3636 				vm_object_unlock(cur_object);
3637 				cur_object = shadow_object;
3638 				shadow_object = VM_OBJECT_NULL;
3639 			}
3640 			if (cur_object->internal ||
3641 			    cur_object->pager == NULL) {
3642 				vm_object_unlock(cur_object);
3643 				RETURN(KERN_NO_SPACE);
3644 			}
3645 			if (cur_object->pager->mo_pager_ops
3646 			    != &fourk_pager_ops) {
3647 				vm_object_unlock(cur_object);
3648 				RETURN(KERN_NO_SPACE);
3649 			}
3650 			fourk_object = cur_object;
3651 			fourk_mem_obj = fourk_object->pager;
3652 
3653 			/* keep the "4K" object alive */
3654 			vm_object_reference_locked(fourk_object);
3655 			memory_object_reference(fourk_mem_obj);
3656 			vm_object_unlock(fourk_object);
3657 
3658 			/* merge permissions */
3659 			entry->protection |= cur_protection;
3660 			entry->max_protection |= max_protection;
3661 
3662 			if ((entry->protection & VM_PROT_WRITE) &&
3663 			    (entry->protection & VM_PROT_ALLEXEC) &&
3664 			    fourk_binary_compatibility_unsafe &&
3665 			    fourk_binary_compatibility_allow_wx) {
3666 				/* write+execute: need to be "jit" */
3667 				entry->used_for_jit = TRUE;
3668 			}
3669 			goto map_in_fourk_pager;
3670 		}
3671 
3672 		/*
3673 		 *	...	the next region doesn't overlap the
3674 		 *		end point.
3675 		 */
3676 
3677 		if ((entry->vme_next != vm_map_to_entry(map)) &&
3678 		    (entry->vme_next->vme_start < end)) {
3679 			RETURN(KERN_NO_SPACE);
3680 		}
3681 	}
3682 
3683 	/*
3684 	 *	At this point,
3685 	 *		"start" and "end" should define the endpoints of the
3686 	 *			available new range, and
3687 	 *		"entry" should refer to the region before the new
3688 	 *			range, and
3689 	 *
3690 	 *		the map should be locked.
3691 	 */
3692 
3693 	/* create a new "4K" pager */
3694 	fourk_mem_obj = fourk_pager_create();
3695 	fourk_object = fourk_pager_to_vm_object(fourk_mem_obj);
3696 	assert(fourk_object);
3697 
3698 	/* keep the "4" object alive */
3699 	vm_object_reference(fourk_object);
3700 
3701 	/* create a "copy" object, to map the "4K" object copy-on-write */
3702 	fourk_copy = TRUE;
3703 	result = vm_object_copy_strategically(fourk_object,
3704 	    0,
3705 	    end - start,
3706 	    &copy_object,
3707 	    &copy_offset,
3708 	    &fourk_copy);
3709 	assert(result == KERN_SUCCESS);
3710 	assert(copy_object != VM_OBJECT_NULL);
3711 	assert(copy_offset == 0);
3712 
3713 	/* map the "4K" pager's copy object */
3714 	new_entry = vm_map_entry_insert(map,
3715 	    entry,
3716 	    vm_map_trunc_page(start, VM_MAP_PAGE_MASK(map)),
3717 	    vm_map_round_page(end, VM_MAP_PAGE_MASK(map)),
3718 	    copy_object,
3719 	    0,                      /* offset */
3720 	    vmk_flags,
3721 	    FALSE,                  /* needs_copy */
3722 	    cur_protection, max_protection,
3723 	    (entry_for_jit && !VM_MAP_POLICY_ALLOW_JIT_INHERIT(map) ?
3724 	    VM_INHERIT_NONE : inheritance),
3725 	    no_cache,
3726 	    permanent,
3727 	    superpage_size,
3728 	    clear_map_aligned,
3729 	    alias);
3730 	entry = new_entry;
3731 
3732 #if VM_MAP_DEBUG_FOURK
3733 	if (vm_map_debug_fourk) {
3734 		printf("FOURK_PAGER: map %p [0x%llx:0x%llx] new pager %p\n",
3735 		    map,
3736 		    (uint64_t) entry->vme_start,
3737 		    (uint64_t) entry->vme_end,
3738 		    fourk_mem_obj);
3739 	}
3740 #endif /* VM_MAP_DEBUG_FOURK */
3741 
3742 	new_mapping_established = TRUE;
3743 
3744 map_in_fourk_pager:
3745 	/* "map" the original "object" where it belongs in the "4K" pager */
3746 	fourk_pager_offset = (fourk_start & SIXTEENK_PAGE_MASK);
3747 	fourk_pager_index_start = (int) (fourk_pager_offset / FOURK_PAGE_SIZE);
3748 	if (fourk_size > SIXTEENK_PAGE_SIZE) {
3749 		fourk_pager_index_num = 4;
3750 	} else {
3751 		fourk_pager_index_num = (int) (fourk_size / FOURK_PAGE_SIZE);
3752 	}
3753 	if (fourk_pager_index_start + fourk_pager_index_num > 4) {
3754 		fourk_pager_index_num = 4 - fourk_pager_index_start;
3755 	}
3756 	for (cur_idx = 0;
3757 	    cur_idx < fourk_pager_index_num;
3758 	    cur_idx++) {
3759 		vm_object_t             old_object;
3760 		vm_object_offset_t      old_offset;
3761 
3762 		kr = fourk_pager_populate(fourk_mem_obj,
3763 		    TRUE,                       /* overwrite */
3764 		    fourk_pager_index_start + cur_idx,
3765 		    object,
3766 		    (object
3767 		    ? (offset +
3768 		    (cur_idx * FOURK_PAGE_SIZE))
3769 		    : 0),
3770 		    &old_object,
3771 		    &old_offset);
3772 #if VM_MAP_DEBUG_FOURK
3773 		if (vm_map_debug_fourk) {
3774 			if (old_object == (vm_object_t) -1 &&
3775 			    old_offset == (vm_object_offset_t) -1) {
3776 				printf("FOURK_PAGER: map %p [0x%llx:0x%llx] "
3777 				    "pager [%p:0x%llx] "
3778 				    "populate[%d] "
3779 				    "[object:%p,offset:0x%llx]\n",
3780 				    map,
3781 				    (uint64_t) entry->vme_start,
3782 				    (uint64_t) entry->vme_end,
3783 				    fourk_mem_obj,
3784 				    VME_OFFSET(entry),
3785 				    fourk_pager_index_start + cur_idx,
3786 				    object,
3787 				    (object
3788 				    ? (offset + (cur_idx * FOURK_PAGE_SIZE))
3789 				    : 0));
3790 			} else {
3791 				printf("FOURK_PAGER: map %p [0x%llx:0x%llx] "
3792 				    "pager [%p:0x%llx] "
3793 				    "populate[%d] [object:%p,offset:0x%llx] "
3794 				    "old [%p:0x%llx]\n",
3795 				    map,
3796 				    (uint64_t) entry->vme_start,
3797 				    (uint64_t) entry->vme_end,
3798 				    fourk_mem_obj,
3799 				    VME_OFFSET(entry),
3800 				    fourk_pager_index_start + cur_idx,
3801 				    object,
3802 				    (object
3803 				    ? (offset + (cur_idx * FOURK_PAGE_SIZE))
3804 				    : 0),
3805 				    old_object,
3806 				    old_offset);
3807 			}
3808 		}
3809 #endif /* VM_MAP_DEBUG_FOURK */
3810 
3811 		assert(kr == KERN_SUCCESS);
3812 		if (object != old_object &&
3813 		    object != VM_OBJECT_NULL &&
3814 		    object != (vm_object_t) -1) {
3815 			vm_object_reference(object);
3816 		}
3817 		if (object != old_object &&
3818 		    old_object != VM_OBJECT_NULL &&
3819 		    old_object != (vm_object_t) -1) {
3820 			vm_object_deallocate(old_object);
3821 		}
3822 	}
3823 
3824 BailOut:
3825 	assert(map_locked == TRUE);
3826 
3827 	if (result == KERN_SUCCESS) {
3828 		vm_prot_t pager_prot;
3829 		memory_object_t pager;
3830 
3831 #if DEBUG
3832 		if (pmap_empty &&
3833 		    !(vmk_flags.vmkf_no_pmap_check)) {
3834 			assert(pmap_is_empty(map->pmap,
3835 			    *address,
3836 			    *address + size));
3837 		}
3838 #endif /* DEBUG */
3839 
3840 		/*
3841 		 * For "named" VM objects, let the pager know that the
3842 		 * memory object is being mapped.  Some pagers need to keep
3843 		 * track of this, to know when they can reclaim the memory
3844 		 * object, for example.
3845 		 * VM calls memory_object_map() for each mapping (specifying
3846 		 * the protection of each mapping) and calls
3847 		 * memory_object_last_unmap() when all the mappings are gone.
3848 		 */
3849 		pager_prot = max_protection;
3850 		if (needs_copy) {
3851 			/*
3852 			 * Copy-On-Write mapping: won't modify
3853 			 * the memory object.
3854 			 */
3855 			pager_prot &= ~VM_PROT_WRITE;
3856 		}
3857 		if (!is_submap &&
3858 		    object != VM_OBJECT_NULL &&
3859 		    object->named &&
3860 		    object->pager != MEMORY_OBJECT_NULL) {
3861 			vm_object_lock(object);
3862 			pager = object->pager;
3863 			if (object->named &&
3864 			    pager != MEMORY_OBJECT_NULL) {
3865 				assert(object->pager_ready);
3866 				vm_object_mapping_wait(object, THREAD_UNINT);
3867 				vm_object_mapping_begin(object);
3868 				vm_object_unlock(object);
3869 
3870 				kr = memory_object_map(pager, pager_prot);
3871 				assert(kr == KERN_SUCCESS);
3872 
3873 				vm_object_lock(object);
3874 				vm_object_mapping_end(object);
3875 			}
3876 			vm_object_unlock(object);
3877 		}
3878 		if (!is_submap &&
3879 		    fourk_object != VM_OBJECT_NULL &&
3880 		    fourk_object->named &&
3881 		    fourk_object->pager != MEMORY_OBJECT_NULL) {
3882 			vm_object_lock(fourk_object);
3883 			pager = fourk_object->pager;
3884 			if (fourk_object->named &&
3885 			    pager != MEMORY_OBJECT_NULL) {
3886 				assert(fourk_object->pager_ready);
3887 				vm_object_mapping_wait(fourk_object,
3888 				    THREAD_UNINT);
3889 				vm_object_mapping_begin(fourk_object);
3890 				vm_object_unlock(fourk_object);
3891 
3892 				kr = memory_object_map(pager, VM_PROT_READ);
3893 				assert(kr == KERN_SUCCESS);
3894 
3895 				vm_object_lock(fourk_object);
3896 				vm_object_mapping_end(fourk_object);
3897 			}
3898 			vm_object_unlock(fourk_object);
3899 		}
3900 	}
3901 
3902 	if (fourk_object != VM_OBJECT_NULL) {
3903 		vm_object_deallocate(fourk_object);
3904 		fourk_object = VM_OBJECT_NULL;
3905 		memory_object_deallocate(fourk_mem_obj);
3906 		fourk_mem_obj = MEMORY_OBJECT_NULL;
3907 	}
3908 
3909 	assert(map_locked == TRUE);
3910 
3911 	if (!keep_map_locked) {
3912 		vm_map_unlock(map);
3913 		map_locked = FALSE;
3914 	}
3915 
3916 	/*
3917 	 * We can't hold the map lock if we enter this block.
3918 	 */
3919 
3920 	if (result == KERN_SUCCESS) {
3921 		/*	Wire down the new entry if the user
3922 		 *	requested all new map entries be wired.
3923 		 */
3924 		if ((map->wiring_required) || (superpage_size)) {
3925 			assert(!keep_map_locked);
3926 			pmap_empty = FALSE; /* pmap won't be empty */
3927 			kr = vm_map_wire_kernel(map, start, end,
3928 			    new_entry->protection, VM_KERN_MEMORY_MLOCK,
3929 			    TRUE);
3930 			result = kr;
3931 		}
3932 
3933 	}
3934 
3935 	if (result != KERN_SUCCESS) {
3936 		if (new_mapping_established) {
3937 			/*
3938 			 * We have to get rid of the new mappings since we
3939 			 * won't make them available to the user.
3940 			 * Try and do that atomically, to minimize the risk
3941 			 * that someone else create new mappings that range.
3942 			 */
3943 
3944 			if (!map_locked) {
3945 				vm_map_lock(map);
3946 				map_locked = TRUE;
3947 			}
3948 			(void)vm_map_delete(map, *address, *address + size,
3949 			    VM_MAP_REMOVE_NO_MAP_ALIGN | VM_MAP_REMOVE_NO_YIELD,
3950 			    KMEM_GUARD_NONE, &zap_list);
3951 		}
3952 	}
3953 
3954 	/*
3955 	 * The caller is responsible for releasing the lock if it requested to
3956 	 * keep the map locked.
3957 	 */
3958 	if (map_locked && !keep_map_locked) {
3959 		vm_map_unlock(map);
3960 	}
3961 
3962 	vm_map_zap_dispose(&zap_list);
3963 
3964 	return result;
3965 
3966 #undef  RETURN
3967 }
3968 #endif /* __arm64__ */
3969 
3970 /*
3971  * Counters for the prefault optimization.
3972  */
3973 int64_t vm_prefault_nb_pages = 0;
3974 int64_t vm_prefault_nb_bailout = 0;
3975 
3976 static kern_return_t
vm_map_enter_mem_object_helper(vm_map_t target_map,vm_map_offset_t * address,vm_map_size_t initial_size,vm_map_offset_t mask,int flags,vm_map_kernel_flags_t vmk_flags,vm_tag_t tag,ipc_port_t port,vm_object_offset_t offset,boolean_t copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance,upl_page_list_ptr_t page_list,unsigned int page_list_count)3977 vm_map_enter_mem_object_helper(
3978 	vm_map_t                target_map,
3979 	vm_map_offset_t         *address,
3980 	vm_map_size_t           initial_size,
3981 	vm_map_offset_t         mask,
3982 	int                     flags,
3983 	vm_map_kernel_flags_t   vmk_flags,
3984 	vm_tag_t                tag,
3985 	ipc_port_t              port,
3986 	vm_object_offset_t      offset,
3987 	boolean_t               copy,
3988 	vm_prot_t               cur_protection,
3989 	vm_prot_t               max_protection,
3990 	vm_inherit_t            inheritance,
3991 	upl_page_list_ptr_t     page_list,
3992 	unsigned int            page_list_count)
3993 {
3994 	vm_map_address_t        map_addr;
3995 	vm_map_size_t           map_size;
3996 	vm_object_t             object;
3997 	vm_object_size_t        size;
3998 	kern_return_t           result;
3999 	boolean_t               mask_cur_protection, mask_max_protection;
4000 	boolean_t               kernel_prefault, try_prefault = (page_list_count != 0);
4001 	vm_map_offset_t         offset_in_mapping = 0;
4002 #if __arm64__
4003 	boolean_t               fourk = vmk_flags.vmkf_fourk;
4004 #endif /* __arm64__ */
4005 
4006 	if (VM_MAP_PAGE_SHIFT(target_map) < PAGE_SHIFT) {
4007 		/* XXX TODO4K prefaulting depends on page size... */
4008 		try_prefault = FALSE;
4009 	}
4010 
4011 	assertf(vmk_flags.__vmkf_unused == 0, "vmk_flags unused=0x%x\n", vmk_flags.__vmkf_unused);
4012 
4013 	mask_cur_protection = cur_protection & VM_PROT_IS_MASK;
4014 	mask_max_protection = max_protection & VM_PROT_IS_MASK;
4015 	cur_protection &= ~VM_PROT_IS_MASK;
4016 	max_protection &= ~VM_PROT_IS_MASK;
4017 
4018 	/*
4019 	 * Check arguments for validity
4020 	 */
4021 	if ((target_map == VM_MAP_NULL) ||
4022 	    (cur_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)) ||
4023 	    (max_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)) ||
4024 	    (inheritance > VM_INHERIT_LAST_VALID) ||
4025 	    (try_prefault && (copy || !page_list)) ||
4026 	    initial_size == 0) {
4027 		return KERN_INVALID_ARGUMENT;
4028 	}
4029 
4030 #if __arm64__
4031 	if (cur_protection & VM_PROT_EXECUTE) {
4032 		cur_protection |= VM_PROT_READ;
4033 	}
4034 
4035 	if (fourk && VM_MAP_PAGE_SHIFT(target_map) < PAGE_SHIFT) {
4036 		/* no "fourk" if map is using a sub-page page size */
4037 		fourk = FALSE;
4038 	}
4039 	if (fourk) {
4040 		map_addr = vm_map_trunc_page(*address, FOURK_PAGE_MASK);
4041 		map_size = vm_map_round_page(initial_size, FOURK_PAGE_MASK);
4042 	} else
4043 #endif /* __arm64__ */
4044 	{
4045 		map_addr = vm_map_trunc_page(*address,
4046 		    VM_MAP_PAGE_MASK(target_map));
4047 		map_size = vm_map_round_page(initial_size,
4048 		    VM_MAP_PAGE_MASK(target_map));
4049 	}
4050 	size = vm_object_round_page(initial_size);
4051 
4052 	/*
4053 	 * Find the vm object (if any) corresponding to this port.
4054 	 */
4055 	if (!IP_VALID(port)) {
4056 		object = VM_OBJECT_NULL;
4057 		offset = 0;
4058 		copy = FALSE;
4059 	} else if (ip_kotype(port) == IKOT_NAMED_ENTRY) {
4060 		vm_named_entry_t        named_entry;
4061 		vm_object_offset_t      data_offset;
4062 
4063 		named_entry = mach_memory_entry_from_port(port);
4064 
4065 		if (flags & (VM_FLAGS_RETURN_DATA_ADDR |
4066 		    VM_FLAGS_RETURN_4K_DATA_ADDR)) {
4067 			data_offset = named_entry->data_offset;
4068 			offset += named_entry->data_offset;
4069 		} else {
4070 			data_offset = 0;
4071 		}
4072 
4073 		/* a few checks to make sure user is obeying rules */
4074 		if (size == 0) {
4075 			if (offset >= named_entry->size) {
4076 				return KERN_INVALID_RIGHT;
4077 			}
4078 			size = named_entry->size - offset;
4079 		}
4080 		if (mask_max_protection) {
4081 			max_protection &= named_entry->protection;
4082 		}
4083 		if (mask_cur_protection) {
4084 			cur_protection &= named_entry->protection;
4085 		}
4086 		if ((named_entry->protection & max_protection) !=
4087 		    max_protection) {
4088 			return KERN_INVALID_RIGHT;
4089 		}
4090 		if ((named_entry->protection & cur_protection) !=
4091 		    cur_protection) {
4092 			return KERN_INVALID_RIGHT;
4093 		}
4094 		if (offset + size < offset) {
4095 			/* overflow */
4096 			return KERN_INVALID_ARGUMENT;
4097 		}
4098 		if (named_entry->size < (offset + initial_size)) {
4099 			return KERN_INVALID_ARGUMENT;
4100 		}
4101 
4102 		if (named_entry->is_copy) {
4103 			/* for a vm_map_copy, we can only map it whole */
4104 			if ((size != named_entry->size) &&
4105 			    (vm_map_round_page(size,
4106 			    VM_MAP_PAGE_MASK(target_map)) ==
4107 			    named_entry->size)) {
4108 				/* XXX FBDP use the rounded size... */
4109 				size = vm_map_round_page(
4110 					size,
4111 					VM_MAP_PAGE_MASK(target_map));
4112 			}
4113 		}
4114 
4115 		/* the callers parameter offset is defined to be the */
4116 		/* offset from beginning of named entry offset in object */
4117 		offset = offset + named_entry->offset;
4118 
4119 		if (!VM_MAP_PAGE_ALIGNED(size,
4120 		    VM_MAP_PAGE_MASK(target_map))) {
4121 			/*
4122 			 * Let's not map more than requested;
4123 			 * vm_map_enter() will handle this "not map-aligned"
4124 			 * case.
4125 			 */
4126 			map_size = size;
4127 		}
4128 
4129 		named_entry_lock(named_entry);
4130 		if (named_entry->is_sub_map) {
4131 			vm_map_t                submap;
4132 
4133 			if (flags & (VM_FLAGS_RETURN_DATA_ADDR |
4134 			    VM_FLAGS_RETURN_4K_DATA_ADDR)) {
4135 				panic("VM_FLAGS_RETURN_DATA_ADDR not expected for submap.");
4136 			}
4137 
4138 			submap = named_entry->backing.map;
4139 			vm_map_reference(submap);
4140 			named_entry_unlock(named_entry);
4141 
4142 			vmk_flags.vmkf_submap = TRUE;
4143 
4144 			result = vm_map_enter(target_map,
4145 			    &map_addr,
4146 			    map_size,
4147 			    mask,
4148 			    flags,
4149 			    vmk_flags,
4150 			    tag,
4151 			    (vm_object_t)(uintptr_t) submap,
4152 			    offset,
4153 			    copy,
4154 			    cur_protection,
4155 			    max_protection,
4156 			    inheritance);
4157 			if (result != KERN_SUCCESS) {
4158 				vm_map_deallocate(submap);
4159 			} else {
4160 				/*
4161 				 * No need to lock "submap" just to check its
4162 				 * "mapped" flag: that flag is never reset
4163 				 * once it's been set and if we race, we'll
4164 				 * just end up setting it twice, which is OK.
4165 				 */
4166 				if (submap->mapped_in_other_pmaps == FALSE &&
4167 				    vm_map_pmap(submap) != PMAP_NULL &&
4168 				    vm_map_pmap(submap) !=
4169 				    vm_map_pmap(target_map)) {
4170 					/*
4171 					 * This submap is being mapped in a map
4172 					 * that uses a different pmap.
4173 					 * Set its "mapped_in_other_pmaps" flag
4174 					 * to indicate that we now need to
4175 					 * remove mappings from all pmaps rather
4176 					 * than just the submap's pmap.
4177 					 */
4178 					vm_map_lock(submap);
4179 					submap->mapped_in_other_pmaps = TRUE;
4180 					vm_map_unlock(submap);
4181 				}
4182 				*address = map_addr;
4183 			}
4184 			return result;
4185 		} else if (named_entry->is_copy) {
4186 			kern_return_t   kr;
4187 			vm_map_copy_t   copy_map;
4188 			vm_map_entry_t  copy_entry;
4189 			vm_map_offset_t copy_addr;
4190 			vm_map_copy_t   target_copy_map;
4191 			vm_map_offset_t overmap_start, overmap_end;
4192 			vm_map_offset_t trimmed_start;
4193 			vm_map_size_t   target_size;
4194 
4195 			if (flags & ~(VM_FLAGS_FIXED |
4196 			    VM_FLAGS_ANYWHERE |
4197 			    VM_FLAGS_OVERWRITE |
4198 			    VM_FLAGS_RETURN_4K_DATA_ADDR |
4199 			    VM_FLAGS_RETURN_DATA_ADDR |
4200 			    VM_FLAGS_ALIAS_MASK)) {
4201 				named_entry_unlock(named_entry);
4202 				return KERN_INVALID_ARGUMENT;
4203 			}
4204 
4205 			copy_map = named_entry->backing.copy;
4206 			assert(copy_map->type == VM_MAP_COPY_ENTRY_LIST);
4207 			if (copy_map->type != VM_MAP_COPY_ENTRY_LIST) {
4208 				/* unsupported type; should not happen */
4209 				printf("vm_map_enter_mem_object: "
4210 				    "memory_entry->backing.copy "
4211 				    "unsupported type 0x%x\n",
4212 				    copy_map->type);
4213 				named_entry_unlock(named_entry);
4214 				return KERN_INVALID_ARGUMENT;
4215 			}
4216 
4217 			if (VM_MAP_PAGE_SHIFT(target_map) != copy_map->cpy_hdr.page_shift) {
4218 				DEBUG4K_SHARE("copy_map %p offset %llx size 0x%llx pgshift %d -> target_map %p pgshift %d\n", copy_map, offset, (uint64_t)map_size, copy_map->cpy_hdr.page_shift, target_map, VM_MAP_PAGE_SHIFT(target_map));
4219 			}
4220 
4221 			if (flags & (VM_FLAGS_RETURN_DATA_ADDR |
4222 			    VM_FLAGS_RETURN_4K_DATA_ADDR)) {
4223 				offset_in_mapping = offset & VM_MAP_PAGE_MASK(target_map);
4224 				if (flags & VM_FLAGS_RETURN_4K_DATA_ADDR) {
4225 					offset_in_mapping &= ~((signed)(0xFFF));
4226 				}
4227 			}
4228 
4229 			target_copy_map = VM_MAP_COPY_NULL;
4230 			target_size = copy_map->size;
4231 			overmap_start = 0;
4232 			overmap_end = 0;
4233 			trimmed_start = 0;
4234 			if (copy_map->cpy_hdr.page_shift != VM_MAP_PAGE_SHIFT(target_map)) {
4235 				DEBUG4K_ADJUST("adjusting...\n");
4236 				kr = vm_map_copy_adjust_to_target(
4237 					copy_map,
4238 					offset /* includes data_offset */,
4239 					initial_size,
4240 					target_map,
4241 					copy,
4242 					&target_copy_map,
4243 					&overmap_start,
4244 					&overmap_end,
4245 					&trimmed_start);
4246 				if (kr != KERN_SUCCESS) {
4247 					named_entry_unlock(named_entry);
4248 					return kr;
4249 				}
4250 				target_size = target_copy_map->size;
4251 				if (trimmed_start >= data_offset) {
4252 					data_offset = offset & VM_MAP_PAGE_MASK(target_map);
4253 				} else {
4254 					data_offset -= trimmed_start;
4255 				}
4256 			} else {
4257 				/*
4258 				 * Assert that the vm_map_copy is coming from the right
4259 				 * zone and hasn't been forged
4260 				 */
4261 				vm_map_copy_require(copy_map);
4262 				target_copy_map = copy_map;
4263 			}
4264 
4265 			/* reserve a contiguous range */
4266 			kr = vm_map_enter(target_map,
4267 			    &map_addr,
4268 			    vm_map_round_page(target_size, VM_MAP_PAGE_MASK(target_map)),
4269 			    mask,
4270 			    flags & (VM_FLAGS_ANYWHERE |
4271 			    VM_FLAGS_OVERWRITE |
4272 			    VM_FLAGS_RETURN_4K_DATA_ADDR |
4273 			    VM_FLAGS_RETURN_DATA_ADDR),
4274 			    vmk_flags,
4275 			    tag,
4276 			    VM_OBJECT_NULL,
4277 			    0,
4278 			    FALSE,               /* copy */
4279 			    cur_protection,
4280 			    max_protection,
4281 			    inheritance);
4282 			if (kr != KERN_SUCCESS) {
4283 				DEBUG4K_ERROR("kr 0x%x\n", kr);
4284 				if (target_copy_map != copy_map) {
4285 					vm_map_copy_discard(target_copy_map);
4286 					target_copy_map = VM_MAP_COPY_NULL;
4287 				}
4288 				named_entry_unlock(named_entry);
4289 				return kr;
4290 			}
4291 
4292 			copy_addr = map_addr;
4293 
4294 			for (copy_entry = vm_map_copy_first_entry(target_copy_map);
4295 			    copy_entry != vm_map_copy_to_entry(target_copy_map);
4296 			    copy_entry = copy_entry->vme_next) {
4297 				int                     remap_flags;
4298 				vm_map_kernel_flags_t   vmk_remap_flags;
4299 				vm_map_t                copy_submap = VM_MAP_NULL;
4300 				vm_object_t             copy_object = VM_OBJECT_NULL;
4301 				vm_map_size_t           copy_size;
4302 				vm_object_offset_t      copy_offset;
4303 				int                     copy_vm_alias;
4304 				boolean_t               do_copy;
4305 
4306 				do_copy = FALSE;
4307 				remap_flags = 0;
4308 				vmk_remap_flags = VM_MAP_KERNEL_FLAGS_NONE;
4309 
4310 				if (copy_entry->is_sub_map) {
4311 					copy_submap = VME_SUBMAP(copy_entry);
4312 					copy_object = (vm_object_t)copy_submap;
4313 				} else {
4314 					copy_object = VME_OBJECT(copy_entry);
4315 				}
4316 				copy_offset = VME_OFFSET(copy_entry);
4317 				copy_size = (copy_entry->vme_end -
4318 				    copy_entry->vme_start);
4319 				VM_GET_FLAGS_ALIAS(flags, copy_vm_alias);
4320 				if (copy_vm_alias == 0) {
4321 					/*
4322 					 * Caller does not want a specific
4323 					 * alias for this new mapping:  use
4324 					 * the alias of the original mapping.
4325 					 */
4326 					copy_vm_alias = VME_ALIAS(copy_entry);
4327 				}
4328 
4329 				/* sanity check */
4330 				if ((copy_addr + copy_size) >
4331 				    (map_addr +
4332 				    overmap_start + overmap_end +
4333 				    named_entry->size /* XXX full size */)) {
4334 					/* over-mapping too much !? */
4335 					kr = KERN_INVALID_ARGUMENT;
4336 					DEBUG4K_ERROR("kr 0x%x\n", kr);
4337 					/* abort */
4338 					break;
4339 				}
4340 
4341 				/* take a reference on the object */
4342 				if (copy_entry->is_sub_map) {
4343 					vmk_remap_flags.vmkf_submap = TRUE;
4344 					vm_map_reference(copy_submap);
4345 				} else {
4346 					if (!copy &&
4347 					    copy_object != VM_OBJECT_NULL &&
4348 					    copy_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
4349 						/*
4350 						 * We need to resolve our side of this
4351 						 * "symmetric" copy-on-write now; we
4352 						 * need a new object to map and share,
4353 						 * instead of the current one which
4354 						 * might still be shared with the
4355 						 * original mapping.
4356 						 *
4357 						 * Note: A "vm_map_copy_t" does not
4358 						 * have a lock but we're protected by
4359 						 * the named entry's lock here.
4360 						 */
4361 						// assert(copy_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC);
4362 						VME_OBJECT_SHADOW(copy_entry, copy_size, TRUE);
4363 						assert(copy_object != VME_OBJECT(copy_entry));
4364 						if (!copy_entry->needs_copy &&
4365 						    copy_entry->protection & VM_PROT_WRITE) {
4366 							vm_prot_t prot;
4367 
4368 							prot = copy_entry->protection & ~VM_PROT_WRITE;
4369 							vm_object_pmap_protect(copy_object,
4370 							    copy_offset,
4371 							    copy_size,
4372 							    PMAP_NULL,
4373 							    PAGE_SIZE,
4374 							    0,
4375 							    prot);
4376 						}
4377 						copy_entry->needs_copy = FALSE;
4378 						copy_entry->is_shared = TRUE;
4379 						copy_object = VME_OBJECT(copy_entry);
4380 						copy_offset = VME_OFFSET(copy_entry);
4381 						vm_object_lock(copy_object);
4382 						/* we're about to make a shared mapping of this object */
4383 						copy_object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
4384 						copy_object->true_share = TRUE;
4385 						vm_object_unlock(copy_object);
4386 					}
4387 
4388 					if (copy_object != VM_OBJECT_NULL &&
4389 					    copy_object->named &&
4390 					    copy_object->pager != MEMORY_OBJECT_NULL &&
4391 					    copy_object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4392 						memory_object_t pager;
4393 						vm_prot_t       pager_prot;
4394 
4395 						/*
4396 						 * For "named" VM objects, let the pager know that the
4397 						 * memory object is being mapped.  Some pagers need to keep
4398 						 * track of this, to know when they can reclaim the memory
4399 						 * object, for example.
4400 						 * VM calls memory_object_map() for each mapping (specifying
4401 						 * the protection of each mapping) and calls
4402 						 * memory_object_last_unmap() when all the mappings are gone.
4403 						 */
4404 						pager_prot = max_protection;
4405 						if (copy) {
4406 							/*
4407 							 * Copy-On-Write mapping: won't modify the
4408 							 * memory object.
4409 							 */
4410 							pager_prot &= ~VM_PROT_WRITE;
4411 						}
4412 						vm_object_lock(copy_object);
4413 						pager = copy_object->pager;
4414 						if (copy_object->named &&
4415 						    pager != MEMORY_OBJECT_NULL &&
4416 						    copy_object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4417 							assert(copy_object->pager_ready);
4418 							vm_object_mapping_wait(copy_object, THREAD_UNINT);
4419 							vm_object_mapping_begin(copy_object);
4420 							vm_object_unlock(copy_object);
4421 
4422 							kr = memory_object_map(pager, pager_prot);
4423 							assert(kr == KERN_SUCCESS);
4424 
4425 							vm_object_lock(copy_object);
4426 							vm_object_mapping_end(copy_object);
4427 						}
4428 						vm_object_unlock(copy_object);
4429 					}
4430 
4431 					/*
4432 					 *	Perform the copy if requested
4433 					 */
4434 
4435 					if (copy && copy_object != VM_OBJECT_NULL) {
4436 						vm_object_t             new_object;
4437 						vm_object_offset_t      new_offset;
4438 
4439 						result = vm_object_copy_strategically(copy_object, copy_offset,
4440 						    copy_size,
4441 						    &new_object, &new_offset,
4442 						    &do_copy);
4443 
4444 
4445 						if (result == KERN_MEMORY_RESTART_COPY) {
4446 							boolean_t success;
4447 							boolean_t src_needs_copy;
4448 
4449 							/*
4450 							 * XXX
4451 							 * We currently ignore src_needs_copy.
4452 							 * This really is the issue of how to make
4453 							 * MEMORY_OBJECT_COPY_SYMMETRIC safe for
4454 							 * non-kernel users to use. Solution forthcoming.
4455 							 * In the meantime, since we don't allow non-kernel
4456 							 * memory managers to specify symmetric copy,
4457 							 * we won't run into problems here.
4458 							 */
4459 							new_object = copy_object;
4460 							new_offset = copy_offset;
4461 							success = vm_object_copy_quickly(new_object,
4462 							    new_offset,
4463 							    copy_size,
4464 							    &src_needs_copy,
4465 							    &do_copy);
4466 							assert(success);
4467 							result = KERN_SUCCESS;
4468 						}
4469 						if (result != KERN_SUCCESS) {
4470 							kr = result;
4471 							break;
4472 						}
4473 
4474 						copy_object = new_object;
4475 						copy_offset = new_offset;
4476 						/*
4477 						 * No extra object reference for the mapping:
4478 						 * the mapping should be the only thing keeping
4479 						 * this new object alive.
4480 						 */
4481 					} else {
4482 						/*
4483 						 * We already have the right object
4484 						 * to map.
4485 						 */
4486 						copy_object = VME_OBJECT(copy_entry);
4487 						/* take an extra ref for the mapping below */
4488 						vm_object_reference(copy_object);
4489 					}
4490 				}
4491 
4492 				/* over-map the object into destination */
4493 				remap_flags |= flags;
4494 				remap_flags |= VM_FLAGS_FIXED;
4495 				remap_flags |= VM_FLAGS_OVERWRITE;
4496 				remap_flags &= ~VM_FLAGS_ANYWHERE;
4497 				if (!copy && !copy_entry->is_sub_map) {
4498 					/*
4499 					 * copy-on-write should have been
4500 					 * resolved at this point, or we would
4501 					 * end up sharing instead of copying.
4502 					 */
4503 					assert(!copy_entry->needs_copy);
4504 				}
4505 #if XNU_TARGET_OS_OSX
4506 				if (copy_entry->used_for_jit) {
4507 					vmk_remap_flags.vmkf_map_jit = TRUE;
4508 				}
4509 #endif /* XNU_TARGET_OS_OSX */
4510 
4511 				assertf((copy_vm_alias & VME_ALIAS_MASK) == copy_vm_alias,
4512 				    "VM Tag truncated from 0x%x to 0x%x\n", copy_vm_alias, (copy_vm_alias & VME_ALIAS_MASK));
4513 				kr = vm_map_enter(target_map,
4514 				    &copy_addr,
4515 				    copy_size,
4516 				    (vm_map_offset_t) 0,
4517 				    remap_flags,
4518 				    vmk_remap_flags,
4519 				    (vm_tag_t) copy_vm_alias, /* see comment at end of vm_fault_unwire re. cast*/
4520 				    copy_object,
4521 				    copy_offset,
4522 				    ((copy_object == NULL)
4523 				    ? FALSE
4524 				    : (copy || copy_entry->needs_copy)),
4525 				    cur_protection,
4526 				    max_protection,
4527 				    inheritance);
4528 				if (kr != KERN_SUCCESS) {
4529 					DEBUG4K_SHARE("failed kr 0x%x\n", kr);
4530 					if (copy_entry->is_sub_map) {
4531 						vm_map_deallocate(copy_submap);
4532 					} else {
4533 						vm_object_deallocate(copy_object);
4534 					}
4535 					/* abort */
4536 					break;
4537 				}
4538 
4539 				/* next mapping */
4540 				copy_addr += copy_size;
4541 			}
4542 
4543 			if (kr == KERN_SUCCESS) {
4544 				if (flags & (VM_FLAGS_RETURN_DATA_ADDR |
4545 				    VM_FLAGS_RETURN_4K_DATA_ADDR)) {
4546 					*address = map_addr + offset_in_mapping;
4547 				} else {
4548 					*address = map_addr;
4549 				}
4550 				if (overmap_start) {
4551 					*address += overmap_start;
4552 					DEBUG4K_SHARE("map %p map_addr 0x%llx offset_in_mapping 0x%llx overmap_start 0x%llx -> *address 0x%llx\n", target_map, (uint64_t)map_addr, (uint64_t) offset_in_mapping, (uint64_t)overmap_start, (uint64_t)*address);
4553 				}
4554 			}
4555 			named_entry_unlock(named_entry);
4556 			if (target_copy_map != copy_map) {
4557 				vm_map_copy_discard(target_copy_map);
4558 				target_copy_map = VM_MAP_COPY_NULL;
4559 			}
4560 
4561 			if (kr != KERN_SUCCESS) {
4562 				if (!(flags & VM_FLAGS_OVERWRITE)) {
4563 					/* deallocate the contiguous range */
4564 					(void) vm_deallocate(target_map,
4565 					    map_addr,
4566 					    map_size);
4567 				}
4568 			}
4569 
4570 			return kr;
4571 		}
4572 
4573 		if (named_entry->is_object) {
4574 			unsigned int    access;
4575 			vm_prot_t       protections;
4576 			unsigned int    wimg_mode;
4577 
4578 			/* we are mapping a VM object */
4579 
4580 			protections = named_entry->protection & (VM_PROT_ALL | VM_PROT_ALLEXEC);
4581 			access = GET_MAP_MEM(named_entry->protection);
4582 
4583 			if (flags & (VM_FLAGS_RETURN_DATA_ADDR |
4584 			    VM_FLAGS_RETURN_4K_DATA_ADDR)) {
4585 				offset_in_mapping = offset - VM_MAP_TRUNC_PAGE(offset, VM_MAP_PAGE_MASK(target_map));
4586 				if (flags & VM_FLAGS_RETURN_4K_DATA_ADDR) {
4587 					offset_in_mapping &= ~((signed)(0xFFF));
4588 				}
4589 				offset = VM_MAP_TRUNC_PAGE(offset, VM_MAP_PAGE_MASK(target_map));
4590 				map_size = VM_MAP_ROUND_PAGE((offset + offset_in_mapping + initial_size) - offset, VM_MAP_PAGE_MASK(target_map));
4591 			}
4592 
4593 			object = vm_named_entry_to_vm_object(named_entry);
4594 			assert(object != VM_OBJECT_NULL);
4595 			vm_object_lock(object);
4596 			named_entry_unlock(named_entry);
4597 
4598 			vm_object_reference_locked(object);
4599 
4600 			wimg_mode = object->wimg_bits;
4601 			vm_prot_to_wimg(access, &wimg_mode);
4602 			if (object->wimg_bits != wimg_mode) {
4603 				vm_object_change_wimg_mode(object, wimg_mode);
4604 			}
4605 
4606 			vm_object_unlock(object);
4607 		} else {
4608 			panic("invalid VM named entry %p", named_entry);
4609 		}
4610 	} else if (ip_kotype(port) == IKOT_MEMORY_OBJECT) {
4611 		/*
4612 		 * JMM - This is temporary until we unify named entries
4613 		 * and raw memory objects.
4614 		 *
4615 		 * Detected fake ip_kotype for a memory object.  In
4616 		 * this case, the port isn't really a port at all, but
4617 		 * instead is just a raw memory object.
4618 		 */
4619 		if (flags & (VM_FLAGS_RETURN_DATA_ADDR |
4620 		    VM_FLAGS_RETURN_4K_DATA_ADDR)) {
4621 			panic("VM_FLAGS_RETURN_DATA_ADDR not expected for raw memory object.");
4622 		}
4623 
4624 		object = memory_object_to_vm_object((memory_object_t)port);
4625 		if (object == VM_OBJECT_NULL) {
4626 			return KERN_INVALID_OBJECT;
4627 		}
4628 		vm_object_reference(object);
4629 
4630 		/* wait for object (if any) to be ready */
4631 		if (object != VM_OBJECT_NULL) {
4632 			if (object == kernel_object) {
4633 				printf("Warning: Attempt to map kernel object"
4634 				    " by a non-private kernel entity\n");
4635 				return KERN_INVALID_OBJECT;
4636 			}
4637 			if (!object->pager_ready) {
4638 				vm_object_lock(object);
4639 
4640 				while (!object->pager_ready) {
4641 					vm_object_wait(object,
4642 					    VM_OBJECT_EVENT_PAGER_READY,
4643 					    THREAD_UNINT);
4644 					vm_object_lock(object);
4645 				}
4646 				vm_object_unlock(object);
4647 			}
4648 		}
4649 	} else {
4650 		return KERN_INVALID_OBJECT;
4651 	}
4652 
4653 	if (object != VM_OBJECT_NULL &&
4654 	    object->named &&
4655 	    object->pager != MEMORY_OBJECT_NULL &&
4656 	    object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4657 		memory_object_t pager;
4658 		vm_prot_t       pager_prot;
4659 		kern_return_t   kr;
4660 
4661 		/*
4662 		 * For "named" VM objects, let the pager know that the
4663 		 * memory object is being mapped.  Some pagers need to keep
4664 		 * track of this, to know when they can reclaim the memory
4665 		 * object, for example.
4666 		 * VM calls memory_object_map() for each mapping (specifying
4667 		 * the protection of each mapping) and calls
4668 		 * memory_object_last_unmap() when all the mappings are gone.
4669 		 */
4670 		pager_prot = max_protection;
4671 		if (copy) {
4672 			/*
4673 			 * Copy-On-Write mapping: won't modify the
4674 			 * memory object.
4675 			 */
4676 			pager_prot &= ~VM_PROT_WRITE;
4677 		}
4678 		vm_object_lock(object);
4679 		pager = object->pager;
4680 		if (object->named &&
4681 		    pager != MEMORY_OBJECT_NULL &&
4682 		    object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4683 			assert(object->pager_ready);
4684 			vm_object_mapping_wait(object, THREAD_UNINT);
4685 			vm_object_mapping_begin(object);
4686 			vm_object_unlock(object);
4687 
4688 			kr = memory_object_map(pager, pager_prot);
4689 			assert(kr == KERN_SUCCESS);
4690 
4691 			vm_object_lock(object);
4692 			vm_object_mapping_end(object);
4693 		}
4694 		vm_object_unlock(object);
4695 	}
4696 
4697 	/*
4698 	 *	Perform the copy if requested
4699 	 */
4700 
4701 	if (copy) {
4702 		vm_object_t             new_object;
4703 		vm_object_offset_t      new_offset;
4704 
4705 		result = vm_object_copy_strategically(object, offset,
4706 		    map_size,
4707 		    &new_object, &new_offset,
4708 		    &copy);
4709 
4710 
4711 		if (result == KERN_MEMORY_RESTART_COPY) {
4712 			boolean_t success;
4713 			boolean_t src_needs_copy;
4714 
4715 			/*
4716 			 * XXX
4717 			 * We currently ignore src_needs_copy.
4718 			 * This really is the issue of how to make
4719 			 * MEMORY_OBJECT_COPY_SYMMETRIC safe for
4720 			 * non-kernel users to use. Solution forthcoming.
4721 			 * In the meantime, since we don't allow non-kernel
4722 			 * memory managers to specify symmetric copy,
4723 			 * we won't run into problems here.
4724 			 */
4725 			new_object = object;
4726 			new_offset = offset;
4727 			success = vm_object_copy_quickly(new_object,
4728 			    new_offset,
4729 			    map_size,
4730 			    &src_needs_copy,
4731 			    &copy);
4732 			assert(success);
4733 			result = KERN_SUCCESS;
4734 		}
4735 		/*
4736 		 *	Throw away the reference to the
4737 		 *	original object, as it won't be mapped.
4738 		 */
4739 
4740 		vm_object_deallocate(object);
4741 
4742 		if (result != KERN_SUCCESS) {
4743 			return result;
4744 		}
4745 
4746 		object = new_object;
4747 		offset = new_offset;
4748 	}
4749 
4750 	/*
4751 	 * If non-kernel users want to try to prefault pages, the mapping and prefault
4752 	 * needs to be atomic.
4753 	 */
4754 	kernel_prefault = (try_prefault && vm_kernel_map_is_kernel(target_map));
4755 	vmk_flags.vmkf_keep_map_locked = (try_prefault && !kernel_prefault);
4756 
4757 #if __arm64__
4758 	if (fourk) {
4759 		/* map this object in a "4K" pager */
4760 		result = vm_map_enter_fourk(target_map,
4761 		    &map_addr,
4762 		    map_size,
4763 		    (vm_map_offset_t) mask,
4764 		    flags,
4765 		    vmk_flags,
4766 		    tag,
4767 		    object,
4768 		    offset,
4769 		    copy,
4770 		    cur_protection,
4771 		    max_protection,
4772 		    inheritance);
4773 	} else
4774 #endif /* __arm64__ */
4775 	{
4776 		result = vm_map_enter(target_map,
4777 		    &map_addr, map_size,
4778 		    (vm_map_offset_t)mask,
4779 		    flags,
4780 		    vmk_flags,
4781 		    tag,
4782 		    object, offset,
4783 		    copy,
4784 		    cur_protection, max_protection,
4785 		    inheritance);
4786 	}
4787 	if (result != KERN_SUCCESS) {
4788 		vm_object_deallocate(object);
4789 	}
4790 
4791 	/*
4792 	 * Try to prefault, and do not forget to release the vm map lock.
4793 	 */
4794 	if (result == KERN_SUCCESS && try_prefault) {
4795 		mach_vm_address_t va = map_addr;
4796 		kern_return_t kr = KERN_SUCCESS;
4797 		unsigned int i = 0;
4798 		int pmap_options;
4799 
4800 		pmap_options = kernel_prefault ? 0 : PMAP_OPTIONS_NOWAIT;
4801 		if (object->internal) {
4802 			pmap_options |= PMAP_OPTIONS_INTERNAL;
4803 		}
4804 
4805 		for (i = 0; i < page_list_count; ++i) {
4806 			if (!UPL_VALID_PAGE(page_list, i)) {
4807 				if (kernel_prefault) {
4808 					assertf(FALSE, "kernel_prefault && !UPL_VALID_PAGE");
4809 					result = KERN_MEMORY_ERROR;
4810 					break;
4811 				}
4812 			} else {
4813 				/*
4814 				 * If this function call failed, we should stop
4815 				 * trying to optimize, other calls are likely
4816 				 * going to fail too.
4817 				 *
4818 				 * We are not gonna report an error for such
4819 				 * failure though. That's an optimization, not
4820 				 * something critical.
4821 				 */
4822 				kr = pmap_enter_options(target_map->pmap,
4823 				    va, UPL_PHYS_PAGE(page_list, i),
4824 				    cur_protection, VM_PROT_NONE,
4825 				    0, TRUE, pmap_options, NULL);
4826 				if (kr != KERN_SUCCESS) {
4827 					OSIncrementAtomic64(&vm_prefault_nb_bailout);
4828 					if (kernel_prefault) {
4829 						result = kr;
4830 					}
4831 					break;
4832 				}
4833 				OSIncrementAtomic64(&vm_prefault_nb_pages);
4834 			}
4835 
4836 			/* Next virtual address */
4837 			va += PAGE_SIZE;
4838 		}
4839 		if (vmk_flags.vmkf_keep_map_locked) {
4840 			vm_map_unlock(target_map);
4841 		}
4842 	}
4843 
4844 	if (flags & (VM_FLAGS_RETURN_DATA_ADDR |
4845 	    VM_FLAGS_RETURN_4K_DATA_ADDR)) {
4846 		*address = map_addr + offset_in_mapping;
4847 	} else {
4848 		*address = map_addr;
4849 	}
4850 	return result;
4851 }
4852 
4853 kern_return_t
vm_map_enter_mem_object(vm_map_t target_map,vm_map_offset_t * address,vm_map_size_t initial_size,vm_map_offset_t mask,int flags,vm_map_kernel_flags_t vmk_flags,vm_tag_t tag,ipc_port_t port,vm_object_offset_t offset,boolean_t copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)4854 vm_map_enter_mem_object(
4855 	vm_map_t                target_map,
4856 	vm_map_offset_t         *address,
4857 	vm_map_size_t           initial_size,
4858 	vm_map_offset_t         mask,
4859 	int                     flags,
4860 	vm_map_kernel_flags_t   vmk_flags,
4861 	vm_tag_t                tag,
4862 	ipc_port_t              port,
4863 	vm_object_offset_t      offset,
4864 	boolean_t               copy,
4865 	vm_prot_t               cur_protection,
4866 	vm_prot_t               max_protection,
4867 	vm_inherit_t            inheritance)
4868 {
4869 	kern_return_t ret;
4870 
4871 	ret = vm_map_enter_mem_object_helper(target_map,
4872 	    address,
4873 	    initial_size,
4874 	    mask,
4875 	    flags,
4876 	    vmk_flags,
4877 	    tag,
4878 	    port,
4879 	    offset,
4880 	    copy,
4881 	    cur_protection,
4882 	    max_protection,
4883 	    inheritance,
4884 	    NULL,
4885 	    0);
4886 
4887 #if KASAN
4888 	if (ret == KERN_SUCCESS && address && target_map->pmap == kernel_pmap) {
4889 		kasan_notify_address(*address, initial_size);
4890 	}
4891 #endif
4892 
4893 	return ret;
4894 }
4895 
4896 kern_return_t
vm_map_enter_mem_object_prefault(vm_map_t target_map,vm_map_offset_t * address,vm_map_size_t initial_size,vm_map_offset_t mask,int flags,vm_map_kernel_flags_t vmk_flags,vm_tag_t tag,ipc_port_t port,vm_object_offset_t offset,vm_prot_t cur_protection,vm_prot_t max_protection,upl_page_list_ptr_t page_list,unsigned int page_list_count)4897 vm_map_enter_mem_object_prefault(
4898 	vm_map_t                target_map,
4899 	vm_map_offset_t         *address,
4900 	vm_map_size_t           initial_size,
4901 	vm_map_offset_t         mask,
4902 	int                     flags,
4903 	vm_map_kernel_flags_t   vmk_flags,
4904 	vm_tag_t                tag,
4905 	ipc_port_t              port,
4906 	vm_object_offset_t      offset,
4907 	vm_prot_t               cur_protection,
4908 	vm_prot_t               max_protection,
4909 	upl_page_list_ptr_t     page_list,
4910 	unsigned int            page_list_count)
4911 {
4912 	kern_return_t ret;
4913 
4914 	ret = vm_map_enter_mem_object_helper(target_map,
4915 	    address,
4916 	    initial_size,
4917 	    mask,
4918 	    flags,
4919 	    vmk_flags,
4920 	    tag,
4921 	    port,
4922 	    offset,
4923 	    FALSE,
4924 	    cur_protection,
4925 	    max_protection,
4926 	    VM_INHERIT_DEFAULT,
4927 	    page_list,
4928 	    page_list_count);
4929 
4930 #if KASAN
4931 	if (ret == KERN_SUCCESS && address && target_map->pmap == kernel_pmap) {
4932 		kasan_notify_address(*address, initial_size);
4933 	}
4934 #endif
4935 
4936 	return ret;
4937 }
4938 
4939 
4940 kern_return_t
vm_map_enter_mem_object_control(vm_map_t target_map,vm_map_offset_t * address,vm_map_size_t initial_size,vm_map_offset_t mask,int flags,vm_map_kernel_flags_t vmk_flags,vm_tag_t tag,memory_object_control_t control,vm_object_offset_t offset,boolean_t copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)4941 vm_map_enter_mem_object_control(
4942 	vm_map_t                target_map,
4943 	vm_map_offset_t         *address,
4944 	vm_map_size_t           initial_size,
4945 	vm_map_offset_t         mask,
4946 	int                     flags,
4947 	vm_map_kernel_flags_t   vmk_flags,
4948 	vm_tag_t                tag,
4949 	memory_object_control_t control,
4950 	vm_object_offset_t      offset,
4951 	boolean_t               copy,
4952 	vm_prot_t               cur_protection,
4953 	vm_prot_t               max_protection,
4954 	vm_inherit_t            inheritance)
4955 {
4956 	vm_map_address_t        map_addr;
4957 	vm_map_size_t           map_size;
4958 	vm_object_t             object;
4959 	vm_object_size_t        size;
4960 	kern_return_t           result;
4961 	memory_object_t         pager;
4962 	vm_prot_t               pager_prot;
4963 	kern_return_t           kr;
4964 #if __arm64__
4965 	boolean_t               fourk = vmk_flags.vmkf_fourk;
4966 #endif /* __arm64__ */
4967 
4968 	/*
4969 	 * Check arguments for validity
4970 	 */
4971 	if ((target_map == VM_MAP_NULL) ||
4972 	    (cur_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)) ||
4973 	    (max_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)) ||
4974 	    (inheritance > VM_INHERIT_LAST_VALID) ||
4975 	    initial_size == 0) {
4976 		return KERN_INVALID_ARGUMENT;
4977 	}
4978 
4979 #if __arm64__
4980 	if (fourk && VM_MAP_PAGE_MASK(target_map) < PAGE_MASK) {
4981 		fourk = FALSE;
4982 	}
4983 
4984 	if (fourk) {
4985 		map_addr = vm_map_trunc_page(*address,
4986 		    FOURK_PAGE_MASK);
4987 		map_size = vm_map_round_page(initial_size,
4988 		    FOURK_PAGE_MASK);
4989 	} else
4990 #endif /* __arm64__ */
4991 	{
4992 		map_addr = vm_map_trunc_page(*address,
4993 		    VM_MAP_PAGE_MASK(target_map));
4994 		map_size = vm_map_round_page(initial_size,
4995 		    VM_MAP_PAGE_MASK(target_map));
4996 	}
4997 	size = vm_object_round_page(initial_size);
4998 
4999 	object = memory_object_control_to_vm_object(control);
5000 
5001 	if (object == VM_OBJECT_NULL) {
5002 		return KERN_INVALID_OBJECT;
5003 	}
5004 
5005 	if (object == kernel_object) {
5006 		printf("Warning: Attempt to map kernel object"
5007 		    " by a non-private kernel entity\n");
5008 		return KERN_INVALID_OBJECT;
5009 	}
5010 
5011 	vm_object_lock(object);
5012 	object->ref_count++;
5013 
5014 	/*
5015 	 * For "named" VM objects, let the pager know that the
5016 	 * memory object is being mapped.  Some pagers need to keep
5017 	 * track of this, to know when they can reclaim the memory
5018 	 * object, for example.
5019 	 * VM calls memory_object_map() for each mapping (specifying
5020 	 * the protection of each mapping) and calls
5021 	 * memory_object_last_unmap() when all the mappings are gone.
5022 	 */
5023 	pager_prot = max_protection;
5024 	if (copy) {
5025 		pager_prot &= ~VM_PROT_WRITE;
5026 	}
5027 	pager = object->pager;
5028 	if (object->named &&
5029 	    pager != MEMORY_OBJECT_NULL &&
5030 	    object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
5031 		assert(object->pager_ready);
5032 		vm_object_mapping_wait(object, THREAD_UNINT);
5033 		vm_object_mapping_begin(object);
5034 		vm_object_unlock(object);
5035 
5036 		kr = memory_object_map(pager, pager_prot);
5037 		assert(kr == KERN_SUCCESS);
5038 
5039 		vm_object_lock(object);
5040 		vm_object_mapping_end(object);
5041 	}
5042 	vm_object_unlock(object);
5043 
5044 	/*
5045 	 *	Perform the copy if requested
5046 	 */
5047 
5048 	if (copy) {
5049 		vm_object_t             new_object;
5050 		vm_object_offset_t      new_offset;
5051 
5052 		result = vm_object_copy_strategically(object, offset, size,
5053 		    &new_object, &new_offset,
5054 		    &copy);
5055 
5056 
5057 		if (result == KERN_MEMORY_RESTART_COPY) {
5058 			boolean_t success;
5059 			boolean_t src_needs_copy;
5060 
5061 			/*
5062 			 * XXX
5063 			 * We currently ignore src_needs_copy.
5064 			 * This really is the issue of how to make
5065 			 * MEMORY_OBJECT_COPY_SYMMETRIC safe for
5066 			 * non-kernel users to use. Solution forthcoming.
5067 			 * In the meantime, since we don't allow non-kernel
5068 			 * memory managers to specify symmetric copy,
5069 			 * we won't run into problems here.
5070 			 */
5071 			new_object = object;
5072 			new_offset = offset;
5073 			success = vm_object_copy_quickly(new_object,
5074 			    new_offset, size,
5075 			    &src_needs_copy,
5076 			    &copy);
5077 			assert(success);
5078 			result = KERN_SUCCESS;
5079 		}
5080 		/*
5081 		 *	Throw away the reference to the
5082 		 *	original object, as it won't be mapped.
5083 		 */
5084 
5085 		vm_object_deallocate(object);
5086 
5087 		if (result != KERN_SUCCESS) {
5088 			return result;
5089 		}
5090 
5091 		object = new_object;
5092 		offset = new_offset;
5093 	}
5094 
5095 #if __arm64__
5096 	if (fourk) {
5097 		result = vm_map_enter_fourk(target_map,
5098 		    &map_addr,
5099 		    map_size,
5100 		    (vm_map_offset_t)mask,
5101 		    flags,
5102 		    vmk_flags,
5103 		    tag,
5104 		    object, offset,
5105 		    copy,
5106 		    cur_protection, max_protection,
5107 		    inheritance);
5108 	} else
5109 #endif /* __arm64__ */
5110 	{
5111 		result = vm_map_enter(target_map,
5112 		    &map_addr, map_size,
5113 		    (vm_map_offset_t)mask,
5114 		    flags,
5115 		    vmk_flags,
5116 		    tag,
5117 		    object, offset,
5118 		    copy,
5119 		    cur_protection, max_protection,
5120 		    inheritance);
5121 	}
5122 	if (result != KERN_SUCCESS) {
5123 		vm_object_deallocate(object);
5124 	}
5125 	*address = map_addr;
5126 
5127 	return result;
5128 }
5129 
5130 
5131 #if     VM_CPM
5132 
5133 #ifdef MACH_ASSERT
5134 extern pmap_paddr_t     avail_start, avail_end;
5135 #endif
5136 
5137 /*
5138  *	Allocate memory in the specified map, with the caveat that
5139  *	the memory is physically contiguous.  This call may fail
5140  *	if the system can't find sufficient contiguous memory.
5141  *	This call may cause or lead to heart-stopping amounts of
5142  *	paging activity.
5143  *
5144  *	Memory obtained from this call should be freed in the
5145  *	normal way, viz., via vm_deallocate.
5146  */
5147 kern_return_t
vm_map_enter_cpm(vm_map_t map,vm_map_offset_t * addr,vm_map_size_t size,int flags,vm_map_kernel_flags_t vmk_flags)5148 vm_map_enter_cpm(
5149 	vm_map_t                map,
5150 	vm_map_offset_t        *addr,
5151 	vm_map_size_t           size,
5152 	int                     flags,
5153 	vm_map_kernel_flags_t   vmk_flags)
5154 {
5155 	vm_object_t             cpm_obj;
5156 	pmap_t                  pmap;
5157 	vm_page_t               m, pages;
5158 	kern_return_t           kr;
5159 	vm_map_offset_t         va, start, end, offset;
5160 #if     MACH_ASSERT
5161 	vm_map_offset_t         prev_addr = 0;
5162 #endif  /* MACH_ASSERT */
5163 
5164 	boolean_t               anywhere = ((VM_FLAGS_ANYWHERE & flags) != 0);
5165 	vm_tag_t tag;
5166 
5167 	if (VM_MAP_PAGE_SHIFT(map) != PAGE_SHIFT) {
5168 		/* XXX TODO4K do we need to support this? */
5169 		*addr = 0;
5170 		return KERN_NOT_SUPPORTED;
5171 	}
5172 
5173 	VM_GET_FLAGS_ALIAS(flags, tag);
5174 
5175 	if (size == 0) {
5176 		*addr = 0;
5177 		return KERN_SUCCESS;
5178 	}
5179 	if (anywhere) {
5180 		*addr = vm_map_min(map);
5181 	} else {
5182 		*addr = vm_map_trunc_page(*addr,
5183 		    VM_MAP_PAGE_MASK(map));
5184 	}
5185 	size = vm_map_round_page(size,
5186 	    VM_MAP_PAGE_MASK(map));
5187 
5188 	/*
5189 	 * LP64todo - cpm_allocate should probably allow
5190 	 * allocations of >4GB, but not with the current
5191 	 * algorithm, so just cast down the size for now.
5192 	 */
5193 	if (size > VM_MAX_ADDRESS) {
5194 		return KERN_RESOURCE_SHORTAGE;
5195 	}
5196 	if ((kr = cpm_allocate(CAST_DOWN(vm_size_t, size),
5197 	    &pages, 0, 0, TRUE, flags)) != KERN_SUCCESS) {
5198 		return kr;
5199 	}
5200 
5201 	cpm_obj = vm_object_allocate((vm_object_size_t)size);
5202 	assert(cpm_obj != VM_OBJECT_NULL);
5203 	assert(cpm_obj->internal);
5204 	assert(cpm_obj->vo_size == (vm_object_size_t)size);
5205 	assert(cpm_obj->can_persist == FALSE);
5206 	assert(cpm_obj->pager_created == FALSE);
5207 	assert(cpm_obj->pageout == FALSE);
5208 	assert(cpm_obj->shadow == VM_OBJECT_NULL);
5209 
5210 	/*
5211 	 *	Insert pages into object.
5212 	 */
5213 
5214 	vm_object_lock(cpm_obj);
5215 	for (offset = 0; offset < size; offset += PAGE_SIZE) {
5216 		m = pages;
5217 		pages = NEXT_PAGE(m);
5218 		*(NEXT_PAGE_PTR(m)) = VM_PAGE_NULL;
5219 
5220 		assert(!m->vmp_gobbled);
5221 		assert(!m->vmp_wanted);
5222 		assert(!m->vmp_pageout);
5223 		assert(!m->vmp_tabled);
5224 		assert(VM_PAGE_WIRED(m));
5225 		assert(m->vmp_busy);
5226 		assert(VM_PAGE_GET_PHYS_PAGE(m) >= (avail_start >> PAGE_SHIFT) && VM_PAGE_GET_PHYS_PAGE(m) <= (avail_end >> PAGE_SHIFT));
5227 
5228 		m->vmp_busy = FALSE;
5229 		vm_page_insert(m, cpm_obj, offset);
5230 	}
5231 	assert(cpm_obj->resident_page_count == size / PAGE_SIZE);
5232 	vm_object_unlock(cpm_obj);
5233 
5234 	/*
5235 	 *	Hang onto a reference on the object in case a
5236 	 *	multi-threaded application for some reason decides
5237 	 *	to deallocate the portion of the address space into
5238 	 *	which we will insert this object.
5239 	 *
5240 	 *	Unfortunately, we must insert the object now before
5241 	 *	we can talk to the pmap module about which addresses
5242 	 *	must be wired down.  Hence, the race with a multi-
5243 	 *	threaded app.
5244 	 */
5245 	vm_object_reference(cpm_obj);
5246 
5247 	/*
5248 	 *	Insert object into map.
5249 	 */
5250 
5251 	kr = vm_map_enter(
5252 		map,
5253 		addr,
5254 		size,
5255 		(vm_map_offset_t)0,
5256 		flags,
5257 		vmk_flags,
5258 		cpm_obj,
5259 		(vm_object_offset_t)0,
5260 		FALSE,
5261 		VM_PROT_ALL,
5262 		VM_PROT_ALL,
5263 		VM_INHERIT_DEFAULT);
5264 
5265 	if (kr != KERN_SUCCESS) {
5266 		/*
5267 		 *	A CPM object doesn't have can_persist set,
5268 		 *	so all we have to do is deallocate it to
5269 		 *	free up these pages.
5270 		 */
5271 		assert(cpm_obj->pager_created == FALSE);
5272 		assert(cpm_obj->can_persist == FALSE);
5273 		assert(cpm_obj->pageout == FALSE);
5274 		assert(cpm_obj->shadow == VM_OBJECT_NULL);
5275 		vm_object_deallocate(cpm_obj); /* kill acquired ref */
5276 		vm_object_deallocate(cpm_obj); /* kill creation ref */
5277 	}
5278 
5279 	/*
5280 	 *	Inform the physical mapping system that the
5281 	 *	range of addresses may not fault, so that
5282 	 *	page tables and such can be locked down as well.
5283 	 */
5284 	start = *addr;
5285 	end = start + size;
5286 	pmap = vm_map_pmap(map);
5287 	pmap_pageable(pmap, start, end, FALSE);
5288 
5289 	/*
5290 	 *	Enter each page into the pmap, to avoid faults.
5291 	 *	Note that this loop could be coded more efficiently,
5292 	 *	if the need arose, rather than looking up each page
5293 	 *	again.
5294 	 */
5295 	for (offset = 0, va = start; offset < size;
5296 	    va += PAGE_SIZE, offset += PAGE_SIZE) {
5297 		int type_of_fault;
5298 
5299 		vm_object_lock(cpm_obj);
5300 		m = vm_page_lookup(cpm_obj, (vm_object_offset_t)offset);
5301 		assert(m != VM_PAGE_NULL);
5302 
5303 		vm_page_zero_fill(m);
5304 
5305 		type_of_fault = DBG_ZERO_FILL_FAULT;
5306 
5307 		vm_fault_enter(m, pmap, va,
5308 		    PAGE_SIZE, 0,
5309 		    VM_PROT_ALL, VM_PROT_WRITE,
5310 		    VM_PAGE_WIRED(m),
5311 		    FALSE,                             /* change_wiring */
5312 		    VM_KERN_MEMORY_NONE,                             /* tag - not wiring */
5313 		    FALSE,                             /* no_cache */
5314 		    FALSE,                             /* cs_bypass */
5315 		    0,                                 /* user_tag */
5316 		    0,                             /* pmap_options */
5317 		    NULL,                              /* need_retry */
5318 		    &type_of_fault);
5319 
5320 		vm_object_unlock(cpm_obj);
5321 	}
5322 
5323 #if     MACH_ASSERT
5324 	/*
5325 	 *	Verify ordering in address space.
5326 	 */
5327 	for (offset = 0; offset < size; offset += PAGE_SIZE) {
5328 		vm_object_lock(cpm_obj);
5329 		m = vm_page_lookup(cpm_obj, (vm_object_offset_t)offset);
5330 		vm_object_unlock(cpm_obj);
5331 		if (m == VM_PAGE_NULL) {
5332 			panic("vm_allocate_cpm:  obj %p off 0x%llx no page",
5333 			    cpm_obj, (uint64_t)offset);
5334 		}
5335 		assert(m->vmp_tabled);
5336 		assert(!m->vmp_busy);
5337 		assert(!m->vmp_wanted);
5338 		assert(!m->vmp_fictitious);
5339 		assert(!m->vmp_private);
5340 		assert(!m->vmp_absent);
5341 		assert(!m->vmp_cleaning);
5342 		assert(!m->vmp_laundry);
5343 		assert(!m->vmp_precious);
5344 		assert(!m->vmp_clustered);
5345 		if (offset != 0) {
5346 			if (VM_PAGE_GET_PHYS_PAGE(m) != prev_addr + 1) {
5347 				printf("start 0x%llx end 0x%llx va 0x%llx\n",
5348 				    (uint64_t)start, (uint64_t)end, (uint64_t)va);
5349 				printf("obj %p off 0x%llx\n", cpm_obj, (uint64_t)offset);
5350 				printf("m %p prev_address 0x%llx\n", m, (uint64_t)prev_addr);
5351 				panic("vm_allocate_cpm:  pages not contig!");
5352 			}
5353 		}
5354 		prev_addr = VM_PAGE_GET_PHYS_PAGE(m);
5355 	}
5356 #endif  /* MACH_ASSERT */
5357 
5358 	vm_object_deallocate(cpm_obj); /* kill extra ref */
5359 
5360 	return kr;
5361 }
5362 
5363 
5364 #else   /* VM_CPM */
5365 
5366 /*
5367  *	Interface is defined in all cases, but unless the kernel
5368  *	is built explicitly for this option, the interface does
5369  *	nothing.
5370  */
5371 
5372 kern_return_t
vm_map_enter_cpm(__unused vm_map_t map,__unused vm_map_offset_t * addr,__unused vm_map_size_t size,__unused int flags,__unused vm_map_kernel_flags_t vmk_flags)5373 vm_map_enter_cpm(
5374 	__unused vm_map_t                map,
5375 	__unused vm_map_offset_t        *addr,
5376 	__unused vm_map_size_t           size,
5377 	__unused int                     flags,
5378 	__unused vm_map_kernel_flags_t   vmk_flags)
5379 {
5380 	return KERN_FAILURE;
5381 }
5382 #endif /* VM_CPM */
5383 
5384 /* Not used without nested pmaps */
5385 #ifndef NO_NESTED_PMAP
5386 /*
5387  * Clip and unnest a portion of a nested submap mapping.
5388  */
5389 
5390 
5391 static void
vm_map_clip_unnest(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t start_unnest,vm_map_offset_t end_unnest)5392 vm_map_clip_unnest(
5393 	vm_map_t        map,
5394 	vm_map_entry_t  entry,
5395 	vm_map_offset_t start_unnest,
5396 	vm_map_offset_t end_unnest)
5397 {
5398 	vm_map_offset_t old_start_unnest = start_unnest;
5399 	vm_map_offset_t old_end_unnest = end_unnest;
5400 
5401 	assert(entry->is_sub_map);
5402 	assert(VME_SUBMAP(entry) != NULL);
5403 	assert(entry->use_pmap);
5404 
5405 	/*
5406 	 * Query the platform for the optimal unnest range.
5407 	 * DRK: There's some duplication of effort here, since
5408 	 * callers may have adjusted the range to some extent. This
5409 	 * routine was introduced to support 1GiB subtree nesting
5410 	 * for x86 platforms, which can also nest on 2MiB boundaries
5411 	 * depending on size/alignment.
5412 	 */
5413 	if (pmap_adjust_unnest_parameters(map->pmap, &start_unnest, &end_unnest)) {
5414 		assert(VME_SUBMAP(entry)->is_nested_map);
5415 		assert(!VME_SUBMAP(entry)->disable_vmentry_reuse);
5416 		log_unnest_badness(map,
5417 		    old_start_unnest,
5418 		    old_end_unnest,
5419 		    VME_SUBMAP(entry)->is_nested_map,
5420 		    (entry->vme_start +
5421 		    VME_SUBMAP(entry)->lowest_unnestable_start -
5422 		    VME_OFFSET(entry)));
5423 	}
5424 
5425 	if (entry->vme_start > start_unnest ||
5426 	    entry->vme_end < end_unnest) {
5427 		panic("vm_map_clip_unnest(0x%llx,0x%llx): "
5428 		    "bad nested entry: start=0x%llx end=0x%llx\n",
5429 		    (long long)start_unnest, (long long)end_unnest,
5430 		    (long long)entry->vme_start, (long long)entry->vme_end);
5431 	}
5432 
5433 	if (start_unnest > entry->vme_start) {
5434 		_vm_map_clip_start(&map->hdr,
5435 		    entry,
5436 		    start_unnest);
5437 		if (map->holelistenabled) {
5438 			vm_map_store_update_first_free(map, NULL, FALSE);
5439 		} else {
5440 			vm_map_store_update_first_free(map, map->first_free, FALSE);
5441 		}
5442 	}
5443 	if (entry->vme_end > end_unnest) {
5444 		_vm_map_clip_end(&map->hdr,
5445 		    entry,
5446 		    end_unnest);
5447 		if (map->holelistenabled) {
5448 			vm_map_store_update_first_free(map, NULL, FALSE);
5449 		} else {
5450 			vm_map_store_update_first_free(map, map->first_free, FALSE);
5451 		}
5452 	}
5453 
5454 	pmap_unnest(map->pmap,
5455 	    entry->vme_start,
5456 	    entry->vme_end - entry->vme_start);
5457 	if ((map->mapped_in_other_pmaps) && os_ref_get_count_raw(&map->map_refcnt) != 0) {
5458 		/* clean up parent map/maps */
5459 		vm_map_submap_pmap_clean(
5460 			map, entry->vme_start,
5461 			entry->vme_end,
5462 			VME_SUBMAP(entry),
5463 			VME_OFFSET(entry));
5464 	}
5465 	entry->use_pmap = FALSE;
5466 	if ((map->pmap != kernel_pmap) &&
5467 	    (VME_ALIAS(entry) == VM_MEMORY_SHARED_PMAP)) {
5468 		VME_ALIAS_SET(entry, VM_MEMORY_UNSHARED_PMAP);
5469 	}
5470 }
5471 #endif  /* NO_NESTED_PMAP */
5472 
5473 __abortlike
5474 static void
__vm_map_clip_atomic_entry_panic(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t where)5475 __vm_map_clip_atomic_entry_panic(
5476 	vm_map_t        map,
5477 	vm_map_entry_t  entry,
5478 	vm_map_offset_t where)
5479 {
5480 	panic("vm_map_clip(%p): Attempting to clip an atomic VM map entry "
5481 	    "%p [0x%llx:0x%llx] at 0x%llx", map, entry,
5482 	    (uint64_t)entry->vme_start,
5483 	    (uint64_t)entry->vme_end,
5484 	    (uint64_t)where);
5485 }
5486 
5487 /*
5488  *	vm_map_clip_start:	[ internal use only ]
5489  *
5490  *	Asserts that the given entry begins at or after
5491  *	the specified address; if necessary,
5492  *	it splits the entry into two.
5493  */
5494 void
vm_map_clip_start(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t startaddr)5495 vm_map_clip_start(
5496 	vm_map_t        map,
5497 	vm_map_entry_t  entry,
5498 	vm_map_offset_t startaddr)
5499 {
5500 #ifndef NO_NESTED_PMAP
5501 	if (entry->is_sub_map &&
5502 	    entry->use_pmap &&
5503 	    startaddr >= entry->vme_start) {
5504 		vm_map_offset_t start_unnest, end_unnest;
5505 
5506 		/*
5507 		 * Make sure "startaddr" is no longer in a nested range
5508 		 * before we clip.  Unnest only the minimum range the platform
5509 		 * can handle.
5510 		 * vm_map_clip_unnest may perform additional adjustments to
5511 		 * the unnest range.
5512 		 */
5513 		start_unnest = startaddr & ~(pmap_shared_region_size_min(map->pmap) - 1);
5514 		end_unnest = start_unnest + pmap_shared_region_size_min(map->pmap);
5515 		vm_map_clip_unnest(map, entry, start_unnest, end_unnest);
5516 	}
5517 #endif /* NO_NESTED_PMAP */
5518 	if (startaddr > entry->vme_start) {
5519 		if (!entry->is_sub_map &&
5520 		    VME_OBJECT(entry) &&
5521 		    VME_OBJECT(entry)->phys_contiguous) {
5522 			pmap_remove(map->pmap,
5523 			    (addr64_t)(entry->vme_start),
5524 			    (addr64_t)(entry->vme_end));
5525 		}
5526 		if (entry->vme_atomic) {
5527 			__vm_map_clip_atomic_entry_panic(map, entry, startaddr);
5528 		}
5529 
5530 		DTRACE_VM5(
5531 			vm_map_clip_start,
5532 			vm_map_t, map,
5533 			vm_map_offset_t, entry->vme_start,
5534 			vm_map_offset_t, entry->vme_end,
5535 			vm_map_offset_t, startaddr,
5536 			int, VME_ALIAS(entry));
5537 
5538 		_vm_map_clip_start(&map->hdr, entry, startaddr);
5539 		if (map->holelistenabled) {
5540 			vm_map_store_update_first_free(map, NULL, FALSE);
5541 		} else {
5542 			vm_map_store_update_first_free(map, map->first_free, FALSE);
5543 		}
5544 	}
5545 }
5546 
5547 
5548 #define vm_map_copy_clip_start(copy, entry, startaddr) \
5549 	MACRO_BEGIN \
5550 	if ((startaddr) > (entry)->vme_start) \
5551 	        _vm_map_clip_start(&(copy)->cpy_hdr,(entry),(startaddr)); \
5552 	MACRO_END
5553 
5554 /*
5555  *	This routine is called only when it is known that
5556  *	the entry must be split.
5557  */
5558 static void
_vm_map_clip_start(struct vm_map_header * map_header,vm_map_entry_t entry,vm_map_offset_t start)5559 _vm_map_clip_start(
5560 	struct vm_map_header    *map_header,
5561 	vm_map_entry_t          entry,
5562 	vm_map_offset_t         start)
5563 {
5564 	vm_map_entry_t  new_entry;
5565 
5566 	/*
5567 	 *	Split off the front portion --
5568 	 *	note that we must insert the new
5569 	 *	entry BEFORE this one, so that
5570 	 *	this entry has the specified starting
5571 	 *	address.
5572 	 */
5573 
5574 	if (entry->map_aligned) {
5575 		assert(VM_MAP_PAGE_ALIGNED(start,
5576 		    VM_MAP_HDR_PAGE_MASK(map_header)));
5577 	}
5578 
5579 	new_entry = _vm_map_entry_create(map_header);
5580 	vm_map_entry_copy_full(new_entry, entry);
5581 
5582 	new_entry->vme_end = start;
5583 	assert(new_entry->vme_start < new_entry->vme_end);
5584 	VME_OFFSET_SET(entry, VME_OFFSET(entry) + (start - entry->vme_start));
5585 	assert(start < entry->vme_end);
5586 	entry->vme_start = start;
5587 
5588 	_vm_map_store_entry_link(map_header, entry->vme_prev, new_entry);
5589 
5590 	if (entry->is_sub_map) {
5591 		vm_map_reference(VME_SUBMAP(new_entry));
5592 	} else {
5593 		vm_object_reference(VME_OBJECT(new_entry));
5594 	}
5595 }
5596 
5597 
5598 /*
5599  *	vm_map_clip_end:	[ internal use only ]
5600  *
5601  *	Asserts that the given entry ends at or before
5602  *	the specified address; if necessary,
5603  *	it splits the entry into two.
5604  */
5605 void
vm_map_clip_end(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t endaddr)5606 vm_map_clip_end(
5607 	vm_map_t        map,
5608 	vm_map_entry_t  entry,
5609 	vm_map_offset_t endaddr)
5610 {
5611 	if (endaddr > entry->vme_end) {
5612 		/*
5613 		 * Within the scope of this clipping, limit "endaddr" to
5614 		 * the end of this map entry...
5615 		 */
5616 		endaddr = entry->vme_end;
5617 	}
5618 #ifndef NO_NESTED_PMAP
5619 	if (entry->is_sub_map && entry->use_pmap) {
5620 		vm_map_offset_t start_unnest, end_unnest;
5621 
5622 		/*
5623 		 * Make sure the range between the start of this entry and
5624 		 * the new "endaddr" is no longer nested before we clip.
5625 		 * Unnest only the minimum range the platform can handle.
5626 		 * vm_map_clip_unnest may perform additional adjustments to
5627 		 * the unnest range.
5628 		 */
5629 		start_unnest = entry->vme_start;
5630 		end_unnest =
5631 		    (endaddr + pmap_shared_region_size_min(map->pmap) - 1) &
5632 		    ~(pmap_shared_region_size_min(map->pmap) - 1);
5633 		vm_map_clip_unnest(map, entry, start_unnest, end_unnest);
5634 	}
5635 #endif /* NO_NESTED_PMAP */
5636 	if (endaddr < entry->vme_end) {
5637 		if (!entry->is_sub_map &&
5638 		    VME_OBJECT(entry) &&
5639 		    VME_OBJECT(entry)->phys_contiguous) {
5640 			pmap_remove(map->pmap,
5641 			    (addr64_t)(entry->vme_start),
5642 			    (addr64_t)(entry->vme_end));
5643 		}
5644 		if (entry->vme_atomic) {
5645 			__vm_map_clip_atomic_entry_panic(map, entry, endaddr);
5646 		}
5647 		DTRACE_VM5(
5648 			vm_map_clip_end,
5649 			vm_map_t, map,
5650 			vm_map_offset_t, entry->vme_start,
5651 			vm_map_offset_t, entry->vme_end,
5652 			vm_map_offset_t, endaddr,
5653 			int, VME_ALIAS(entry));
5654 
5655 		_vm_map_clip_end(&map->hdr, entry, endaddr);
5656 		if (map->holelistenabled) {
5657 			vm_map_store_update_first_free(map, NULL, FALSE);
5658 		} else {
5659 			vm_map_store_update_first_free(map, map->first_free, FALSE);
5660 		}
5661 	}
5662 }
5663 
5664 
5665 #define vm_map_copy_clip_end(copy, entry, endaddr) \
5666 	MACRO_BEGIN \
5667 	if ((endaddr) < (entry)->vme_end) \
5668 	        _vm_map_clip_end(&(copy)->cpy_hdr,(entry),(endaddr)); \
5669 	MACRO_END
5670 
5671 /*
5672  *	This routine is called only when it is known that
5673  *	the entry must be split.
5674  */
5675 static void
_vm_map_clip_end(struct vm_map_header * map_header,vm_map_entry_t entry,vm_map_offset_t end)5676 _vm_map_clip_end(
5677 	struct vm_map_header    *map_header,
5678 	vm_map_entry_t          entry,
5679 	vm_map_offset_t         end)
5680 {
5681 	vm_map_entry_t  new_entry;
5682 
5683 	/*
5684 	 *	Create a new entry and insert it
5685 	 *	AFTER the specified entry
5686 	 */
5687 
5688 	if (entry->map_aligned) {
5689 		assert(VM_MAP_PAGE_ALIGNED(end,
5690 		    VM_MAP_HDR_PAGE_MASK(map_header)));
5691 	}
5692 
5693 	new_entry = _vm_map_entry_create(map_header);
5694 	vm_map_entry_copy_full(new_entry, entry);
5695 
5696 	assert(entry->vme_start < end);
5697 	new_entry->vme_start = entry->vme_end = end;
5698 	VME_OFFSET_SET(new_entry,
5699 	    VME_OFFSET(new_entry) + (end - entry->vme_start));
5700 	assert(new_entry->vme_start < new_entry->vme_end);
5701 
5702 	_vm_map_store_entry_link(map_header, entry, new_entry);
5703 
5704 	if (entry->is_sub_map) {
5705 		vm_map_reference(VME_SUBMAP(new_entry));
5706 	} else {
5707 		vm_object_reference(VME_OBJECT(new_entry));
5708 	}
5709 }
5710 
5711 
5712 /*
5713  *	VM_MAP_RANGE_CHECK:	[ internal use only ]
5714  *
5715  *	Asserts that the starting and ending region
5716  *	addresses fall within the valid range of the map.
5717  */
5718 #define VM_MAP_RANGE_CHECK(map, start, end)     \
5719 	MACRO_BEGIN                             \
5720 	if (start < vm_map_min(map))            \
5721 	        start = vm_map_min(map);        \
5722 	if (end > vm_map_max(map))              \
5723 	        end = vm_map_max(map);          \
5724 	if (start > end)                        \
5725 	        start = end;                    \
5726 	MACRO_END
5727 
5728 /*
5729  *	vm_map_range_check:	[ internal use only ]
5730  *
5731  *	Check that the region defined by the specified start and
5732  *	end addresses are wholly contained within a single map
5733  *	entry or set of adjacent map entries of the spacified map,
5734  *	i.e. the specified region contains no unmapped space.
5735  *	If any or all of the region is unmapped, FALSE is returned.
5736  *	Otherwise, TRUE is returned and if the output argument 'entry'
5737  *	is not NULL it points to the map entry containing the start
5738  *	of the region.
5739  *
5740  *	The map is locked for reading on entry and is left locked.
5741  */
5742 static boolean_t
vm_map_range_check(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_map_entry_t * entry)5743 vm_map_range_check(
5744 	vm_map_t                map,
5745 	vm_map_offset_t         start,
5746 	vm_map_offset_t         end,
5747 	vm_map_entry_t          *entry)
5748 {
5749 	vm_map_entry_t          cur;
5750 	vm_map_offset_t         prev;
5751 
5752 	/*
5753 	 *      Basic sanity checks first
5754 	 */
5755 	if (start < vm_map_min(map) || end > vm_map_max(map) || start > end) {
5756 		return FALSE;
5757 	}
5758 
5759 	/*
5760 	 *      Check first if the region starts within a valid
5761 	 *	mapping for the map.
5762 	 */
5763 	if (!vm_map_lookup_entry(map, start, &cur)) {
5764 		return FALSE;
5765 	}
5766 
5767 	/*
5768 	 *	Optimize for the case that the region is contained
5769 	 *	in a single map entry.
5770 	 */
5771 	if (entry != (vm_map_entry_t *) NULL) {
5772 		*entry = cur;
5773 	}
5774 	if (end <= cur->vme_end) {
5775 		return TRUE;
5776 	}
5777 
5778 	/*
5779 	 *      If the region is not wholly contained within a
5780 	 *      single entry, walk the entries looking for holes.
5781 	 */
5782 	prev = cur->vme_end;
5783 	cur = cur->vme_next;
5784 	while ((cur != vm_map_to_entry(map)) && (prev == cur->vme_start)) {
5785 		if (end <= cur->vme_end) {
5786 			return TRUE;
5787 		}
5788 		prev = cur->vme_end;
5789 		cur = cur->vme_next;
5790 	}
5791 	return FALSE;
5792 }
5793 
5794 /*
5795  *	vm_map_protect:
5796  *
5797  *	Sets the protection of the specified address
5798  *	region in the target map.  If "set_max" is
5799  *	specified, the maximum protection is to be set;
5800  *	otherwise, only the current protection is affected.
5801  */
5802 kern_return_t
vm_map_protect(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t new_prot,boolean_t set_max)5803 vm_map_protect(
5804 	vm_map_t        map,
5805 	vm_map_offset_t start,
5806 	vm_map_offset_t end,
5807 	vm_prot_t       new_prot,
5808 	boolean_t       set_max)
5809 {
5810 	vm_map_entry_t                  current;
5811 	vm_map_offset_t                 prev;
5812 	vm_map_entry_t                  entry;
5813 	vm_prot_t                       new_max;
5814 	int                             pmap_options = 0;
5815 	kern_return_t                   kr;
5816 
5817 	if (new_prot & VM_PROT_COPY) {
5818 		vm_map_offset_t         new_start;
5819 		vm_prot_t               cur_prot, max_prot;
5820 		vm_map_kernel_flags_t   kflags;
5821 
5822 		/* LP64todo - see below */
5823 		if (start >= map->max_offset) {
5824 			return KERN_INVALID_ADDRESS;
5825 		}
5826 
5827 		if ((new_prot & VM_PROT_ALLEXEC) &&
5828 		    map->pmap != kernel_pmap &&
5829 		    (vm_map_cs_enforcement(map)
5830 #if XNU_TARGET_OS_OSX && __arm64__
5831 		    || !VM_MAP_IS_EXOTIC(map)
5832 #endif /* XNU_TARGET_OS_OSX && __arm64__ */
5833 		    ) &&
5834 		    VM_MAP_POLICY_WX_FAIL(map)) {
5835 			DTRACE_VM3(cs_wx,
5836 			    uint64_t, (uint64_t) start,
5837 			    uint64_t, (uint64_t) end,
5838 			    vm_prot_t, new_prot);
5839 			printf("CODE SIGNING: %d[%s] %s can't have both write and exec at the same time\n",
5840 			    proc_selfpid(),
5841 			    (get_bsdtask_info(current_task())
5842 			    ? proc_name_address(get_bsdtask_info(current_task()))
5843 			    : "?"),
5844 			    __FUNCTION__);
5845 			return KERN_PROTECTION_FAILURE;
5846 		}
5847 
5848 		/*
5849 		 * Let vm_map_remap_extract() know that it will need to:
5850 		 * + make a copy of the mapping
5851 		 * + add VM_PROT_WRITE to the max protections
5852 		 * + remove any protections that are no longer allowed from the
5853 		 *   max protections (to avoid any WRITE/EXECUTE conflict, for
5854 		 *   example).
5855 		 * Note that "max_prot" is an IN/OUT parameter only for this
5856 		 * specific (VM_PROT_COPY) case.  It's usually an OUT parameter
5857 		 * only.
5858 		 */
5859 		max_prot = new_prot & (VM_PROT_ALL | VM_PROT_ALLEXEC);
5860 		cur_prot = VM_PROT_NONE;
5861 		kflags = VM_MAP_KERNEL_FLAGS_NONE;
5862 		kflags.vmkf_remap_prot_copy = TRUE;
5863 		new_start = start;
5864 		kr = vm_map_remap(map,
5865 		    &new_start,
5866 		    end - start,
5867 		    0, /* mask */
5868 		    VM_FLAGS_FIXED | VM_FLAGS_OVERWRITE,
5869 		    kflags,
5870 		    0,
5871 		    map,
5872 		    start,
5873 		    TRUE, /* copy-on-write remapping! */
5874 		    &cur_prot, /* IN/OUT */
5875 		    &max_prot, /* IN/OUT */
5876 		    VM_INHERIT_DEFAULT);
5877 		if (kr != KERN_SUCCESS) {
5878 			return kr;
5879 		}
5880 		new_prot &= ~VM_PROT_COPY;
5881 	}
5882 
5883 	vm_map_lock(map);
5884 
5885 	/* LP64todo - remove this check when vm_map_commpage64()
5886 	 * no longer has to stuff in a map_entry for the commpage
5887 	 * above the map's max_offset.
5888 	 */
5889 	if (start >= map->max_offset) {
5890 		vm_map_unlock(map);
5891 		return KERN_INVALID_ADDRESS;
5892 	}
5893 
5894 	while (1) {
5895 		/*
5896 		 *      Lookup the entry.  If it doesn't start in a valid
5897 		 *	entry, return an error.
5898 		 */
5899 		if (!vm_map_lookup_entry(map, start, &entry)) {
5900 			vm_map_unlock(map);
5901 			return KERN_INVALID_ADDRESS;
5902 		}
5903 
5904 		if (entry->superpage_size && (start & (SUPERPAGE_SIZE - 1))) { /* extend request to whole entry */
5905 			start = SUPERPAGE_ROUND_DOWN(start);
5906 			continue;
5907 		}
5908 		break;
5909 	}
5910 	if (entry->superpage_size) {
5911 		end = SUPERPAGE_ROUND_UP(end);
5912 	}
5913 
5914 	/*
5915 	 *	Make a first pass to check for protection and address
5916 	 *	violations.
5917 	 */
5918 
5919 	current = entry;
5920 	prev = current->vme_start;
5921 	while ((current != vm_map_to_entry(map)) &&
5922 	    (current->vme_start < end)) {
5923 		/*
5924 		 * If there is a hole, return an error.
5925 		 */
5926 		if (current->vme_start != prev) {
5927 			vm_map_unlock(map);
5928 			return KERN_INVALID_ADDRESS;
5929 		}
5930 
5931 		new_max = current->max_protection;
5932 
5933 #if defined(__x86_64__)
5934 		/* Allow max mask to include execute prot bits if this map doesn't enforce CS */
5935 		if (set_max && (new_prot & VM_PROT_ALLEXEC) && !vm_map_cs_enforcement(map)) {
5936 			new_max = (new_max & ~VM_PROT_ALLEXEC) | (new_prot & VM_PROT_ALLEXEC);
5937 		}
5938 #endif
5939 		if ((new_prot & new_max) != new_prot) {
5940 			vm_map_unlock(map);
5941 			return KERN_PROTECTION_FAILURE;
5942 		}
5943 
5944 		if (current->used_for_jit &&
5945 		    pmap_has_prot_policy(map->pmap, current->translated_allow_execute, current->protection)) {
5946 			vm_map_unlock(map);
5947 			return KERN_PROTECTION_FAILURE;
5948 		}
5949 
5950 #if __arm64e__
5951 		/* Disallow remapping hw assisted TPRO mappings */
5952 		if (current->used_for_tpro) {
5953 			vm_map_unlock(map);
5954 			return KERN_PROTECTION_FAILURE;
5955 		}
5956 #endif /* __arm64e__ */
5957 
5958 
5959 		if ((new_prot & VM_PROT_WRITE) &&
5960 		    (new_prot & VM_PROT_ALLEXEC) &&
5961 #if XNU_TARGET_OS_OSX
5962 		    map->pmap != kernel_pmap &&
5963 		    (vm_map_cs_enforcement(map)
5964 #if __arm64__
5965 		    || !VM_MAP_IS_EXOTIC(map)
5966 #endif /* __arm64__ */
5967 		    ) &&
5968 #endif /* XNU_TARGET_OS_OSX */
5969 		    !(current->used_for_jit)) {
5970 			DTRACE_VM3(cs_wx,
5971 			    uint64_t, (uint64_t) current->vme_start,
5972 			    uint64_t, (uint64_t) current->vme_end,
5973 			    vm_prot_t, new_prot);
5974 			printf("CODE SIGNING: %d[%s] %s can't have both write and exec at the same time\n",
5975 			    proc_selfpid(),
5976 			    (get_bsdtask_info(current_task())
5977 			    ? proc_name_address(get_bsdtask_info(current_task()))
5978 			    : "?"),
5979 			    __FUNCTION__);
5980 			new_prot &= ~VM_PROT_ALLEXEC;
5981 			if (VM_MAP_POLICY_WX_FAIL(map)) {
5982 				vm_map_unlock(map);
5983 				return KERN_PROTECTION_FAILURE;
5984 			}
5985 		}
5986 
5987 		/*
5988 		 * If the task has requested executable lockdown,
5989 		 * deny both:
5990 		 * - adding executable protections OR
5991 		 * - adding write protections to an existing executable mapping.
5992 		 */
5993 		if (map->map_disallow_new_exec == TRUE) {
5994 			if ((new_prot & VM_PROT_ALLEXEC) ||
5995 			    ((current->protection & VM_PROT_EXECUTE) && (new_prot & VM_PROT_WRITE))) {
5996 				vm_map_unlock(map);
5997 				return KERN_PROTECTION_FAILURE;
5998 			}
5999 		}
6000 
6001 		prev = current->vme_end;
6002 		current = current->vme_next;
6003 	}
6004 
6005 #if __arm64__
6006 	if (end > prev &&
6007 	    end == vm_map_round_page(prev, VM_MAP_PAGE_MASK(map))) {
6008 		vm_map_entry_t prev_entry;
6009 
6010 		prev_entry = current->vme_prev;
6011 		if (prev_entry != vm_map_to_entry(map) &&
6012 		    !prev_entry->map_aligned &&
6013 		    (vm_map_round_page(prev_entry->vme_end,
6014 		    VM_MAP_PAGE_MASK(map))
6015 		    == end)) {
6016 			/*
6017 			 * The last entry in our range is not "map-aligned"
6018 			 * but it would have reached all the way to "end"
6019 			 * if it had been map-aligned, so this is not really
6020 			 * a hole in the range and we can proceed.
6021 			 */
6022 			prev = end;
6023 		}
6024 	}
6025 #endif /* __arm64__ */
6026 
6027 	if (end > prev) {
6028 		vm_map_unlock(map);
6029 		return KERN_INVALID_ADDRESS;
6030 	}
6031 
6032 	/*
6033 	 *	Go back and fix up protections.
6034 	 *	Clip to start here if the range starts within
6035 	 *	the entry.
6036 	 */
6037 
6038 	current = entry;
6039 	if (current != vm_map_to_entry(map)) {
6040 		/* clip and unnest if necessary */
6041 		vm_map_clip_start(map, current, start);
6042 	}
6043 
6044 	while ((current != vm_map_to_entry(map)) &&
6045 	    (current->vme_start < end)) {
6046 		vm_prot_t       old_prot;
6047 
6048 		vm_map_clip_end(map, current, end);
6049 
6050 		if (current->is_sub_map) {
6051 			/* clipping did unnest if needed */
6052 			assert(!current->use_pmap);
6053 		}
6054 
6055 		old_prot = current->protection;
6056 
6057 		if (set_max) {
6058 			current->max_protection = new_prot;
6059 			/* Consider either EXECUTE or UEXEC as EXECUTE for this masking */
6060 			current->protection = (new_prot & old_prot);
6061 		} else {
6062 			current->protection = new_prot;
6063 		}
6064 
6065 		/*
6066 		 *	Update physical map if necessary.
6067 		 *	If the request is to turn off write protection,
6068 		 *	we won't do it for real (in pmap). This is because
6069 		 *	it would cause copy-on-write to fail.  We've already
6070 		 *	set, the new protection in the map, so if a
6071 		 *	write-protect fault occurred, it will be fixed up
6072 		 *	properly, COW or not.
6073 		 */
6074 		if (current->protection != old_prot) {
6075 			/* Look one level in we support nested pmaps */
6076 			/* from mapped submaps which are direct entries */
6077 			/* in our map */
6078 
6079 			vm_prot_t prot;
6080 
6081 			prot = current->protection;
6082 			if (current->is_sub_map || (VME_OBJECT(current) == NULL) || (VME_OBJECT(current) != compressor_object)) {
6083 				prot &= ~VM_PROT_WRITE;
6084 			} else {
6085 				assert(!VME_OBJECT(current)->code_signed);
6086 				assert(VME_OBJECT(current)->copy_strategy == MEMORY_OBJECT_COPY_NONE);
6087 				if (prot & VM_PROT_WRITE) {
6088 					/*
6089 					 * For write requests on the
6090 					 * compressor, we wil ask the
6091 					 * pmap layer to prevent us from
6092 					 * taking a write fault when we
6093 					 * attempt to access the mapping
6094 					 * next.
6095 					 */
6096 					pmap_options |= PMAP_OPTIONS_PROTECT_IMMEDIATE;
6097 				}
6098 			}
6099 
6100 			if (override_nx(map, VME_ALIAS(current)) && prot) {
6101 				prot |= VM_PROT_EXECUTE;
6102 			}
6103 
6104 #if DEVELOPMENT || DEBUG
6105 			if (!(old_prot & VM_PROT_EXECUTE) &&
6106 			    (prot & VM_PROT_EXECUTE) &&
6107 			    panic_on_unsigned_execute &&
6108 			    (proc_selfcsflags() & CS_KILL)) {
6109 				panic("vm_map_protect(%p,0x%llx,0x%llx) old=0x%x new=0x%x - <rdar://23770418> code-signing bypass?", map, (uint64_t)current->vme_start, (uint64_t)current->vme_end, old_prot, prot);
6110 			}
6111 #endif /* DEVELOPMENT || DEBUG */
6112 
6113 			if (pmap_has_prot_policy(map->pmap, current->translated_allow_execute, prot)) {
6114 				if (current->wired_count) {
6115 					panic("vm_map_protect(%p,0x%llx,0x%llx) new=0x%x wired=%x",
6116 					    map, (uint64_t)current->vme_start, (uint64_t)current->vme_end, prot, current->wired_count);
6117 				}
6118 
6119 				/* If the pmap layer cares about this
6120 				 * protection type, force a fault for
6121 				 * each page so that vm_fault will
6122 				 * repopulate the page with the full
6123 				 * set of protections.
6124 				 */
6125 				/*
6126 				 * TODO: We don't seem to need this,
6127 				 * but this is due to an internal
6128 				 * implementation detail of
6129 				 * pmap_protect.  Do we want to rely
6130 				 * on this?
6131 				 */
6132 				prot = VM_PROT_NONE;
6133 			}
6134 
6135 			if (current->is_sub_map && current->use_pmap) {
6136 				pmap_protect(VME_SUBMAP(current)->pmap,
6137 				    current->vme_start,
6138 				    current->vme_end,
6139 				    prot);
6140 			} else {
6141 				pmap_protect_options(map->pmap,
6142 				    current->vme_start,
6143 				    current->vme_end,
6144 				    prot,
6145 				    pmap_options,
6146 				    NULL);
6147 			}
6148 		}
6149 		current = current->vme_next;
6150 	}
6151 
6152 	current = entry;
6153 	while ((current != vm_map_to_entry(map)) &&
6154 	    (current->vme_start <= end)) {
6155 		vm_map_simplify_entry(map, current);
6156 		current = current->vme_next;
6157 	}
6158 
6159 	vm_map_unlock(map);
6160 	return KERN_SUCCESS;
6161 }
6162 
6163 /*
6164  *	vm_map_inherit:
6165  *
6166  *	Sets the inheritance of the specified address
6167  *	range in the target map.  Inheritance
6168  *	affects how the map will be shared with
6169  *	child maps at the time of vm_map_fork.
6170  */
6171 kern_return_t
vm_map_inherit(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_inherit_t new_inheritance)6172 vm_map_inherit(
6173 	vm_map_t        map,
6174 	vm_map_offset_t start,
6175 	vm_map_offset_t end,
6176 	vm_inherit_t    new_inheritance)
6177 {
6178 	vm_map_entry_t  entry;
6179 	vm_map_entry_t  temp_entry;
6180 
6181 	vm_map_lock(map);
6182 
6183 	VM_MAP_RANGE_CHECK(map, start, end);
6184 
6185 	if (vm_map_lookup_entry(map, start, &temp_entry)) {
6186 		entry = temp_entry;
6187 	} else {
6188 		temp_entry = temp_entry->vme_next;
6189 		entry = temp_entry;
6190 	}
6191 
6192 	/* first check entire range for submaps which can't support the */
6193 	/* given inheritance. */
6194 	while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
6195 		if (entry->is_sub_map) {
6196 			if (new_inheritance == VM_INHERIT_COPY) {
6197 				vm_map_unlock(map);
6198 				return KERN_INVALID_ARGUMENT;
6199 			}
6200 		}
6201 
6202 		entry = entry->vme_next;
6203 	}
6204 
6205 	entry = temp_entry;
6206 	if (entry != vm_map_to_entry(map)) {
6207 		/* clip and unnest if necessary */
6208 		vm_map_clip_start(map, entry, start);
6209 	}
6210 
6211 	while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
6212 		vm_map_clip_end(map, entry, end);
6213 		if (entry->is_sub_map) {
6214 			/* clip did unnest if needed */
6215 			assert(!entry->use_pmap);
6216 		}
6217 
6218 		entry->inheritance = new_inheritance;
6219 
6220 		entry = entry->vme_next;
6221 	}
6222 
6223 	vm_map_unlock(map);
6224 	return KERN_SUCCESS;
6225 }
6226 
6227 /*
6228  * Update the accounting for the amount of wired memory in this map.  If the user has
6229  * exceeded the defined limits, then we fail.  Wiring on behalf of the kernel never fails.
6230  */
6231 
6232 static kern_return_t
add_wire_counts(vm_map_t map,vm_map_entry_t entry,boolean_t user_wire)6233 add_wire_counts(
6234 	vm_map_t        map,
6235 	vm_map_entry_t  entry,
6236 	boolean_t       user_wire)
6237 {
6238 	vm_map_size_t   size;
6239 
6240 	if (user_wire) {
6241 		unsigned int total_wire_count =  vm_page_wire_count + vm_lopage_free_count;
6242 
6243 		/*
6244 		 * We're wiring memory at the request of the user.  Check if this is the first time the user is wiring
6245 		 * this map entry.
6246 		 */
6247 
6248 		if (entry->user_wired_count == 0) {
6249 			size = entry->vme_end - entry->vme_start;
6250 
6251 			/*
6252 			 * Since this is the first time the user is wiring this map entry, check to see if we're
6253 			 * exceeding the user wire limits.  There is a per map limit which is the smaller of either
6254 			 * the process's rlimit or the global vm_per_task_user_wire_limit which caps this value.  There is also
6255 			 * a system-wide limit on the amount of memory all users can wire.  If the user is over either
6256 			 * limit, then we fail.
6257 			 */
6258 
6259 			if (size + map->user_wire_size > MIN(map->user_wire_limit, vm_per_task_user_wire_limit) ||
6260 			    size + ptoa_64(total_wire_count) > vm_global_user_wire_limit) {
6261 				if (size + ptoa_64(total_wire_count) > vm_global_user_wire_limit) {
6262 #if DEVELOPMENT || DEBUG
6263 					if (panic_on_mlock_failure) {
6264 						panic("mlock: Over global wire limit. %llu bytes wired and requested to wire %llu bytes more", ptoa_64(total_wire_count), (uint64_t) size);
6265 					}
6266 #endif /* DEVELOPMENT || DEBUG */
6267 					os_atomic_inc(&vm_add_wire_count_over_global_limit, relaxed);
6268 				} else {
6269 					os_atomic_inc(&vm_add_wire_count_over_user_limit, relaxed);
6270 #if DEVELOPMENT || DEBUG
6271 					if (panic_on_mlock_failure) {
6272 						panic("mlock: Over process wire limit. %llu bytes wired and requested to wire %llu bytes more", (uint64_t) map->user_wire_size, (uint64_t) size);
6273 					}
6274 #endif /* DEVELOPMENT || DEBUG */
6275 				}
6276 				return KERN_RESOURCE_SHORTAGE;
6277 			}
6278 
6279 			/*
6280 			 * The first time the user wires an entry, we also increment the wired_count and add this to
6281 			 * the total that has been wired in the map.
6282 			 */
6283 
6284 			if (entry->wired_count >= MAX_WIRE_COUNT) {
6285 				return KERN_FAILURE;
6286 			}
6287 
6288 			entry->wired_count++;
6289 			map->user_wire_size += size;
6290 		}
6291 
6292 		if (entry->user_wired_count >= MAX_WIRE_COUNT) {
6293 			return KERN_FAILURE;
6294 		}
6295 
6296 		entry->user_wired_count++;
6297 	} else {
6298 		/*
6299 		 * The kernel's wiring the memory.  Just bump the count and continue.
6300 		 */
6301 
6302 		if (entry->wired_count >= MAX_WIRE_COUNT) {
6303 			panic("vm_map_wire: too many wirings");
6304 		}
6305 
6306 		entry->wired_count++;
6307 	}
6308 
6309 	return KERN_SUCCESS;
6310 }
6311 
6312 /*
6313  * Update the memory wiring accounting now that the given map entry is being unwired.
6314  */
6315 
6316 static void
subtract_wire_counts(vm_map_t map,vm_map_entry_t entry,boolean_t user_wire)6317 subtract_wire_counts(
6318 	vm_map_t        map,
6319 	vm_map_entry_t  entry,
6320 	boolean_t       user_wire)
6321 {
6322 	if (user_wire) {
6323 		/*
6324 		 * We're unwiring memory at the request of the user.  See if we're removing the last user wire reference.
6325 		 */
6326 
6327 		if (entry->user_wired_count == 1) {
6328 			/*
6329 			 * We're removing the last user wire reference.  Decrement the wired_count and the total
6330 			 * user wired memory for this map.
6331 			 */
6332 
6333 			assert(entry->wired_count >= 1);
6334 			entry->wired_count--;
6335 			map->user_wire_size -= entry->vme_end - entry->vme_start;
6336 		}
6337 
6338 		assert(entry->user_wired_count >= 1);
6339 		entry->user_wired_count--;
6340 	} else {
6341 		/*
6342 		 * The kernel is unwiring the memory.   Just update the count.
6343 		 */
6344 
6345 		assert(entry->wired_count >= 1);
6346 		entry->wired_count--;
6347 	}
6348 }
6349 
6350 int cs_executable_wire = 0;
6351 
6352 /*
6353  *	vm_map_wire:
6354  *
6355  *	Sets the pageability of the specified address range in the
6356  *	target map as wired.  Regions specified as not pageable require
6357  *	locked-down physical memory and physical page maps.  The
6358  *	access_type variable indicates types of accesses that must not
6359  *	generate page faults.  This is checked against protection of
6360  *	memory being locked-down.
6361  *
6362  *	The map must not be locked, but a reference must remain to the
6363  *	map throughout the call.
6364  */
6365 static kern_return_t
vm_map_wire_nested(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t caller_prot,vm_tag_t tag,boolean_t user_wire,pmap_t map_pmap,vm_map_offset_t pmap_addr,ppnum_t * physpage_p)6366 vm_map_wire_nested(
6367 	vm_map_t                map,
6368 	vm_map_offset_t         start,
6369 	vm_map_offset_t         end,
6370 	vm_prot_t               caller_prot,
6371 	vm_tag_t                tag,
6372 	boolean_t               user_wire,
6373 	pmap_t                  map_pmap,
6374 	vm_map_offset_t         pmap_addr,
6375 	ppnum_t                 *physpage_p)
6376 {
6377 	vm_map_entry_t          entry;
6378 	vm_prot_t               access_type;
6379 	struct vm_map_entry     *first_entry, tmp_entry;
6380 	vm_map_t                real_map;
6381 	vm_map_offset_t         s, e;
6382 	kern_return_t           rc;
6383 	boolean_t               need_wakeup;
6384 	boolean_t               main_map = FALSE;
6385 	wait_interrupt_t        interruptible_state;
6386 	thread_t                cur_thread;
6387 	unsigned int            last_timestamp;
6388 	vm_map_size_t           size;
6389 	boolean_t               wire_and_extract;
6390 	vm_prot_t               extra_prots;
6391 
6392 	extra_prots = VM_PROT_COPY;
6393 	extra_prots |= VM_PROT_COPY_FAIL_IF_EXECUTABLE;
6394 #if XNU_TARGET_OS_OSX
6395 	if (map->pmap == kernel_pmap ||
6396 	    !vm_map_cs_enforcement(map)) {
6397 		extra_prots &= ~VM_PROT_COPY_FAIL_IF_EXECUTABLE;
6398 	}
6399 #endif /* XNU_TARGET_OS_OSX */
6400 
6401 	access_type = (caller_prot & (VM_PROT_ALL | VM_PROT_ALLEXEC));
6402 
6403 	wire_and_extract = FALSE;
6404 	if (physpage_p != NULL) {
6405 		/*
6406 		 * The caller wants the physical page number of the
6407 		 * wired page.  We return only one physical page number
6408 		 * so this works for only one page at a time.
6409 		 */
6410 		if ((end - start) != PAGE_SIZE) {
6411 			return KERN_INVALID_ARGUMENT;
6412 		}
6413 		wire_and_extract = TRUE;
6414 		*physpage_p = 0;
6415 	}
6416 
6417 	vm_map_lock(map);
6418 	if (map_pmap == NULL) {
6419 		main_map = TRUE;
6420 	}
6421 	last_timestamp = map->timestamp;
6422 
6423 	VM_MAP_RANGE_CHECK(map, start, end);
6424 	assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
6425 	assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
6426 
6427 	if (start == end) {
6428 		/* We wired what the caller asked for, zero pages */
6429 		vm_map_unlock(map);
6430 		return KERN_SUCCESS;
6431 	}
6432 
6433 	need_wakeup = FALSE;
6434 	cur_thread = current_thread();
6435 
6436 	s = start;
6437 	rc = KERN_SUCCESS;
6438 
6439 	if (vm_map_lookup_entry(map, s, &first_entry)) {
6440 		entry = first_entry;
6441 		/*
6442 		 * vm_map_clip_start will be done later.
6443 		 * We don't want to unnest any nested submaps here !
6444 		 */
6445 	} else {
6446 		/* Start address is not in map */
6447 		rc = KERN_INVALID_ADDRESS;
6448 		goto done;
6449 	}
6450 
6451 	while ((entry != vm_map_to_entry(map)) && (s < end)) {
6452 		/*
6453 		 * At this point, we have wired from "start" to "s".
6454 		 * We still need to wire from "s" to "end".
6455 		 *
6456 		 * "entry" hasn't been clipped, so it could start before "s"
6457 		 * and/or end after "end".
6458 		 */
6459 
6460 		/* "e" is how far we want to wire in this entry */
6461 		e = entry->vme_end;
6462 		if (e > end) {
6463 			e = end;
6464 		}
6465 
6466 		/*
6467 		 * If another thread is wiring/unwiring this entry then
6468 		 * block after informing other thread to wake us up.
6469 		 */
6470 		if (entry->in_transition) {
6471 			wait_result_t wait_result;
6472 
6473 			/*
6474 			 * We have not clipped the entry.  Make sure that
6475 			 * the start address is in range so that the lookup
6476 			 * below will succeed.
6477 			 * "s" is the current starting point: we've already
6478 			 * wired from "start" to "s" and we still have
6479 			 * to wire from "s" to "end".
6480 			 */
6481 
6482 			entry->needs_wakeup = TRUE;
6483 
6484 			/*
6485 			 * wake up anybody waiting on entries that we have
6486 			 * already wired.
6487 			 */
6488 			if (need_wakeup) {
6489 				vm_map_entry_wakeup(map);
6490 				need_wakeup = FALSE;
6491 			}
6492 			/*
6493 			 * User wiring is interruptible
6494 			 */
6495 			wait_result = vm_map_entry_wait(map,
6496 			    (user_wire) ? THREAD_ABORTSAFE :
6497 			    THREAD_UNINT);
6498 			if (user_wire && wait_result == THREAD_INTERRUPTED) {
6499 				/*
6500 				 * undo the wirings we have done so far
6501 				 * We do not clear the needs_wakeup flag,
6502 				 * because we cannot tell if we were the
6503 				 * only one waiting.
6504 				 */
6505 				rc = KERN_FAILURE;
6506 				goto done;
6507 			}
6508 
6509 			/*
6510 			 * Cannot avoid a lookup here. reset timestamp.
6511 			 */
6512 			last_timestamp = map->timestamp;
6513 
6514 			/*
6515 			 * The entry could have been clipped, look it up again.
6516 			 * Worse that can happen is, it may not exist anymore.
6517 			 */
6518 			if (!vm_map_lookup_entry(map, s, &first_entry)) {
6519 				/*
6520 				 * User: undo everything upto the previous
6521 				 * entry.  let vm_map_unwire worry about
6522 				 * checking the validity of the range.
6523 				 */
6524 				rc = KERN_FAILURE;
6525 				goto done;
6526 			}
6527 			entry = first_entry;
6528 			continue;
6529 		}
6530 
6531 		if (entry->is_sub_map) {
6532 			vm_map_offset_t sub_start;
6533 			vm_map_offset_t sub_end;
6534 			vm_map_offset_t local_start;
6535 			vm_map_offset_t local_end;
6536 			pmap_t          pmap;
6537 
6538 			if (wire_and_extract) {
6539 				/*
6540 				 * Wiring would result in copy-on-write
6541 				 * which would not be compatible with
6542 				 * the sharing we have with the original
6543 				 * provider of this memory.
6544 				 */
6545 				rc = KERN_INVALID_ARGUMENT;
6546 				goto done;
6547 			}
6548 
6549 			vm_map_clip_start(map, entry, s);
6550 			vm_map_clip_end(map, entry, end);
6551 
6552 			sub_start = VME_OFFSET(entry);
6553 			sub_end = entry->vme_end;
6554 			sub_end += VME_OFFSET(entry) - entry->vme_start;
6555 
6556 			local_end = entry->vme_end;
6557 			if (map_pmap == NULL) {
6558 				vm_object_t             object;
6559 				vm_object_offset_t      offset;
6560 				vm_prot_t               prot;
6561 				boolean_t               wired;
6562 				vm_map_entry_t          local_entry;
6563 				vm_map_version_t         version;
6564 				vm_map_t                lookup_map;
6565 
6566 				if (entry->use_pmap) {
6567 					pmap = VME_SUBMAP(entry)->pmap;
6568 					/* ppc implementation requires that */
6569 					/* submaps pmap address ranges line */
6570 					/* up with parent map */
6571 #ifdef notdef
6572 					pmap_addr = sub_start;
6573 #endif
6574 					pmap_addr = s;
6575 				} else {
6576 					pmap = map->pmap;
6577 					pmap_addr = s;
6578 				}
6579 
6580 				if (entry->wired_count) {
6581 					if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6582 						goto done;
6583 					}
6584 
6585 					/*
6586 					 * The map was not unlocked:
6587 					 * no need to goto re-lookup.
6588 					 * Just go directly to next entry.
6589 					 */
6590 					entry = entry->vme_next;
6591 					s = entry->vme_start;
6592 					continue;
6593 				}
6594 
6595 				/* call vm_map_lookup_and_lock_object to */
6596 				/* cause any needs copy to be   */
6597 				/* evaluated */
6598 				local_start = entry->vme_start;
6599 				lookup_map = map;
6600 				vm_map_lock_write_to_read(map);
6601 				rc = vm_map_lookup_and_lock_object(
6602 					&lookup_map, local_start,
6603 					(access_type | extra_prots),
6604 					OBJECT_LOCK_EXCLUSIVE,
6605 					&version, &object,
6606 					&offset, &prot, &wired,
6607 					NULL,
6608 					&real_map, NULL);
6609 				if (rc != KERN_SUCCESS) {
6610 					vm_map_unlock_read(lookup_map);
6611 					assert(map_pmap == NULL);
6612 					vm_map_unwire(map, start,
6613 					    s, user_wire);
6614 					return rc;
6615 				}
6616 				vm_object_unlock(object);
6617 				if (real_map != lookup_map) {
6618 					vm_map_unlock(real_map);
6619 				}
6620 				vm_map_unlock_read(lookup_map);
6621 				vm_map_lock(map);
6622 
6623 				/* we unlocked, so must re-lookup */
6624 				if (!vm_map_lookup_entry(map,
6625 				    local_start,
6626 				    &local_entry)) {
6627 					rc = KERN_FAILURE;
6628 					goto done;
6629 				}
6630 
6631 				/*
6632 				 * entry could have been "simplified",
6633 				 * so re-clip
6634 				 */
6635 				entry = local_entry;
6636 				assert(s == local_start);
6637 				vm_map_clip_start(map, entry, s);
6638 				vm_map_clip_end(map, entry, end);
6639 				/* re-compute "e" */
6640 				e = entry->vme_end;
6641 				if (e > end) {
6642 					e = end;
6643 				}
6644 
6645 				/* did we have a change of type? */
6646 				if (!entry->is_sub_map) {
6647 					last_timestamp = map->timestamp;
6648 					continue;
6649 				}
6650 			} else {
6651 				local_start = entry->vme_start;
6652 				pmap = map_pmap;
6653 			}
6654 
6655 			if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6656 				goto done;
6657 			}
6658 
6659 			entry->in_transition = TRUE;
6660 
6661 			vm_map_unlock(map);
6662 			rc = vm_map_wire_nested(VME_SUBMAP(entry),
6663 			    sub_start, sub_end,
6664 			    caller_prot, tag,
6665 			    user_wire, pmap, pmap_addr,
6666 			    NULL);
6667 			vm_map_lock(map);
6668 
6669 			/*
6670 			 * Find the entry again.  It could have been clipped
6671 			 * after we unlocked the map.
6672 			 */
6673 			if (!vm_map_lookup_entry(map, local_start,
6674 			    &first_entry)) {
6675 				panic("vm_map_wire: re-lookup failed");
6676 			}
6677 			entry = first_entry;
6678 
6679 			assert(local_start == s);
6680 			/* re-compute "e" */
6681 			e = entry->vme_end;
6682 			if (e > end) {
6683 				e = end;
6684 			}
6685 
6686 			last_timestamp = map->timestamp;
6687 			while ((entry != vm_map_to_entry(map)) &&
6688 			    (entry->vme_start < e)) {
6689 				assert(entry->in_transition);
6690 				entry->in_transition = FALSE;
6691 				if (entry->needs_wakeup) {
6692 					entry->needs_wakeup = FALSE;
6693 					need_wakeup = TRUE;
6694 				}
6695 				if (rc != KERN_SUCCESS) {/* from vm_*_wire */
6696 					subtract_wire_counts(map, entry, user_wire);
6697 				}
6698 				entry = entry->vme_next;
6699 			}
6700 			if (rc != KERN_SUCCESS) {       /* from vm_*_wire */
6701 				goto done;
6702 			}
6703 
6704 			/* no need to relookup again */
6705 			s = entry->vme_start;
6706 			continue;
6707 		}
6708 
6709 		/*
6710 		 * If this entry is already wired then increment
6711 		 * the appropriate wire reference count.
6712 		 */
6713 		if (entry->wired_count) {
6714 			if ((entry->protection & access_type) != access_type) {
6715 				/* found a protection problem */
6716 
6717 				/*
6718 				 * XXX FBDP
6719 				 * We should always return an error
6720 				 * in this case but since we didn't
6721 				 * enforce it before, let's do
6722 				 * it only for the new "wire_and_extract"
6723 				 * code path for now...
6724 				 */
6725 				if (wire_and_extract) {
6726 					rc = KERN_PROTECTION_FAILURE;
6727 					goto done;
6728 				}
6729 			}
6730 
6731 			/*
6732 			 * entry is already wired down, get our reference
6733 			 * after clipping to our range.
6734 			 */
6735 			vm_map_clip_start(map, entry, s);
6736 			vm_map_clip_end(map, entry, end);
6737 
6738 			if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6739 				goto done;
6740 			}
6741 
6742 			if (wire_and_extract) {
6743 				vm_object_t             object;
6744 				vm_object_offset_t      offset;
6745 				vm_page_t               m;
6746 
6747 				/*
6748 				 * We don't have to "wire" the page again
6749 				 * bit we still have to "extract" its
6750 				 * physical page number, after some sanity
6751 				 * checks.
6752 				 */
6753 				assert((entry->vme_end - entry->vme_start)
6754 				    == PAGE_SIZE);
6755 				assert(!entry->needs_copy);
6756 				assert(!entry->is_sub_map);
6757 				assert(VME_OBJECT(entry));
6758 				if (((entry->vme_end - entry->vme_start)
6759 				    != PAGE_SIZE) ||
6760 				    entry->needs_copy ||
6761 				    entry->is_sub_map ||
6762 				    VME_OBJECT(entry) == VM_OBJECT_NULL) {
6763 					rc = KERN_INVALID_ARGUMENT;
6764 					goto done;
6765 				}
6766 
6767 				object = VME_OBJECT(entry);
6768 				offset = VME_OFFSET(entry);
6769 				/* need exclusive lock to update m->dirty */
6770 				if (entry->protection & VM_PROT_WRITE) {
6771 					vm_object_lock(object);
6772 				} else {
6773 					vm_object_lock_shared(object);
6774 				}
6775 				m = vm_page_lookup(object, offset);
6776 				assert(m != VM_PAGE_NULL);
6777 				assert(VM_PAGE_WIRED(m));
6778 				if (m != VM_PAGE_NULL && VM_PAGE_WIRED(m)) {
6779 					*physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
6780 					if (entry->protection & VM_PROT_WRITE) {
6781 						vm_object_lock_assert_exclusive(
6782 							object);
6783 						m->vmp_dirty = TRUE;
6784 					}
6785 				} else {
6786 					/* not already wired !? */
6787 					*physpage_p = 0;
6788 				}
6789 				vm_object_unlock(object);
6790 			}
6791 
6792 			/* map was not unlocked: no need to relookup */
6793 			entry = entry->vme_next;
6794 			s = entry->vme_start;
6795 			continue;
6796 		}
6797 
6798 		/*
6799 		 * Unwired entry or wire request transmitted via submap
6800 		 */
6801 
6802 		/*
6803 		 * Wiring would copy the pages to the shadow object.
6804 		 * The shadow object would not be code-signed so
6805 		 * attempting to execute code from these copied pages
6806 		 * would trigger a code-signing violation.
6807 		 */
6808 
6809 		if ((entry->protection & VM_PROT_EXECUTE)
6810 #if XNU_TARGET_OS_OSX
6811 		    &&
6812 		    map->pmap != kernel_pmap &&
6813 		    (vm_map_cs_enforcement(map)
6814 #if __arm64__
6815 		    || !VM_MAP_IS_EXOTIC(map)
6816 #endif /* __arm64__ */
6817 		    )
6818 #endif /* XNU_TARGET_OS_OSX */
6819 		    ) {
6820 #if MACH_ASSERT
6821 			printf("pid %d[%s] wiring executable range from "
6822 			    "0x%llx to 0x%llx: rejected to preserve "
6823 			    "code-signing\n",
6824 			    proc_selfpid(),
6825 			    (get_bsdtask_info(current_task())
6826 			    ? proc_name_address(get_bsdtask_info(current_task()))
6827 			    : "?"),
6828 			    (uint64_t) entry->vme_start,
6829 			    (uint64_t) entry->vme_end);
6830 #endif /* MACH_ASSERT */
6831 			DTRACE_VM2(cs_executable_wire,
6832 			    uint64_t, (uint64_t)entry->vme_start,
6833 			    uint64_t, (uint64_t)entry->vme_end);
6834 			cs_executable_wire++;
6835 			rc = KERN_PROTECTION_FAILURE;
6836 			goto done;
6837 		}
6838 
6839 		/*
6840 		 * Perform actions of vm_map_lookup that need the write
6841 		 * lock on the map: create a shadow object for a
6842 		 * copy-on-write region, or an object for a zero-fill
6843 		 * region.
6844 		 */
6845 		size = entry->vme_end - entry->vme_start;
6846 		/*
6847 		 * If wiring a copy-on-write page, we need to copy it now
6848 		 * even if we're only (currently) requesting read access.
6849 		 * This is aggressive, but once it's wired we can't move it.
6850 		 */
6851 		if (entry->needs_copy) {
6852 			if (wire_and_extract) {
6853 				/*
6854 				 * We're supposed to share with the original
6855 				 * provider so should not be "needs_copy"
6856 				 */
6857 				rc = KERN_INVALID_ARGUMENT;
6858 				goto done;
6859 			}
6860 
6861 			VME_OBJECT_SHADOW(entry, size,
6862 			    vm_map_always_shadow(map));
6863 			entry->needs_copy = FALSE;
6864 		} else if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
6865 			if (wire_and_extract) {
6866 				/*
6867 				 * We're supposed to share with the original
6868 				 * provider so should already have an object.
6869 				 */
6870 				rc = KERN_INVALID_ARGUMENT;
6871 				goto done;
6872 			}
6873 			VME_OBJECT_SET(entry, vm_object_allocate(size), false, 0);
6874 			VME_OFFSET_SET(entry, (vm_object_offset_t)0);
6875 			assert(entry->use_pmap);
6876 		} else if (VME_OBJECT(entry)->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
6877 			if (wire_and_extract) {
6878 				/*
6879 				 * We're supposed to share with the original
6880 				 * provider so should not be COPY_SYMMETRIC.
6881 				 */
6882 				rc = KERN_INVALID_ARGUMENT;
6883 				goto done;
6884 			}
6885 			/*
6886 			 * Force an unrequested "copy-on-write" but only for
6887 			 * the range we're wiring.
6888 			 */
6889 //			printf("FBDP %s:%d map %p entry %p [ 0x%llx 0x%llx ] s 0x%llx end 0x%llx wire&extract=%d\n", __FUNCTION__, __LINE__, map, entry, (uint64_t)entry->vme_start, (uint64_t)entry->vme_end, (uint64_t)s, (uint64_t)end, wire_and_extract);
6890 			vm_map_clip_start(map, entry, s);
6891 			vm_map_clip_end(map, entry, end);
6892 			/* recompute "size" */
6893 			size = entry->vme_end - entry->vme_start;
6894 			/* make a shadow object */
6895 			vm_object_t orig_object;
6896 			vm_object_offset_t orig_offset;
6897 			orig_object = VME_OBJECT(entry);
6898 			orig_offset = VME_OFFSET(entry);
6899 			VME_OBJECT_SHADOW(entry, size, vm_map_always_shadow(map));
6900 			if (VME_OBJECT(entry) != orig_object) {
6901 				/*
6902 				 * This mapping has not been shared (or it would be
6903 				 * COPY_DELAY instead of COPY_SYMMETRIC) and it has
6904 				 * not been copied-on-write (or it would be marked
6905 				 * as "needs_copy" and would have been handled above
6906 				 * and also already write-protected).
6907 				 * We still need to write-protect here to prevent
6908 				 * other threads from modifying these pages while
6909 				 * we're in the process of copying and wiring
6910 				 * the copied pages.
6911 				 * Since the mapping is neither shared nor COWed,
6912 				 * we only need to write-protect the PTEs for this
6913 				 * mapping.
6914 				 */
6915 				vm_object_pmap_protect(orig_object,
6916 				    orig_offset,
6917 				    size,
6918 				    map->pmap,
6919 				    VM_MAP_PAGE_SIZE(map),
6920 				    entry->vme_start,
6921 				    entry->protection & ~VM_PROT_WRITE);
6922 			}
6923 		}
6924 		if (VME_OBJECT(entry)->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
6925 			/*
6926 			 * Make the object COPY_DELAY to get a stable object
6927 			 * to wire.
6928 			 * That should avoid creating long shadow chains while
6929 			 * wiring/unwiring the same range repeatedly.
6930 			 * That also prevents part of the object from being
6931 			 * wired while another part is "needs_copy", which
6932 			 * could result in conflicting rules wrt copy-on-write.
6933 			 */
6934 			vm_object_t object;
6935 
6936 			object = VME_OBJECT(entry);
6937 			vm_object_lock(object);
6938 			if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
6939 				assertf(vm_object_round_page(VME_OFFSET(entry) + size) - vm_object_trunc_page(VME_OFFSET(entry)) == object->vo_size,
6940 				    "object %p size 0x%llx entry %p [0x%llx:0x%llx:0x%llx] size 0x%llx\n",
6941 				    object, (uint64_t)object->vo_size,
6942 				    entry,
6943 				    (uint64_t)entry->vme_start,
6944 				    (uint64_t)entry->vme_end,
6945 				    (uint64_t)VME_OFFSET(entry),
6946 				    (uint64_t)size);
6947 				assertf(object->ref_count == 1,
6948 				    "object %p ref_count %d\n",
6949 				    object, object->ref_count);
6950 				assertf(!entry->needs_copy,
6951 				    "entry %p\n", entry);
6952 				object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
6953 				object->true_share = TRUE;
6954 			}
6955 			vm_object_unlock(object);
6956 		}
6957 
6958 		vm_map_clip_start(map, entry, s);
6959 		vm_map_clip_end(map, entry, end);
6960 
6961 		/* re-compute "e" */
6962 		e = entry->vme_end;
6963 		if (e > end) {
6964 			e = end;
6965 		}
6966 
6967 		/*
6968 		 * Check for holes and protection mismatch.
6969 		 * Holes: Next entry should be contiguous unless this
6970 		 *	  is the end of the region.
6971 		 * Protection: Access requested must be allowed, unless
6972 		 *	wiring is by protection class
6973 		 */
6974 		if ((entry->vme_end < end) &&
6975 		    ((entry->vme_next == vm_map_to_entry(map)) ||
6976 		    (entry->vme_next->vme_start > entry->vme_end))) {
6977 			/* found a hole */
6978 			rc = KERN_INVALID_ADDRESS;
6979 			goto done;
6980 		}
6981 		if ((entry->protection & access_type) != access_type) {
6982 			/* found a protection problem */
6983 			rc = KERN_PROTECTION_FAILURE;
6984 			goto done;
6985 		}
6986 
6987 		assert(entry->wired_count == 0 && entry->user_wired_count == 0);
6988 
6989 		if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6990 			goto done;
6991 		}
6992 
6993 		entry->in_transition = TRUE;
6994 
6995 		/*
6996 		 * This entry might get split once we unlock the map.
6997 		 * In vm_fault_wire(), we need the current range as
6998 		 * defined by this entry.  In order for this to work
6999 		 * along with a simultaneous clip operation, we make a
7000 		 * temporary copy of this entry and use that for the
7001 		 * wiring.  Note that the underlying objects do not
7002 		 * change during a clip.
7003 		 */
7004 		tmp_entry = *entry;
7005 
7006 		/*
7007 		 * The in_transition state guarentees that the entry
7008 		 * (or entries for this range, if split occured) will be
7009 		 * there when the map lock is acquired for the second time.
7010 		 */
7011 		vm_map_unlock(map);
7012 
7013 		if (!user_wire && cur_thread != THREAD_NULL) {
7014 			interruptible_state = thread_interrupt_level(THREAD_UNINT);
7015 		} else {
7016 			interruptible_state = THREAD_UNINT;
7017 		}
7018 
7019 		if (map_pmap) {
7020 			rc = vm_fault_wire(map,
7021 			    &tmp_entry, caller_prot, tag, map_pmap, pmap_addr,
7022 			    physpage_p);
7023 		} else {
7024 			rc = vm_fault_wire(map,
7025 			    &tmp_entry, caller_prot, tag, map->pmap,
7026 			    tmp_entry.vme_start,
7027 			    physpage_p);
7028 		}
7029 
7030 		if (!user_wire && cur_thread != THREAD_NULL) {
7031 			thread_interrupt_level(interruptible_state);
7032 		}
7033 
7034 		vm_map_lock(map);
7035 
7036 		if (last_timestamp + 1 != map->timestamp) {
7037 			/*
7038 			 * Find the entry again.  It could have been clipped
7039 			 * after we unlocked the map.
7040 			 */
7041 			if (!vm_map_lookup_entry(map, tmp_entry.vme_start,
7042 			    &first_entry)) {
7043 				panic("vm_map_wire: re-lookup failed");
7044 			}
7045 
7046 			entry = first_entry;
7047 		}
7048 
7049 		last_timestamp = map->timestamp;
7050 
7051 		while ((entry != vm_map_to_entry(map)) &&
7052 		    (entry->vme_start < tmp_entry.vme_end)) {
7053 			assert(entry->in_transition);
7054 			entry->in_transition = FALSE;
7055 			if (entry->needs_wakeup) {
7056 				entry->needs_wakeup = FALSE;
7057 				need_wakeup = TRUE;
7058 			}
7059 			if (rc != KERN_SUCCESS) {       /* from vm_*_wire */
7060 				subtract_wire_counts(map, entry, user_wire);
7061 			}
7062 			entry = entry->vme_next;
7063 		}
7064 
7065 		if (rc != KERN_SUCCESS) {               /* from vm_*_wire */
7066 			goto done;
7067 		}
7068 
7069 		if ((entry != vm_map_to_entry(map)) && /* we still have entries in the map */
7070 		    (tmp_entry.vme_end != end) &&    /* AND, we are not at the end of the requested range */
7071 		    (entry->vme_start != tmp_entry.vme_end)) { /* AND, the next entry is not contiguous. */
7072 			/* found a "new" hole */
7073 			s = tmp_entry.vme_end;
7074 			rc = KERN_INVALID_ADDRESS;
7075 			goto done;
7076 		}
7077 
7078 		s = entry->vme_start;
7079 	} /* end while loop through map entries */
7080 
7081 done:
7082 	if (rc == KERN_SUCCESS) {
7083 		/* repair any damage we may have made to the VM map */
7084 		vm_map_simplify_range(map, start, end);
7085 	}
7086 
7087 	vm_map_unlock(map);
7088 
7089 	/*
7090 	 * wake up anybody waiting on entries we wired.
7091 	 */
7092 	if (need_wakeup) {
7093 		vm_map_entry_wakeup(map);
7094 	}
7095 
7096 	if (rc != KERN_SUCCESS) {
7097 		/* undo what has been wired so far */
7098 		vm_map_unwire_nested(map, start, s, user_wire,
7099 		    map_pmap, pmap_addr);
7100 		if (physpage_p) {
7101 			*physpage_p = 0;
7102 		}
7103 	}
7104 
7105 	return rc;
7106 }
7107 
7108 kern_return_t
vm_map_wire_external(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t caller_prot,boolean_t user_wire)7109 vm_map_wire_external(
7110 	vm_map_t                map,
7111 	vm_map_offset_t         start,
7112 	vm_map_offset_t         end,
7113 	vm_prot_t               caller_prot,
7114 	boolean_t               user_wire)
7115 {
7116 	kern_return_t   kret;
7117 
7118 	kret = vm_map_wire_nested(map, start, end, caller_prot, vm_tag_bt(),
7119 	    user_wire, (pmap_t)NULL, 0, NULL);
7120 	return kret;
7121 }
7122 
7123 kern_return_t
vm_map_wire_kernel(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t caller_prot,vm_tag_t tag,boolean_t user_wire)7124 vm_map_wire_kernel(
7125 	vm_map_t                map,
7126 	vm_map_offset_t         start,
7127 	vm_map_offset_t         end,
7128 	vm_prot_t               caller_prot,
7129 	vm_tag_t                tag,
7130 	boolean_t               user_wire)
7131 {
7132 	kern_return_t   kret;
7133 
7134 	kret = vm_map_wire_nested(map, start, end, caller_prot, tag,
7135 	    user_wire, (pmap_t)NULL, 0, NULL);
7136 	return kret;
7137 }
7138 
7139 kern_return_t
vm_map_wire_and_extract_external(vm_map_t map,vm_map_offset_t start,vm_prot_t caller_prot,boolean_t user_wire,ppnum_t * physpage_p)7140 vm_map_wire_and_extract_external(
7141 	vm_map_t        map,
7142 	vm_map_offset_t start,
7143 	vm_prot_t       caller_prot,
7144 	boolean_t       user_wire,
7145 	ppnum_t         *physpage_p)
7146 {
7147 	kern_return_t   kret;
7148 
7149 	kret = vm_map_wire_nested(map,
7150 	    start,
7151 	    start + VM_MAP_PAGE_SIZE(map),
7152 	    caller_prot,
7153 	    vm_tag_bt(),
7154 	    user_wire,
7155 	    (pmap_t)NULL,
7156 	    0,
7157 	    physpage_p);
7158 	if (kret != KERN_SUCCESS &&
7159 	    physpage_p != NULL) {
7160 		*physpage_p = 0;
7161 	}
7162 	return kret;
7163 }
7164 
7165 kern_return_t
vm_map_wire_and_extract_kernel(vm_map_t map,vm_map_offset_t start,vm_prot_t caller_prot,vm_tag_t tag,boolean_t user_wire,ppnum_t * physpage_p)7166 vm_map_wire_and_extract_kernel(
7167 	vm_map_t        map,
7168 	vm_map_offset_t start,
7169 	vm_prot_t       caller_prot,
7170 	vm_tag_t        tag,
7171 	boolean_t       user_wire,
7172 	ppnum_t         *physpage_p)
7173 {
7174 	kern_return_t   kret;
7175 
7176 	kret = vm_map_wire_nested(map,
7177 	    start,
7178 	    start + VM_MAP_PAGE_SIZE(map),
7179 	    caller_prot,
7180 	    tag,
7181 	    user_wire,
7182 	    (pmap_t)NULL,
7183 	    0,
7184 	    physpage_p);
7185 	if (kret != KERN_SUCCESS &&
7186 	    physpage_p != NULL) {
7187 		*physpage_p = 0;
7188 	}
7189 	return kret;
7190 }
7191 
7192 /*
7193  *	vm_map_unwire:
7194  *
7195  *	Sets the pageability of the specified address range in the target
7196  *	as pageable.  Regions specified must have been wired previously.
7197  *
7198  *	The map must not be locked, but a reference must remain to the map
7199  *	throughout the call.
7200  *
7201  *	Kernel will panic on failures.  User unwire ignores holes and
7202  *	unwired and intransition entries to avoid losing memory by leaving
7203  *	it unwired.
7204  */
7205 static kern_return_t
vm_map_unwire_nested(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,boolean_t user_wire,pmap_t map_pmap,vm_map_offset_t pmap_addr)7206 vm_map_unwire_nested(
7207 	vm_map_t                map,
7208 	vm_map_offset_t         start,
7209 	vm_map_offset_t         end,
7210 	boolean_t               user_wire,
7211 	pmap_t                  map_pmap,
7212 	vm_map_offset_t         pmap_addr)
7213 {
7214 	vm_map_entry_t          entry;
7215 	struct vm_map_entry     *first_entry, tmp_entry;
7216 	boolean_t               need_wakeup;
7217 	boolean_t               main_map = FALSE;
7218 	unsigned int            last_timestamp;
7219 
7220 	vm_map_lock(map);
7221 	if (map_pmap == NULL) {
7222 		main_map = TRUE;
7223 	}
7224 	last_timestamp = map->timestamp;
7225 
7226 	VM_MAP_RANGE_CHECK(map, start, end);
7227 	assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
7228 	assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
7229 
7230 	if (start == end) {
7231 		/* We unwired what the caller asked for: zero pages */
7232 		vm_map_unlock(map);
7233 		return KERN_SUCCESS;
7234 	}
7235 
7236 	if (vm_map_lookup_entry(map, start, &first_entry)) {
7237 		entry = first_entry;
7238 		/*
7239 		 * vm_map_clip_start will be done later.
7240 		 * We don't want to unnest any nested sub maps here !
7241 		 */
7242 	} else {
7243 		if (!user_wire) {
7244 			panic("vm_map_unwire: start not found");
7245 		}
7246 		/*	Start address is not in map. */
7247 		vm_map_unlock(map);
7248 		return KERN_INVALID_ADDRESS;
7249 	}
7250 
7251 	if (entry->superpage_size) {
7252 		/* superpages are always wired */
7253 		vm_map_unlock(map);
7254 		return KERN_INVALID_ADDRESS;
7255 	}
7256 
7257 	need_wakeup = FALSE;
7258 	while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
7259 		if (entry->in_transition) {
7260 			/*
7261 			 * 1)
7262 			 * Another thread is wiring down this entry. Note
7263 			 * that if it is not for the other thread we would
7264 			 * be unwiring an unwired entry.  This is not
7265 			 * permitted.  If we wait, we will be unwiring memory
7266 			 * we did not wire.
7267 			 *
7268 			 * 2)
7269 			 * Another thread is unwiring this entry.  We did not
7270 			 * have a reference to it, because if we did, this
7271 			 * entry will not be getting unwired now.
7272 			 */
7273 			if (!user_wire) {
7274 				/*
7275 				 * XXX FBDP
7276 				 * This could happen:  there could be some
7277 				 * overlapping vslock/vsunlock operations
7278 				 * going on.
7279 				 * We should probably just wait and retry,
7280 				 * but then we have to be careful that this
7281 				 * entry could get "simplified" after
7282 				 * "in_transition" gets unset and before
7283 				 * we re-lookup the entry, so we would
7284 				 * have to re-clip the entry to avoid
7285 				 * re-unwiring what we have already unwired...
7286 				 * See vm_map_wire_nested().
7287 				 *
7288 				 * Or we could just ignore "in_transition"
7289 				 * here and proceed to decement the wired
7290 				 * count(s) on this entry.  That should be fine
7291 				 * as long as "wired_count" doesn't drop all
7292 				 * the way to 0 (and we should panic if THAT
7293 				 * happens).
7294 				 */
7295 				panic("vm_map_unwire: in_transition entry");
7296 			}
7297 
7298 			entry = entry->vme_next;
7299 			continue;
7300 		}
7301 
7302 		if (entry->is_sub_map) {
7303 			vm_map_offset_t sub_start;
7304 			vm_map_offset_t sub_end;
7305 			vm_map_offset_t local_end;
7306 			pmap_t          pmap;
7307 
7308 			vm_map_clip_start(map, entry, start);
7309 			vm_map_clip_end(map, entry, end);
7310 
7311 			sub_start = VME_OFFSET(entry);
7312 			sub_end = entry->vme_end - entry->vme_start;
7313 			sub_end += VME_OFFSET(entry);
7314 			local_end = entry->vme_end;
7315 			if (map_pmap == NULL) {
7316 				if (entry->use_pmap) {
7317 					pmap = VME_SUBMAP(entry)->pmap;
7318 					pmap_addr = sub_start;
7319 				} else {
7320 					pmap = map->pmap;
7321 					pmap_addr = start;
7322 				}
7323 				if (entry->wired_count == 0 ||
7324 				    (user_wire && entry->user_wired_count == 0)) {
7325 					if (!user_wire) {
7326 						panic("vm_map_unwire: entry is unwired");
7327 					}
7328 					entry = entry->vme_next;
7329 					continue;
7330 				}
7331 
7332 				/*
7333 				 * Check for holes
7334 				 * Holes: Next entry should be contiguous unless
7335 				 * this is the end of the region.
7336 				 */
7337 				if (((entry->vme_end < end) &&
7338 				    ((entry->vme_next == vm_map_to_entry(map)) ||
7339 				    (entry->vme_next->vme_start
7340 				    > entry->vme_end)))) {
7341 					if (!user_wire) {
7342 						panic("vm_map_unwire: non-contiguous region");
7343 					}
7344 /*
7345  *                                       entry = entry->vme_next;
7346  *                                       continue;
7347  */
7348 				}
7349 
7350 				subtract_wire_counts(map, entry, user_wire);
7351 
7352 				if (entry->wired_count != 0) {
7353 					entry = entry->vme_next;
7354 					continue;
7355 				}
7356 
7357 				entry->in_transition = TRUE;
7358 				tmp_entry = *entry;/* see comment in vm_map_wire() */
7359 
7360 				/*
7361 				 * We can unlock the map now. The in_transition state
7362 				 * guarantees existance of the entry.
7363 				 */
7364 				vm_map_unlock(map);
7365 				vm_map_unwire_nested(VME_SUBMAP(entry),
7366 				    sub_start, sub_end, user_wire, pmap, pmap_addr);
7367 				vm_map_lock(map);
7368 
7369 				if (last_timestamp + 1 != map->timestamp) {
7370 					/*
7371 					 * Find the entry again.  It could have been
7372 					 * clipped or deleted after we unlocked the map.
7373 					 */
7374 					if (!vm_map_lookup_entry(map,
7375 					    tmp_entry.vme_start,
7376 					    &first_entry)) {
7377 						if (!user_wire) {
7378 							panic("vm_map_unwire: re-lookup failed");
7379 						}
7380 						entry = first_entry->vme_next;
7381 					} else {
7382 						entry = first_entry;
7383 					}
7384 				}
7385 				last_timestamp = map->timestamp;
7386 
7387 				/*
7388 				 * clear transition bit for all constituent entries
7389 				 * that were in the original entry (saved in
7390 				 * tmp_entry).  Also check for waiters.
7391 				 */
7392 				while ((entry != vm_map_to_entry(map)) &&
7393 				    (entry->vme_start < tmp_entry.vme_end)) {
7394 					assert(entry->in_transition);
7395 					entry->in_transition = FALSE;
7396 					if (entry->needs_wakeup) {
7397 						entry->needs_wakeup = FALSE;
7398 						need_wakeup = TRUE;
7399 					}
7400 					entry = entry->vme_next;
7401 				}
7402 				continue;
7403 			} else {
7404 				tmp_entry = *entry;
7405 				vm_map_unlock(map);
7406 				vm_map_unwire_nested(VME_SUBMAP(entry),
7407 				    sub_start, sub_end, user_wire, map_pmap,
7408 				    pmap_addr);
7409 				vm_map_lock(map);
7410 
7411 				if (last_timestamp + 1 != map->timestamp) {
7412 					/*
7413 					 * Find the entry again.  It could have been
7414 					 * clipped or deleted after we unlocked the map.
7415 					 */
7416 					if (!vm_map_lookup_entry(map,
7417 					    tmp_entry.vme_start,
7418 					    &first_entry)) {
7419 						if (!user_wire) {
7420 							panic("vm_map_unwire: re-lookup failed");
7421 						}
7422 						entry = first_entry->vme_next;
7423 					} else {
7424 						entry = first_entry;
7425 					}
7426 				}
7427 				last_timestamp = map->timestamp;
7428 			}
7429 		}
7430 
7431 
7432 		if ((entry->wired_count == 0) ||
7433 		    (user_wire && entry->user_wired_count == 0)) {
7434 			if (!user_wire) {
7435 				panic("vm_map_unwire: entry is unwired");
7436 			}
7437 
7438 			entry = entry->vme_next;
7439 			continue;
7440 		}
7441 
7442 		assert(entry->wired_count > 0 &&
7443 		    (!user_wire || entry->user_wired_count > 0));
7444 
7445 		vm_map_clip_start(map, entry, start);
7446 		vm_map_clip_end(map, entry, end);
7447 
7448 		/*
7449 		 * Check for holes
7450 		 * Holes: Next entry should be contiguous unless
7451 		 *	  this is the end of the region.
7452 		 */
7453 		if (((entry->vme_end < end) &&
7454 		    ((entry->vme_next == vm_map_to_entry(map)) ||
7455 		    (entry->vme_next->vme_start > entry->vme_end)))) {
7456 			if (!user_wire) {
7457 				panic("vm_map_unwire: non-contiguous region");
7458 			}
7459 			entry = entry->vme_next;
7460 			continue;
7461 		}
7462 
7463 		subtract_wire_counts(map, entry, user_wire);
7464 
7465 		if (entry->wired_count != 0) {
7466 			entry = entry->vme_next;
7467 			continue;
7468 		}
7469 
7470 		if (entry->zero_wired_pages) {
7471 			entry->zero_wired_pages = FALSE;
7472 		}
7473 
7474 		entry->in_transition = TRUE;
7475 		tmp_entry = *entry;     /* see comment in vm_map_wire() */
7476 
7477 		/*
7478 		 * We can unlock the map now. The in_transition state
7479 		 * guarantees existance of the entry.
7480 		 */
7481 		vm_map_unlock(map);
7482 		if (map_pmap) {
7483 			vm_fault_unwire(map,
7484 			    &tmp_entry, FALSE, map_pmap, pmap_addr);
7485 		} else {
7486 			vm_fault_unwire(map,
7487 			    &tmp_entry, FALSE, map->pmap,
7488 			    tmp_entry.vme_start);
7489 		}
7490 		vm_map_lock(map);
7491 
7492 		if (last_timestamp + 1 != map->timestamp) {
7493 			/*
7494 			 * Find the entry again.  It could have been clipped
7495 			 * or deleted after we unlocked the map.
7496 			 */
7497 			if (!vm_map_lookup_entry(map, tmp_entry.vme_start,
7498 			    &first_entry)) {
7499 				if (!user_wire) {
7500 					panic("vm_map_unwire: re-lookup failed");
7501 				}
7502 				entry = first_entry->vme_next;
7503 			} else {
7504 				entry = first_entry;
7505 			}
7506 		}
7507 		last_timestamp = map->timestamp;
7508 
7509 		/*
7510 		 * clear transition bit for all constituent entries that
7511 		 * were in the original entry (saved in tmp_entry).  Also
7512 		 * check for waiters.
7513 		 */
7514 		while ((entry != vm_map_to_entry(map)) &&
7515 		    (entry->vme_start < tmp_entry.vme_end)) {
7516 			assert(entry->in_transition);
7517 			entry->in_transition = FALSE;
7518 			if (entry->needs_wakeup) {
7519 				entry->needs_wakeup = FALSE;
7520 				need_wakeup = TRUE;
7521 			}
7522 			entry = entry->vme_next;
7523 		}
7524 	}
7525 
7526 	/*
7527 	 * We might have fragmented the address space when we wired this
7528 	 * range of addresses.  Attempt to re-coalesce these VM map entries
7529 	 * with their neighbors now that they're no longer wired.
7530 	 * Under some circumstances, address space fragmentation can
7531 	 * prevent VM object shadow chain collapsing, which can cause
7532 	 * swap space leaks.
7533 	 */
7534 	vm_map_simplify_range(map, start, end);
7535 
7536 	vm_map_unlock(map);
7537 	/*
7538 	 * wake up anybody waiting on entries that we have unwired.
7539 	 */
7540 	if (need_wakeup) {
7541 		vm_map_entry_wakeup(map);
7542 	}
7543 	return KERN_SUCCESS;
7544 }
7545 
7546 kern_return_t
vm_map_unwire(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,boolean_t user_wire)7547 vm_map_unwire(
7548 	vm_map_t                map,
7549 	vm_map_offset_t         start,
7550 	vm_map_offset_t         end,
7551 	boolean_t               user_wire)
7552 {
7553 	return vm_map_unwire_nested(map, start, end,
7554 	           user_wire, (pmap_t)NULL, 0);
7555 }
7556 
7557 
7558 /*
7559  *	vm_map_entry_zap:	[ internal use only ]
7560  *
7561  *	Remove the entry from the target map
7562  *	and put it on a zap list.
7563  */
7564 static void
vm_map_entry_zap(vm_map_t map,vm_map_entry_t entry,vm_map_zap_t zap)7565 vm_map_entry_zap(
7566 	vm_map_t                map,
7567 	vm_map_entry_t          entry,
7568 	vm_map_zap_t            zap)
7569 {
7570 	vm_map_offset_t s, e;
7571 
7572 	s = entry->vme_start;
7573 	e = entry->vme_end;
7574 	assert(VM_MAP_PAGE_ALIGNED(s, FOURK_PAGE_MASK));
7575 	assert(VM_MAP_PAGE_ALIGNED(e, FOURK_PAGE_MASK));
7576 	if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK) {
7577 		assert(page_aligned(s));
7578 		assert(page_aligned(e));
7579 	}
7580 	if (entry->map_aligned == TRUE) {
7581 		assert(VM_MAP_PAGE_ALIGNED(s, VM_MAP_PAGE_MASK(map)));
7582 		assert(VM_MAP_PAGE_ALIGNED(e, VM_MAP_PAGE_MASK(map)));
7583 	}
7584 	assert(entry->wired_count == 0);
7585 	assert(entry->user_wired_count == 0);
7586 	assert(!entry->vme_permanent);
7587 
7588 	vm_map_store_entry_unlink(map, entry, false);
7589 	map->size -= e - s;
7590 
7591 	vm_map_zap_append(zap, entry);
7592 }
7593 
7594 static void
vm_map_submap_pmap_clean(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_map_t sub_map,vm_map_offset_t offset)7595 vm_map_submap_pmap_clean(
7596 	vm_map_t        map,
7597 	vm_map_offset_t start,
7598 	vm_map_offset_t end,
7599 	vm_map_t        sub_map,
7600 	vm_map_offset_t offset)
7601 {
7602 	vm_map_offset_t submap_start;
7603 	vm_map_offset_t submap_end;
7604 	vm_map_size_t   remove_size;
7605 	vm_map_entry_t  entry;
7606 
7607 	submap_end = offset + (end - start);
7608 	submap_start = offset;
7609 
7610 	vm_map_lock_read(sub_map);
7611 	if (vm_map_lookup_entry(sub_map, offset, &entry)) {
7612 		remove_size = (entry->vme_end - entry->vme_start);
7613 		if (offset > entry->vme_start) {
7614 			remove_size -= offset - entry->vme_start;
7615 		}
7616 
7617 
7618 		if (submap_end < entry->vme_end) {
7619 			remove_size -=
7620 			    entry->vme_end - submap_end;
7621 		}
7622 		if (entry->is_sub_map) {
7623 			vm_map_submap_pmap_clean(
7624 				sub_map,
7625 				start,
7626 				start + remove_size,
7627 				VME_SUBMAP(entry),
7628 				VME_OFFSET(entry));
7629 		} else {
7630 			if (map->mapped_in_other_pmaps &&
7631 			    os_ref_get_count_raw(&map->map_refcnt) != 0 &&
7632 			    VME_OBJECT(entry) != NULL) {
7633 				vm_object_pmap_protect_options(
7634 					VME_OBJECT(entry),
7635 					(VME_OFFSET(entry) +
7636 					offset -
7637 					entry->vme_start),
7638 					remove_size,
7639 					PMAP_NULL,
7640 					PAGE_SIZE,
7641 					entry->vme_start,
7642 					VM_PROT_NONE,
7643 					PMAP_OPTIONS_REMOVE);
7644 			} else {
7645 				pmap_remove(map->pmap,
7646 				    (addr64_t)start,
7647 				    (addr64_t)(start + remove_size));
7648 			}
7649 		}
7650 	}
7651 
7652 	entry = entry->vme_next;
7653 
7654 	while ((entry != vm_map_to_entry(sub_map))
7655 	    && (entry->vme_start < submap_end)) {
7656 		remove_size = (entry->vme_end - entry->vme_start);
7657 		if (submap_end < entry->vme_end) {
7658 			remove_size -= entry->vme_end - submap_end;
7659 		}
7660 		if (entry->is_sub_map) {
7661 			vm_map_submap_pmap_clean(
7662 				sub_map,
7663 				(start + entry->vme_start) - offset,
7664 				((start + entry->vme_start) - offset) + remove_size,
7665 				VME_SUBMAP(entry),
7666 				VME_OFFSET(entry));
7667 		} else {
7668 			if (map->mapped_in_other_pmaps &&
7669 			    os_ref_get_count_raw(&map->map_refcnt) != 0 &&
7670 			    VME_OBJECT(entry) != NULL) {
7671 				vm_object_pmap_protect_options(
7672 					VME_OBJECT(entry),
7673 					VME_OFFSET(entry),
7674 					remove_size,
7675 					PMAP_NULL,
7676 					PAGE_SIZE,
7677 					entry->vme_start,
7678 					VM_PROT_NONE,
7679 					PMAP_OPTIONS_REMOVE);
7680 			} else {
7681 				pmap_remove(map->pmap,
7682 				    (addr64_t)((start + entry->vme_start)
7683 				    - offset),
7684 				    (addr64_t)(((start + entry->vme_start)
7685 				    - offset) + remove_size));
7686 			}
7687 		}
7688 		entry = entry->vme_next;
7689 	}
7690 	vm_map_unlock_read(sub_map);
7691 	return;
7692 }
7693 
7694 /*
7695  *     virt_memory_guard_ast:
7696  *
7697  *     Handle the AST callout for a virtual memory guard.
7698  *	   raise an EXC_GUARD exception and terminate the task
7699  *     if configured to do so.
7700  */
7701 void
virt_memory_guard_ast(thread_t thread,mach_exception_data_type_t code,mach_exception_data_type_t subcode)7702 virt_memory_guard_ast(
7703 	thread_t thread,
7704 	mach_exception_data_type_t code,
7705 	mach_exception_data_type_t subcode)
7706 {
7707 	task_t task = get_threadtask(thread);
7708 	assert(task != kernel_task);
7709 	assert(task == current_task());
7710 	kern_return_t sync_exception_result;
7711 	uint32_t behavior;
7712 
7713 	behavior = task->task_exc_guard;
7714 
7715 	/* Is delivery enabled */
7716 	if ((behavior & TASK_EXC_GUARD_VM_DELIVER) == 0) {
7717 		return;
7718 	}
7719 
7720 	/* If only once, make sure we're that once */
7721 	while (behavior & TASK_EXC_GUARD_VM_ONCE) {
7722 		uint32_t new_behavior = behavior & ~TASK_EXC_GUARD_VM_DELIVER;
7723 
7724 		if (OSCompareAndSwap(behavior, new_behavior, &task->task_exc_guard)) {
7725 			break;
7726 		}
7727 		behavior = task->task_exc_guard;
7728 		if ((behavior & TASK_EXC_GUARD_VM_DELIVER) == 0) {
7729 			return;
7730 		}
7731 	}
7732 
7733 	/* Raise exception synchronously and see if handler claimed it */
7734 	sync_exception_result = task_exception_notify(EXC_GUARD, code, subcode);
7735 
7736 	if (task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL) {
7737 		/*
7738 		 * If Synchronous EXC_GUARD delivery was successful then
7739 		 * kill the process and return, else kill the process
7740 		 * and deliver the exception via EXC_CORPSE_NOTIFY.
7741 		 */
7742 		if (sync_exception_result == KERN_SUCCESS) {
7743 			task_bsdtask_kill(current_task());
7744 		} else {
7745 			exit_with_guard_exception(current_proc(), code, subcode);
7746 		}
7747 	} else if (task->task_exc_guard & TASK_EXC_GUARD_VM_CORPSE) {
7748 		/*
7749 		 * If the synchronous EXC_GUARD delivery was not successful,
7750 		 * raise a simulated crash.
7751 		 */
7752 		if (sync_exception_result != KERN_SUCCESS) {
7753 			task_violated_guard(code, subcode, NULL, FALSE);
7754 		}
7755 	}
7756 }
7757 
7758 /*
7759  *     vm_map_guard_exception:
7760  *
7761  *     Generate a GUARD_TYPE_VIRTUAL_MEMORY EXC_GUARD exception.
7762  *
7763  *     Right now, we do this when we find nothing mapped, or a
7764  *     gap in the mapping when a user address space deallocate
7765  *     was requested. We report the address of the first gap found.
7766  */
7767 static void
vm_map_guard_exception(vm_map_offset_t gap_start,unsigned reason)7768 vm_map_guard_exception(
7769 	vm_map_offset_t gap_start,
7770 	unsigned reason)
7771 {
7772 	mach_exception_code_t code = 0;
7773 	unsigned int guard_type = GUARD_TYPE_VIRT_MEMORY;
7774 	unsigned int target = 0; /* should we pass in pid associated with map? */
7775 	mach_exception_data_type_t subcode = (uint64_t)gap_start;
7776 	boolean_t fatal = FALSE;
7777 
7778 	task_t task = current_task_early();
7779 
7780 	/* Can't deliver exceptions to a NULL task (early boot) or kernel task */
7781 	if (task == NULL || task == kernel_task) {
7782 		return;
7783 	}
7784 
7785 	EXC_GUARD_ENCODE_TYPE(code, guard_type);
7786 	EXC_GUARD_ENCODE_FLAVOR(code, reason);
7787 	EXC_GUARD_ENCODE_TARGET(code, target);
7788 
7789 	if (task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL) {
7790 		fatal = TRUE;
7791 	}
7792 	thread_guard_violation(current_thread(), code, subcode, fatal);
7793 }
7794 
7795 static kern_return_t
vm_map_delete_submap_recurse(vm_map_t submap,vm_map_offset_t submap_start,vm_map_offset_t submap_end)7796 vm_map_delete_submap_recurse(
7797 	vm_map_t submap,
7798 	vm_map_offset_t submap_start,
7799 	vm_map_offset_t submap_end)
7800 {
7801 	vm_map_entry_t submap_entry;
7802 
7803 	/*
7804 	 * Verify that the submap does not contain any "permanent" entries
7805 	 * within the specified range.
7806 	 * We do not care about gaps.
7807 	 */
7808 
7809 	vm_map_lock(submap);
7810 
7811 	if (!vm_map_lookup_entry(submap, submap_start, &submap_entry)) {
7812 		submap_entry = submap_entry->vme_next;
7813 	}
7814 
7815 	for (;
7816 	    submap_entry != vm_map_to_entry(submap) &&
7817 	    submap_entry->vme_start < submap_end;
7818 	    submap_entry = submap_entry->vme_next) {
7819 		if (submap_entry->vme_permanent) {
7820 			/* "permanent" entry -> fail */
7821 			vm_map_unlock(submap);
7822 			return KERN_PROTECTION_FAILURE;
7823 		}
7824 	}
7825 	/* no "permanent" entries in the range -> success */
7826 	vm_map_unlock(submap);
7827 	return KERN_SUCCESS;
7828 }
7829 
7830 __abortlike
7831 static void
__vm_map_delete_misaligned_panic(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)7832 __vm_map_delete_misaligned_panic(
7833 	vm_map_t                map,
7834 	vm_map_offset_t         start,
7835 	vm_map_offset_t         end)
7836 {
7837 	panic("vm_map_delete(%p,0x%llx,0x%llx): start is not aligned to 0x%x",
7838 	    map, (uint64_t)start, (uint64_t)end, VM_MAP_PAGE_SIZE(map));
7839 }
7840 
7841 __abortlike
7842 static void
__vm_map_delete_failed_panic(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,kern_return_t kr)7843 __vm_map_delete_failed_panic(
7844 	vm_map_t                map,
7845 	vm_map_offset_t         start,
7846 	vm_map_offset_t         end,
7847 	kern_return_t           kr)
7848 {
7849 	panic("vm_map_delete(%p,0x%llx,0x%llx): failed unexpected with %d",
7850 	    map, (uint64_t)start, (uint64_t)end, kr);
7851 }
7852 
7853 __abortlike
7854 static void
__vm_map_delete_gap_panic(vm_map_t map,vm_map_offset_t where,vm_map_offset_t start,vm_map_offset_t end)7855 __vm_map_delete_gap_panic(
7856 	vm_map_t                map,
7857 	vm_map_offset_t         where,
7858 	vm_map_offset_t         start,
7859 	vm_map_offset_t         end)
7860 {
7861 	panic("vm_map_delete(%p,0x%llx,0x%llx): no map entry at 0x%llx",
7862 	    map, (uint64_t)start, (uint64_t)end, (uint64_t)where);
7863 }
7864 
7865 __abortlike
7866 static void
__vm_map_delete_permanent_panic(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_map_entry_t entry)7867 __vm_map_delete_permanent_panic(
7868 	vm_map_t                map,
7869 	vm_map_offset_t         start,
7870 	vm_map_offset_t         end,
7871 	vm_map_entry_t          entry)
7872 {
7873 	panic("vm_map_delete(%p,0x%llx,0x%llx): "
7874 	    "Attempting to remove permanent VM map entry %p [0x%llx:0x%llx]",
7875 	    map, (uint64_t)start, (uint64_t)end, entry,
7876 	    (uint64_t)entry->vme_start,
7877 	    (uint64_t)entry->vme_end);
7878 }
7879 
7880 __options_decl(vm_map_delete_state_t, uint32_t, {
7881 	VMDS_NONE               = 0x0000,
7882 
7883 	VMDS_FOUND_GAP          = 0x0001,
7884 	VMDS_GAPS_OK            = 0x0002,
7885 
7886 	VMDS_KERNEL_PMAP        = 0x0004,
7887 	VMDS_NEEDS_LOOKUP       = 0x0008,
7888 	VMDS_NEEDS_WAKEUP       = 0x0010,
7889 });
7890 
7891 /*
7892  *	vm_map_delete:	[ internal use only ]
7893  *
7894  *	Deallocates the given address range from the target map.
7895  *	Removes all user wirings. Unwires one kernel wiring if
7896  *	VM_MAP_REMOVE_KUNWIRE is set.  Waits for kernel wirings to go
7897  *	away if VM_MAP_REMOVE_WAIT_FOR_KWIRE is set.  Sleeps
7898  *	interruptibly if VM_MAP_REMOVE_INTERRUPTIBLE is set.
7899  *
7900  *
7901  *	When the map is a kernel map, then any error in removing mappings
7902  *	will lead to a panic so that clients do not have to repeat the panic
7903  *	code at each call site.  If VM_MAP_REMOVE_INTERRUPTIBLE
7904  *	is also passed, then KERN_ABORTED will not lead to a panic.
7905  *
7906  *	This routine is called with map locked and leaves map locked.
7907  */
7908 static kmem_return_t
vm_map_delete(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vmr_flags_t flags,kmem_guard_t guard,vm_map_zap_t zap_list)7909 vm_map_delete(
7910 	vm_map_t                map,
7911 	vm_map_offset_t         start,
7912 	vm_map_offset_t         end,
7913 	vmr_flags_t             flags,
7914 	kmem_guard_t            guard,
7915 	vm_map_zap_t            zap_list)
7916 {
7917 	vm_map_entry_t          entry, next;
7918 	int                     interruptible;
7919 	vm_map_offset_t         gap_start = 0;
7920 	vm_map_offset_t         clear_in_transition_end = 0;
7921 	__unused vm_map_offset_t save_start = start;
7922 	__unused vm_map_offset_t save_end = end;
7923 	vm_map_delete_state_t   state = VMDS_NONE;
7924 	kmem_return_t           ret = { };
7925 
7926 	if (vm_map_pmap(map) == kernel_pmap) {
7927 		state |= VMDS_KERNEL_PMAP;
7928 	}
7929 
7930 	if (map->terminated || os_ref_get_count_raw(&map->map_refcnt) == 0) {
7931 		state |= VMDS_GAPS_OK;
7932 	}
7933 
7934 	interruptible = (flags & VM_MAP_REMOVE_INTERRUPTIBLE) ?
7935 	    THREAD_ABORTSAFE : THREAD_UNINT;
7936 
7937 	if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) == 0 &&
7938 	    (start & VM_MAP_PAGE_MASK(map))) {
7939 		__vm_map_delete_misaligned_panic(map, start, end);
7940 	}
7941 
7942 	if ((state & VMDS_GAPS_OK) == 0) {
7943 		/*
7944 		 * If the map isn't terminated then all deletions must have
7945 		 * no gaps, and be within the [min, max) of the map.
7946 		 *
7947 		 * We got here without VM_MAP_RANGE_CHECK() being called,
7948 		 * and hence must validate bounds manually.
7949 		 *
7950 		 * It is worth noting that because vm_deallocate() will
7951 		 * round_page() the deallocation size, it's possible for "end"
7952 		 * to be 0 here due to overflow. We hence must treat it as being
7953 		 * beyond vm_map_max(map).
7954 		 *
7955 		 * Similarly, end < start means some wrap around happend,
7956 		 * which should cause an error or panic.
7957 		 */
7958 		if (end == 0 || end > vm_map_max(map)) {
7959 			state |= VMDS_FOUND_GAP;
7960 			gap_start = vm_map_max(map);
7961 			if (state & VMDS_KERNEL_PMAP) {
7962 				__vm_map_delete_gap_panic(map,
7963 				    gap_start, start, end);
7964 			}
7965 			goto out;
7966 		}
7967 
7968 		if (end < start) {
7969 			if (state & VMDS_KERNEL_PMAP) {
7970 				__vm_map_delete_gap_panic(map,
7971 				    vm_map_max(map), start, end);
7972 			}
7973 			ret.kmr_return = KERN_INVALID_ARGUMENT;
7974 			goto out;
7975 		}
7976 
7977 		if (start < vm_map_min(map)) {
7978 			state |= VMDS_FOUND_GAP;
7979 			gap_start = start;
7980 			if (state & VMDS_KERNEL_PMAP) {
7981 				__vm_map_delete_gap_panic(map,
7982 				    gap_start, start, end);
7983 			}
7984 			goto out;
7985 		}
7986 	} else {
7987 		/*
7988 		 * If the map is terminated, we must accept start/end
7989 		 * being beyond the boundaries of the map as this is
7990 		 * how some of the mappings like commpage mappings
7991 		 * can be destroyed (they're outside of those bounds).
7992 		 *
7993 		 * end < start is still something we can't cope with,
7994 		 * so just bail.
7995 		 */
7996 		if (end < start) {
7997 			goto out;
7998 		}
7999 	}
8000 
8001 
8002 	/*
8003 	 *	Find the start of the region.
8004 	 *
8005 	 *	If in a superpage, extend the range
8006 	 *	to include the start of the mapping.
8007 	 */
8008 	while (vm_map_lookup_entry_or_next(map, start, &entry)) {
8009 		if (entry->superpage_size && (start & ~SUPERPAGE_MASK)) {
8010 			start = SUPERPAGE_ROUND_DOWN(start);
8011 		} else {
8012 			SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
8013 			break;
8014 		}
8015 	}
8016 
8017 	if (entry->superpage_size) {
8018 		end = SUPERPAGE_ROUND_UP(end);
8019 	}
8020 
8021 	/*
8022 	 *	Step through all entries in this region
8023 	 */
8024 	for (vm_map_offset_t s = start; s < end;) {
8025 		/*
8026 		 * At this point, we have deleted all the memory entries
8027 		 * in [start, s) and are proceeding with the [s, end) range.
8028 		 *
8029 		 * This loop might drop the map lock, and it is possible that
8030 		 * some memory was already reallocated within [start, s)
8031 		 * and we don't want to mess with those entries.
8032 		 *
8033 		 * Some of those entries could even have been re-assembled
8034 		 * with an entry after "s" (in vm_map_simplify_entry()), so
8035 		 * we may have to vm_map_clip_start() again.
8036 		 *
8037 		 * When clear_in_transition_end is set, the we had marked
8038 		 * [start, clear_in_transition_end) as "in_transition"
8039 		 * during a previous iteration and we need to clear it.
8040 		 */
8041 
8042 		/*
8043 		 * Step 1: If needed (because we dropped locks),
8044 		 *         lookup the entry again.
8045 		 *
8046 		 *         If we're coming back from unwiring (Step 5),
8047 		 *         we also need to mark the entries as no longer
8048 		 *         in transition after that.
8049 		 */
8050 
8051 		if (state & VMDS_NEEDS_LOOKUP) {
8052 			state &= ~VMDS_NEEDS_LOOKUP;
8053 
8054 			if (vm_map_lookup_entry_or_next(map, s, &entry)) {
8055 				SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
8056 			}
8057 		}
8058 
8059 		if (clear_in_transition_end) {
8060 			for (vm_map_entry_t it = entry;
8061 			    it != vm_map_to_entry(map) &&
8062 			    it->vme_start < clear_in_transition_end;
8063 			    it = it->vme_next) {
8064 				assert(it->in_transition);
8065 				it->in_transition = FALSE;
8066 				if (it->needs_wakeup) {
8067 					it->needs_wakeup = FALSE;
8068 					state |= VMDS_NEEDS_WAKEUP;
8069 				}
8070 			}
8071 
8072 			clear_in_transition_end = 0;
8073 		}
8074 
8075 
8076 		/*
8077 		 * Step 2: Perform various policy checks
8078 		 *         before we do _anything_ to this entry.
8079 		 */
8080 
8081 		if (entry == vm_map_to_entry(map) || s < entry->vme_start) {
8082 			if (state & (VMDS_GAPS_OK | VMDS_FOUND_GAP)) {
8083 				/*
8084 				 * Either we found a gap already,
8085 				 * or we are tearing down a map,
8086 				 * keep going.
8087 				 */
8088 			} else if (state & VMDS_KERNEL_PMAP) {
8089 				__vm_map_delete_gap_panic(map, s, start, end);
8090 			} else if (vm_map_round_page(s, VM_MAP_PAGE_MASK(map)) < end) {
8091 				/*
8092 				 * The vm_map_round_page() is needed since an entry
8093 				 * can be less than VM_MAP_PAGE_MASK() sized.
8094 				 *
8095 				 * For example, devices which have h/w 4K pages,
8096 				 * but entry sizes are all now 16K.
8097 				 */
8098 				state |= VMDS_FOUND_GAP;
8099 				gap_start = s;
8100 			}
8101 
8102 			if (entry == vm_map_to_entry(map) ||
8103 			    end <= entry->vme_start) {
8104 				break;
8105 			}
8106 
8107 			s = entry->vme_start;
8108 		}
8109 
8110 		if (state & VMDS_KERNEL_PMAP) {
8111 			/*
8112 			 * In the kernel map and its submaps,
8113 			 * permanent entries never die, even
8114 			 * if VM_MAP_REMOVE_IMMUTABLE is passed.
8115 			 */
8116 			if (entry->vme_permanent) {
8117 				__vm_map_delete_permanent_panic(map, start, end, entry);
8118 			}
8119 
8120 			if (flags & VM_MAP_REMOVE_GUESS_SIZE) {
8121 				end = entry->vme_end;
8122 				flags &= ~VM_MAP_REMOVE_GUESS_SIZE;
8123 			}
8124 
8125 			/*
8126 			 * In the kernel map and its submaps,
8127 			 * the removal of an atomic/guarded entry is strict.
8128 			 *
8129 			 * An atomic entry is processed only if it was
8130 			 * specifically targeted.
8131 			 *
8132 			 * We might have deleted non-atomic entries before
8133 			 * we reach this this point however...
8134 			 */
8135 			kmem_entry_validate_guard(map, entry,
8136 			    start, end - start, guard);
8137 		}
8138 
8139 		/*
8140 		 * Step 2.1: handle "permanent" and "submap" entries
8141 		 * *before* clipping to avoid triggering some unnecessary
8142 		 * un-nesting of the shared region.
8143 		 */
8144 		if (entry->vme_permanent && entry->is_sub_map) {
8145 //			printf("FBDP %s:%d permanent submap...\n", __FUNCTION__, __LINE__);
8146 			/*
8147 			 * Un-mapping a "permanent" mapping of a user-space
8148 			 * submap is not allowed unless...
8149 			 */
8150 			if (flags & VM_MAP_REMOVE_IMMUTABLE) {
8151 				/*
8152 				 * a. explicitly requested by the kernel caller.
8153 				 */
8154 //				printf("FBDP %s:%d flags & REMOVE_IMMUTABLE\n", __FUNCTION__, __LINE__);
8155 			} else if ((flags & VM_MAP_REMOVE_IMMUTABLE_CODE) &&
8156 			    developer_mode_state()) {
8157 				/*
8158 				 * b. we're in "developer" mode (for
8159 				 *    breakpoints, dtrace probes, ...).
8160 				 */
8161 //				printf("FBDP %s:%d flags & REMOVE_IMMUTABLE_CODE\n", __FUNCTION__, __LINE__);
8162 			} else if (map->terminated) {
8163 				/*
8164 				 * c. this is the final address space cleanup.
8165 				 */
8166 //				printf("FBDP %s:%d map->terminated\n", __FUNCTION__, __LINE__);
8167 			} else {
8168 				vm_map_offset_t submap_start, submap_end;
8169 				kern_return_t submap_kr;
8170 
8171 				/*
8172 				 * Check if there are any "permanent" mappings
8173 				 * in this range in the submap.
8174 				 */
8175 				if (entry->in_transition) {
8176 					/* can that even happen ? */
8177 					goto in_transition;
8178 				}
8179 				/* compute the clipped range in the submap */
8180 				submap_start = s - entry->vme_start;
8181 				submap_start += VME_OFFSET(entry);
8182 				submap_end = end - entry->vme_start;
8183 				submap_end += VME_OFFSET(entry);
8184 				submap_kr = vm_map_delete_submap_recurse(
8185 					VME_SUBMAP(entry),
8186 					submap_start,
8187 					submap_end);
8188 				if (submap_kr != KERN_SUCCESS) {
8189 					/*
8190 					 * There are some "permanent" mappings
8191 					 * in the submap: we are not allowed
8192 					 * to remove this range.
8193 					 */
8194 					printf("%d[%s] removing permanent submap entry "
8195 					    "%p [0x%llx:0x%llx] prot 0x%x/0x%x -> KERN_PROT_FAILURE\n",
8196 					    proc_selfpid(),
8197 					    (get_bsdtask_info(current_task())
8198 					    ? proc_name_address(get_bsdtask_info(current_task()))
8199 					    : "?"), entry,
8200 					    (uint64_t)entry->vme_start,
8201 					    (uint64_t)entry->vme_end,
8202 					    entry->protection,
8203 					    entry->max_protection);
8204 					DTRACE_VM6(vm_map_delete_permanent_deny_submap,
8205 					    vm_map_entry_t, entry,
8206 					    vm_map_offset_t, entry->vme_start,
8207 					    vm_map_offset_t, entry->vme_end,
8208 					    vm_prot_t, entry->protection,
8209 					    vm_prot_t, entry->max_protection,
8210 					    int, VME_ALIAS(entry));
8211 					ret.kmr_return = KERN_PROTECTION_FAILURE;
8212 					goto out;
8213 				}
8214 				/* no permanent mappings: proceed */
8215 			}
8216 		}
8217 
8218 		/*
8219 		 * Step 3: Perform any clipping needed.
8220 		 *
8221 		 *         After this, "entry" starts at "s", ends before "end"
8222 		 */
8223 
8224 		if (entry->vme_start < s) {
8225 			if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) &&
8226 			    entry->map_aligned &&
8227 			    !VM_MAP_PAGE_ALIGNED(s, VM_MAP_PAGE_MASK(map))) {
8228 				/*
8229 				 * The entry will no longer be map-aligned
8230 				 * after clipping and the caller said it's OK.
8231 				 */
8232 				entry->map_aligned = FALSE;
8233 			}
8234 			vm_map_clip_start(map, entry, s);
8235 			SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
8236 		}
8237 
8238 		if (end < entry->vme_end) {
8239 			if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) &&
8240 			    entry->map_aligned &&
8241 			    !VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map))) {
8242 				/*
8243 				 * The entry will no longer be map-aligned
8244 				 * after clipping and the caller said it's OK.
8245 				 */
8246 				entry->map_aligned = FALSE;
8247 			}
8248 			vm_map_clip_end(map, entry, end);
8249 		}
8250 
8251 		if (entry->vme_permanent && entry->is_sub_map) {
8252 			/*
8253 			 * We already went through step 2.1 which did not deny
8254 			 * the removal of this "permanent" and "is_sub_map"
8255 			 * entry.
8256 			 * Now that we've clipped what we actually want to
8257 			 * delete, undo the "permanent" part to allow the
8258 			 * removal to proceed.
8259 			 */
8260 			DTRACE_VM6(vm_map_delete_permanent_allow_submap,
8261 			    vm_map_entry_t, entry,
8262 			    vm_map_offset_t, entry->vme_start,
8263 			    vm_map_offset_t, entry->vme_end,
8264 			    vm_prot_t, entry->protection,
8265 			    vm_prot_t, entry->max_protection,
8266 			    int, VME_ALIAS(entry));
8267 			entry->vme_permanent = false;
8268 		}
8269 
8270 		assert(s == entry->vme_start);
8271 		assert(entry->vme_end <= end);
8272 
8273 
8274 		/*
8275 		 * Step 4: If the entry is in flux, wait for this to resolve.
8276 		 */
8277 
8278 		if (entry->in_transition) {
8279 			wait_result_t wait_result;
8280 
8281 in_transition:
8282 			/*
8283 			 * Another thread is wiring/unwiring this entry.
8284 			 * Let the other thread know we are waiting.
8285 			 */
8286 
8287 			entry->needs_wakeup = TRUE;
8288 
8289 			/*
8290 			 * wake up anybody waiting on entries that we have
8291 			 * already unwired/deleted.
8292 			 */
8293 			if (state & VMDS_NEEDS_WAKEUP) {
8294 				vm_map_entry_wakeup(map);
8295 				state &= ~VMDS_NEEDS_WAKEUP;
8296 			}
8297 
8298 			wait_result = vm_map_entry_wait(map, interruptible);
8299 
8300 			if (interruptible &&
8301 			    wait_result == THREAD_INTERRUPTED) {
8302 				/*
8303 				 * We do not clear the needs_wakeup flag,
8304 				 * since we cannot tell if we were the only one.
8305 				 */
8306 				ret.kmr_return = KERN_ABORTED;
8307 				return ret;
8308 			}
8309 
8310 			/*
8311 			 * The entry could have been clipped or it
8312 			 * may not exist anymore.  Look it up again.
8313 			 */
8314 			state |= VMDS_NEEDS_LOOKUP;
8315 			continue;
8316 		}
8317 
8318 
8319 		/*
8320 		 * Step 5: Handle wiring
8321 		 */
8322 
8323 		if (entry->wired_count) {
8324 			struct vm_map_entry tmp_entry;
8325 			boolean_t           user_wire;
8326 			unsigned int        last_timestamp;
8327 
8328 			user_wire = entry->user_wired_count > 0;
8329 
8330 			/*
8331 			 *      Remove a kernel wiring if requested
8332 			 */
8333 			if (flags & VM_MAP_REMOVE_KUNWIRE) {
8334 				entry->wired_count--;
8335 			}
8336 
8337 			/*
8338 			 *	Remove all user wirings for proper accounting
8339 			 */
8340 			while (entry->user_wired_count) {
8341 				subtract_wire_counts(map, entry, user_wire);
8342 			}
8343 
8344 			/*
8345 			 * All our DMA I/O operations in IOKit are currently
8346 			 * done by wiring through the map entries of the task
8347 			 * requesting the I/O.
8348 			 *
8349 			 * Because of this, we must always wait for kernel wirings
8350 			 * to go away on the entries before deleting them.
8351 			 *
8352 			 * Any caller who wants to actually remove a kernel wiring
8353 			 * should explicitly set the VM_MAP_REMOVE_KUNWIRE flag to
8354 			 * properly remove one wiring instead of blasting through
8355 			 * them all.
8356 			 */
8357 			if (entry->wired_count != 0) {
8358 				assert(map != kernel_map);
8359 				/*
8360 				 * Cannot continue.  Typical case is when
8361 				 * a user thread has physical io pending on
8362 				 * on this page.  Either wait for the
8363 				 * kernel wiring to go away or return an
8364 				 * error.
8365 				 */
8366 				wait_result_t wait_result;
8367 
8368 				entry->needs_wakeup = TRUE;
8369 				wait_result = vm_map_entry_wait(map,
8370 				    interruptible);
8371 
8372 				if (interruptible &&
8373 				    wait_result == THREAD_INTERRUPTED) {
8374 					/*
8375 					 * We do not clear the
8376 					 * needs_wakeup flag, since we
8377 					 * cannot tell if we were the
8378 					 * only one.
8379 					 */
8380 					ret.kmr_return = KERN_ABORTED;
8381 					return ret;
8382 				}
8383 
8384 
8385 				/*
8386 				 * The entry could have been clipped or
8387 				 * it may not exist anymore.  Look it
8388 				 * up again.
8389 				 */
8390 				state |= VMDS_NEEDS_LOOKUP;
8391 				continue;
8392 			}
8393 
8394 			/*
8395 			 * We can unlock the map now.
8396 			 *
8397 			 * The entry might be split once we unlock the map,
8398 			 * but we need the range as defined by this entry
8399 			 * to be stable. So we must make a local copy.
8400 			 *
8401 			 * The underlying objects do not change during clips,
8402 			 * and the in_transition state guarentees existence
8403 			 * of the entry.
8404 			 */
8405 			last_timestamp = map->timestamp;
8406 			entry->in_transition = TRUE;
8407 			tmp_entry = *entry;
8408 			vm_map_unlock(map);
8409 
8410 			if (tmp_entry.is_sub_map) {
8411 				vm_map_t sub_map;
8412 				vm_map_offset_t sub_start, sub_end;
8413 				pmap_t pmap;
8414 				vm_map_offset_t pmap_addr;
8415 
8416 
8417 				sub_map = VME_SUBMAP(&tmp_entry);
8418 				sub_start = VME_OFFSET(&tmp_entry);
8419 				sub_end = sub_start + (tmp_entry.vme_end -
8420 				    tmp_entry.vme_start);
8421 				if (tmp_entry.use_pmap) {
8422 					pmap = sub_map->pmap;
8423 					pmap_addr = tmp_entry.vme_start;
8424 				} else {
8425 					pmap = map->pmap;
8426 					pmap_addr = tmp_entry.vme_start;
8427 				}
8428 				(void) vm_map_unwire_nested(sub_map,
8429 				    sub_start, sub_end,
8430 				    user_wire,
8431 				    pmap, pmap_addr);
8432 			} else {
8433 				if (tmp_entry.vme_kernel_object) {
8434 					pmap_protect_options(
8435 						map->pmap,
8436 						tmp_entry.vme_start,
8437 						tmp_entry.vme_end,
8438 						VM_PROT_NONE,
8439 						PMAP_OPTIONS_REMOVE,
8440 						NULL);
8441 				}
8442 				vm_fault_unwire(map, &tmp_entry,
8443 				    tmp_entry.vme_kernel_object,
8444 				    map->pmap, tmp_entry.vme_start);
8445 			}
8446 
8447 			vm_map_lock(map);
8448 
8449 			/*
8450 			 * Unwiring happened, we can now go back to deleting
8451 			 * them (after we clear the in_transition bit for the range).
8452 			 */
8453 			if (last_timestamp + 1 != map->timestamp) {
8454 				state |= VMDS_NEEDS_LOOKUP;
8455 			}
8456 			clear_in_transition_end = tmp_entry.vme_end;
8457 			continue;
8458 		}
8459 
8460 		assert(entry->wired_count == 0);
8461 		assert(entry->user_wired_count == 0);
8462 
8463 
8464 		/*
8465 		 * Step 6: Entry is unwired and ready for us to delete !
8466 		 */
8467 
8468 		if (!entry->vme_permanent) {
8469 			/*
8470 			 * Typical case: the entry really shouldn't be permanent
8471 			 */
8472 		} else if ((flags & VM_MAP_REMOVE_IMMUTABLE_CODE) &&
8473 		    (entry->protection & VM_PROT_EXECUTE) &&
8474 		    developer_mode_state()) {
8475 			/*
8476 			 * Allow debuggers to undo executable mappings
8477 			 * when developer mode is on.
8478 			 */
8479 #if 0
8480 			printf("FBDP %d[%s] removing permanent executable entry "
8481 			    "%p [0x%llx:0x%llx] prot 0x%x/0x%x\n",
8482 			    proc_selfpid(),
8483 			    (current_task()->bsd_info
8484 			    ? proc_name_address(current_task()->bsd_info)
8485 			    : "?"), entry,
8486 			    (uint64_t)entry->vme_start,
8487 			    (uint64_t)entry->vme_end,
8488 			    entry->protection,
8489 			    entry->max_protection);
8490 #endif
8491 			entry->vme_permanent = FALSE;
8492 		} else if ((flags & VM_MAP_REMOVE_IMMUTABLE) || map->terminated) {
8493 #if 0
8494 			printf("FBDP %d[%s] removing permanent entry "
8495 			    "%p [0x%llx:0x%llx] prot 0x%x/0x%x\n",
8496 			    proc_selfpid(),
8497 			    (current_task()->bsd_info
8498 			    ? proc_name_address(current_task()->bsd_info)
8499 			    : "?"), entry,
8500 			    (uint64_t)entry->vme_start,
8501 			    (uint64_t)entry->vme_end,
8502 			    entry->protection,
8503 			    entry->max_protection);
8504 #endif
8505 			entry->vme_permanent = FALSE;
8506 		} else {
8507 			DTRACE_VM6(vm_map_delete_permanent,
8508 			    vm_map_entry_t, entry,
8509 			    vm_map_offset_t, entry->vme_start,
8510 			    vm_map_offset_t, entry->vme_end,
8511 			    vm_prot_t, entry->protection,
8512 			    vm_prot_t, entry->max_protection,
8513 			    int, VME_ALIAS(entry));
8514 		}
8515 
8516 		if (entry->is_sub_map) {
8517 			assertf(VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)) >= VM_MAP_PAGE_SHIFT(map),
8518 			    "map %p (%d) entry %p submap %p (%d)\n",
8519 			    map, VM_MAP_PAGE_SHIFT(map), entry,
8520 			    VME_SUBMAP(entry),
8521 			    VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)));
8522 			if (entry->use_pmap) {
8523 #ifndef NO_NESTED_PMAP
8524 				int pmap_flags;
8525 
8526 				if (map->terminated) {
8527 					/*
8528 					 * This is the final cleanup of the
8529 					 * address space being terminated.
8530 					 * No new mappings are expected and
8531 					 * we don't really need to unnest the
8532 					 * shared region (and lose the "global"
8533 					 * pmap mappings, if applicable).
8534 					 *
8535 					 * Tell the pmap layer that we're
8536 					 * "clean" wrt nesting.
8537 					 */
8538 					pmap_flags = PMAP_UNNEST_CLEAN;
8539 				} else {
8540 					/*
8541 					 * We're unmapping part of the nested
8542 					 * shared region, so we can't keep the
8543 					 * nested pmap.
8544 					 */
8545 					pmap_flags = 0;
8546 				}
8547 				pmap_unnest_options(
8548 					map->pmap,
8549 					(addr64_t)entry->vme_start,
8550 					entry->vme_end - entry->vme_start,
8551 					pmap_flags);
8552 #endif  /* NO_NESTED_PMAP */
8553 				if (map->mapped_in_other_pmaps &&
8554 				    os_ref_get_count_raw(&map->map_refcnt) != 0) {
8555 					/* clean up parent map/maps */
8556 					vm_map_submap_pmap_clean(
8557 						map, entry->vme_start,
8558 						entry->vme_end,
8559 						VME_SUBMAP(entry),
8560 						VME_OFFSET(entry));
8561 				}
8562 			} else {
8563 				vm_map_submap_pmap_clean(
8564 					map, entry->vme_start, entry->vme_end,
8565 					VME_SUBMAP(entry),
8566 					VME_OFFSET(entry));
8567 			}
8568 		} else if (entry->vme_kernel_object ||
8569 		    VME_OBJECT(entry) == compressor_object) {
8570 			/*
8571 			 * nothing to do
8572 			 */
8573 		} else if (map->mapped_in_other_pmaps &&
8574 		    os_ref_get_count_raw(&map->map_refcnt) != 0) {
8575 			vm_object_pmap_protect_options(
8576 				VME_OBJECT(entry), VME_OFFSET(entry),
8577 				entry->vme_end - entry->vme_start,
8578 				PMAP_NULL,
8579 				PAGE_SIZE,
8580 				entry->vme_start,
8581 				VM_PROT_NONE,
8582 				PMAP_OPTIONS_REMOVE);
8583 		} else if ((VME_OBJECT(entry) != VM_OBJECT_NULL) ||
8584 		    (state & VMDS_KERNEL_PMAP)) {
8585 			/* Remove translations associated
8586 			 * with this range unless the entry
8587 			 * does not have an object, or
8588 			 * it's the kernel map or a descendant
8589 			 * since the platform could potentially
8590 			 * create "backdoor" mappings invisible
8591 			 * to the VM. It is expected that
8592 			 * objectless, non-kernel ranges
8593 			 * do not have such VM invisible
8594 			 * translations.
8595 			 */
8596 			pmap_remove_options(map->pmap,
8597 			    (addr64_t)entry->vme_start,
8598 			    (addr64_t)entry->vme_end,
8599 			    PMAP_OPTIONS_REMOVE);
8600 		}
8601 
8602 #if DEBUG
8603 		/*
8604 		 * All pmap mappings for this map entry must have been
8605 		 * cleared by now.
8606 		 */
8607 		assert(pmap_is_empty(map->pmap,
8608 		    entry->vme_start,
8609 		    entry->vme_end));
8610 #endif /* DEBUG */
8611 
8612 		if (entry->iokit_acct) {
8613 			/* alternate accounting */
8614 			DTRACE_VM4(vm_map_iokit_unmapped_region,
8615 			    vm_map_t, map,
8616 			    vm_map_offset_t, entry->vme_start,
8617 			    vm_map_offset_t, entry->vme_end,
8618 			    int, VME_ALIAS(entry));
8619 			vm_map_iokit_unmapped_region(map,
8620 			    (entry->vme_end -
8621 			    entry->vme_start));
8622 			entry->iokit_acct = FALSE;
8623 			entry->use_pmap = FALSE;
8624 		}
8625 
8626 		s = entry->vme_end;
8627 		next = entry->vme_next;
8628 		ret.kmr_size += entry->vme_end - entry->vme_start;
8629 
8630 		if (entry->vme_permanent) {
8631 			/*
8632 			 * A permanent entry can not be removed, so leave it
8633 			 * in place but remove all access permissions.
8634 			 */
8635 			if (!entry->pmap_cs_associated) {
8636 				printf("%s:%d %d[%s] map %p entry %p [ 0x%llx - 0x%llx ] submap %d prot 0x%x/0x%x -> 0/0\n",
8637 				    __FUNCTION__, __LINE__,
8638 				    proc_selfpid(),
8639 				    (get_bsdtask_info(current_task())
8640 				    ? proc_name_address(get_bsdtask_info(current_task()))
8641 				    : "?"),
8642 				    map,
8643 				    entry,
8644 				    (uint64_t)entry->vme_start,
8645 				    (uint64_t)entry->vme_end,
8646 				    entry->is_sub_map,
8647 				    entry->protection,
8648 				    entry->max_protection);
8649 			}
8650 			DTRACE_VM6(vm_map_delete_permanent_prot_none,
8651 			    vm_map_entry_t, entry,
8652 			    vm_map_offset_t, entry->vme_start,
8653 			    vm_map_offset_t, entry->vme_end,
8654 			    vm_prot_t, entry->protection,
8655 			    vm_prot_t, entry->max_protection,
8656 			    int, VME_ALIAS(entry));
8657 			entry->protection = VM_PROT_NONE;
8658 			entry->max_protection = VM_PROT_NONE;
8659 		} else {
8660 			vm_map_entry_zap(map, entry, zap_list);
8661 		}
8662 
8663 		entry = next;
8664 
8665 		if ((flags & VM_MAP_REMOVE_NO_YIELD) == 0 && s < end) {
8666 			unsigned int last_timestamp = map->timestamp++;
8667 
8668 			if (lck_rw_lock_yield_exclusive(&map->lock,
8669 			    LCK_RW_YIELD_ANY_WAITER)) {
8670 				if (last_timestamp != map->timestamp + 1) {
8671 					state |= VMDS_NEEDS_LOOKUP;
8672 				}
8673 			} else {
8674 				/* we didn't yield, undo our change */
8675 				map->timestamp--;
8676 			}
8677 		}
8678 	}
8679 
8680 	if (map->wait_for_space) {
8681 		thread_wakeup((event_t) map);
8682 	}
8683 
8684 	if (state & VMDS_NEEDS_WAKEUP) {
8685 		vm_map_entry_wakeup(map);
8686 	}
8687 
8688 out:
8689 	if ((state & VMDS_KERNEL_PMAP) && ret.kmr_return) {
8690 		__vm_map_delete_failed_panic(map, start, end, ret.kmr_return);
8691 	}
8692 
8693 	if (state & VMDS_FOUND_GAP) {
8694 		DTRACE_VM3(kern_vm_deallocate_gap,
8695 		    vm_map_offset_t, gap_start,
8696 		    vm_map_offset_t, save_start,
8697 		    vm_map_offset_t, save_end);
8698 		if (flags & VM_MAP_REMOVE_GAPS_FAIL) {
8699 			ret.kmr_return = KERN_INVALID_VALUE;
8700 		} else {
8701 			vm_map_guard_exception(gap_start, kGUARD_EXC_DEALLOC_GAP);
8702 		}
8703 	}
8704 
8705 	return ret;
8706 }
8707 
8708 kmem_return_t
vm_map_remove_and_unlock(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vmr_flags_t flags,kmem_guard_t guard)8709 vm_map_remove_and_unlock(
8710 	vm_map_t        map,
8711 	vm_map_offset_t start,
8712 	vm_map_offset_t end,
8713 	vmr_flags_t     flags,
8714 	kmem_guard_t    guard)
8715 {
8716 	kmem_return_t ret;
8717 	VM_MAP_ZAP_DECLARE(zap);
8718 
8719 	ret = vm_map_delete(map, start, end, flags, guard, &zap);
8720 	vm_map_unlock(map);
8721 
8722 	vm_map_zap_dispose(&zap);
8723 
8724 	return ret;
8725 }
8726 
8727 /*
8728  *	vm_map_remove_guard:
8729  *
8730  *	Remove the given address range from the target map.
8731  *	This is the exported form of vm_map_delete.
8732  */
8733 kmem_return_t
vm_map_remove_guard(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vmr_flags_t flags,kmem_guard_t guard)8734 vm_map_remove_guard(
8735 	vm_map_t        map,
8736 	vm_map_offset_t start,
8737 	vm_map_offset_t end,
8738 	vmr_flags_t     flags,
8739 	kmem_guard_t    guard)
8740 {
8741 	vm_map_lock(map);
8742 	return vm_map_remove_and_unlock(map, start, end, flags, guard);
8743 }
8744 
8745 /*
8746  *	vm_map_terminate:
8747  *
8748  *	Clean out a task's map.
8749  */
8750 kern_return_t
vm_map_terminate(vm_map_t map)8751 vm_map_terminate(
8752 	vm_map_t        map)
8753 {
8754 	vm_map_lock(map);
8755 	map->terminated = TRUE;
8756 	vm_map_disable_hole_optimization(map);
8757 	(void)vm_map_remove_and_unlock(map, map->min_offset, map->max_offset,
8758 	    VM_MAP_REMOVE_NO_FLAGS, KMEM_GUARD_NONE);
8759 	return KERN_SUCCESS;
8760 }
8761 
8762 /*
8763  *	Routine:	vm_map_copy_allocate
8764  *
8765  *	Description:
8766  *		Allocates and initializes a map copy object.
8767  */
8768 static vm_map_copy_t
vm_map_copy_allocate(void)8769 vm_map_copy_allocate(void)
8770 {
8771 	vm_map_copy_t new_copy;
8772 
8773 	new_copy = zalloc_id(ZONE_ID_VM_MAP_COPY, Z_WAITOK | Z_ZERO);
8774 	new_copy->c_u.hdr.rb_head_store.rbh_root = (void*)(int)SKIP_RB_TREE;
8775 	vm_map_copy_first_entry(new_copy) = vm_map_copy_to_entry(new_copy);
8776 	vm_map_copy_last_entry(new_copy) = vm_map_copy_to_entry(new_copy);
8777 	return new_copy;
8778 }
8779 
8780 /*
8781  *	Routine:	vm_map_copy_discard
8782  *
8783  *	Description:
8784  *		Dispose of a map copy object (returned by
8785  *		vm_map_copyin).
8786  */
8787 void
vm_map_copy_discard(vm_map_copy_t copy)8788 vm_map_copy_discard(
8789 	vm_map_copy_t   copy)
8790 {
8791 	if (copy == VM_MAP_COPY_NULL) {
8792 		return;
8793 	}
8794 
8795 	/*
8796 	 * Assert that the vm_map_copy is coming from the right
8797 	 * zone and hasn't been forged
8798 	 */
8799 	vm_map_copy_require(copy);
8800 
8801 	switch (copy->type) {
8802 	case VM_MAP_COPY_ENTRY_LIST:
8803 		while (vm_map_copy_first_entry(copy) !=
8804 		    vm_map_copy_to_entry(copy)) {
8805 			vm_map_entry_t  entry = vm_map_copy_first_entry(copy);
8806 
8807 			vm_map_copy_entry_unlink(copy, entry);
8808 			if (entry->is_sub_map) {
8809 				vm_map_deallocate(VME_SUBMAP(entry));
8810 			} else {
8811 				vm_object_deallocate(VME_OBJECT(entry));
8812 			}
8813 			vm_map_copy_entry_dispose(entry);
8814 		}
8815 		break;
8816 	case VM_MAP_COPY_OBJECT:
8817 		vm_object_deallocate(copy->cpy_object);
8818 		break;
8819 	case VM_MAP_COPY_KERNEL_BUFFER:
8820 
8821 		/*
8822 		 * The vm_map_copy_t and possibly the data buffer were
8823 		 * allocated by a single call to kalloc_data(), i.e. the
8824 		 * vm_map_copy_t was not allocated out of the zone.
8825 		 */
8826 		if (copy->size > msg_ool_size_small || copy->offset) {
8827 			panic("Invalid vm_map_copy_t sz:%lld, ofst:%lld",
8828 			    (long long)copy->size, (long long)copy->offset);
8829 		}
8830 		kfree_data(copy->cpy_kdata, copy->size);
8831 	}
8832 	zfree_id(ZONE_ID_VM_MAP_COPY, copy);
8833 }
8834 
8835 /*
8836  *	Routine:	vm_map_copy_copy
8837  *
8838  *	Description:
8839  *			Move the information in a map copy object to
8840  *			a new map copy object, leaving the old one
8841  *			empty.
8842  *
8843  *			This is used by kernel routines that need
8844  *			to look at out-of-line data (in copyin form)
8845  *			before deciding whether to return SUCCESS.
8846  *			If the routine returns FAILURE, the original
8847  *			copy object will be deallocated; therefore,
8848  *			these routines must make a copy of the copy
8849  *			object and leave the original empty so that
8850  *			deallocation will not fail.
8851  */
8852 vm_map_copy_t
vm_map_copy_copy(vm_map_copy_t copy)8853 vm_map_copy_copy(
8854 	vm_map_copy_t   copy)
8855 {
8856 	vm_map_copy_t   new_copy;
8857 
8858 	if (copy == VM_MAP_COPY_NULL) {
8859 		return VM_MAP_COPY_NULL;
8860 	}
8861 
8862 	/*
8863 	 * Assert that the vm_map_copy is coming from the right
8864 	 * zone and hasn't been forged
8865 	 */
8866 	vm_map_copy_require(copy);
8867 
8868 	/*
8869 	 * Allocate a new copy object, and copy the information
8870 	 * from the old one into it.
8871 	 */
8872 
8873 	new_copy = zalloc_id(ZONE_ID_VM_MAP_COPY, Z_WAITOK | Z_ZERO | Z_NOFAIL);
8874 	memcpy((void *) new_copy, (void *) copy, sizeof(struct vm_map_copy));
8875 #if __has_feature(ptrauth_calls)
8876 	if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) {
8877 		new_copy->cpy_kdata = copy->cpy_kdata;
8878 	}
8879 #endif
8880 
8881 	if (copy->type == VM_MAP_COPY_ENTRY_LIST) {
8882 		/*
8883 		 * The links in the entry chain must be
8884 		 * changed to point to the new copy object.
8885 		 */
8886 		vm_map_copy_first_entry(copy)->vme_prev
8887 		        = vm_map_copy_to_entry(new_copy);
8888 		vm_map_copy_last_entry(copy)->vme_next
8889 		        = vm_map_copy_to_entry(new_copy);
8890 	}
8891 
8892 	/*
8893 	 * Change the old copy object into one that contains
8894 	 * nothing to be deallocated.
8895 	 */
8896 	copy->type = VM_MAP_COPY_OBJECT;
8897 	copy->cpy_object = VM_OBJECT_NULL;
8898 
8899 	/*
8900 	 * Return the new object.
8901 	 */
8902 	return new_copy;
8903 }
8904 
8905 static boolean_t
vm_map_entry_is_overwritable(vm_map_t dst_map __unused,vm_map_entry_t entry)8906 vm_map_entry_is_overwritable(
8907 	vm_map_t        dst_map __unused,
8908 	vm_map_entry_t  entry)
8909 {
8910 	if (!(entry->protection & VM_PROT_WRITE)) {
8911 		/* can't overwrite if not writable */
8912 		return FALSE;
8913 	}
8914 #if !__x86_64__
8915 	if (entry->used_for_jit &&
8916 	    vm_map_cs_enforcement(dst_map) &&
8917 	    !dst_map->cs_debugged) {
8918 		/*
8919 		 * Can't overwrite a JIT region while cs_enforced
8920 		 * and not cs_debugged.
8921 		 */
8922 		return FALSE;
8923 	}
8924 
8925 #if __arm64e__
8926 	/* Do not allow overwrite HW assisted TPRO entries */
8927 	if (entry->used_for_tpro) {
8928 		return FALSE;
8929 	}
8930 #endif /* __arm64e__ */
8931 
8932 	if (entry->vme_permanent) {
8933 		if (entry->is_sub_map) {
8934 			/*
8935 			 * We can't tell if the submap contains "permanent"
8936 			 * entries within the range targeted by the caller.
8937 			 * The caller will have to check for that with
8938 			 * vm_map_overwrite_submap_recurse() for example.
8939 			 */
8940 		} else {
8941 			/*
8942 			 * Do not allow overwriting of a "permanent"
8943 			 * entry.
8944 			 */
8945 			DTRACE_VM6(vm_map_delete_permanent_deny_overwrite,
8946 			    vm_map_entry_t, entry,
8947 			    vm_map_offset_t, entry->vme_start,
8948 			    vm_map_offset_t, entry->vme_end,
8949 			    vm_prot_t, entry->protection,
8950 			    vm_prot_t, entry->max_protection,
8951 			    int, VME_ALIAS(entry));
8952 			return FALSE;
8953 		}
8954 	}
8955 #endif /* !__x86_64__ */
8956 	return TRUE;
8957 }
8958 
8959 static kern_return_t
vm_map_overwrite_submap_recurse(vm_map_t dst_map,vm_map_offset_t dst_addr,vm_map_size_t dst_size)8960 vm_map_overwrite_submap_recurse(
8961 	vm_map_t        dst_map,
8962 	vm_map_offset_t dst_addr,
8963 	vm_map_size_t   dst_size)
8964 {
8965 	vm_map_offset_t dst_end;
8966 	vm_map_entry_t  tmp_entry;
8967 	vm_map_entry_t  entry;
8968 	kern_return_t   result;
8969 	boolean_t       encountered_sub_map = FALSE;
8970 
8971 
8972 
8973 	/*
8974 	 *	Verify that the destination is all writeable
8975 	 *	initially.  We have to trunc the destination
8976 	 *	address and round the copy size or we'll end up
8977 	 *	splitting entries in strange ways.
8978 	 */
8979 
8980 	dst_end = vm_map_round_page(dst_addr + dst_size,
8981 	    VM_MAP_PAGE_MASK(dst_map));
8982 	vm_map_lock(dst_map);
8983 
8984 start_pass_1:
8985 	if (!vm_map_lookup_entry(dst_map, dst_addr, &tmp_entry)) {
8986 		vm_map_unlock(dst_map);
8987 		return KERN_INVALID_ADDRESS;
8988 	}
8989 
8990 	vm_map_clip_start(dst_map,
8991 	    tmp_entry,
8992 	    vm_map_trunc_page(dst_addr,
8993 	    VM_MAP_PAGE_MASK(dst_map)));
8994 	if (tmp_entry->is_sub_map) {
8995 		/* clipping did unnest if needed */
8996 		assert(!tmp_entry->use_pmap);
8997 	}
8998 
8999 	for (entry = tmp_entry;;) {
9000 		vm_map_entry_t  next;
9001 
9002 		next = entry->vme_next;
9003 		while (entry->is_sub_map) {
9004 			vm_map_offset_t sub_start;
9005 			vm_map_offset_t sub_end;
9006 			vm_map_offset_t local_end;
9007 
9008 			if (entry->in_transition) {
9009 				/*
9010 				 * Say that we are waiting, and wait for entry.
9011 				 */
9012 				entry->needs_wakeup = TRUE;
9013 				vm_map_entry_wait(dst_map, THREAD_UNINT);
9014 
9015 				goto start_pass_1;
9016 			}
9017 
9018 			encountered_sub_map = TRUE;
9019 			sub_start = VME_OFFSET(entry);
9020 
9021 			if (entry->vme_end < dst_end) {
9022 				sub_end = entry->vme_end;
9023 			} else {
9024 				sub_end = dst_end;
9025 			}
9026 			sub_end -= entry->vme_start;
9027 			sub_end += VME_OFFSET(entry);
9028 			local_end = entry->vme_end;
9029 			vm_map_unlock(dst_map);
9030 
9031 			result = vm_map_overwrite_submap_recurse(
9032 				VME_SUBMAP(entry),
9033 				sub_start,
9034 				sub_end - sub_start);
9035 
9036 			if (result != KERN_SUCCESS) {
9037 				return result;
9038 			}
9039 			if (dst_end <= entry->vme_end) {
9040 				return KERN_SUCCESS;
9041 			}
9042 			vm_map_lock(dst_map);
9043 			if (!vm_map_lookup_entry(dst_map, local_end,
9044 			    &tmp_entry)) {
9045 				vm_map_unlock(dst_map);
9046 				return KERN_INVALID_ADDRESS;
9047 			}
9048 			entry = tmp_entry;
9049 			next = entry->vme_next;
9050 		}
9051 
9052 		if (!(entry->protection & VM_PROT_WRITE)) {
9053 			vm_map_unlock(dst_map);
9054 			return KERN_PROTECTION_FAILURE;
9055 		}
9056 
9057 		if (!vm_map_entry_is_overwritable(dst_map, entry)) {
9058 			vm_map_unlock(dst_map);
9059 			return KERN_PROTECTION_FAILURE;
9060 		}
9061 
9062 		/*
9063 		 *	If the entry is in transition, we must wait
9064 		 *	for it to exit that state.  Anything could happen
9065 		 *	when we unlock the map, so start over.
9066 		 */
9067 		if (entry->in_transition) {
9068 			/*
9069 			 * Say that we are waiting, and wait for entry.
9070 			 */
9071 			entry->needs_wakeup = TRUE;
9072 			vm_map_entry_wait(dst_map, THREAD_UNINT);
9073 
9074 			goto start_pass_1;
9075 		}
9076 
9077 /*
9078  *		our range is contained completely within this map entry
9079  */
9080 		if (dst_end <= entry->vme_end) {
9081 			vm_map_unlock(dst_map);
9082 			return KERN_SUCCESS;
9083 		}
9084 /*
9085  *		check that range specified is contiguous region
9086  */
9087 		if ((next == vm_map_to_entry(dst_map)) ||
9088 		    (next->vme_start != entry->vme_end)) {
9089 			vm_map_unlock(dst_map);
9090 			return KERN_INVALID_ADDRESS;
9091 		}
9092 
9093 		/*
9094 		 *	Check for permanent objects in the destination.
9095 		 */
9096 		if ((VME_OBJECT(entry) != VM_OBJECT_NULL) &&
9097 		    ((!VME_OBJECT(entry)->internal) ||
9098 		    (VME_OBJECT(entry)->true_share))) {
9099 			if (encountered_sub_map) {
9100 				vm_map_unlock(dst_map);
9101 				return KERN_FAILURE;
9102 			}
9103 		}
9104 
9105 
9106 		entry = next;
9107 	}/* for */
9108 	vm_map_unlock(dst_map);
9109 	return KERN_SUCCESS;
9110 }
9111 
9112 /*
9113  *	Routine:	vm_map_copy_overwrite
9114  *
9115  *	Description:
9116  *		Copy the memory described by the map copy
9117  *		object (copy; returned by vm_map_copyin) onto
9118  *		the specified destination region (dst_map, dst_addr).
9119  *		The destination must be writeable.
9120  *
9121  *		Unlike vm_map_copyout, this routine actually
9122  *		writes over previously-mapped memory.  If the
9123  *		previous mapping was to a permanent (user-supplied)
9124  *		memory object, it is preserved.
9125  *
9126  *		The attributes (protection and inheritance) of the
9127  *		destination region are preserved.
9128  *
9129  *		If successful, consumes the copy object.
9130  *		Otherwise, the caller is responsible for it.
9131  *
9132  *	Implementation notes:
9133  *		To overwrite aligned temporary virtual memory, it is
9134  *		sufficient to remove the previous mapping and insert
9135  *		the new copy.  This replacement is done either on
9136  *		the whole region (if no permanent virtual memory
9137  *		objects are embedded in the destination region) or
9138  *		in individual map entries.
9139  *
9140  *		To overwrite permanent virtual memory , it is necessary
9141  *		to copy each page, as the external memory management
9142  *		interface currently does not provide any optimizations.
9143  *
9144  *		Unaligned memory also has to be copied.  It is possible
9145  *		to use 'vm_trickery' to copy the aligned data.  This is
9146  *		not done but not hard to implement.
9147  *
9148  *		Once a page of permanent memory has been overwritten,
9149  *		it is impossible to interrupt this function; otherwise,
9150  *		the call would be neither atomic nor location-independent.
9151  *		The kernel-state portion of a user thread must be
9152  *		interruptible.
9153  *
9154  *		It may be expensive to forward all requests that might
9155  *		overwrite permanent memory (vm_write, vm_copy) to
9156  *		uninterruptible kernel threads.  This routine may be
9157  *		called by interruptible threads; however, success is
9158  *		not guaranteed -- if the request cannot be performed
9159  *		atomically and interruptibly, an error indication is
9160  *		returned.
9161  *
9162  *		Callers of this function must call vm_map_copy_require on
9163  *		previously created vm_map_copy_t or pass a newly created
9164  *		one to ensure that it hasn't been forged.
9165  */
9166 
9167 static kern_return_t
vm_map_copy_overwrite_nested(vm_map_t dst_map,vm_map_address_t dst_addr,vm_map_copy_t copy,boolean_t interruptible,pmap_t pmap,boolean_t discard_on_success)9168 vm_map_copy_overwrite_nested(
9169 	vm_map_t                dst_map,
9170 	vm_map_address_t        dst_addr,
9171 	vm_map_copy_t           copy,
9172 	boolean_t               interruptible,
9173 	pmap_t                  pmap,
9174 	boolean_t               discard_on_success)
9175 {
9176 	vm_map_offset_t         dst_end;
9177 	vm_map_entry_t          tmp_entry;
9178 	vm_map_entry_t          entry;
9179 	kern_return_t           kr;
9180 	boolean_t               aligned = TRUE;
9181 	boolean_t               contains_permanent_objects = FALSE;
9182 	boolean_t               encountered_sub_map = FALSE;
9183 	vm_map_offset_t         base_addr;
9184 	vm_map_size_t           copy_size;
9185 	vm_map_size_t           total_size;
9186 	uint16_t                copy_page_shift;
9187 
9188 	/*
9189 	 *	Check for special kernel buffer allocated
9190 	 *	by new_ipc_kmsg_copyin.
9191 	 */
9192 
9193 	if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) {
9194 		return vm_map_copyout_kernel_buffer(
9195 			dst_map, &dst_addr,
9196 			copy, copy->size, TRUE, discard_on_success);
9197 	}
9198 
9199 	/*
9200 	 *      Only works for entry lists at the moment.  Will
9201 	 *	support page lists later.
9202 	 */
9203 
9204 	assert(copy->type == VM_MAP_COPY_ENTRY_LIST);
9205 
9206 	if (copy->size == 0) {
9207 		if (discard_on_success) {
9208 			vm_map_copy_discard(copy);
9209 		}
9210 		return KERN_SUCCESS;
9211 	}
9212 
9213 	copy_page_shift = copy->cpy_hdr.page_shift;
9214 
9215 	/*
9216 	 *	Verify that the destination is all writeable
9217 	 *	initially.  We have to trunc the destination
9218 	 *	address and round the copy size or we'll end up
9219 	 *	splitting entries in strange ways.
9220 	 */
9221 
9222 	if (!VM_MAP_PAGE_ALIGNED(copy->size,
9223 	    VM_MAP_PAGE_MASK(dst_map)) ||
9224 	    !VM_MAP_PAGE_ALIGNED(copy->offset,
9225 	    VM_MAP_PAGE_MASK(dst_map)) ||
9226 	    !VM_MAP_PAGE_ALIGNED(dst_addr,
9227 	    VM_MAP_PAGE_MASK(dst_map)) ||
9228 	    copy_page_shift != VM_MAP_PAGE_SHIFT(dst_map)) {
9229 		aligned = FALSE;
9230 		dst_end = vm_map_round_page(dst_addr + copy->size,
9231 		    VM_MAP_PAGE_MASK(dst_map));
9232 	} else {
9233 		dst_end = dst_addr + copy->size;
9234 	}
9235 
9236 	vm_map_lock(dst_map);
9237 
9238 	/* LP64todo - remove this check when vm_map_commpage64()
9239 	 * no longer has to stuff in a map_entry for the commpage
9240 	 * above the map's max_offset.
9241 	 */
9242 	if (dst_addr >= dst_map->max_offset) {
9243 		vm_map_unlock(dst_map);
9244 		return KERN_INVALID_ADDRESS;
9245 	}
9246 
9247 start_pass_1:
9248 	if (!vm_map_lookup_entry(dst_map, dst_addr, &tmp_entry)) {
9249 		vm_map_unlock(dst_map);
9250 		return KERN_INVALID_ADDRESS;
9251 	}
9252 	vm_map_clip_start(dst_map,
9253 	    tmp_entry,
9254 	    vm_map_trunc_page(dst_addr,
9255 	    VM_MAP_PAGE_MASK(dst_map)));
9256 	for (entry = tmp_entry;;) {
9257 		vm_map_entry_t  next = entry->vme_next;
9258 
9259 		while (entry->is_sub_map) {
9260 			vm_map_offset_t sub_start;
9261 			vm_map_offset_t sub_end;
9262 			vm_map_offset_t local_end;
9263 
9264 			if (entry->in_transition) {
9265 				/*
9266 				 * Say that we are waiting, and wait for entry.
9267 				 */
9268 				entry->needs_wakeup = TRUE;
9269 				vm_map_entry_wait(dst_map, THREAD_UNINT);
9270 
9271 				goto start_pass_1;
9272 			}
9273 
9274 			local_end = entry->vme_end;
9275 			if (!(entry->needs_copy)) {
9276 				/* if needs_copy we are a COW submap */
9277 				/* in such a case we just replace so */
9278 				/* there is no need for the follow-  */
9279 				/* ing check.                        */
9280 				encountered_sub_map = TRUE;
9281 				sub_start = VME_OFFSET(entry);
9282 
9283 				if (entry->vme_end < dst_end) {
9284 					sub_end = entry->vme_end;
9285 				} else {
9286 					sub_end = dst_end;
9287 				}
9288 				sub_end -= entry->vme_start;
9289 				sub_end += VME_OFFSET(entry);
9290 				vm_map_unlock(dst_map);
9291 
9292 				kr = vm_map_overwrite_submap_recurse(
9293 					VME_SUBMAP(entry),
9294 					sub_start,
9295 					sub_end - sub_start);
9296 				if (kr != KERN_SUCCESS) {
9297 					return kr;
9298 				}
9299 				vm_map_lock(dst_map);
9300 			}
9301 
9302 			if (dst_end <= entry->vme_end) {
9303 				goto start_overwrite;
9304 			}
9305 			if (!vm_map_lookup_entry(dst_map, local_end,
9306 			    &entry)) {
9307 				vm_map_unlock(dst_map);
9308 				return KERN_INVALID_ADDRESS;
9309 			}
9310 			next = entry->vme_next;
9311 		}
9312 
9313 		if (!(entry->protection & VM_PROT_WRITE)) {
9314 			vm_map_unlock(dst_map);
9315 			return KERN_PROTECTION_FAILURE;
9316 		}
9317 
9318 		if (!vm_map_entry_is_overwritable(dst_map, entry)) {
9319 			vm_map_unlock(dst_map);
9320 			return KERN_PROTECTION_FAILURE;
9321 		}
9322 
9323 		/*
9324 		 *	If the entry is in transition, we must wait
9325 		 *	for it to exit that state.  Anything could happen
9326 		 *	when we unlock the map, so start over.
9327 		 */
9328 		if (entry->in_transition) {
9329 			/*
9330 			 * Say that we are waiting, and wait for entry.
9331 			 */
9332 			entry->needs_wakeup = TRUE;
9333 			vm_map_entry_wait(dst_map, THREAD_UNINT);
9334 
9335 			goto start_pass_1;
9336 		}
9337 
9338 /*
9339  *		our range is contained completely within this map entry
9340  */
9341 		if (dst_end <= entry->vme_end) {
9342 			break;
9343 		}
9344 /*
9345  *		check that range specified is contiguous region
9346  */
9347 		if ((next == vm_map_to_entry(dst_map)) ||
9348 		    (next->vme_start != entry->vme_end)) {
9349 			vm_map_unlock(dst_map);
9350 			return KERN_INVALID_ADDRESS;
9351 		}
9352 
9353 
9354 		/*
9355 		 *	Check for permanent objects in the destination.
9356 		 */
9357 		if ((VME_OBJECT(entry) != VM_OBJECT_NULL) &&
9358 		    ((!VME_OBJECT(entry)->internal) ||
9359 		    (VME_OBJECT(entry)->true_share))) {
9360 			contains_permanent_objects = TRUE;
9361 		}
9362 
9363 		entry = next;
9364 	}/* for */
9365 
9366 start_overwrite:
9367 	/*
9368 	 *	If there are permanent objects in the destination, then
9369 	 *	the copy cannot be interrupted.
9370 	 */
9371 
9372 	if (interruptible && contains_permanent_objects) {
9373 		vm_map_unlock(dst_map);
9374 		return KERN_FAILURE;   /* XXX */
9375 	}
9376 
9377 	/*
9378 	 *
9379 	 *	Make a second pass, overwriting the data
9380 	 *	At the beginning of each loop iteration,
9381 	 *	the next entry to be overwritten is "tmp_entry"
9382 	 *	(initially, the value returned from the lookup above),
9383 	 *	and the starting address expected in that entry
9384 	 *	is "start".
9385 	 */
9386 
9387 	total_size = copy->size;
9388 	if (encountered_sub_map) {
9389 		copy_size = 0;
9390 		/* re-calculate tmp_entry since we've had the map */
9391 		/* unlocked */
9392 		if (!vm_map_lookup_entry( dst_map, dst_addr, &tmp_entry)) {
9393 			vm_map_unlock(dst_map);
9394 			return KERN_INVALID_ADDRESS;
9395 		}
9396 	} else {
9397 		copy_size = copy->size;
9398 	}
9399 
9400 	base_addr = dst_addr;
9401 	while (TRUE) {
9402 		/* deconstruct the copy object and do in parts */
9403 		/* only in sub_map, interruptable case */
9404 		vm_map_entry_t  copy_entry;
9405 		vm_map_entry_t  previous_prev = VM_MAP_ENTRY_NULL;
9406 		vm_map_entry_t  next_copy = VM_MAP_ENTRY_NULL;
9407 		int             nentries;
9408 		int             remaining_entries = 0;
9409 		vm_map_offset_t new_offset = 0;
9410 
9411 		for (entry = tmp_entry; copy_size == 0;) {
9412 			vm_map_entry_t  next;
9413 
9414 			next = entry->vme_next;
9415 
9416 			/* tmp_entry and base address are moved along */
9417 			/* each time we encounter a sub-map.  Otherwise */
9418 			/* entry can outpase tmp_entry, and the copy_size */
9419 			/* may reflect the distance between them */
9420 			/* if the current entry is found to be in transition */
9421 			/* we will start over at the beginning or the last */
9422 			/* encounter of a submap as dictated by base_addr */
9423 			/* we will zero copy_size accordingly. */
9424 			if (entry->in_transition) {
9425 				/*
9426 				 * Say that we are waiting, and wait for entry.
9427 				 */
9428 				entry->needs_wakeup = TRUE;
9429 				vm_map_entry_wait(dst_map, THREAD_UNINT);
9430 
9431 				if (!vm_map_lookup_entry(dst_map, base_addr,
9432 				    &tmp_entry)) {
9433 					vm_map_unlock(dst_map);
9434 					return KERN_INVALID_ADDRESS;
9435 				}
9436 				copy_size = 0;
9437 				entry = tmp_entry;
9438 				continue;
9439 			}
9440 			if (entry->is_sub_map) {
9441 				vm_map_offset_t sub_start;
9442 				vm_map_offset_t sub_end;
9443 				vm_map_offset_t local_end;
9444 
9445 				if (entry->needs_copy) {
9446 					/* if this is a COW submap */
9447 					/* just back the range with a */
9448 					/* anonymous entry */
9449 					assert(!entry->vme_permanent);
9450 					if (entry->vme_end < dst_end) {
9451 						sub_end = entry->vme_end;
9452 					} else {
9453 						sub_end = dst_end;
9454 					}
9455 					if (entry->vme_start < base_addr) {
9456 						sub_start = base_addr;
9457 					} else {
9458 						sub_start = entry->vme_start;
9459 					}
9460 					vm_map_clip_end(
9461 						dst_map, entry, sub_end);
9462 					vm_map_clip_start(
9463 						dst_map, entry, sub_start);
9464 					assert(!entry->use_pmap);
9465 					assert(!entry->iokit_acct);
9466 					entry->use_pmap = TRUE;
9467 					vm_map_deallocate(VME_SUBMAP(entry));
9468 					assert(!entry->vme_permanent);
9469 					VME_OBJECT_SET(entry, VM_OBJECT_NULL, false, 0);
9470 					VME_OFFSET_SET(entry, 0);
9471 					entry->is_shared = FALSE;
9472 					entry->needs_copy = FALSE;
9473 					entry->protection = VM_PROT_DEFAULT;
9474 					entry->max_protection = VM_PROT_ALL;
9475 					entry->wired_count = 0;
9476 					entry->user_wired_count = 0;
9477 					if (entry->inheritance
9478 					    == VM_INHERIT_SHARE) {
9479 						entry->inheritance = VM_INHERIT_COPY;
9480 					}
9481 					continue;
9482 				}
9483 				/* first take care of any non-sub_map */
9484 				/* entries to send */
9485 				if (base_addr < entry->vme_start) {
9486 					/* stuff to send */
9487 					copy_size =
9488 					    entry->vme_start - base_addr;
9489 					break;
9490 				}
9491 				sub_start = VME_OFFSET(entry);
9492 
9493 				if (entry->vme_end < dst_end) {
9494 					sub_end = entry->vme_end;
9495 				} else {
9496 					sub_end = dst_end;
9497 				}
9498 				sub_end -= entry->vme_start;
9499 				sub_end += VME_OFFSET(entry);
9500 				local_end = entry->vme_end;
9501 				vm_map_unlock(dst_map);
9502 				copy_size = sub_end - sub_start;
9503 
9504 				/* adjust the copy object */
9505 				if (total_size > copy_size) {
9506 					vm_map_size_t   local_size = 0;
9507 					vm_map_size_t   entry_size;
9508 
9509 					nentries = 1;
9510 					new_offset = copy->offset;
9511 					copy_entry = vm_map_copy_first_entry(copy);
9512 					while (copy_entry !=
9513 					    vm_map_copy_to_entry(copy)) {
9514 						entry_size = copy_entry->vme_end -
9515 						    copy_entry->vme_start;
9516 						if ((local_size < copy_size) &&
9517 						    ((local_size + entry_size)
9518 						    >= copy_size)) {
9519 							vm_map_copy_clip_end(copy,
9520 							    copy_entry,
9521 							    copy_entry->vme_start +
9522 							    (copy_size - local_size));
9523 							entry_size = copy_entry->vme_end -
9524 							    copy_entry->vme_start;
9525 							local_size += entry_size;
9526 							new_offset += entry_size;
9527 						}
9528 						if (local_size >= copy_size) {
9529 							next_copy = copy_entry->vme_next;
9530 							copy_entry->vme_next =
9531 							    vm_map_copy_to_entry(copy);
9532 							previous_prev =
9533 							    copy->cpy_hdr.links.prev;
9534 							copy->cpy_hdr.links.prev = copy_entry;
9535 							copy->size = copy_size;
9536 							remaining_entries =
9537 							    copy->cpy_hdr.nentries;
9538 							remaining_entries -= nentries;
9539 							copy->cpy_hdr.nentries = nentries;
9540 							break;
9541 						} else {
9542 							local_size += entry_size;
9543 							new_offset += entry_size;
9544 							nentries++;
9545 						}
9546 						copy_entry = copy_entry->vme_next;
9547 					}
9548 				}
9549 
9550 				if ((entry->use_pmap) && (pmap == NULL)) {
9551 					kr = vm_map_copy_overwrite_nested(
9552 						VME_SUBMAP(entry),
9553 						sub_start,
9554 						copy,
9555 						interruptible,
9556 						VME_SUBMAP(entry)->pmap,
9557 						TRUE);
9558 				} else if (pmap != NULL) {
9559 					kr = vm_map_copy_overwrite_nested(
9560 						VME_SUBMAP(entry),
9561 						sub_start,
9562 						copy,
9563 						interruptible, pmap,
9564 						TRUE);
9565 				} else {
9566 					kr = vm_map_copy_overwrite_nested(
9567 						VME_SUBMAP(entry),
9568 						sub_start,
9569 						copy,
9570 						interruptible,
9571 						dst_map->pmap,
9572 						TRUE);
9573 				}
9574 				if (kr != KERN_SUCCESS) {
9575 					if (next_copy != NULL) {
9576 						copy->cpy_hdr.nentries +=
9577 						    remaining_entries;
9578 						copy->cpy_hdr.links.prev->vme_next =
9579 						    next_copy;
9580 						copy->cpy_hdr.links.prev
9581 						        = previous_prev;
9582 						copy->size = total_size;
9583 					}
9584 					return kr;
9585 				}
9586 				if (dst_end <= local_end) {
9587 					return KERN_SUCCESS;
9588 				}
9589 				/* otherwise copy no longer exists, it was */
9590 				/* destroyed after successful copy_overwrite */
9591 				copy = vm_map_copy_allocate();
9592 				copy->type = VM_MAP_COPY_ENTRY_LIST;
9593 				copy->offset = new_offset;
9594 				copy->cpy_hdr.page_shift = copy_page_shift;
9595 
9596 				/*
9597 				 * XXX FBDP
9598 				 * this does not seem to deal with
9599 				 * the VM map store (R&B tree)
9600 				 */
9601 
9602 				total_size -= copy_size;
9603 				copy_size = 0;
9604 				/* put back remainder of copy in container */
9605 				if (next_copy != NULL) {
9606 					copy->cpy_hdr.nentries = remaining_entries;
9607 					copy->cpy_hdr.links.next = next_copy;
9608 					copy->cpy_hdr.links.prev = previous_prev;
9609 					copy->size = total_size;
9610 					next_copy->vme_prev =
9611 					    vm_map_copy_to_entry(copy);
9612 					next_copy = NULL;
9613 				}
9614 				base_addr = local_end;
9615 				vm_map_lock(dst_map);
9616 				if (!vm_map_lookup_entry(dst_map,
9617 				    local_end, &tmp_entry)) {
9618 					vm_map_unlock(dst_map);
9619 					return KERN_INVALID_ADDRESS;
9620 				}
9621 				entry = tmp_entry;
9622 				continue;
9623 			}
9624 			if (dst_end <= entry->vme_end) {
9625 				copy_size = dst_end - base_addr;
9626 				break;
9627 			}
9628 
9629 			if ((next == vm_map_to_entry(dst_map)) ||
9630 			    (next->vme_start != entry->vme_end)) {
9631 				vm_map_unlock(dst_map);
9632 				return KERN_INVALID_ADDRESS;
9633 			}
9634 
9635 			entry = next;
9636 		}/* for */
9637 
9638 		next_copy = NULL;
9639 		nentries = 1;
9640 
9641 		/* adjust the copy object */
9642 		if (total_size > copy_size) {
9643 			vm_map_size_t   local_size = 0;
9644 			vm_map_size_t   entry_size;
9645 
9646 			new_offset = copy->offset;
9647 			copy_entry = vm_map_copy_first_entry(copy);
9648 			while (copy_entry != vm_map_copy_to_entry(copy)) {
9649 				entry_size = copy_entry->vme_end -
9650 				    copy_entry->vme_start;
9651 				if ((local_size < copy_size) &&
9652 				    ((local_size + entry_size)
9653 				    >= copy_size)) {
9654 					vm_map_copy_clip_end(copy, copy_entry,
9655 					    copy_entry->vme_start +
9656 					    (copy_size - local_size));
9657 					entry_size = copy_entry->vme_end -
9658 					    copy_entry->vme_start;
9659 					local_size += entry_size;
9660 					new_offset += entry_size;
9661 				}
9662 				if (local_size >= copy_size) {
9663 					next_copy = copy_entry->vme_next;
9664 					copy_entry->vme_next =
9665 					    vm_map_copy_to_entry(copy);
9666 					previous_prev =
9667 					    copy->cpy_hdr.links.prev;
9668 					copy->cpy_hdr.links.prev = copy_entry;
9669 					copy->size = copy_size;
9670 					remaining_entries =
9671 					    copy->cpy_hdr.nentries;
9672 					remaining_entries -= nentries;
9673 					copy->cpy_hdr.nentries = nentries;
9674 					break;
9675 				} else {
9676 					local_size += entry_size;
9677 					new_offset += entry_size;
9678 					nentries++;
9679 				}
9680 				copy_entry = copy_entry->vme_next;
9681 			}
9682 		}
9683 
9684 		if (aligned) {
9685 			pmap_t  local_pmap;
9686 
9687 			if (pmap) {
9688 				local_pmap = pmap;
9689 			} else {
9690 				local_pmap = dst_map->pmap;
9691 			}
9692 
9693 			if ((kr =  vm_map_copy_overwrite_aligned(
9694 				    dst_map, tmp_entry, copy,
9695 				    base_addr, local_pmap)) != KERN_SUCCESS) {
9696 				if (next_copy != NULL) {
9697 					copy->cpy_hdr.nentries +=
9698 					    remaining_entries;
9699 					copy->cpy_hdr.links.prev->vme_next =
9700 					    next_copy;
9701 					copy->cpy_hdr.links.prev =
9702 					    previous_prev;
9703 					copy->size += copy_size;
9704 				}
9705 				return kr;
9706 			}
9707 			vm_map_unlock(dst_map);
9708 		} else {
9709 			/*
9710 			 * Performance gain:
9711 			 *
9712 			 * if the copy and dst address are misaligned but the same
9713 			 * offset within the page we can copy_not_aligned the
9714 			 * misaligned parts and copy aligned the rest.  If they are
9715 			 * aligned but len is unaligned we simply need to copy
9716 			 * the end bit unaligned.  We'll need to split the misaligned
9717 			 * bits of the region in this case !
9718 			 */
9719 			/* ALWAYS UNLOCKS THE dst_map MAP */
9720 			kr = vm_map_copy_overwrite_unaligned(
9721 				dst_map,
9722 				tmp_entry,
9723 				copy,
9724 				base_addr,
9725 				discard_on_success);
9726 			if (kr != KERN_SUCCESS) {
9727 				if (next_copy != NULL) {
9728 					copy->cpy_hdr.nentries +=
9729 					    remaining_entries;
9730 					copy->cpy_hdr.links.prev->vme_next =
9731 					    next_copy;
9732 					copy->cpy_hdr.links.prev =
9733 					    previous_prev;
9734 					copy->size += copy_size;
9735 				}
9736 				return kr;
9737 			}
9738 		}
9739 		total_size -= copy_size;
9740 		if (total_size == 0) {
9741 			break;
9742 		}
9743 		base_addr += copy_size;
9744 		copy_size = 0;
9745 		copy->offset = new_offset;
9746 		if (next_copy != NULL) {
9747 			copy->cpy_hdr.nentries = remaining_entries;
9748 			copy->cpy_hdr.links.next = next_copy;
9749 			copy->cpy_hdr.links.prev = previous_prev;
9750 			next_copy->vme_prev = vm_map_copy_to_entry(copy);
9751 			copy->size = total_size;
9752 		}
9753 		vm_map_lock(dst_map);
9754 		while (TRUE) {
9755 			if (!vm_map_lookup_entry(dst_map,
9756 			    base_addr, &tmp_entry)) {
9757 				vm_map_unlock(dst_map);
9758 				return KERN_INVALID_ADDRESS;
9759 			}
9760 			if (tmp_entry->in_transition) {
9761 				entry->needs_wakeup = TRUE;
9762 				vm_map_entry_wait(dst_map, THREAD_UNINT);
9763 			} else {
9764 				break;
9765 			}
9766 		}
9767 		vm_map_clip_start(dst_map,
9768 		    tmp_entry,
9769 		    vm_map_trunc_page(base_addr,
9770 		    VM_MAP_PAGE_MASK(dst_map)));
9771 
9772 		entry = tmp_entry;
9773 	} /* while */
9774 
9775 	/*
9776 	 *	Throw away the vm_map_copy object
9777 	 */
9778 	if (discard_on_success) {
9779 		vm_map_copy_discard(copy);
9780 	}
9781 
9782 	return KERN_SUCCESS;
9783 }/* vm_map_copy_overwrite */
9784 
9785 kern_return_t
vm_map_copy_overwrite(vm_map_t dst_map,vm_map_offset_t dst_addr,vm_map_copy_t copy,vm_map_size_t copy_size,boolean_t interruptible)9786 vm_map_copy_overwrite(
9787 	vm_map_t        dst_map,
9788 	vm_map_offset_t dst_addr,
9789 	vm_map_copy_t   copy,
9790 	vm_map_size_t   copy_size,
9791 	boolean_t       interruptible)
9792 {
9793 	vm_map_size_t   head_size, tail_size;
9794 	vm_map_copy_t   head_copy, tail_copy;
9795 	vm_map_offset_t head_addr, tail_addr;
9796 	vm_map_entry_t  entry;
9797 	kern_return_t   kr;
9798 	vm_map_offset_t effective_page_mask, effective_page_size;
9799 	uint16_t        copy_page_shift;
9800 
9801 	head_size = 0;
9802 	tail_size = 0;
9803 	head_copy = NULL;
9804 	tail_copy = NULL;
9805 	head_addr = 0;
9806 	tail_addr = 0;
9807 
9808 	/*
9809 	 *	Check for null copy object.
9810 	 */
9811 	if (copy == VM_MAP_COPY_NULL) {
9812 		return KERN_SUCCESS;
9813 	}
9814 
9815 	/*
9816 	 * Assert that the vm_map_copy is coming from the right
9817 	 * zone and hasn't been forged
9818 	 */
9819 	vm_map_copy_require(copy);
9820 
9821 	if (interruptible ||
9822 	    copy->type != VM_MAP_COPY_ENTRY_LIST) {
9823 		/*
9824 		 * We can't split the "copy" map if we're interruptible
9825 		 * or if we don't have a "copy" map...
9826 		 */
9827 blunt_copy:
9828 		return vm_map_copy_overwrite_nested(dst_map,
9829 		           dst_addr,
9830 		           copy,
9831 		           interruptible,
9832 		           (pmap_t) NULL,
9833 		           TRUE);
9834 	}
9835 
9836 	copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy);
9837 	if (copy_page_shift < PAGE_SHIFT ||
9838 	    VM_MAP_PAGE_SHIFT(dst_map) < PAGE_SHIFT) {
9839 		goto blunt_copy;
9840 	}
9841 
9842 	if (VM_MAP_PAGE_SHIFT(dst_map) < PAGE_SHIFT) {
9843 		effective_page_mask = VM_MAP_PAGE_MASK(dst_map);
9844 	} else {
9845 		effective_page_mask = MAX(VM_MAP_PAGE_MASK(dst_map), PAGE_MASK);
9846 		effective_page_mask = MAX(VM_MAP_COPY_PAGE_MASK(copy),
9847 		    effective_page_mask);
9848 	}
9849 	effective_page_size = effective_page_mask + 1;
9850 
9851 	if (copy_size < VM_MAP_COPY_OVERWRITE_OPTIMIZATION_THRESHOLD_PAGES * effective_page_size) {
9852 		/*
9853 		 * Too small to bother with optimizing...
9854 		 */
9855 		goto blunt_copy;
9856 	}
9857 
9858 	if ((dst_addr & effective_page_mask) !=
9859 	    (copy->offset & effective_page_mask)) {
9860 		/*
9861 		 * Incompatible mis-alignment of source and destination...
9862 		 */
9863 		goto blunt_copy;
9864 	}
9865 
9866 	/*
9867 	 * Proper alignment or identical mis-alignment at the beginning.
9868 	 * Let's try and do a small unaligned copy first (if needed)
9869 	 * and then an aligned copy for the rest.
9870 	 */
9871 	if (!vm_map_page_aligned(dst_addr, effective_page_mask)) {
9872 		head_addr = dst_addr;
9873 		head_size = (effective_page_size -
9874 		    (copy->offset & effective_page_mask));
9875 		head_size = MIN(head_size, copy_size);
9876 	}
9877 	if (!vm_map_page_aligned(copy->offset + copy_size,
9878 	    effective_page_mask)) {
9879 		/*
9880 		 * Mis-alignment at the end.
9881 		 * Do an aligned copy up to the last page and
9882 		 * then an unaligned copy for the remaining bytes.
9883 		 */
9884 		tail_size = ((copy->offset + copy_size) &
9885 		    effective_page_mask);
9886 		tail_size = MIN(tail_size, copy_size);
9887 		tail_addr = dst_addr + copy_size - tail_size;
9888 		assert(tail_addr >= head_addr + head_size);
9889 	}
9890 	assert(head_size + tail_size <= copy_size);
9891 
9892 	if (head_size + tail_size == copy_size) {
9893 		/*
9894 		 * It's all unaligned, no optimization possible...
9895 		 */
9896 		goto blunt_copy;
9897 	}
9898 
9899 	/*
9900 	 * Can't optimize if there are any submaps in the
9901 	 * destination due to the way we free the "copy" map
9902 	 * progressively in vm_map_copy_overwrite_nested()
9903 	 * in that case.
9904 	 */
9905 	vm_map_lock_read(dst_map);
9906 	if (!vm_map_lookup_entry(dst_map, dst_addr, &entry)) {
9907 		vm_map_unlock_read(dst_map);
9908 		goto blunt_copy;
9909 	}
9910 	for (;
9911 	    (entry != vm_map_to_entry(dst_map) &&
9912 	    entry->vme_start < dst_addr + copy_size);
9913 	    entry = entry->vme_next) {
9914 		if (entry->is_sub_map) {
9915 			vm_map_unlock_read(dst_map);
9916 			goto blunt_copy;
9917 		}
9918 	}
9919 	vm_map_unlock_read(dst_map);
9920 
9921 	if (head_size) {
9922 		/*
9923 		 * Unaligned copy of the first "head_size" bytes, to reach
9924 		 * a page boundary.
9925 		 */
9926 
9927 		/*
9928 		 * Extract "head_copy" out of "copy".
9929 		 */
9930 		head_copy = vm_map_copy_allocate();
9931 		head_copy->type = VM_MAP_COPY_ENTRY_LIST;
9932 		head_copy->cpy_hdr.entries_pageable =
9933 		    copy->cpy_hdr.entries_pageable;
9934 		vm_map_store_init(&head_copy->cpy_hdr);
9935 		head_copy->cpy_hdr.page_shift = copy_page_shift;
9936 
9937 		entry = vm_map_copy_first_entry(copy);
9938 		if (entry->vme_end < copy->offset + head_size) {
9939 			head_size = entry->vme_end - copy->offset;
9940 		}
9941 
9942 		head_copy->offset = copy->offset;
9943 		head_copy->size = head_size;
9944 		copy->offset += head_size;
9945 		copy->size -= head_size;
9946 		copy_size -= head_size;
9947 		assert(copy_size > 0);
9948 
9949 		vm_map_copy_clip_end(copy, entry, copy->offset);
9950 		vm_map_copy_entry_unlink(copy, entry);
9951 		vm_map_copy_entry_link(head_copy,
9952 		    vm_map_copy_to_entry(head_copy),
9953 		    entry);
9954 
9955 		/*
9956 		 * Do the unaligned copy.
9957 		 */
9958 		kr = vm_map_copy_overwrite_nested(dst_map,
9959 		    head_addr,
9960 		    head_copy,
9961 		    interruptible,
9962 		    (pmap_t) NULL,
9963 		    FALSE);
9964 		if (kr != KERN_SUCCESS) {
9965 			goto done;
9966 		}
9967 	}
9968 
9969 	if (tail_size) {
9970 		/*
9971 		 * Extract "tail_copy" out of "copy".
9972 		 */
9973 		tail_copy = vm_map_copy_allocate();
9974 		tail_copy->type = VM_MAP_COPY_ENTRY_LIST;
9975 		tail_copy->cpy_hdr.entries_pageable =
9976 		    copy->cpy_hdr.entries_pageable;
9977 		vm_map_store_init(&tail_copy->cpy_hdr);
9978 		tail_copy->cpy_hdr.page_shift = copy_page_shift;
9979 
9980 		tail_copy->offset = copy->offset + copy_size - tail_size;
9981 		tail_copy->size = tail_size;
9982 
9983 		copy->size -= tail_size;
9984 		copy_size -= tail_size;
9985 		assert(copy_size > 0);
9986 
9987 		entry = vm_map_copy_last_entry(copy);
9988 		vm_map_copy_clip_start(copy, entry, tail_copy->offset);
9989 		entry = vm_map_copy_last_entry(copy);
9990 		vm_map_copy_entry_unlink(copy, entry);
9991 		vm_map_copy_entry_link(tail_copy,
9992 		    vm_map_copy_last_entry(tail_copy),
9993 		    entry);
9994 	}
9995 
9996 	/*
9997 	 * If we are here from ipc_kmsg_copyout_ool_descriptor(),
9998 	 * we want to avoid TOCTOU issues w.r.t copy->size but
9999 	 * we don't need to change vm_map_copy_overwrite_nested()
10000 	 * and all other vm_map_copy_overwrite variants.
10001 	 *
10002 	 * So we assign the original copy_size that was passed into
10003 	 * this routine back to copy.
10004 	 *
10005 	 * This use of local 'copy_size' passed into this routine is
10006 	 * to try and protect against TOCTOU attacks where the kernel
10007 	 * has been exploited. We don't expect this to be an issue
10008 	 * during normal system operation.
10009 	 */
10010 	assertf(copy->size == copy_size,
10011 	    "Mismatch of copy sizes. Expected 0x%llx, Got 0x%llx\n", (uint64_t) copy_size, (uint64_t) copy->size);
10012 	copy->size = copy_size;
10013 
10014 	/*
10015 	 * Copy most (or possibly all) of the data.
10016 	 */
10017 	kr = vm_map_copy_overwrite_nested(dst_map,
10018 	    dst_addr + head_size,
10019 	    copy,
10020 	    interruptible,
10021 	    (pmap_t) NULL,
10022 	    FALSE);
10023 	if (kr != KERN_SUCCESS) {
10024 		goto done;
10025 	}
10026 
10027 	if (tail_size) {
10028 		kr = vm_map_copy_overwrite_nested(dst_map,
10029 		    tail_addr,
10030 		    tail_copy,
10031 		    interruptible,
10032 		    (pmap_t) NULL,
10033 		    FALSE);
10034 	}
10035 
10036 done:
10037 	assert(copy->type == VM_MAP_COPY_ENTRY_LIST);
10038 	if (kr == KERN_SUCCESS) {
10039 		/*
10040 		 * Discard all the copy maps.
10041 		 */
10042 		if (head_copy) {
10043 			vm_map_copy_discard(head_copy);
10044 			head_copy = NULL;
10045 		}
10046 		vm_map_copy_discard(copy);
10047 		if (tail_copy) {
10048 			vm_map_copy_discard(tail_copy);
10049 			tail_copy = NULL;
10050 		}
10051 	} else {
10052 		/*
10053 		 * Re-assemble the original copy map.
10054 		 */
10055 		if (head_copy) {
10056 			entry = vm_map_copy_first_entry(head_copy);
10057 			vm_map_copy_entry_unlink(head_copy, entry);
10058 			vm_map_copy_entry_link(copy,
10059 			    vm_map_copy_to_entry(copy),
10060 			    entry);
10061 			copy->offset -= head_size;
10062 			copy->size += head_size;
10063 			vm_map_copy_discard(head_copy);
10064 			head_copy = NULL;
10065 		}
10066 		if (tail_copy) {
10067 			entry = vm_map_copy_last_entry(tail_copy);
10068 			vm_map_copy_entry_unlink(tail_copy, entry);
10069 			vm_map_copy_entry_link(copy,
10070 			    vm_map_copy_last_entry(copy),
10071 			    entry);
10072 			copy->size += tail_size;
10073 			vm_map_copy_discard(tail_copy);
10074 			tail_copy = NULL;
10075 		}
10076 	}
10077 	return kr;
10078 }
10079 
10080 
10081 /*
10082  *	Routine: vm_map_copy_overwrite_unaligned	[internal use only]
10083  *
10084  *	Decription:
10085  *	Physically copy unaligned data
10086  *
10087  *	Implementation:
10088  *	Unaligned parts of pages have to be physically copied.  We use
10089  *	a modified form of vm_fault_copy (which understands none-aligned
10090  *	page offsets and sizes) to do the copy.  We attempt to copy as
10091  *	much memory in one go as possibly, however vm_fault_copy copies
10092  *	within 1 memory object so we have to find the smaller of "amount left"
10093  *	"source object data size" and "target object data size".  With
10094  *	unaligned data we don't need to split regions, therefore the source
10095  *	(copy) object should be one map entry, the target range may be split
10096  *	over multiple map entries however.  In any event we are pessimistic
10097  *	about these assumptions.
10098  *
10099  *	Callers of this function must call vm_map_copy_require on
10100  *	previously created vm_map_copy_t or pass a newly created
10101  *	one to ensure that it hasn't been forged.
10102  *
10103  *	Assumptions:
10104  *	dst_map is locked on entry and is return locked on success,
10105  *	unlocked on error.
10106  */
10107 
10108 static kern_return_t
vm_map_copy_overwrite_unaligned(vm_map_t dst_map,vm_map_entry_t entry,vm_map_copy_t copy,vm_map_offset_t start,boolean_t discard_on_success)10109 vm_map_copy_overwrite_unaligned(
10110 	vm_map_t        dst_map,
10111 	vm_map_entry_t  entry,
10112 	vm_map_copy_t   copy,
10113 	vm_map_offset_t start,
10114 	boolean_t       discard_on_success)
10115 {
10116 	vm_map_entry_t          copy_entry;
10117 	vm_map_entry_t          copy_entry_next;
10118 	vm_map_version_t        version;
10119 	vm_object_t             dst_object;
10120 	vm_object_offset_t      dst_offset;
10121 	vm_object_offset_t      src_offset;
10122 	vm_object_offset_t      entry_offset;
10123 	vm_map_offset_t         entry_end;
10124 	vm_map_size_t           src_size,
10125 	    dst_size,
10126 	    copy_size,
10127 	    amount_left;
10128 	kern_return_t           kr = KERN_SUCCESS;
10129 
10130 
10131 	copy_entry = vm_map_copy_first_entry(copy);
10132 
10133 	vm_map_lock_write_to_read(dst_map);
10134 
10135 	src_offset = copy->offset - trunc_page_mask_64(copy->offset, VM_MAP_COPY_PAGE_MASK(copy));
10136 	amount_left = copy->size;
10137 /*
10138  *	unaligned so we never clipped this entry, we need the offset into
10139  *	the vm_object not just the data.
10140  */
10141 	while (amount_left > 0) {
10142 		if (entry == vm_map_to_entry(dst_map)) {
10143 			vm_map_unlock_read(dst_map);
10144 			return KERN_INVALID_ADDRESS;
10145 		}
10146 
10147 		/* "start" must be within the current map entry */
10148 		assert((start >= entry->vme_start) && (start < entry->vme_end));
10149 
10150 		dst_offset = start - entry->vme_start;
10151 
10152 		dst_size = entry->vme_end - start;
10153 
10154 		src_size = copy_entry->vme_end -
10155 		    (copy_entry->vme_start + src_offset);
10156 
10157 		if (dst_size < src_size) {
10158 /*
10159  *			we can only copy dst_size bytes before
10160  *			we have to get the next destination entry
10161  */
10162 			copy_size = dst_size;
10163 		} else {
10164 /*
10165  *			we can only copy src_size bytes before
10166  *			we have to get the next source copy entry
10167  */
10168 			copy_size = src_size;
10169 		}
10170 
10171 		if (copy_size > amount_left) {
10172 			copy_size = amount_left;
10173 		}
10174 /*
10175  *		Entry needs copy, create a shadow shadow object for
10176  *		Copy on write region.
10177  */
10178 		if (entry->needs_copy &&
10179 		    ((entry->protection & VM_PROT_WRITE) != 0)) {
10180 			if (vm_map_lock_read_to_write(dst_map)) {
10181 				vm_map_lock_read(dst_map);
10182 				goto RetryLookup;
10183 			}
10184 			VME_OBJECT_SHADOW(entry,
10185 			    (vm_map_size_t)(entry->vme_end
10186 			    - entry->vme_start),
10187 			    vm_map_always_shadow(dst_map));
10188 			entry->needs_copy = FALSE;
10189 			vm_map_lock_write_to_read(dst_map);
10190 		}
10191 		dst_object = VME_OBJECT(entry);
10192 /*
10193  *		unlike with the virtual (aligned) copy we're going
10194  *		to fault on it therefore we need a target object.
10195  */
10196 		if (dst_object == VM_OBJECT_NULL) {
10197 			if (vm_map_lock_read_to_write(dst_map)) {
10198 				vm_map_lock_read(dst_map);
10199 				goto RetryLookup;
10200 			}
10201 			dst_object = vm_object_allocate((vm_map_size_t)
10202 			    entry->vme_end - entry->vme_start);
10203 			VME_OBJECT_SET(entry, dst_object, false, 0);
10204 			VME_OFFSET_SET(entry, 0);
10205 			assert(entry->use_pmap);
10206 			vm_map_lock_write_to_read(dst_map);
10207 		}
10208 /*
10209  *		Take an object reference and unlock map. The "entry" may
10210  *		disappear or change when the map is unlocked.
10211  */
10212 		vm_object_reference(dst_object);
10213 		version.main_timestamp = dst_map->timestamp;
10214 		entry_offset = VME_OFFSET(entry);
10215 		entry_end = entry->vme_end;
10216 		vm_map_unlock_read(dst_map);
10217 /*
10218  *		Copy as much as possible in one pass
10219  */
10220 		kr = vm_fault_copy(
10221 			VME_OBJECT(copy_entry),
10222 			VME_OFFSET(copy_entry) + src_offset,
10223 			&copy_size,
10224 			dst_object,
10225 			entry_offset + dst_offset,
10226 			dst_map,
10227 			&version,
10228 			THREAD_UNINT );
10229 
10230 		start += copy_size;
10231 		src_offset += copy_size;
10232 		amount_left -= copy_size;
10233 /*
10234  *		Release the object reference
10235  */
10236 		vm_object_deallocate(dst_object);
10237 /*
10238  *		If a hard error occurred, return it now
10239  */
10240 		if (kr != KERN_SUCCESS) {
10241 			return kr;
10242 		}
10243 
10244 		if ((copy_entry->vme_start + src_offset) == copy_entry->vme_end
10245 		    || amount_left == 0) {
10246 /*
10247  *			all done with this copy entry, dispose.
10248  */
10249 			copy_entry_next = copy_entry->vme_next;
10250 
10251 			if (discard_on_success) {
10252 				vm_map_copy_entry_unlink(copy, copy_entry);
10253 				assert(!copy_entry->is_sub_map);
10254 				vm_object_deallocate(VME_OBJECT(copy_entry));
10255 				vm_map_copy_entry_dispose(copy_entry);
10256 			}
10257 
10258 			if (copy_entry_next == vm_map_copy_to_entry(copy) &&
10259 			    amount_left) {
10260 /*
10261  *				not finished copying but run out of source
10262  */
10263 				return KERN_INVALID_ADDRESS;
10264 			}
10265 
10266 			copy_entry = copy_entry_next;
10267 
10268 			src_offset = 0;
10269 		}
10270 
10271 		if (amount_left == 0) {
10272 			return KERN_SUCCESS;
10273 		}
10274 
10275 		vm_map_lock_read(dst_map);
10276 		if (version.main_timestamp == dst_map->timestamp) {
10277 			if (start == entry_end) {
10278 /*
10279  *				destination region is split.  Use the version
10280  *				information to avoid a lookup in the normal
10281  *				case.
10282  */
10283 				entry = entry->vme_next;
10284 /*
10285  *				should be contiguous. Fail if we encounter
10286  *				a hole in the destination.
10287  */
10288 				if (start != entry->vme_start) {
10289 					vm_map_unlock_read(dst_map);
10290 					return KERN_INVALID_ADDRESS;
10291 				}
10292 			}
10293 		} else {
10294 /*
10295  *			Map version check failed.
10296  *			we must lookup the entry because somebody
10297  *			might have changed the map behind our backs.
10298  */
10299 RetryLookup:
10300 			if (!vm_map_lookup_entry(dst_map, start, &entry)) {
10301 				vm_map_unlock_read(dst_map);
10302 				return KERN_INVALID_ADDRESS;
10303 			}
10304 		}
10305 	}/* while */
10306 
10307 	return KERN_SUCCESS;
10308 }/* vm_map_copy_overwrite_unaligned */
10309 
10310 /*
10311  *	Routine: vm_map_copy_overwrite_aligned	[internal use only]
10312  *
10313  *	Description:
10314  *	Does all the vm_trickery possible for whole pages.
10315  *
10316  *	Implementation:
10317  *
10318  *	If there are no permanent objects in the destination,
10319  *	and the source and destination map entry zones match,
10320  *	and the destination map entry is not shared,
10321  *	then the map entries can be deleted and replaced
10322  *	with those from the copy.  The following code is the
10323  *	basic idea of what to do, but there are lots of annoying
10324  *	little details about getting protection and inheritance
10325  *	right.  Should add protection, inheritance, and sharing checks
10326  *	to the above pass and make sure that no wiring is involved.
10327  *
10328  *	Callers of this function must call vm_map_copy_require on
10329  *	previously created vm_map_copy_t or pass a newly created
10330  *	one to ensure that it hasn't been forged.
10331  */
10332 
10333 int vm_map_copy_overwrite_aligned_src_not_internal = 0;
10334 int vm_map_copy_overwrite_aligned_src_not_symmetric = 0;
10335 int vm_map_copy_overwrite_aligned_src_large = 0;
10336 
10337 static kern_return_t
vm_map_copy_overwrite_aligned(vm_map_t dst_map,vm_map_entry_t tmp_entry,vm_map_copy_t copy,vm_map_offset_t start,__unused pmap_t pmap)10338 vm_map_copy_overwrite_aligned(
10339 	vm_map_t        dst_map,
10340 	vm_map_entry_t  tmp_entry,
10341 	vm_map_copy_t   copy,
10342 	vm_map_offset_t start,
10343 	__unused pmap_t pmap)
10344 {
10345 	vm_object_t     object;
10346 	vm_map_entry_t  copy_entry;
10347 	vm_map_size_t   copy_size;
10348 	vm_map_size_t   size;
10349 	vm_map_entry_t  entry;
10350 
10351 	while ((copy_entry = vm_map_copy_first_entry(copy))
10352 	    != vm_map_copy_to_entry(copy)) {
10353 		copy_size = (copy_entry->vme_end - copy_entry->vme_start);
10354 
10355 		entry = tmp_entry;
10356 		if (entry->is_sub_map) {
10357 			/* unnested when clipped earlier */
10358 			assert(!entry->use_pmap);
10359 		}
10360 		if (entry == vm_map_to_entry(dst_map)) {
10361 			vm_map_unlock(dst_map);
10362 			return KERN_INVALID_ADDRESS;
10363 		}
10364 		size = (entry->vme_end - entry->vme_start);
10365 		/*
10366 		 *	Make sure that no holes popped up in the
10367 		 *	address map, and that the protection is
10368 		 *	still valid, in case the map was unlocked
10369 		 *	earlier.
10370 		 */
10371 
10372 		if ((entry->vme_start != start) || ((entry->is_sub_map)
10373 		    && !entry->needs_copy)) {
10374 			vm_map_unlock(dst_map);
10375 			return KERN_INVALID_ADDRESS;
10376 		}
10377 		assert(entry != vm_map_to_entry(dst_map));
10378 
10379 		/*
10380 		 *	Check protection again
10381 		 */
10382 
10383 		if (!(entry->protection & VM_PROT_WRITE)) {
10384 			vm_map_unlock(dst_map);
10385 			return KERN_PROTECTION_FAILURE;
10386 		}
10387 
10388 		if (!vm_map_entry_is_overwritable(dst_map, entry)) {
10389 			vm_map_unlock(dst_map);
10390 			return KERN_PROTECTION_FAILURE;
10391 		}
10392 
10393 		/*
10394 		 *	Adjust to source size first
10395 		 */
10396 
10397 		if (copy_size < size) {
10398 			if (entry->map_aligned &&
10399 			    !VM_MAP_PAGE_ALIGNED(entry->vme_start + copy_size,
10400 			    VM_MAP_PAGE_MASK(dst_map))) {
10401 				/* no longer map-aligned */
10402 				entry->map_aligned = FALSE;
10403 			}
10404 			vm_map_clip_end(dst_map, entry, entry->vme_start + copy_size);
10405 			size = copy_size;
10406 		}
10407 
10408 		/*
10409 		 *	Adjust to destination size
10410 		 */
10411 
10412 		if (size < copy_size) {
10413 			vm_map_copy_clip_end(copy, copy_entry,
10414 			    copy_entry->vme_start + size);
10415 			copy_size = size;
10416 		}
10417 
10418 		assert((entry->vme_end - entry->vme_start) == size);
10419 		assert((tmp_entry->vme_end - tmp_entry->vme_start) == size);
10420 		assert((copy_entry->vme_end - copy_entry->vme_start) == size);
10421 
10422 		/*
10423 		 *	If the destination contains temporary unshared memory,
10424 		 *	we can perform the copy by throwing it away and
10425 		 *	installing the source data.
10426 		 */
10427 
10428 		object = VME_OBJECT(entry);
10429 		if ((!entry->is_shared &&
10430 		    ((object == VM_OBJECT_NULL) ||
10431 		    (object->internal && !object->true_share))) ||
10432 		    entry->needs_copy) {
10433 			vm_object_t     old_object = VME_OBJECT(entry);
10434 			vm_object_offset_t      old_offset = VME_OFFSET(entry);
10435 			vm_object_offset_t      offset;
10436 
10437 			/*
10438 			 * Ensure that the source and destination aren't
10439 			 * identical
10440 			 */
10441 			if (old_object == VME_OBJECT(copy_entry) &&
10442 			    old_offset == VME_OFFSET(copy_entry)) {
10443 				vm_map_copy_entry_unlink(copy, copy_entry);
10444 				vm_map_copy_entry_dispose(copy_entry);
10445 
10446 				if (old_object != VM_OBJECT_NULL) {
10447 					vm_object_deallocate(old_object);
10448 				}
10449 
10450 				start = tmp_entry->vme_end;
10451 				tmp_entry = tmp_entry->vme_next;
10452 				continue;
10453 			}
10454 
10455 #if XNU_TARGET_OS_OSX
10456 #define __TRADEOFF1_OBJ_SIZE (64 * 1024 * 1024) /* 64 MB */
10457 #define __TRADEOFF1_COPY_SIZE (128 * 1024)      /* 128 KB */
10458 			if (VME_OBJECT(copy_entry) != VM_OBJECT_NULL &&
10459 			    VME_OBJECT(copy_entry)->vo_size >= __TRADEOFF1_OBJ_SIZE &&
10460 			    copy_size <= __TRADEOFF1_COPY_SIZE) {
10461 				/*
10462 				 * Virtual vs. Physical copy tradeoff #1.
10463 				 *
10464 				 * Copying only a few pages out of a large
10465 				 * object:  do a physical copy instead of
10466 				 * a virtual copy, to avoid possibly keeping
10467 				 * the entire large object alive because of
10468 				 * those few copy-on-write pages.
10469 				 */
10470 				vm_map_copy_overwrite_aligned_src_large++;
10471 				goto slow_copy;
10472 			}
10473 #endif /* XNU_TARGET_OS_OSX */
10474 
10475 			if ((dst_map->pmap != kernel_pmap) &&
10476 			    (VME_ALIAS(entry) >= VM_MEMORY_MALLOC) &&
10477 			    (VME_ALIAS(entry) <= VM_MEMORY_MALLOC_MEDIUM)) {
10478 				vm_object_t new_object, new_shadow;
10479 
10480 				/*
10481 				 * We're about to map something over a mapping
10482 				 * established by malloc()...
10483 				 */
10484 				new_object = VME_OBJECT(copy_entry);
10485 				if (new_object != VM_OBJECT_NULL) {
10486 					vm_object_lock_shared(new_object);
10487 				}
10488 				while (new_object != VM_OBJECT_NULL &&
10489 #if XNU_TARGET_OS_OSX
10490 				    !new_object->true_share &&
10491 				    new_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC &&
10492 #endif /* XNU_TARGET_OS_OSX */
10493 				    new_object->internal) {
10494 					new_shadow = new_object->shadow;
10495 					if (new_shadow == VM_OBJECT_NULL) {
10496 						break;
10497 					}
10498 					vm_object_lock_shared(new_shadow);
10499 					vm_object_unlock(new_object);
10500 					new_object = new_shadow;
10501 				}
10502 				if (new_object != VM_OBJECT_NULL) {
10503 					if (!new_object->internal) {
10504 						/*
10505 						 * The new mapping is backed
10506 						 * by an external object.  We
10507 						 * don't want malloc'ed memory
10508 						 * to be replaced with such a
10509 						 * non-anonymous mapping, so
10510 						 * let's go off the optimized
10511 						 * path...
10512 						 */
10513 						vm_map_copy_overwrite_aligned_src_not_internal++;
10514 						vm_object_unlock(new_object);
10515 						goto slow_copy;
10516 					}
10517 #if XNU_TARGET_OS_OSX
10518 					if (new_object->true_share ||
10519 					    new_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
10520 						/*
10521 						 * Same if there's a "true_share"
10522 						 * object in the shadow chain, or
10523 						 * an object with a non-default
10524 						 * (SYMMETRIC) copy strategy.
10525 						 */
10526 						vm_map_copy_overwrite_aligned_src_not_symmetric++;
10527 						vm_object_unlock(new_object);
10528 						goto slow_copy;
10529 					}
10530 #endif /* XNU_TARGET_OS_OSX */
10531 					vm_object_unlock(new_object);
10532 				}
10533 				/*
10534 				 * The new mapping is still backed by
10535 				 * anonymous (internal) memory, so it's
10536 				 * OK to substitute it for the original
10537 				 * malloc() mapping.
10538 				 */
10539 			}
10540 
10541 			if (old_object != VM_OBJECT_NULL) {
10542 				assert(!entry->vme_permanent);
10543 				if (entry->is_sub_map) {
10544 					if (entry->use_pmap) {
10545 #ifndef NO_NESTED_PMAP
10546 						pmap_unnest(dst_map->pmap,
10547 						    (addr64_t)entry->vme_start,
10548 						    entry->vme_end - entry->vme_start);
10549 #endif  /* NO_NESTED_PMAP */
10550 						if (dst_map->mapped_in_other_pmaps) {
10551 							/* clean up parent */
10552 							/* map/maps */
10553 							vm_map_submap_pmap_clean(
10554 								dst_map, entry->vme_start,
10555 								entry->vme_end,
10556 								VME_SUBMAP(entry),
10557 								VME_OFFSET(entry));
10558 						}
10559 					} else {
10560 						vm_map_submap_pmap_clean(
10561 							dst_map, entry->vme_start,
10562 							entry->vme_end,
10563 							VME_SUBMAP(entry),
10564 							VME_OFFSET(entry));
10565 					}
10566 					vm_map_deallocate(VME_SUBMAP(entry));
10567 				} else {
10568 					if (dst_map->mapped_in_other_pmaps) {
10569 						vm_object_pmap_protect_options(
10570 							VME_OBJECT(entry),
10571 							VME_OFFSET(entry),
10572 							entry->vme_end
10573 							- entry->vme_start,
10574 							PMAP_NULL,
10575 							PAGE_SIZE,
10576 							entry->vme_start,
10577 							VM_PROT_NONE,
10578 							PMAP_OPTIONS_REMOVE);
10579 					} else {
10580 						pmap_remove_options(
10581 							dst_map->pmap,
10582 							(addr64_t)(entry->vme_start),
10583 							(addr64_t)(entry->vme_end),
10584 							PMAP_OPTIONS_REMOVE);
10585 					}
10586 					vm_object_deallocate(old_object);
10587 				}
10588 			}
10589 
10590 			if (entry->iokit_acct) {
10591 				/* keep using iokit accounting */
10592 				entry->use_pmap = FALSE;
10593 			} else {
10594 				/* use pmap accounting */
10595 				entry->use_pmap = TRUE;
10596 			}
10597 			assert(!entry->vme_permanent);
10598 			VME_OBJECT_SET(entry, VME_OBJECT(copy_entry), false, 0);
10599 			object = VME_OBJECT(entry);
10600 			entry->needs_copy = copy_entry->needs_copy;
10601 			entry->wired_count = 0;
10602 			entry->user_wired_count = 0;
10603 			offset = VME_OFFSET(copy_entry);
10604 			VME_OFFSET_SET(entry, offset);
10605 
10606 			vm_map_copy_entry_unlink(copy, copy_entry);
10607 			vm_map_copy_entry_dispose(copy_entry);
10608 
10609 			/*
10610 			 * we could try to push pages into the pmap at this point, BUT
10611 			 * this optimization only saved on average 2 us per page if ALL
10612 			 * the pages in the source were currently mapped
10613 			 * and ALL the pages in the dest were touched, if there were fewer
10614 			 * than 2/3 of the pages touched, this optimization actually cost more cycles
10615 			 * it also puts a lot of pressure on the pmap layer w/r to mapping structures
10616 			 */
10617 
10618 			/*
10619 			 *	Set up for the next iteration.  The map
10620 			 *	has not been unlocked, so the next
10621 			 *	address should be at the end of this
10622 			 *	entry, and the next map entry should be
10623 			 *	the one following it.
10624 			 */
10625 
10626 			start = tmp_entry->vme_end;
10627 			tmp_entry = tmp_entry->vme_next;
10628 		} else {
10629 			vm_map_version_t        version;
10630 			vm_object_t             dst_object;
10631 			vm_object_offset_t      dst_offset;
10632 			kern_return_t           r;
10633 
10634 slow_copy:
10635 			if (entry->needs_copy) {
10636 				VME_OBJECT_SHADOW(entry,
10637 				    (entry->vme_end -
10638 				    entry->vme_start),
10639 				    vm_map_always_shadow(dst_map));
10640 				entry->needs_copy = FALSE;
10641 			}
10642 
10643 			dst_object = VME_OBJECT(entry);
10644 			dst_offset = VME_OFFSET(entry);
10645 
10646 			/*
10647 			 *	Take an object reference, and record
10648 			 *	the map version information so that the
10649 			 *	map can be safely unlocked.
10650 			 */
10651 
10652 			if (dst_object == VM_OBJECT_NULL) {
10653 				/*
10654 				 * We would usually have just taken the
10655 				 * optimized path above if the destination
10656 				 * object has not been allocated yet.  But we
10657 				 * now disable that optimization if the copy
10658 				 * entry's object is not backed by anonymous
10659 				 * memory to avoid replacing malloc'ed
10660 				 * (i.e. re-usable) anonymous memory with a
10661 				 * not-so-anonymous mapping.
10662 				 * So we have to handle this case here and
10663 				 * allocate a new VM object for this map entry.
10664 				 */
10665 				dst_object = vm_object_allocate(
10666 					entry->vme_end - entry->vme_start);
10667 				dst_offset = 0;
10668 				VME_OBJECT_SET(entry, dst_object, false, 0);
10669 				VME_OFFSET_SET(entry, dst_offset);
10670 				assert(entry->use_pmap);
10671 			}
10672 
10673 			vm_object_reference(dst_object);
10674 
10675 			/* account for unlock bumping up timestamp */
10676 			version.main_timestamp = dst_map->timestamp + 1;
10677 
10678 			vm_map_unlock(dst_map);
10679 
10680 			/*
10681 			 *	Copy as much as possible in one pass
10682 			 */
10683 
10684 			copy_size = size;
10685 			r = vm_fault_copy(
10686 				VME_OBJECT(copy_entry),
10687 				VME_OFFSET(copy_entry),
10688 				&copy_size,
10689 				dst_object,
10690 				dst_offset,
10691 				dst_map,
10692 				&version,
10693 				THREAD_UNINT );
10694 
10695 			/*
10696 			 *	Release the object reference
10697 			 */
10698 
10699 			vm_object_deallocate(dst_object);
10700 
10701 			/*
10702 			 *	If a hard error occurred, return it now
10703 			 */
10704 
10705 			if (r != KERN_SUCCESS) {
10706 				return r;
10707 			}
10708 
10709 			if (copy_size != 0) {
10710 				/*
10711 				 *	Dispose of the copied region
10712 				 */
10713 
10714 				vm_map_copy_clip_end(copy, copy_entry,
10715 				    copy_entry->vme_start + copy_size);
10716 				vm_map_copy_entry_unlink(copy, copy_entry);
10717 				vm_object_deallocate(VME_OBJECT(copy_entry));
10718 				vm_map_copy_entry_dispose(copy_entry);
10719 			}
10720 
10721 			/*
10722 			 *	Pick up in the destination map where we left off.
10723 			 *
10724 			 *	Use the version information to avoid a lookup
10725 			 *	in the normal case.
10726 			 */
10727 
10728 			start += copy_size;
10729 			vm_map_lock(dst_map);
10730 			if (version.main_timestamp == dst_map->timestamp &&
10731 			    copy_size != 0) {
10732 				/* We can safely use saved tmp_entry value */
10733 
10734 				if (tmp_entry->map_aligned &&
10735 				    !VM_MAP_PAGE_ALIGNED(
10736 					    start,
10737 					    VM_MAP_PAGE_MASK(dst_map))) {
10738 					/* no longer map-aligned */
10739 					tmp_entry->map_aligned = FALSE;
10740 				}
10741 				vm_map_clip_end(dst_map, tmp_entry, start);
10742 				tmp_entry = tmp_entry->vme_next;
10743 			} else {
10744 				/* Must do lookup of tmp_entry */
10745 
10746 				if (!vm_map_lookup_entry(dst_map, start, &tmp_entry)) {
10747 					vm_map_unlock(dst_map);
10748 					return KERN_INVALID_ADDRESS;
10749 				}
10750 				if (tmp_entry->map_aligned &&
10751 				    !VM_MAP_PAGE_ALIGNED(
10752 					    start,
10753 					    VM_MAP_PAGE_MASK(dst_map))) {
10754 					/* no longer map-aligned */
10755 					tmp_entry->map_aligned = FALSE;
10756 				}
10757 				vm_map_clip_start(dst_map, tmp_entry, start);
10758 			}
10759 		}
10760 	}/* while */
10761 
10762 	return KERN_SUCCESS;
10763 }/* vm_map_copy_overwrite_aligned */
10764 
10765 /*
10766  *	Routine: vm_map_copyin_kernel_buffer [internal use only]
10767  *
10768  *	Description:
10769  *		Copy in data to a kernel buffer from space in the
10770  *		source map. The original space may be optionally
10771  *		deallocated.
10772  *
10773  *		If successful, returns a new copy object.
10774  */
10775 static kern_return_t
vm_map_copyin_kernel_buffer(vm_map_t src_map,vm_map_offset_t src_addr,vm_map_size_t len,boolean_t src_destroy,vm_map_copy_t * copy_result)10776 vm_map_copyin_kernel_buffer(
10777 	vm_map_t        src_map,
10778 	vm_map_offset_t src_addr,
10779 	vm_map_size_t   len,
10780 	boolean_t       src_destroy,
10781 	vm_map_copy_t   *copy_result)
10782 {
10783 	kern_return_t kr;
10784 	vm_map_copy_t copy;
10785 
10786 	if (len > msg_ool_size_small) {
10787 		return KERN_INVALID_ARGUMENT;
10788 	}
10789 
10790 	copy = zalloc_id(ZONE_ID_VM_MAP_COPY, Z_WAITOK | Z_ZERO | Z_NOFAIL);
10791 	copy->cpy_kdata = kalloc_data(len, Z_WAITOK);
10792 	if (copy->cpy_kdata == NULL) {
10793 		zfree_id(ZONE_ID_VM_MAP_COPY, copy);
10794 		return KERN_RESOURCE_SHORTAGE;
10795 	}
10796 
10797 	copy->type = VM_MAP_COPY_KERNEL_BUFFER;
10798 	copy->size = len;
10799 	copy->offset = 0;
10800 
10801 	kr = copyinmap(src_map, src_addr, copy->cpy_kdata, (vm_size_t)len);
10802 	if (kr != KERN_SUCCESS) {
10803 		kfree_data(copy->cpy_kdata, len);
10804 		zfree_id(ZONE_ID_VM_MAP_COPY, copy);
10805 		return kr;
10806 	}
10807 
10808 	if (src_destroy) {
10809 		vmr_flags_t flags = VM_MAP_REMOVE_INTERRUPTIBLE;
10810 
10811 		if (src_map == kernel_map) {
10812 			flags |= VM_MAP_REMOVE_KUNWIRE;
10813 		}
10814 
10815 		(void)vm_map_remove_guard(src_map,
10816 		    vm_map_trunc_page(src_addr, VM_MAP_PAGE_MASK(src_map)),
10817 		    vm_map_round_page(src_addr + len, VM_MAP_PAGE_MASK(src_map)),
10818 		    flags, KMEM_GUARD_NONE);
10819 	}
10820 
10821 	*copy_result = copy;
10822 	return KERN_SUCCESS;
10823 }
10824 
10825 /*
10826  *	Routine: vm_map_copyout_kernel_buffer	[internal use only]
10827  *
10828  *	Description:
10829  *		Copy out data from a kernel buffer into space in the
10830  *		destination map. The space may be otpionally dynamically
10831  *		allocated.
10832  *
10833  *		If successful, consumes the copy object.
10834  *		Otherwise, the caller is responsible for it.
10835  *
10836  *		Callers of this function must call vm_map_copy_require on
10837  *		previously created vm_map_copy_t or pass a newly created
10838  *		one to ensure that it hasn't been forged.
10839  */
10840 static int vm_map_copyout_kernel_buffer_failures = 0;
10841 static kern_return_t
vm_map_copyout_kernel_buffer(vm_map_t map,vm_map_address_t * addr,vm_map_copy_t copy,vm_map_size_t copy_size,boolean_t overwrite,boolean_t consume_on_success)10842 vm_map_copyout_kernel_buffer(
10843 	vm_map_t                map,
10844 	vm_map_address_t        *addr,  /* IN/OUT */
10845 	vm_map_copy_t           copy,
10846 	vm_map_size_t           copy_size,
10847 	boolean_t               overwrite,
10848 	boolean_t               consume_on_success)
10849 {
10850 	kern_return_t kr = KERN_SUCCESS;
10851 	thread_t thread = current_thread();
10852 
10853 	assert(copy->size == copy_size);
10854 
10855 	/*
10856 	 * check for corrupted vm_map_copy structure
10857 	 */
10858 	if (copy_size > msg_ool_size_small || copy->offset) {
10859 		panic("Invalid vm_map_copy_t sz:%lld, ofst:%lld",
10860 		    (long long)copy->size, (long long)copy->offset);
10861 	}
10862 
10863 	if (!overwrite) {
10864 		/*
10865 		 * Allocate space in the target map for the data
10866 		 */
10867 		vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
10868 
10869 		if (map == kernel_map) {
10870 			vmk_flags.vmkf_range_id = KMEM_RANGE_ID_DATA;
10871 		}
10872 		*addr = 0;
10873 		kr = vm_map_enter(map,
10874 		    addr,
10875 		    vm_map_round_page(copy_size,
10876 		    VM_MAP_PAGE_MASK(map)),
10877 		    (vm_map_offset_t) 0,
10878 		    VM_FLAGS_ANYWHERE,
10879 		    vmk_flags,
10880 		    VM_KERN_MEMORY_NONE,
10881 		    VM_OBJECT_NULL,
10882 		    (vm_object_offset_t) 0,
10883 		    FALSE,
10884 		    VM_PROT_DEFAULT,
10885 		    VM_PROT_ALL,
10886 		    VM_INHERIT_DEFAULT);
10887 		if (kr != KERN_SUCCESS) {
10888 			return kr;
10889 		}
10890 #if KASAN
10891 		if (map->pmap == kernel_pmap) {
10892 			kasan_notify_address(*addr, copy->size);
10893 		}
10894 #endif
10895 	}
10896 
10897 	/*
10898 	 * Copyout the data from the kernel buffer to the target map.
10899 	 */
10900 	if (thread->map == map) {
10901 		/*
10902 		 * If the target map is the current map, just do
10903 		 * the copy.
10904 		 */
10905 		assert((vm_size_t)copy_size == copy_size);
10906 		if (copyout(copy->cpy_kdata, *addr, (vm_size_t)copy_size)) {
10907 			kr = KERN_INVALID_ADDRESS;
10908 		}
10909 	} else {
10910 		vm_map_t oldmap;
10911 
10912 		/*
10913 		 * If the target map is another map, assume the
10914 		 * target's address space identity for the duration
10915 		 * of the copy.
10916 		 */
10917 		vm_map_reference(map);
10918 		oldmap = vm_map_switch(map);
10919 
10920 		assert((vm_size_t)copy_size == copy_size);
10921 		if (copyout(copy->cpy_kdata, *addr, (vm_size_t)copy_size)) {
10922 			vm_map_copyout_kernel_buffer_failures++;
10923 			kr = KERN_INVALID_ADDRESS;
10924 		}
10925 
10926 		(void) vm_map_switch(oldmap);
10927 		vm_map_deallocate(map);
10928 	}
10929 
10930 	if (kr != KERN_SUCCESS) {
10931 		/* the copy failed, clean up */
10932 		if (!overwrite) {
10933 			/*
10934 			 * Deallocate the space we allocated in the target map.
10935 			 */
10936 			(void) vm_map_remove(map,
10937 			    vm_map_trunc_page(*addr,
10938 			    VM_MAP_PAGE_MASK(map)),
10939 			    vm_map_round_page((*addr +
10940 			    vm_map_round_page(copy_size,
10941 			    VM_MAP_PAGE_MASK(map))),
10942 			    VM_MAP_PAGE_MASK(map)));
10943 			*addr = 0;
10944 		}
10945 	} else {
10946 		/* copy was successful, dicard the copy structure */
10947 		if (consume_on_success) {
10948 			kfree_data(copy->cpy_kdata, copy_size);
10949 			zfree_id(ZONE_ID_VM_MAP_COPY, copy);
10950 		}
10951 	}
10952 
10953 	return kr;
10954 }
10955 
10956 /*
10957  *	Routine:	vm_map_copy_insert      [internal use only]
10958  *
10959  *	Description:
10960  *		Link a copy chain ("copy") into a map at the
10961  *		specified location (after "where").
10962  *
10963  *		Callers of this function must call vm_map_copy_require on
10964  *		previously created vm_map_copy_t or pass a newly created
10965  *		one to ensure that it hasn't been forged.
10966  *	Side effects:
10967  *		The copy chain is destroyed.
10968  */
10969 static void
vm_map_copy_insert(vm_map_t map,vm_map_entry_t after_where,vm_map_copy_t copy)10970 vm_map_copy_insert(
10971 	vm_map_t        map,
10972 	vm_map_entry_t  after_where,
10973 	vm_map_copy_t   copy)
10974 {
10975 	vm_map_entry_t  entry;
10976 
10977 	while (vm_map_copy_first_entry(copy) != vm_map_copy_to_entry(copy)) {
10978 		entry = vm_map_copy_first_entry(copy);
10979 		vm_map_copy_entry_unlink(copy, entry);
10980 		vm_map_store_entry_link(map, after_where, entry,
10981 		    VM_MAP_KERNEL_FLAGS_NONE);
10982 		after_where = entry;
10983 	}
10984 	zfree_id(ZONE_ID_VM_MAP_COPY, copy);
10985 }
10986 
10987 /*
10988  * Callers of this function must call vm_map_copy_require on
10989  * previously created vm_map_copy_t or pass a newly created
10990  * one to ensure that it hasn't been forged.
10991  */
10992 void
vm_map_copy_remap(vm_map_t map,vm_map_entry_t where,vm_map_copy_t copy,vm_map_offset_t adjustment,vm_prot_t cur_prot,vm_prot_t max_prot,vm_inherit_t inheritance)10993 vm_map_copy_remap(
10994 	vm_map_t        map,
10995 	vm_map_entry_t  where,
10996 	vm_map_copy_t   copy,
10997 	vm_map_offset_t adjustment,
10998 	vm_prot_t       cur_prot,
10999 	vm_prot_t       max_prot,
11000 	vm_inherit_t    inheritance)
11001 {
11002 	vm_map_entry_t  copy_entry, new_entry;
11003 
11004 	for (copy_entry = vm_map_copy_first_entry(copy);
11005 	    copy_entry != vm_map_copy_to_entry(copy);
11006 	    copy_entry = copy_entry->vme_next) {
11007 		/* get a new VM map entry for the map */
11008 		new_entry = vm_map_entry_create(map);
11009 		/* copy the "copy entry" to the new entry */
11010 		vm_map_entry_copy(map, new_entry, copy_entry);
11011 		/* adjust "start" and "end" */
11012 		new_entry->vme_start += adjustment;
11013 		new_entry->vme_end += adjustment;
11014 		/* clear some attributes */
11015 		new_entry->inheritance = inheritance;
11016 		new_entry->protection = cur_prot;
11017 		new_entry->max_protection = max_prot;
11018 		new_entry->behavior = VM_BEHAVIOR_DEFAULT;
11019 		/* take an extra reference on the entry's "object" */
11020 		if (new_entry->is_sub_map) {
11021 			assert(!new_entry->use_pmap); /* not nested */
11022 			vm_map_reference(VME_SUBMAP(new_entry));
11023 		} else {
11024 			vm_object_reference(VME_OBJECT(new_entry));
11025 		}
11026 		/* insert the new entry in the map */
11027 		vm_map_store_entry_link(map, where, new_entry,
11028 		    VM_MAP_KERNEL_FLAGS_NONE);
11029 		/* continue inserting the "copy entries" after the new entry */
11030 		where = new_entry;
11031 	}
11032 }
11033 
11034 
11035 /*
11036  * Returns true if *size matches (or is in the range of) copy->size.
11037  * Upon returning true, the *size field is updated with the actual size of the
11038  * copy object (may be different for VM_MAP_COPY_ENTRY_LIST types)
11039  */
11040 boolean_t
vm_map_copy_validate_size(vm_map_t dst_map,vm_map_copy_t copy,vm_map_size_t * size)11041 vm_map_copy_validate_size(
11042 	vm_map_t                dst_map,
11043 	vm_map_copy_t           copy,
11044 	vm_map_size_t           *size)
11045 {
11046 	if (copy == VM_MAP_COPY_NULL) {
11047 		return FALSE;
11048 	}
11049 
11050 	/*
11051 	 * Assert that the vm_map_copy is coming from the right
11052 	 * zone and hasn't been forged
11053 	 */
11054 	vm_map_copy_require(copy);
11055 
11056 	vm_map_size_t copy_sz = copy->size;
11057 	vm_map_size_t sz = *size;
11058 	switch (copy->type) {
11059 	case VM_MAP_COPY_OBJECT:
11060 	case VM_MAP_COPY_KERNEL_BUFFER:
11061 		if (sz == copy_sz) {
11062 			return TRUE;
11063 		}
11064 		break;
11065 	case VM_MAP_COPY_ENTRY_LIST:
11066 		/*
11067 		 * potential page-size rounding prevents us from exactly
11068 		 * validating this flavor of vm_map_copy, but we can at least
11069 		 * assert that it's within a range.
11070 		 */
11071 		if (copy_sz >= sz &&
11072 		    copy_sz <= vm_map_round_page(sz, VM_MAP_PAGE_MASK(dst_map))) {
11073 			*size = copy_sz;
11074 			return TRUE;
11075 		}
11076 		break;
11077 	default:
11078 		break;
11079 	}
11080 	return FALSE;
11081 }
11082 
11083 /*
11084  *	Routine:	vm_map_copyout_size
11085  *
11086  *	Description:
11087  *		Copy out a copy chain ("copy") into newly-allocated
11088  *		space in the destination map. Uses a prevalidated
11089  *		size for the copy object (vm_map_copy_validate_size).
11090  *
11091  *		If successful, consumes the copy object.
11092  *		Otherwise, the caller is responsible for it.
11093  */
11094 kern_return_t
vm_map_copyout_size(vm_map_t dst_map,vm_map_address_t * dst_addr,vm_map_copy_t copy,vm_map_size_t copy_size)11095 vm_map_copyout_size(
11096 	vm_map_t                dst_map,
11097 	vm_map_address_t        *dst_addr,      /* OUT */
11098 	vm_map_copy_t           copy,
11099 	vm_map_size_t           copy_size)
11100 {
11101 	return vm_map_copyout_internal(dst_map, dst_addr, copy, copy_size,
11102 	           TRUE,                     /* consume_on_success */
11103 	           VM_PROT_DEFAULT,
11104 	           VM_PROT_ALL,
11105 	           VM_INHERIT_DEFAULT);
11106 }
11107 
11108 /*
11109  *	Routine:	vm_map_copyout
11110  *
11111  *	Description:
11112  *		Copy out a copy chain ("copy") into newly-allocated
11113  *		space in the destination map.
11114  *
11115  *		If successful, consumes the copy object.
11116  *		Otherwise, the caller is responsible for it.
11117  */
11118 kern_return_t
vm_map_copyout(vm_map_t dst_map,vm_map_address_t * dst_addr,vm_map_copy_t copy)11119 vm_map_copyout(
11120 	vm_map_t                dst_map,
11121 	vm_map_address_t        *dst_addr,      /* OUT */
11122 	vm_map_copy_t           copy)
11123 {
11124 	return vm_map_copyout_internal(dst_map, dst_addr, copy, copy ? copy->size : 0,
11125 	           TRUE,                     /* consume_on_success */
11126 	           VM_PROT_DEFAULT,
11127 	           VM_PROT_ALL,
11128 	           VM_INHERIT_DEFAULT);
11129 }
11130 
11131 kern_return_t
vm_map_copyout_internal(vm_map_t dst_map,vm_map_address_t * dst_addr,vm_map_copy_t copy,vm_map_size_t copy_size,boolean_t consume_on_success,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)11132 vm_map_copyout_internal(
11133 	vm_map_t                dst_map,
11134 	vm_map_address_t        *dst_addr,      /* OUT */
11135 	vm_map_copy_t           copy,
11136 	vm_map_size_t           copy_size,
11137 	boolean_t               consume_on_success,
11138 	vm_prot_t               cur_protection,
11139 	vm_prot_t               max_protection,
11140 	vm_inherit_t            inheritance)
11141 {
11142 	vm_map_size_t           size;
11143 	vm_map_size_t           adjustment;
11144 	vm_map_offset_t         start;
11145 	vm_object_offset_t      vm_copy_start;
11146 	vm_map_entry_t          last;
11147 	vm_map_entry_t          entry;
11148 	vm_map_copy_t           original_copy;
11149 	kern_return_t           kr;
11150 	vm_map_kernel_flags_t   vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
11151 
11152 	/*
11153 	 *	Check for null copy object.
11154 	 */
11155 
11156 	if (copy == VM_MAP_COPY_NULL) {
11157 		*dst_addr = 0;
11158 		return KERN_SUCCESS;
11159 	}
11160 
11161 	/*
11162 	 * Assert that the vm_map_copy is coming from the right
11163 	 * zone and hasn't been forged
11164 	 */
11165 	vm_map_copy_require(copy);
11166 
11167 	if (copy->size != copy_size) {
11168 		*dst_addr = 0;
11169 		return KERN_FAILURE;
11170 	}
11171 
11172 	/*
11173 	 *	Check for special copy object, created
11174 	 *	by vm_map_copyin_object.
11175 	 */
11176 
11177 	if (copy->type == VM_MAP_COPY_OBJECT) {
11178 		vm_object_t             object = copy->cpy_object;
11179 		vm_object_offset_t      offset;
11180 
11181 		offset = vm_object_trunc_page(copy->offset);
11182 		size = vm_map_round_page((copy_size +
11183 		    (vm_map_size_t)(copy->offset -
11184 		    offset)),
11185 		    VM_MAP_PAGE_MASK(dst_map));
11186 		*dst_addr = 0;
11187 		kr = vm_map_enter(dst_map, dst_addr, size,
11188 		    (vm_map_offset_t) 0, VM_FLAGS_ANYWHERE,
11189 		    VM_MAP_KERNEL_FLAGS_NONE,
11190 		    VM_KERN_MEMORY_NONE,
11191 		    object, offset, FALSE,
11192 		    VM_PROT_DEFAULT, VM_PROT_ALL,
11193 		    VM_INHERIT_DEFAULT);
11194 		if (kr != KERN_SUCCESS) {
11195 			return kr;
11196 		}
11197 		/* Account for non-pagealigned copy object */
11198 		*dst_addr += (vm_map_offset_t)(copy->offset - offset);
11199 		if (consume_on_success) {
11200 			zfree_id(ZONE_ID_VM_MAP_COPY, copy);
11201 		}
11202 		return KERN_SUCCESS;
11203 	}
11204 
11205 	/*
11206 	 *	Check for special kernel buffer allocated
11207 	 *	by new_ipc_kmsg_copyin.
11208 	 */
11209 
11210 	if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) {
11211 		return vm_map_copyout_kernel_buffer(dst_map, dst_addr,
11212 		           copy, copy_size, FALSE,
11213 		           consume_on_success);
11214 	}
11215 
11216 	original_copy = copy;
11217 	if (copy->cpy_hdr.page_shift != VM_MAP_PAGE_SHIFT(dst_map)) {
11218 		vm_map_copy_t target_copy;
11219 		vm_map_offset_t overmap_start, overmap_end, trimmed_start;
11220 
11221 		target_copy = VM_MAP_COPY_NULL;
11222 		DEBUG4K_ADJUST("adjusting...\n");
11223 		kr = vm_map_copy_adjust_to_target(
11224 			copy,
11225 			0, /* offset */
11226 			copy->size, /* size */
11227 			dst_map,
11228 			TRUE, /* copy */
11229 			&target_copy,
11230 			&overmap_start,
11231 			&overmap_end,
11232 			&trimmed_start);
11233 		if (kr != KERN_SUCCESS) {
11234 			DEBUG4K_COPY("adjust failed 0x%x\n", kr);
11235 			return kr;
11236 		}
11237 		DEBUG4K_COPY("copy %p (%d 0x%llx 0x%llx) dst_map %p (%d) target_copy %p (%d 0x%llx 0x%llx) overmap_start 0x%llx overmap_end 0x%llx trimmed_start 0x%llx\n", copy, copy->cpy_hdr.page_shift, copy->offset, (uint64_t)copy->size, dst_map, VM_MAP_PAGE_SHIFT(dst_map), target_copy, target_copy->cpy_hdr.page_shift, target_copy->offset, (uint64_t)target_copy->size, (uint64_t)overmap_start, (uint64_t)overmap_end, (uint64_t)trimmed_start);
11238 		if (target_copy != copy) {
11239 			copy = target_copy;
11240 		}
11241 		copy_size = copy->size;
11242 	}
11243 
11244 	/*
11245 	 *	Find space for the data
11246 	 */
11247 
11248 	vm_copy_start = vm_map_trunc_page((vm_map_size_t)copy->offset,
11249 	    VM_MAP_COPY_PAGE_MASK(copy));
11250 	size = vm_map_round_page((vm_map_size_t)copy->offset + copy_size,
11251 	    VM_MAP_COPY_PAGE_MASK(copy))
11252 	    - vm_copy_start;
11253 
11254 
11255 	if (dst_map == kernel_map) {
11256 		vmk_flags.vmkf_range_id = KMEM_RANGE_ID_DATA;
11257 	}
11258 
11259 	vm_map_lock(dst_map);
11260 	kr = vm_map_locate_space(dst_map, size, 0, vmk_flags,
11261 	    &start, &last);
11262 	if (kr != KERN_SUCCESS) {
11263 		vm_map_unlock(dst_map);
11264 		return kr;
11265 	}
11266 
11267 	adjustment = start - vm_copy_start;
11268 	if (!consume_on_success) {
11269 		/*
11270 		 * We're not allowed to consume "copy", so we'll have to
11271 		 * copy its map entries into the destination map below.
11272 		 * No need to re-allocate map entries from the correct
11273 		 * (pageable or not) zone, since we'll get new map entries
11274 		 * during the transfer.
11275 		 * We'll also adjust the map entries's "start" and "end"
11276 		 * during the transfer, to keep "copy"'s entries consistent
11277 		 * with its "offset".
11278 		 */
11279 		goto after_adjustments;
11280 	}
11281 
11282 	/*
11283 	 *	Since we're going to just drop the map
11284 	 *	entries from the copy into the destination
11285 	 *	map, they must come from the same pool.
11286 	 */
11287 
11288 	if (copy->cpy_hdr.entries_pageable != dst_map->hdr.entries_pageable) {
11289 		/*
11290 		 * Mismatches occur when dealing with the default
11291 		 * pager.
11292 		 */
11293 		vm_map_entry_t  next, new;
11294 
11295 		/*
11296 		 * Find the zone that the copies were allocated from
11297 		 */
11298 
11299 		entry = vm_map_copy_first_entry(copy);
11300 
11301 		/*
11302 		 * Reinitialize the copy so that vm_map_copy_entry_link
11303 		 * will work.
11304 		 */
11305 		vm_map_store_copy_reset(copy, entry);
11306 		copy->cpy_hdr.entries_pageable = dst_map->hdr.entries_pageable;
11307 
11308 		/*
11309 		 * Copy each entry.
11310 		 */
11311 		while (entry != vm_map_copy_to_entry(copy)) {
11312 			new = vm_map_copy_entry_create(copy);
11313 			vm_map_entry_copy_full(new, entry);
11314 			new->vme_no_copy_on_read = FALSE;
11315 			assert(!new->iokit_acct);
11316 			if (new->is_sub_map) {
11317 				/* clr address space specifics */
11318 				new->use_pmap = FALSE;
11319 			}
11320 			vm_map_copy_entry_link(copy,
11321 			    vm_map_copy_last_entry(copy),
11322 			    new);
11323 			next = entry->vme_next;
11324 			vm_map_entry_dispose(entry);
11325 			entry = next;
11326 		}
11327 	}
11328 
11329 	/*
11330 	 *	Adjust the addresses in the copy chain, and
11331 	 *	reset the region attributes.
11332 	 */
11333 
11334 	for (entry = vm_map_copy_first_entry(copy);
11335 	    entry != vm_map_copy_to_entry(copy);
11336 	    entry = entry->vme_next) {
11337 		if (VM_MAP_PAGE_SHIFT(dst_map) == PAGE_SHIFT) {
11338 			/*
11339 			 * We're injecting this copy entry into a map that
11340 			 * has the standard page alignment, so clear
11341 			 * "map_aligned" (which might have been inherited
11342 			 * from the original map entry).
11343 			 */
11344 			entry->map_aligned = FALSE;
11345 		}
11346 
11347 		entry->vme_start += adjustment;
11348 		entry->vme_end += adjustment;
11349 
11350 		if (entry->map_aligned) {
11351 			assert(VM_MAP_PAGE_ALIGNED(entry->vme_start,
11352 			    VM_MAP_PAGE_MASK(dst_map)));
11353 			assert(VM_MAP_PAGE_ALIGNED(entry->vme_end,
11354 			    VM_MAP_PAGE_MASK(dst_map)));
11355 		}
11356 
11357 		entry->inheritance = VM_INHERIT_DEFAULT;
11358 		entry->protection = VM_PROT_DEFAULT;
11359 		entry->max_protection = VM_PROT_ALL;
11360 		entry->behavior = VM_BEHAVIOR_DEFAULT;
11361 
11362 		/*
11363 		 * If the entry is now wired,
11364 		 * map the pages into the destination map.
11365 		 */
11366 		if (entry->wired_count != 0) {
11367 			vm_map_offset_t va;
11368 			vm_object_offset_t       offset;
11369 			vm_object_t object;
11370 			vm_prot_t prot;
11371 			int     type_of_fault;
11372 
11373 			/* TODO4K would need to use actual page size */
11374 			assert(VM_MAP_PAGE_SHIFT(dst_map) == PAGE_SHIFT);
11375 
11376 			object = VME_OBJECT(entry);
11377 			offset = VME_OFFSET(entry);
11378 			va = entry->vme_start;
11379 
11380 			pmap_pageable(dst_map->pmap,
11381 			    entry->vme_start,
11382 			    entry->vme_end,
11383 			    TRUE);
11384 
11385 			while (va < entry->vme_end) {
11386 				vm_page_t       m;
11387 				struct vm_object_fault_info fault_info = {};
11388 
11389 				/*
11390 				 * Look up the page in the object.
11391 				 * Assert that the page will be found in the
11392 				 * top object:
11393 				 * either
11394 				 *	the object was newly created by
11395 				 *	vm_object_copy_slowly, and has
11396 				 *	copies of all of the pages from
11397 				 *	the source object
11398 				 * or
11399 				 *	the object was moved from the old
11400 				 *	map entry; because the old map
11401 				 *	entry was wired, all of the pages
11402 				 *	were in the top-level object.
11403 				 *	(XXX not true if we wire pages for
11404 				 *	 reading)
11405 				 */
11406 				vm_object_lock(object);
11407 
11408 				m = vm_page_lookup(object, offset);
11409 				if (m == VM_PAGE_NULL || !VM_PAGE_WIRED(m) ||
11410 				    m->vmp_absent) {
11411 					panic("vm_map_copyout: wiring %p", m);
11412 				}
11413 
11414 				prot = entry->protection;
11415 
11416 				if (override_nx(dst_map, VME_ALIAS(entry)) &&
11417 				    prot) {
11418 					prot |= VM_PROT_EXECUTE;
11419 				}
11420 
11421 				type_of_fault = DBG_CACHE_HIT_FAULT;
11422 
11423 				fault_info.user_tag = VME_ALIAS(entry);
11424 				fault_info.pmap_options = 0;
11425 				if (entry->iokit_acct ||
11426 				    (!entry->is_sub_map && !entry->use_pmap)) {
11427 					fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
11428 				}
11429 
11430 				vm_fault_enter(m,
11431 				    dst_map->pmap,
11432 				    va,
11433 				    PAGE_SIZE, 0,
11434 				    prot,
11435 				    prot,
11436 				    VM_PAGE_WIRED(m),
11437 				    FALSE,            /* change_wiring */
11438 				    VM_KERN_MEMORY_NONE,            /* tag - not wiring */
11439 				    &fault_info,
11440 				    NULL,             /* need_retry */
11441 				    &type_of_fault);
11442 
11443 				vm_object_unlock(object);
11444 
11445 				offset += PAGE_SIZE_64;
11446 				va += PAGE_SIZE;
11447 			}
11448 		}
11449 	}
11450 
11451 after_adjustments:
11452 
11453 	/*
11454 	 *	Correct the page alignment for the result
11455 	 */
11456 
11457 	*dst_addr = start + (copy->offset - vm_copy_start);
11458 
11459 #if KASAN
11460 	kasan_notify_address(*dst_addr, size);
11461 #endif
11462 
11463 	/*
11464 	 *	Update the hints and the map size
11465 	 */
11466 
11467 	if (consume_on_success) {
11468 		SAVE_HINT_MAP_WRITE(dst_map, vm_map_copy_last_entry(copy));
11469 	} else {
11470 		SAVE_HINT_MAP_WRITE(dst_map, last);
11471 	}
11472 
11473 	dst_map->size += size;
11474 
11475 	/*
11476 	 *	Link in the copy
11477 	 */
11478 
11479 	if (consume_on_success) {
11480 		vm_map_copy_insert(dst_map, last, copy);
11481 		if (copy != original_copy) {
11482 			vm_map_copy_discard(original_copy);
11483 			original_copy = VM_MAP_COPY_NULL;
11484 		}
11485 	} else {
11486 		vm_map_copy_remap(dst_map, last, copy, adjustment,
11487 		    cur_protection, max_protection,
11488 		    inheritance);
11489 		if (copy != original_copy && original_copy != VM_MAP_COPY_NULL) {
11490 			vm_map_copy_discard(copy);
11491 			copy = original_copy;
11492 		}
11493 	}
11494 
11495 
11496 	vm_map_unlock(dst_map);
11497 
11498 	/*
11499 	 * XXX	If wiring_required, call vm_map_pageable
11500 	 */
11501 
11502 	return KERN_SUCCESS;
11503 }
11504 
11505 /*
11506  *	Routine:	vm_map_copyin
11507  *
11508  *	Description:
11509  *		see vm_map_copyin_common.  Exported via Unsupported.exports.
11510  *
11511  */
11512 
11513 #undef vm_map_copyin
11514 
11515 kern_return_t
vm_map_copyin(vm_map_t src_map,vm_map_address_t src_addr,vm_map_size_t len,boolean_t src_destroy,vm_map_copy_t * copy_result)11516 vm_map_copyin(
11517 	vm_map_t                        src_map,
11518 	vm_map_address_t        src_addr,
11519 	vm_map_size_t           len,
11520 	boolean_t                       src_destroy,
11521 	vm_map_copy_t           *copy_result)   /* OUT */
11522 {
11523 	return vm_map_copyin_common(src_map, src_addr, len, src_destroy,
11524 	           FALSE, copy_result, FALSE);
11525 }
11526 
11527 /*
11528  *	Routine:	vm_map_copyin_common
11529  *
11530  *	Description:
11531  *		Copy the specified region (src_addr, len) from the
11532  *		source address space (src_map), possibly removing
11533  *		the region from the source address space (src_destroy).
11534  *
11535  *	Returns:
11536  *		A vm_map_copy_t object (copy_result), suitable for
11537  *		insertion into another address space (using vm_map_copyout),
11538  *		copying over another address space region (using
11539  *		vm_map_copy_overwrite).  If the copy is unused, it
11540  *		should be destroyed (using vm_map_copy_discard).
11541  *
11542  *	In/out conditions:
11543  *		The source map should not be locked on entry.
11544  */
11545 
11546 typedef struct submap_map {
11547 	vm_map_t        parent_map;
11548 	vm_map_offset_t base_start;
11549 	vm_map_offset_t base_end;
11550 	vm_map_size_t   base_len;
11551 	struct submap_map *next;
11552 } submap_map_t;
11553 
11554 kern_return_t
vm_map_copyin_common(vm_map_t src_map,vm_map_address_t src_addr,vm_map_size_t len,boolean_t src_destroy,__unused boolean_t src_volatile,vm_map_copy_t * copy_result,boolean_t use_maxprot)11555 vm_map_copyin_common(
11556 	vm_map_t        src_map,
11557 	vm_map_address_t src_addr,
11558 	vm_map_size_t   len,
11559 	boolean_t       src_destroy,
11560 	__unused boolean_t      src_volatile,
11561 	vm_map_copy_t   *copy_result,   /* OUT */
11562 	boolean_t       use_maxprot)
11563 {
11564 	int flags;
11565 
11566 	flags = 0;
11567 	if (src_destroy) {
11568 		flags |= VM_MAP_COPYIN_SRC_DESTROY;
11569 	}
11570 	if (use_maxprot) {
11571 		flags |= VM_MAP_COPYIN_USE_MAXPROT;
11572 	}
11573 	return vm_map_copyin_internal(src_map,
11574 	           src_addr,
11575 	           len,
11576 	           flags,
11577 	           copy_result);
11578 }
11579 kern_return_t
vm_map_copyin_internal(vm_map_t src_map,vm_map_address_t src_addr,vm_map_size_t len,int flags,vm_map_copy_t * copy_result)11580 vm_map_copyin_internal(
11581 	vm_map_t        src_map,
11582 	vm_map_address_t src_addr,
11583 	vm_map_size_t   len,
11584 	int             flags,
11585 	vm_map_copy_t   *copy_result)   /* OUT */
11586 {
11587 	vm_map_entry_t  tmp_entry;      /* Result of last map lookup --
11588 	                                 * in multi-level lookup, this
11589 	                                 * entry contains the actual
11590 	                                 * vm_object/offset.
11591 	                                 */
11592 	vm_map_entry_t  new_entry = VM_MAP_ENTRY_NULL;  /* Map entry for copy */
11593 
11594 	vm_map_offset_t src_start;      /* Start of current entry --
11595 	                                 * where copy is taking place now
11596 	                                 */
11597 	vm_map_offset_t src_end;        /* End of entire region to be
11598 	                                 * copied */
11599 	vm_map_offset_t src_base;
11600 	vm_map_t        base_map = src_map;
11601 	boolean_t       map_share = FALSE;
11602 	submap_map_t    *parent_maps = NULL;
11603 
11604 	vm_map_copy_t   copy;           /* Resulting copy */
11605 	vm_map_address_t copy_addr;
11606 	vm_map_size_t   copy_size;
11607 	boolean_t       src_destroy;
11608 	boolean_t       use_maxprot;
11609 	boolean_t       preserve_purgeable;
11610 	boolean_t       entry_was_shared;
11611 	vm_map_entry_t  saved_src_entry;
11612 
11613 	if (flags & ~VM_MAP_COPYIN_ALL_FLAGS) {
11614 		return KERN_INVALID_ARGUMENT;
11615 	}
11616 
11617 #if CONFIG_KERNEL_TBI
11618 	if (src_map->pmap == kernel_pmap) {
11619 		src_addr = VM_KERNEL_TBI_FILL(src_addr);
11620 	}
11621 #endif /* CONFIG_KERNEL_TBI && KASAN_TBI */
11622 
11623 	src_destroy = (flags & VM_MAP_COPYIN_SRC_DESTROY) ? TRUE : FALSE;
11624 	use_maxprot = (flags & VM_MAP_COPYIN_USE_MAXPROT) ? TRUE : FALSE;
11625 	preserve_purgeable =
11626 	    (flags & VM_MAP_COPYIN_PRESERVE_PURGEABLE) ? TRUE : FALSE;
11627 
11628 	/*
11629 	 *	Check for copies of zero bytes.
11630 	 */
11631 
11632 	if (len == 0) {
11633 		*copy_result = VM_MAP_COPY_NULL;
11634 		return KERN_SUCCESS;
11635 	}
11636 
11637 	/*
11638 	 *	Check that the end address doesn't overflow
11639 	 */
11640 	src_end = src_addr + len;
11641 	if (src_end < src_addr) {
11642 		return KERN_INVALID_ADDRESS;
11643 	}
11644 
11645 	/*
11646 	 *	Compute (page aligned) start and end of region
11647 	 */
11648 	src_start = vm_map_trunc_page(src_addr,
11649 	    VM_MAP_PAGE_MASK(src_map));
11650 	src_end = vm_map_round_page(src_end,
11651 	    VM_MAP_PAGE_MASK(src_map));
11652 
11653 	/*
11654 	 * If the copy is sufficiently small, use a kernel buffer instead
11655 	 * of making a virtual copy.  The theory being that the cost of
11656 	 * setting up VM (and taking C-O-W faults) dominates the copy costs
11657 	 * for small regions.
11658 	 */
11659 	if ((len <= msg_ool_size_small) &&
11660 	    !use_maxprot &&
11661 	    !preserve_purgeable &&
11662 	    !(flags & VM_MAP_COPYIN_ENTRY_LIST) &&
11663 	    /*
11664 	     * Since the "msg_ool_size_small" threshold was increased and
11665 	     * vm_map_copyin_kernel_buffer() doesn't handle accesses beyond the
11666 	     * address space limits, we revert to doing a virtual copy if the
11667 	     * copied range goes beyond those limits.  Otherwise, mach_vm_read()
11668 	     * of the commpage would now fail when it used to work.
11669 	     */
11670 	    (src_start >= vm_map_min(src_map) &&
11671 	    src_start < vm_map_max(src_map) &&
11672 	    src_end >= vm_map_min(src_map) &&
11673 	    src_end < vm_map_max(src_map))) {
11674 		return vm_map_copyin_kernel_buffer(src_map, src_addr, len,
11675 		           src_destroy, copy_result);
11676 	}
11677 
11678 	/*
11679 	 *	Allocate a header element for the list.
11680 	 *
11681 	 *	Use the start and end in the header to
11682 	 *	remember the endpoints prior to rounding.
11683 	 */
11684 
11685 	copy = vm_map_copy_allocate();
11686 	copy->type = VM_MAP_COPY_ENTRY_LIST;
11687 	copy->cpy_hdr.entries_pageable = TRUE;
11688 	copy->cpy_hdr.page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(src_map);
11689 
11690 	vm_map_store_init( &(copy->cpy_hdr));
11691 
11692 	copy->offset = src_addr;
11693 	copy->size = len;
11694 
11695 	new_entry = vm_map_copy_entry_create(copy);
11696 
11697 #define RETURN(x)                                               \
11698 	MACRO_BEGIN                                             \
11699 	vm_map_unlock(src_map);                                 \
11700 	if(src_map != base_map)                                 \
11701 	        vm_map_deallocate(src_map);                     \
11702 	if (new_entry != VM_MAP_ENTRY_NULL)                     \
11703 	        vm_map_copy_entry_dispose(new_entry);           \
11704 	vm_map_copy_discard(copy);                              \
11705 	{                                                       \
11706 	        submap_map_t	*_ptr;                          \
11707                                                                 \
11708 	        for(_ptr = parent_maps; _ptr != NULL; _ptr = parent_maps) { \
11709 	                parent_maps=parent_maps->next;          \
11710 	                if (_ptr->parent_map != base_map)       \
11711 	                        vm_map_deallocate(_ptr->parent_map);    \
11712 	                kfree_type(submap_map_t, _ptr);         \
11713 	        }                                               \
11714 	}                                                       \
11715 	MACRO_RETURN(x);                                        \
11716 	MACRO_END
11717 
11718 	/*
11719 	 *	Find the beginning of the region.
11720 	 */
11721 
11722 	vm_map_lock(src_map);
11723 
11724 	/*
11725 	 * Lookup the original "src_addr" rather than the truncated
11726 	 * "src_start", in case "src_start" falls in a non-map-aligned
11727 	 * map entry *before* the map entry that contains "src_addr"...
11728 	 */
11729 	if (!vm_map_lookup_entry(src_map, src_addr, &tmp_entry)) {
11730 		RETURN(KERN_INVALID_ADDRESS);
11731 	}
11732 	if (!tmp_entry->is_sub_map) {
11733 		/*
11734 		 * ... but clip to the map-rounded "src_start" rather than
11735 		 * "src_addr" to preserve map-alignment.  We'll adjust the
11736 		 * first copy entry at the end, if needed.
11737 		 */
11738 		vm_map_clip_start(src_map, tmp_entry, src_start);
11739 	}
11740 	if (src_start < tmp_entry->vme_start) {
11741 		/*
11742 		 * Move "src_start" up to the start of the
11743 		 * first map entry to copy.
11744 		 */
11745 		src_start = tmp_entry->vme_start;
11746 	}
11747 	/* set for later submap fix-up */
11748 	copy_addr = src_start;
11749 
11750 	/*
11751 	 *	Go through entries until we get to the end.
11752 	 */
11753 
11754 	while (TRUE) {
11755 		vm_map_entry_t  src_entry = tmp_entry;  /* Top-level entry */
11756 		vm_map_size_t   src_size;               /* Size of source
11757 		                                         * map entry (in both
11758 		                                         * maps)
11759 		                                         */
11760 
11761 		vm_object_t             src_object;     /* Object to copy */
11762 		vm_object_offset_t      src_offset;
11763 
11764 		vm_object_t             new_copy_object;/* vm_object_copy_* result */
11765 
11766 		boolean_t       src_needs_copy;         /* Should source map
11767 		                                         * be made read-only
11768 		                                         * for copy-on-write?
11769 		                                         */
11770 
11771 		boolean_t       new_entry_needs_copy;   /* Will new entry be COW? */
11772 
11773 		boolean_t       was_wired;              /* Was source wired? */
11774 		boolean_t       saved_used_for_jit;     /* Saved used_for_jit. */
11775 		vm_map_version_t version;               /* Version before locks
11776 		                                         * dropped to make copy
11777 		                                         */
11778 		kern_return_t   result;                 /* Return value from
11779 		                                         * copy_strategically.
11780 		                                         */
11781 		while (tmp_entry->is_sub_map) {
11782 			vm_map_size_t submap_len;
11783 			submap_map_t *ptr;
11784 
11785 			ptr = kalloc_type(submap_map_t, Z_WAITOK);
11786 			ptr->next = parent_maps;
11787 			parent_maps = ptr;
11788 			ptr->parent_map = src_map;
11789 			ptr->base_start = src_start;
11790 			ptr->base_end = src_end;
11791 			submap_len = tmp_entry->vme_end - src_start;
11792 			if (submap_len > (src_end - src_start)) {
11793 				submap_len = src_end - src_start;
11794 			}
11795 			ptr->base_len = submap_len;
11796 
11797 			src_start -= tmp_entry->vme_start;
11798 			src_start += VME_OFFSET(tmp_entry);
11799 			src_end = src_start + submap_len;
11800 			src_map = VME_SUBMAP(tmp_entry);
11801 			vm_map_lock(src_map);
11802 			/* keep an outstanding reference for all maps in */
11803 			/* the parents tree except the base map */
11804 			vm_map_reference(src_map);
11805 			vm_map_unlock(ptr->parent_map);
11806 			if (!vm_map_lookup_entry(
11807 				    src_map, src_start, &tmp_entry)) {
11808 				RETURN(KERN_INVALID_ADDRESS);
11809 			}
11810 			map_share = TRUE;
11811 			if (!tmp_entry->is_sub_map) {
11812 				vm_map_clip_start(src_map, tmp_entry, src_start);
11813 			}
11814 			src_entry = tmp_entry;
11815 		}
11816 		/* we are now in the lowest level submap... */
11817 
11818 		if ((VME_OBJECT(tmp_entry) != VM_OBJECT_NULL) &&
11819 		    (VME_OBJECT(tmp_entry)->phys_contiguous)) {
11820 			/* This is not, supported for now.In future */
11821 			/* we will need to detect the phys_contig   */
11822 			/* condition and then upgrade copy_slowly   */
11823 			/* to do physical copy from the device mem  */
11824 			/* based object. We can piggy-back off of   */
11825 			/* the was wired boolean to set-up the      */
11826 			/* proper handling */
11827 			RETURN(KERN_PROTECTION_FAILURE);
11828 		}
11829 		/*
11830 		 *	Create a new address map entry to hold the result.
11831 		 *	Fill in the fields from the appropriate source entries.
11832 		 *	We must unlock the source map to do this if we need
11833 		 *	to allocate a map entry.
11834 		 */
11835 		if (new_entry == VM_MAP_ENTRY_NULL) {
11836 			version.main_timestamp = src_map->timestamp;
11837 			vm_map_unlock(src_map);
11838 
11839 			new_entry = vm_map_copy_entry_create(copy);
11840 
11841 			vm_map_lock(src_map);
11842 			if ((version.main_timestamp + 1) != src_map->timestamp) {
11843 				if (!vm_map_lookup_entry(src_map, src_start,
11844 				    &tmp_entry)) {
11845 					RETURN(KERN_INVALID_ADDRESS);
11846 				}
11847 				if (!tmp_entry->is_sub_map) {
11848 					vm_map_clip_start(src_map, tmp_entry, src_start);
11849 				}
11850 				continue; /* restart w/ new tmp_entry */
11851 			}
11852 		}
11853 
11854 		/*
11855 		 *	Verify that the region can be read.
11856 		 */
11857 		if (((src_entry->protection & VM_PROT_READ) == VM_PROT_NONE &&
11858 		    !use_maxprot) ||
11859 		    (src_entry->max_protection & VM_PROT_READ) == 0) {
11860 			RETURN(KERN_PROTECTION_FAILURE);
11861 		}
11862 
11863 		/*
11864 		 *	Clip against the endpoints of the entire region.
11865 		 */
11866 
11867 		vm_map_clip_end(src_map, src_entry, src_end);
11868 
11869 		src_size = src_entry->vme_end - src_start;
11870 		src_object = VME_OBJECT(src_entry);
11871 		src_offset = VME_OFFSET(src_entry);
11872 		was_wired = (src_entry->wired_count != 0);
11873 
11874 		vm_map_entry_copy(src_map, new_entry, src_entry);
11875 		if (new_entry->is_sub_map) {
11876 			/* clr address space specifics */
11877 			new_entry->use_pmap = FALSE;
11878 		} else {
11879 			/*
11880 			 * We're dealing with a copy-on-write operation,
11881 			 * so the resulting mapping should not inherit the
11882 			 * original mapping's accounting settings.
11883 			 * "iokit_acct" should have been cleared in
11884 			 * vm_map_entry_copy().
11885 			 * "use_pmap" should be reset to its default (TRUE)
11886 			 * so that the new mapping gets accounted for in
11887 			 * the task's memory footprint.
11888 			 */
11889 			assert(!new_entry->iokit_acct);
11890 			new_entry->use_pmap = TRUE;
11891 		}
11892 
11893 		/*
11894 		 *	Attempt non-blocking copy-on-write optimizations.
11895 		 */
11896 
11897 		/*
11898 		 * If we are destroying the source, and the object
11899 		 * is internal, we could move the object reference
11900 		 * from the source to the copy.  The copy is
11901 		 * copy-on-write only if the source is.
11902 		 * We make another reference to the object, because
11903 		 * destroying the source entry will deallocate it.
11904 		 *
11905 		 * This memory transfer has to be atomic, (to prevent
11906 		 * the VM object from being shared or copied while
11907 		 * it's being moved here), so we could only do this
11908 		 * if we won't have to unlock the VM map until the
11909 		 * original mapping has been fully removed.
11910 		 */
11911 
11912 RestartCopy:
11913 		if ((src_object == VM_OBJECT_NULL ||
11914 		    (!was_wired && !map_share && !tmp_entry->is_shared
11915 		    && !(debug4k_no_cow_copyin && VM_MAP_PAGE_SHIFT(src_map) < PAGE_SHIFT))) &&
11916 		    vm_object_copy_quickly(
11917 			    VME_OBJECT(new_entry),
11918 			    src_offset,
11919 			    src_size,
11920 			    &src_needs_copy,
11921 			    &new_entry_needs_copy)) {
11922 			new_entry->needs_copy = new_entry_needs_copy;
11923 
11924 			/*
11925 			 *	Handle copy-on-write obligations
11926 			 */
11927 
11928 			if (src_needs_copy && !tmp_entry->needs_copy) {
11929 				vm_prot_t prot;
11930 
11931 				prot = src_entry->protection & ~VM_PROT_WRITE;
11932 
11933 				if (override_nx(src_map, VME_ALIAS(src_entry))
11934 				    && prot) {
11935 					prot |= VM_PROT_EXECUTE;
11936 				}
11937 
11938 				vm_object_pmap_protect(
11939 					src_object,
11940 					src_offset,
11941 					src_size,
11942 					(src_entry->is_shared ?
11943 					PMAP_NULL
11944 					: src_map->pmap),
11945 					VM_MAP_PAGE_SIZE(src_map),
11946 					src_entry->vme_start,
11947 					prot);
11948 
11949 				assert(tmp_entry->wired_count == 0);
11950 				tmp_entry->needs_copy = TRUE;
11951 			}
11952 
11953 			/*
11954 			 *	The map has never been unlocked, so it's safe
11955 			 *	to move to the next entry rather than doing
11956 			 *	another lookup.
11957 			 */
11958 
11959 			goto CopySuccessful;
11960 		}
11961 
11962 		entry_was_shared = tmp_entry->is_shared;
11963 
11964 		/*
11965 		 *	Take an object reference, so that we may
11966 		 *	release the map lock(s).
11967 		 */
11968 
11969 		assert(src_object != VM_OBJECT_NULL);
11970 		vm_object_reference(src_object);
11971 
11972 		/*
11973 		 *	Record the timestamp for later verification.
11974 		 *	Unlock the map.
11975 		 */
11976 
11977 		version.main_timestamp = src_map->timestamp;
11978 		vm_map_unlock(src_map); /* Increments timestamp once! */
11979 		saved_src_entry = src_entry;
11980 		tmp_entry = VM_MAP_ENTRY_NULL;
11981 		src_entry = VM_MAP_ENTRY_NULL;
11982 
11983 		/*
11984 		 *	Perform the copy
11985 		 */
11986 
11987 		if (was_wired ||
11988 		    (debug4k_no_cow_copyin &&
11989 		    VM_MAP_PAGE_SHIFT(src_map) < PAGE_SHIFT)) {
11990 CopySlowly:
11991 			vm_object_lock(src_object);
11992 			result = vm_object_copy_slowly(
11993 				src_object,
11994 				src_offset,
11995 				src_size,
11996 				THREAD_UNINT,
11997 				&new_copy_object);
11998 			/* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
11999 			saved_used_for_jit = new_entry->used_for_jit;
12000 			VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
12001 			new_entry->used_for_jit = saved_used_for_jit;
12002 			VME_OFFSET_SET(new_entry,
12003 			    src_offset - vm_object_trunc_page(src_offset));
12004 			new_entry->needs_copy = FALSE;
12005 		} else if (src_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC &&
12006 		    (entry_was_shared || map_share)) {
12007 			vm_object_t new_object;
12008 
12009 			vm_object_lock_shared(src_object);
12010 			new_object = vm_object_copy_delayed(
12011 				src_object,
12012 				src_offset,
12013 				src_size,
12014 				TRUE);
12015 			if (new_object == VM_OBJECT_NULL) {
12016 				goto CopySlowly;
12017 			}
12018 
12019 			VME_OBJECT_SET(new_entry, new_object, false, 0);
12020 			assert(new_entry->wired_count == 0);
12021 			new_entry->needs_copy = TRUE;
12022 			assert(!new_entry->iokit_acct);
12023 			assert(new_object->purgable == VM_PURGABLE_DENY);
12024 			assertf(new_entry->use_pmap, "src_map %p new_entry %p\n", src_map, new_entry);
12025 			result = KERN_SUCCESS;
12026 		} else {
12027 			vm_object_offset_t new_offset;
12028 			new_offset = VME_OFFSET(new_entry);
12029 			result = vm_object_copy_strategically(src_object,
12030 			    src_offset,
12031 			    src_size,
12032 			    &new_copy_object,
12033 			    &new_offset,
12034 			    &new_entry_needs_copy);
12035 			/* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
12036 			saved_used_for_jit = new_entry->used_for_jit;
12037 			VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
12038 			new_entry->used_for_jit = saved_used_for_jit;
12039 			if (new_offset != VME_OFFSET(new_entry)) {
12040 				VME_OFFSET_SET(new_entry, new_offset);
12041 			}
12042 
12043 			new_entry->needs_copy = new_entry_needs_copy;
12044 		}
12045 
12046 		if (result == KERN_SUCCESS &&
12047 		    ((preserve_purgeable &&
12048 		    src_object->purgable != VM_PURGABLE_DENY) ||
12049 		    new_entry->used_for_jit)) {
12050 			/*
12051 			 * Purgeable objects should be COPY_NONE, true share;
12052 			 * this should be propogated to the copy.
12053 			 *
12054 			 * Also force mappings the pmap specially protects to
12055 			 * be COPY_NONE; trying to COW these mappings would
12056 			 * change the effective protections, which could have
12057 			 * side effects if the pmap layer relies on the
12058 			 * specified protections.
12059 			 */
12060 
12061 			vm_object_t     new_object;
12062 
12063 			new_object = VME_OBJECT(new_entry);
12064 			assert(new_object != src_object);
12065 			vm_object_lock(new_object);
12066 			assert(new_object->ref_count == 1);
12067 			assert(new_object->shadow == VM_OBJECT_NULL);
12068 			assert(new_object->copy == VM_OBJECT_NULL);
12069 			assert(new_object->vo_owner == NULL);
12070 
12071 			new_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
12072 
12073 			if (preserve_purgeable &&
12074 			    src_object->purgable != VM_PURGABLE_DENY) {
12075 				new_object->true_share = TRUE;
12076 
12077 				/* start as non-volatile with no owner... */
12078 				new_object->purgable = VM_PURGABLE_NONVOLATILE;
12079 				vm_purgeable_nonvolatile_enqueue(new_object, NULL);
12080 				/* ... and move to src_object's purgeable state */
12081 				if (src_object->purgable != VM_PURGABLE_NONVOLATILE) {
12082 					int state;
12083 					state = src_object->purgable;
12084 					vm_object_purgable_control(
12085 						new_object,
12086 						VM_PURGABLE_SET_STATE_FROM_KERNEL,
12087 						&state);
12088 				}
12089 				/* no pmap accounting for purgeable objects */
12090 				new_entry->use_pmap = FALSE;
12091 			}
12092 
12093 			vm_object_unlock(new_object);
12094 			new_object = VM_OBJECT_NULL;
12095 		}
12096 
12097 		if (result != KERN_SUCCESS &&
12098 		    result != KERN_MEMORY_RESTART_COPY) {
12099 			vm_map_lock(src_map);
12100 			RETURN(result);
12101 		}
12102 
12103 		/*
12104 		 *	Throw away the extra reference
12105 		 */
12106 
12107 		vm_object_deallocate(src_object);
12108 
12109 		/*
12110 		 *	Verify that the map has not substantially
12111 		 *	changed while the copy was being made.
12112 		 */
12113 
12114 		vm_map_lock(src_map);
12115 
12116 		if ((version.main_timestamp + 1) == src_map->timestamp) {
12117 			/* src_map hasn't changed: src_entry is still valid */
12118 			src_entry = saved_src_entry;
12119 			goto VerificationSuccessful;
12120 		}
12121 
12122 		/*
12123 		 *	Simple version comparison failed.
12124 		 *
12125 		 *	Retry the lookup and verify that the
12126 		 *	same object/offset are still present.
12127 		 *
12128 		 *	[Note: a memory manager that colludes with
12129 		 *	the calling task can detect that we have
12130 		 *	cheated.  While the map was unlocked, the
12131 		 *	mapping could have been changed and restored.]
12132 		 */
12133 
12134 		if (!vm_map_lookup_entry(src_map, src_start, &tmp_entry)) {
12135 			if (result != KERN_MEMORY_RESTART_COPY) {
12136 				vm_object_deallocate(VME_OBJECT(new_entry));
12137 				VME_OBJECT_SET(new_entry, VM_OBJECT_NULL, false, 0);
12138 				/* reset accounting state */
12139 				new_entry->iokit_acct = FALSE;
12140 				new_entry->use_pmap = TRUE;
12141 			}
12142 			RETURN(KERN_INVALID_ADDRESS);
12143 		}
12144 
12145 		src_entry = tmp_entry;
12146 		vm_map_clip_start(src_map, src_entry, src_start);
12147 
12148 		if ((((src_entry->protection & VM_PROT_READ) == VM_PROT_NONE) &&
12149 		    !use_maxprot) ||
12150 		    ((src_entry->max_protection & VM_PROT_READ) == 0)) {
12151 			goto VerificationFailed;
12152 		}
12153 
12154 		if (src_entry->vme_end < new_entry->vme_end) {
12155 			/*
12156 			 * This entry might have been shortened
12157 			 * (vm_map_clip_end) or been replaced with
12158 			 * an entry that ends closer to "src_start"
12159 			 * than before.
12160 			 * Adjust "new_entry" accordingly; copying
12161 			 * less memory would be correct but we also
12162 			 * redo the copy (see below) if the new entry
12163 			 * no longer points at the same object/offset.
12164 			 */
12165 			assert(VM_MAP_PAGE_ALIGNED(src_entry->vme_end,
12166 			    VM_MAP_COPY_PAGE_MASK(copy)));
12167 			new_entry->vme_end = src_entry->vme_end;
12168 			src_size = new_entry->vme_end - src_start;
12169 		} else if (src_entry->vme_end > new_entry->vme_end) {
12170 			/*
12171 			 * This entry might have been extended
12172 			 * (vm_map_entry_simplify() or coalesce)
12173 			 * or been replaced with an entry that ends farther
12174 			 * from "src_start" than before.
12175 			 *
12176 			 * We've called vm_object_copy_*() only on
12177 			 * the previous <start:end> range, so we can't
12178 			 * just extend new_entry.  We have to re-do
12179 			 * the copy based on the new entry as if it was
12180 			 * pointing at a different object/offset (see
12181 			 * "Verification failed" below).
12182 			 */
12183 		}
12184 
12185 		if ((VME_OBJECT(src_entry) != src_object) ||
12186 		    (VME_OFFSET(src_entry) != src_offset) ||
12187 		    (src_entry->vme_end > new_entry->vme_end)) {
12188 			/*
12189 			 *	Verification failed.
12190 			 *
12191 			 *	Start over with this top-level entry.
12192 			 */
12193 
12194 VerificationFailed:     ;
12195 
12196 			vm_object_deallocate(VME_OBJECT(new_entry));
12197 			tmp_entry = src_entry;
12198 			continue;
12199 		}
12200 
12201 		/*
12202 		 *	Verification succeeded.
12203 		 */
12204 
12205 VerificationSuccessful:;
12206 
12207 		if (result == KERN_MEMORY_RESTART_COPY) {
12208 			goto RestartCopy;
12209 		}
12210 
12211 		/*
12212 		 *	Copy succeeded.
12213 		 */
12214 
12215 CopySuccessful: ;
12216 
12217 		/*
12218 		 *	Link in the new copy entry.
12219 		 */
12220 
12221 		vm_map_copy_entry_link(copy, vm_map_copy_last_entry(copy),
12222 		    new_entry);
12223 
12224 		/*
12225 		 *	Determine whether the entire region
12226 		 *	has been copied.
12227 		 */
12228 		src_base = src_start;
12229 		src_start = new_entry->vme_end;
12230 		new_entry = VM_MAP_ENTRY_NULL;
12231 		while ((src_start >= src_end) && (src_end != 0)) {
12232 			submap_map_t    *ptr;
12233 
12234 			if (src_map == base_map) {
12235 				/* back to the top */
12236 				break;
12237 			}
12238 
12239 			ptr = parent_maps;
12240 			assert(ptr != NULL);
12241 			parent_maps = parent_maps->next;
12242 
12243 			/* fix up the damage we did in that submap */
12244 			vm_map_simplify_range(src_map,
12245 			    src_base,
12246 			    src_end);
12247 
12248 			vm_map_unlock(src_map);
12249 			vm_map_deallocate(src_map);
12250 			vm_map_lock(ptr->parent_map);
12251 			src_map = ptr->parent_map;
12252 			src_base = ptr->base_start;
12253 			src_start = ptr->base_start + ptr->base_len;
12254 			src_end = ptr->base_end;
12255 			if (!vm_map_lookup_entry(src_map,
12256 			    src_start,
12257 			    &tmp_entry) &&
12258 			    (src_end > src_start)) {
12259 				RETURN(KERN_INVALID_ADDRESS);
12260 			}
12261 			kfree_type(submap_map_t, ptr);
12262 			if (parent_maps == NULL) {
12263 				map_share = FALSE;
12264 			}
12265 			src_entry = tmp_entry->vme_prev;
12266 		}
12267 
12268 		if ((VM_MAP_PAGE_SHIFT(src_map) != PAGE_SHIFT) &&
12269 		    (src_start >= src_addr + len) &&
12270 		    (src_addr + len != 0)) {
12271 			/*
12272 			 * Stop copying now, even though we haven't reached
12273 			 * "src_end".  We'll adjust the end of the last copy
12274 			 * entry at the end, if needed.
12275 			 *
12276 			 * If src_map's aligment is different from the
12277 			 * system's page-alignment, there could be
12278 			 * extra non-map-aligned map entries between
12279 			 * the original (non-rounded) "src_addr + len"
12280 			 * and the rounded "src_end".
12281 			 * We do not want to copy those map entries since
12282 			 * they're not part of the copied range.
12283 			 */
12284 			break;
12285 		}
12286 
12287 		if ((src_start >= src_end) && (src_end != 0)) {
12288 			break;
12289 		}
12290 
12291 		/*
12292 		 *	Verify that there are no gaps in the region
12293 		 */
12294 
12295 		tmp_entry = src_entry->vme_next;
12296 		if ((tmp_entry->vme_start != src_start) ||
12297 		    (tmp_entry == vm_map_to_entry(src_map))) {
12298 			RETURN(KERN_INVALID_ADDRESS);
12299 		}
12300 	}
12301 
12302 	/*
12303 	 * If the source should be destroyed, do it now, since the
12304 	 * copy was successful.
12305 	 */
12306 	if (src_destroy) {
12307 		vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_FLAGS;
12308 
12309 		if (src_map == kernel_map) {
12310 			remove_flags |= VM_MAP_REMOVE_KUNWIRE;
12311 		}
12312 		(void)vm_map_remove_and_unlock(src_map,
12313 		    vm_map_trunc_page(src_addr, VM_MAP_PAGE_MASK(src_map)),
12314 		    src_end,
12315 		    remove_flags,
12316 		    KMEM_GUARD_NONE);
12317 	} else {
12318 		/* fix up the damage we did in the base map */
12319 		vm_map_simplify_range(
12320 			src_map,
12321 			vm_map_trunc_page(src_addr,
12322 			VM_MAP_PAGE_MASK(src_map)),
12323 			vm_map_round_page(src_end,
12324 			VM_MAP_PAGE_MASK(src_map)));
12325 		vm_map_unlock(src_map);
12326 	}
12327 
12328 	tmp_entry = VM_MAP_ENTRY_NULL;
12329 
12330 	if (VM_MAP_PAGE_SHIFT(src_map) > PAGE_SHIFT &&
12331 	    VM_MAP_PAGE_SHIFT(src_map) != VM_MAP_COPY_PAGE_SHIFT(copy)) {
12332 		vm_map_offset_t original_start, original_offset, original_end;
12333 
12334 		assert(VM_MAP_COPY_PAGE_MASK(copy) == PAGE_MASK);
12335 
12336 		/* adjust alignment of first copy_entry's "vme_start" */
12337 		tmp_entry = vm_map_copy_first_entry(copy);
12338 		if (tmp_entry != vm_map_copy_to_entry(copy)) {
12339 			vm_map_offset_t adjustment;
12340 
12341 			original_start = tmp_entry->vme_start;
12342 			original_offset = VME_OFFSET(tmp_entry);
12343 
12344 			/* map-align the start of the first copy entry... */
12345 			adjustment = (tmp_entry->vme_start -
12346 			    vm_map_trunc_page(
12347 				    tmp_entry->vme_start,
12348 				    VM_MAP_PAGE_MASK(src_map)));
12349 			tmp_entry->vme_start -= adjustment;
12350 			VME_OFFSET_SET(tmp_entry,
12351 			    VME_OFFSET(tmp_entry) - adjustment);
12352 			copy_addr -= adjustment;
12353 			assert(tmp_entry->vme_start < tmp_entry->vme_end);
12354 			/* ... adjust for mis-aligned start of copy range */
12355 			adjustment =
12356 			    (vm_map_trunc_page(copy->offset,
12357 			    PAGE_MASK) -
12358 			    vm_map_trunc_page(copy->offset,
12359 			    VM_MAP_PAGE_MASK(src_map)));
12360 			if (adjustment) {
12361 				assert(page_aligned(adjustment));
12362 				assert(adjustment < VM_MAP_PAGE_SIZE(src_map));
12363 				tmp_entry->vme_start += adjustment;
12364 				VME_OFFSET_SET(tmp_entry,
12365 				    (VME_OFFSET(tmp_entry) +
12366 				    adjustment));
12367 				copy_addr += adjustment;
12368 				assert(tmp_entry->vme_start < tmp_entry->vme_end);
12369 			}
12370 
12371 			/*
12372 			 * Assert that the adjustments haven't exposed
12373 			 * more than was originally copied...
12374 			 */
12375 			assert(tmp_entry->vme_start >= original_start);
12376 			assert(VME_OFFSET(tmp_entry) >= original_offset);
12377 			/*
12378 			 * ... and that it did not adjust outside of a
12379 			 * a single 16K page.
12380 			 */
12381 			assert(vm_map_trunc_page(tmp_entry->vme_start,
12382 			    VM_MAP_PAGE_MASK(src_map)) ==
12383 			    vm_map_trunc_page(original_start,
12384 			    VM_MAP_PAGE_MASK(src_map)));
12385 		}
12386 
12387 		/* adjust alignment of last copy_entry's "vme_end" */
12388 		tmp_entry = vm_map_copy_last_entry(copy);
12389 		if (tmp_entry != vm_map_copy_to_entry(copy)) {
12390 			vm_map_offset_t adjustment;
12391 
12392 			original_end = tmp_entry->vme_end;
12393 
12394 			/* map-align the end of the last copy entry... */
12395 			tmp_entry->vme_end =
12396 			    vm_map_round_page(tmp_entry->vme_end,
12397 			    VM_MAP_PAGE_MASK(src_map));
12398 			/* ... adjust for mis-aligned end of copy range */
12399 			adjustment =
12400 			    (vm_map_round_page((copy->offset +
12401 			    copy->size),
12402 			    VM_MAP_PAGE_MASK(src_map)) -
12403 			    vm_map_round_page((copy->offset +
12404 			    copy->size),
12405 			    PAGE_MASK));
12406 			if (adjustment) {
12407 				assert(page_aligned(adjustment));
12408 				assert(adjustment < VM_MAP_PAGE_SIZE(src_map));
12409 				tmp_entry->vme_end -= adjustment;
12410 				assert(tmp_entry->vme_start < tmp_entry->vme_end);
12411 			}
12412 
12413 			/*
12414 			 * Assert that the adjustments haven't exposed
12415 			 * more than was originally copied...
12416 			 */
12417 			assert(tmp_entry->vme_end <= original_end);
12418 			/*
12419 			 * ... and that it did not adjust outside of a
12420 			 * a single 16K page.
12421 			 */
12422 			assert(vm_map_round_page(tmp_entry->vme_end,
12423 			    VM_MAP_PAGE_MASK(src_map)) ==
12424 			    vm_map_round_page(original_end,
12425 			    VM_MAP_PAGE_MASK(src_map)));
12426 		}
12427 	}
12428 
12429 	/* Fix-up start and end points in copy.  This is necessary */
12430 	/* when the various entries in the copy object were picked */
12431 	/* up from different sub-maps */
12432 
12433 	tmp_entry = vm_map_copy_first_entry(copy);
12434 	copy_size = 0; /* compute actual size */
12435 	while (tmp_entry != vm_map_copy_to_entry(copy)) {
12436 		assert(VM_MAP_PAGE_ALIGNED(
12437 			    copy_addr + (tmp_entry->vme_end -
12438 			    tmp_entry->vme_start),
12439 			    MIN(VM_MAP_COPY_PAGE_MASK(copy), PAGE_MASK)));
12440 		assert(VM_MAP_PAGE_ALIGNED(
12441 			    copy_addr,
12442 			    MIN(VM_MAP_COPY_PAGE_MASK(copy), PAGE_MASK)));
12443 
12444 		/*
12445 		 * The copy_entries will be injected directly into the
12446 		 * destination map and might not be "map aligned" there...
12447 		 */
12448 		tmp_entry->map_aligned = FALSE;
12449 
12450 		tmp_entry->vme_end = copy_addr +
12451 		    (tmp_entry->vme_end - tmp_entry->vme_start);
12452 		tmp_entry->vme_start = copy_addr;
12453 		assert(tmp_entry->vme_start < tmp_entry->vme_end);
12454 		copy_addr += tmp_entry->vme_end - tmp_entry->vme_start;
12455 		copy_size += tmp_entry->vme_end - tmp_entry->vme_start;
12456 		tmp_entry = (struct vm_map_entry *)tmp_entry->vme_next;
12457 	}
12458 
12459 	if (VM_MAP_PAGE_SHIFT(src_map) != PAGE_SHIFT &&
12460 	    copy_size < copy->size) {
12461 		/*
12462 		 * The actual size of the VM map copy is smaller than what
12463 		 * was requested by the caller.  This must be because some
12464 		 * PAGE_SIZE-sized pages are missing at the end of the last
12465 		 * VM_MAP_PAGE_SIZE(src_map)-sized chunk of the range.
12466 		 * The caller might not have been aware of those missing
12467 		 * pages and might not want to be aware of it, which is
12468 		 * fine as long as they don't try to access (and crash on)
12469 		 * those missing pages.
12470 		 * Let's adjust the size of the "copy", to avoid failing
12471 		 * in vm_map_copyout() or vm_map_copy_overwrite().
12472 		 */
12473 		assert(vm_map_round_page(copy_size,
12474 		    VM_MAP_PAGE_MASK(src_map)) ==
12475 		    vm_map_round_page(copy->size,
12476 		    VM_MAP_PAGE_MASK(src_map)));
12477 		copy->size = copy_size;
12478 	}
12479 
12480 	*copy_result = copy;
12481 	return KERN_SUCCESS;
12482 
12483 #undef  RETURN
12484 }
12485 
12486 kern_return_t
vm_map_copy_extract(vm_map_t src_map,vm_map_address_t src_addr,vm_map_size_t len,boolean_t do_copy,vm_map_copy_t * copy_result,vm_prot_t * cur_prot,vm_prot_t * max_prot,vm_inherit_t inheritance,vm_map_kernel_flags_t vmk_flags)12487 vm_map_copy_extract(
12488 	vm_map_t                src_map,
12489 	vm_map_address_t        src_addr,
12490 	vm_map_size_t           len,
12491 	boolean_t               do_copy,
12492 	vm_map_copy_t           *copy_result,   /* OUT */
12493 	vm_prot_t               *cur_prot,      /* IN/OUT */
12494 	vm_prot_t               *max_prot,      /* IN/OUT */
12495 	vm_inherit_t            inheritance,
12496 	vm_map_kernel_flags_t   vmk_flags)
12497 {
12498 	vm_map_copy_t   copy;
12499 	kern_return_t   kr;
12500 	vm_prot_t required_cur_prot, required_max_prot;
12501 
12502 	/*
12503 	 *	Check for copies of zero bytes.
12504 	 */
12505 
12506 	if (len == 0) {
12507 		*copy_result = VM_MAP_COPY_NULL;
12508 		return KERN_SUCCESS;
12509 	}
12510 
12511 	/*
12512 	 *	Check that the end address doesn't overflow
12513 	 */
12514 	if (src_addr + len < src_addr) {
12515 		return KERN_INVALID_ADDRESS;
12516 	}
12517 
12518 	if (VM_MAP_PAGE_SIZE(src_map) < PAGE_SIZE) {
12519 		DEBUG4K_SHARE("src_map %p src_addr 0x%llx src_end 0x%llx\n", src_map, (uint64_t)src_addr, (uint64_t)(src_addr + len));
12520 	}
12521 
12522 	required_cur_prot = *cur_prot;
12523 	required_max_prot = *max_prot;
12524 
12525 	/*
12526 	 *	Allocate a header element for the list.
12527 	 *
12528 	 *	Use the start and end in the header to
12529 	 *	remember the endpoints prior to rounding.
12530 	 */
12531 
12532 	copy = vm_map_copy_allocate();
12533 	copy->type = VM_MAP_COPY_ENTRY_LIST;
12534 	copy->cpy_hdr.entries_pageable = vmk_flags.vmkf_copy_pageable;
12535 
12536 	vm_map_store_init(&copy->cpy_hdr);
12537 
12538 	copy->offset = 0;
12539 	copy->size = len;
12540 
12541 	kr = vm_map_remap_extract(src_map,
12542 	    src_addr,
12543 	    len,
12544 	    do_copy,             /* copy */
12545 	    &copy->cpy_hdr,
12546 	    cur_prot,            /* IN/OUT */
12547 	    max_prot,            /* IN/OUT */
12548 	    inheritance,
12549 	    vmk_flags);
12550 	if (kr != KERN_SUCCESS) {
12551 		vm_map_copy_discard(copy);
12552 		return kr;
12553 	}
12554 	if (required_cur_prot != VM_PROT_NONE) {
12555 		assert((*cur_prot & required_cur_prot) == required_cur_prot);
12556 		assert((*max_prot & required_max_prot) == required_max_prot);
12557 	}
12558 
12559 	*copy_result = copy;
12560 	return KERN_SUCCESS;
12561 }
12562 
12563 /*
12564  *	vm_map_copyin_object:
12565  *
12566  *	Create a copy object from an object.
12567  *	Our caller donates an object reference.
12568  */
12569 
12570 kern_return_t
vm_map_copyin_object(vm_object_t object,vm_object_offset_t offset,vm_object_size_t size,vm_map_copy_t * copy_result)12571 vm_map_copyin_object(
12572 	vm_object_t             object,
12573 	vm_object_offset_t      offset, /* offset of region in object */
12574 	vm_object_size_t        size,   /* size of region in object */
12575 	vm_map_copy_t   *copy_result)   /* OUT */
12576 {
12577 	vm_map_copy_t   copy;           /* Resulting copy */
12578 
12579 	/*
12580 	 *	We drop the object into a special copy object
12581 	 *	that contains the object directly.
12582 	 */
12583 
12584 	copy = vm_map_copy_allocate();
12585 	copy->type = VM_MAP_COPY_OBJECT;
12586 	copy->cpy_object = object;
12587 	copy->offset = offset;
12588 	copy->size = size;
12589 
12590 	*copy_result = copy;
12591 	return KERN_SUCCESS;
12592 }
12593 
12594 static void
vm_map_fork_share(vm_map_t old_map,vm_map_entry_t old_entry,vm_map_t new_map)12595 vm_map_fork_share(
12596 	vm_map_t        old_map,
12597 	vm_map_entry_t  old_entry,
12598 	vm_map_t        new_map)
12599 {
12600 	vm_object_t     object;
12601 	vm_map_entry_t  new_entry;
12602 
12603 	/*
12604 	 *	New sharing code.  New map entry
12605 	 *	references original object.  Internal
12606 	 *	objects use asynchronous copy algorithm for
12607 	 *	future copies.  First make sure we have
12608 	 *	the right object.  If we need a shadow,
12609 	 *	or someone else already has one, then
12610 	 *	make a new shadow and share it.
12611 	 */
12612 
12613 	if (!old_entry->is_sub_map) {
12614 		object = VME_OBJECT(old_entry);
12615 	}
12616 
12617 	if (old_entry->is_sub_map) {
12618 		assert(old_entry->wired_count == 0);
12619 #ifndef NO_NESTED_PMAP
12620 #if !PMAP_FORK_NEST
12621 		if (old_entry->use_pmap) {
12622 			kern_return_t   result;
12623 
12624 			result = pmap_nest(new_map->pmap,
12625 			    (VME_SUBMAP(old_entry))->pmap,
12626 			    (addr64_t)old_entry->vme_start,
12627 			    (uint64_t)(old_entry->vme_end - old_entry->vme_start));
12628 			if (result) {
12629 				panic("vm_map_fork_share: pmap_nest failed!");
12630 			}
12631 		}
12632 #endif /* !PMAP_FORK_NEST */
12633 #endif  /* NO_NESTED_PMAP */
12634 	} else if (object == VM_OBJECT_NULL) {
12635 		object = vm_object_allocate((vm_map_size_t)(old_entry->vme_end -
12636 		    old_entry->vme_start));
12637 		VME_OFFSET_SET(old_entry, 0);
12638 		VME_OBJECT_SET(old_entry, object, false, 0);
12639 		old_entry->use_pmap = TRUE;
12640 //		assert(!old_entry->needs_copy);
12641 	} else if (object->copy_strategy !=
12642 	    MEMORY_OBJECT_COPY_SYMMETRIC) {
12643 		/*
12644 		 *	We are already using an asymmetric
12645 		 *	copy, and therefore we already have
12646 		 *	the right object.
12647 		 */
12648 
12649 		assert(!old_entry->needs_copy);
12650 	} else if (old_entry->needs_copy ||       /* case 1 */
12651 	    object->shadowed ||                 /* case 2 */
12652 	    (!object->true_share &&             /* case 3 */
12653 	    !old_entry->is_shared &&
12654 	    (object->vo_size >
12655 	    (vm_map_size_t)(old_entry->vme_end -
12656 	    old_entry->vme_start)))) {
12657 		/*
12658 		 *	We need to create a shadow.
12659 		 *	There are three cases here.
12660 		 *	In the first case, we need to
12661 		 *	complete a deferred symmetrical
12662 		 *	copy that we participated in.
12663 		 *	In the second and third cases,
12664 		 *	we need to create the shadow so
12665 		 *	that changes that we make to the
12666 		 *	object do not interfere with
12667 		 *	any symmetrical copies which
12668 		 *	have occured (case 2) or which
12669 		 *	might occur (case 3).
12670 		 *
12671 		 *	The first case is when we had
12672 		 *	deferred shadow object creation
12673 		 *	via the entry->needs_copy mechanism.
12674 		 *	This mechanism only works when
12675 		 *	only one entry points to the source
12676 		 *	object, and we are about to create
12677 		 *	a second entry pointing to the
12678 		 *	same object. The problem is that
12679 		 *	there is no way of mapping from
12680 		 *	an object to the entries pointing
12681 		 *	to it. (Deferred shadow creation
12682 		 *	works with one entry because occurs
12683 		 *	at fault time, and we walk from the
12684 		 *	entry to the object when handling
12685 		 *	the fault.)
12686 		 *
12687 		 *	The second case is when the object
12688 		 *	to be shared has already been copied
12689 		 *	with a symmetric copy, but we point
12690 		 *	directly to the object without
12691 		 *	needs_copy set in our entry. (This
12692 		 *	can happen because different ranges
12693 		 *	of an object can be pointed to by
12694 		 *	different entries. In particular,
12695 		 *	a single entry pointing to an object
12696 		 *	can be split by a call to vm_inherit,
12697 		 *	which, combined with task_create, can
12698 		 *	result in the different entries
12699 		 *	having different needs_copy values.)
12700 		 *	The shadowed flag in the object allows
12701 		 *	us to detect this case. The problem
12702 		 *	with this case is that if this object
12703 		 *	has or will have shadows, then we
12704 		 *	must not perform an asymmetric copy
12705 		 *	of this object, since such a copy
12706 		 *	allows the object to be changed, which
12707 		 *	will break the previous symmetrical
12708 		 *	copies (which rely upon the object
12709 		 *	not changing). In a sense, the shadowed
12710 		 *	flag says "don't change this object".
12711 		 *	We fix this by creating a shadow
12712 		 *	object for this object, and sharing
12713 		 *	that. This works because we are free
12714 		 *	to change the shadow object (and thus
12715 		 *	to use an asymmetric copy strategy);
12716 		 *	this is also semantically correct,
12717 		 *	since this object is temporary, and
12718 		 *	therefore a copy of the object is
12719 		 *	as good as the object itself. (This
12720 		 *	is not true for permanent objects,
12721 		 *	since the pager needs to see changes,
12722 		 *	which won't happen if the changes
12723 		 *	are made to a copy.)
12724 		 *
12725 		 *	The third case is when the object
12726 		 *	to be shared has parts sticking
12727 		 *	outside of the entry we're working
12728 		 *	with, and thus may in the future
12729 		 *	be subject to a symmetrical copy.
12730 		 *	(This is a preemptive version of
12731 		 *	case 2.)
12732 		 */
12733 		VME_OBJECT_SHADOW(old_entry,
12734 		    (vm_map_size_t) (old_entry->vme_end -
12735 		    old_entry->vme_start),
12736 		    vm_map_always_shadow(old_map));
12737 
12738 		/*
12739 		 *	If we're making a shadow for other than
12740 		 *	copy on write reasons, then we have
12741 		 *	to remove write permission.
12742 		 */
12743 
12744 		if (!old_entry->needs_copy &&
12745 		    (old_entry->protection & VM_PROT_WRITE)) {
12746 			vm_prot_t prot;
12747 
12748 			assert(!pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, old_entry->protection));
12749 
12750 			prot = old_entry->protection & ~VM_PROT_WRITE;
12751 
12752 			assert(!pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, prot));
12753 
12754 			if (override_nx(old_map, VME_ALIAS(old_entry)) && prot) {
12755 				prot |= VM_PROT_EXECUTE;
12756 			}
12757 
12758 
12759 			if (old_map->mapped_in_other_pmaps) {
12760 				vm_object_pmap_protect(
12761 					VME_OBJECT(old_entry),
12762 					VME_OFFSET(old_entry),
12763 					(old_entry->vme_end -
12764 					old_entry->vme_start),
12765 					PMAP_NULL,
12766 					PAGE_SIZE,
12767 					old_entry->vme_start,
12768 					prot);
12769 			} else {
12770 				pmap_protect(old_map->pmap,
12771 				    old_entry->vme_start,
12772 				    old_entry->vme_end,
12773 				    prot);
12774 			}
12775 		}
12776 
12777 		old_entry->needs_copy = FALSE;
12778 		object = VME_OBJECT(old_entry);
12779 	}
12780 
12781 
12782 	/*
12783 	 *	If object was using a symmetric copy strategy,
12784 	 *	change its copy strategy to the default
12785 	 *	asymmetric copy strategy, which is copy_delay
12786 	 *	in the non-norma case and copy_call in the
12787 	 *	norma case. Bump the reference count for the
12788 	 *	new entry.
12789 	 */
12790 
12791 	if (old_entry->is_sub_map) {
12792 		vm_map_reference(VME_SUBMAP(old_entry));
12793 	} else {
12794 		vm_object_lock(object);
12795 		vm_object_reference_locked(object);
12796 		if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
12797 			object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
12798 		}
12799 		vm_object_unlock(object);
12800 	}
12801 
12802 	/*
12803 	 *	Clone the entry, using object ref from above.
12804 	 *	Mark both entries as shared.
12805 	 */
12806 
12807 	new_entry = vm_map_entry_create(new_map); /* Never the kernel map or descendants */
12808 	vm_map_entry_copy(old_map, new_entry, old_entry);
12809 	old_entry->is_shared = TRUE;
12810 	new_entry->is_shared = TRUE;
12811 
12812 	/*
12813 	 * We're dealing with a shared mapping, so the resulting mapping
12814 	 * should inherit some of the original mapping's accounting settings.
12815 	 * "iokit_acct" should have been cleared in vm_map_entry_copy().
12816 	 * "use_pmap" should stay the same as before (if it hasn't been reset
12817 	 * to TRUE when we cleared "iokit_acct").
12818 	 */
12819 	assert(!new_entry->iokit_acct);
12820 
12821 	/*
12822 	 *	If old entry's inheritence is VM_INHERIT_NONE,
12823 	 *	the new entry is for corpse fork, remove the
12824 	 *	write permission from the new entry.
12825 	 */
12826 	if (old_entry->inheritance == VM_INHERIT_NONE) {
12827 		new_entry->protection &= ~VM_PROT_WRITE;
12828 		new_entry->max_protection &= ~VM_PROT_WRITE;
12829 	}
12830 
12831 	/*
12832 	 *	Insert the entry into the new map -- we
12833 	 *	know we're inserting at the end of the new
12834 	 *	map.
12835 	 */
12836 
12837 	vm_map_store_entry_link(new_map, vm_map_last_entry(new_map), new_entry,
12838 	    VM_MAP_KERNEL_FLAGS_NONE);
12839 
12840 	/*
12841 	 *	Update the physical map
12842 	 */
12843 
12844 	if (old_entry->is_sub_map) {
12845 		/* Bill Angell pmap support goes here */
12846 	} else {
12847 		pmap_copy(new_map->pmap, old_map->pmap, new_entry->vme_start,
12848 		    old_entry->vme_end - old_entry->vme_start,
12849 		    old_entry->vme_start);
12850 	}
12851 }
12852 
12853 static boolean_t
vm_map_fork_copy(vm_map_t old_map,vm_map_entry_t * old_entry_p,vm_map_t new_map,int vm_map_copyin_flags)12854 vm_map_fork_copy(
12855 	vm_map_t        old_map,
12856 	vm_map_entry_t  *old_entry_p,
12857 	vm_map_t        new_map,
12858 	int             vm_map_copyin_flags)
12859 {
12860 	vm_map_entry_t old_entry = *old_entry_p;
12861 	vm_map_size_t entry_size = old_entry->vme_end - old_entry->vme_start;
12862 	vm_map_offset_t start = old_entry->vme_start;
12863 	vm_map_copy_t copy;
12864 	vm_map_entry_t last = vm_map_last_entry(new_map);
12865 
12866 	vm_map_unlock(old_map);
12867 	/*
12868 	 *	Use maxprot version of copyin because we
12869 	 *	care about whether this memory can ever
12870 	 *	be accessed, not just whether it's accessible
12871 	 *	right now.
12872 	 */
12873 	vm_map_copyin_flags |= VM_MAP_COPYIN_USE_MAXPROT;
12874 	if (vm_map_copyin_internal(old_map, start, entry_size,
12875 	    vm_map_copyin_flags, &copy)
12876 	    != KERN_SUCCESS) {
12877 		/*
12878 		 *	The map might have changed while it
12879 		 *	was unlocked, check it again.  Skip
12880 		 *	any blank space or permanently
12881 		 *	unreadable region.
12882 		 */
12883 		vm_map_lock(old_map);
12884 		if (!vm_map_lookup_entry(old_map, start, &last) ||
12885 		    (last->max_protection & VM_PROT_READ) == VM_PROT_NONE) {
12886 			last = last->vme_next;
12887 		}
12888 		*old_entry_p = last;
12889 
12890 		/*
12891 		 * XXX	For some error returns, want to
12892 		 * XXX	skip to the next element.  Note
12893 		 *	that INVALID_ADDRESS and
12894 		 *	PROTECTION_FAILURE are handled above.
12895 		 */
12896 
12897 		return FALSE;
12898 	}
12899 
12900 	/*
12901 	 * Assert that the vm_map_copy is coming from the right
12902 	 * zone and hasn't been forged
12903 	 */
12904 	vm_map_copy_require(copy);
12905 
12906 	/*
12907 	 *	Insert the copy into the new map
12908 	 */
12909 	vm_map_copy_insert(new_map, last, copy);
12910 
12911 	/*
12912 	 *	Pick up the traversal at the end of
12913 	 *	the copied region.
12914 	 */
12915 
12916 	vm_map_lock(old_map);
12917 	start += entry_size;
12918 	if (!vm_map_lookup_entry(old_map, start, &last)) {
12919 		last = last->vme_next;
12920 	} else {
12921 		if (last->vme_start == start) {
12922 			/*
12923 			 * No need to clip here and we don't
12924 			 * want to cause any unnecessary
12925 			 * unnesting...
12926 			 */
12927 		} else {
12928 			vm_map_clip_start(old_map, last, start);
12929 		}
12930 	}
12931 	*old_entry_p = last;
12932 
12933 	return TRUE;
12934 }
12935 
12936 #if PMAP_FORK_NEST
12937 #define PMAP_FORK_NEST_DEBUG 0
12938 static inline void
vm_map_fork_unnest(pmap_t new_pmap,vm_map_offset_t pre_nested_start,vm_map_offset_t pre_nested_end,vm_map_offset_t start,vm_map_offset_t end)12939 vm_map_fork_unnest(
12940 	pmap_t new_pmap,
12941 	vm_map_offset_t pre_nested_start,
12942 	vm_map_offset_t pre_nested_end,
12943 	vm_map_offset_t start,
12944 	vm_map_offset_t end)
12945 {
12946 	kern_return_t kr;
12947 	vm_map_offset_t nesting_mask, start_unnest, end_unnest;
12948 
12949 	assertf(pre_nested_start <= pre_nested_end,
12950 	    "pre_nested start 0x%llx end 0x%llx",
12951 	    (uint64_t)pre_nested_start, (uint64_t)pre_nested_end);
12952 	assertf(start <= end,
12953 	    "start 0x%llx end 0x%llx",
12954 	    (uint64_t) start, (uint64_t)end);
12955 
12956 	if (pre_nested_start == pre_nested_end) {
12957 		/* nothing was pre-nested: done */
12958 		return;
12959 	}
12960 	if (end <= pre_nested_start) {
12961 		/* fully before pre-nested range: done */
12962 		return;
12963 	}
12964 	if (start >= pre_nested_end) {
12965 		/* fully after pre-nested range: done */
12966 		return;
12967 	}
12968 	/* ignore parts of range outside of pre_nested range */
12969 	if (start < pre_nested_start) {
12970 		start = pre_nested_start;
12971 	}
12972 	if (end > pre_nested_end) {
12973 		end = pre_nested_end;
12974 	}
12975 	nesting_mask = pmap_shared_region_size_min(new_pmap) - 1;
12976 	start_unnest = start & ~nesting_mask;
12977 	end_unnest = (end + nesting_mask) & ~nesting_mask;
12978 	kr = pmap_unnest(new_pmap,
12979 	    (addr64_t)start_unnest,
12980 	    (uint64_t)(end_unnest - start_unnest));
12981 #if PMAP_FORK_NEST_DEBUG
12982 	printf("PMAP_FORK_NEST %s:%d new_pmap %p 0x%llx:0x%llx -> pmap_unnest 0x%llx:0x%llx kr 0x%x\n", __FUNCTION__, __LINE__, new_pmap, (uint64_t)start, (uint64_t)end, (uint64_t)start_unnest, (uint64_t)end_unnest, kr);
12983 #endif /* PMAP_FORK_NEST_DEBUG */
12984 	assertf(kr == KERN_SUCCESS,
12985 	    "0x%llx 0x%llx pmap_unnest(%p, 0x%llx, 0x%llx) -> 0x%x",
12986 	    (uint64_t)start, (uint64_t)end, new_pmap,
12987 	    (uint64_t)start_unnest, (uint64_t)(end_unnest - start_unnest),
12988 	    kr);
12989 }
12990 #endif /* PMAP_FORK_NEST */
12991 
12992 /*
12993  *	vm_map_fork:
12994  *
12995  *	Create and return a new map based on the old
12996  *	map, according to the inheritance values on the
12997  *	regions in that map and the options.
12998  *
12999  *	The source map must not be locked.
13000  */
13001 vm_map_t
vm_map_fork(ledger_t ledger,vm_map_t old_map,int options)13002 vm_map_fork(
13003 	ledger_t        ledger,
13004 	vm_map_t        old_map,
13005 	int             options)
13006 {
13007 	pmap_t          new_pmap;
13008 	vm_map_t        new_map;
13009 	vm_map_entry_t  old_entry;
13010 	vm_map_size_t   new_size = 0, entry_size;
13011 	vm_map_entry_t  new_entry;
13012 	boolean_t       src_needs_copy;
13013 	boolean_t       new_entry_needs_copy;
13014 	boolean_t       pmap_is64bit;
13015 	int             vm_map_copyin_flags;
13016 	vm_inherit_t    old_entry_inheritance;
13017 	int             map_create_options;
13018 	kern_return_t   footprint_collect_kr;
13019 
13020 	if (options & ~(VM_MAP_FORK_SHARE_IF_INHERIT_NONE |
13021 	    VM_MAP_FORK_PRESERVE_PURGEABLE |
13022 	    VM_MAP_FORK_CORPSE_FOOTPRINT)) {
13023 		/* unsupported option */
13024 		return VM_MAP_NULL;
13025 	}
13026 
13027 	pmap_is64bit =
13028 #if defined(__i386__) || defined(__x86_64__)
13029 	    old_map->pmap->pm_task_map != TASK_MAP_32BIT;
13030 #elif defined(__arm64__)
13031 	    old_map->pmap->is_64bit;
13032 #else
13033 #error Unknown architecture.
13034 #endif
13035 
13036 	unsigned int pmap_flags = 0;
13037 	pmap_flags |= pmap_is64bit ? PMAP_CREATE_64BIT : 0;
13038 #if defined(HAS_APPLE_PAC)
13039 	pmap_flags |= old_map->pmap->disable_jop ? PMAP_CREATE_DISABLE_JOP : 0;
13040 #endif
13041 #if CONFIG_ROSETTA
13042 	pmap_flags |= old_map->pmap->is_rosetta ? PMAP_CREATE_ROSETTA : 0;
13043 #endif
13044 #if PMAP_CREATE_FORCE_4K_PAGES
13045 	if (VM_MAP_PAGE_SIZE(old_map) == FOURK_PAGE_SIZE &&
13046 	    PAGE_SIZE != FOURK_PAGE_SIZE) {
13047 		pmap_flags |= PMAP_CREATE_FORCE_4K_PAGES;
13048 	}
13049 #endif /* PMAP_CREATE_FORCE_4K_PAGES */
13050 	new_pmap = pmap_create_options(ledger, (vm_map_size_t) 0, pmap_flags);
13051 	if (new_pmap == NULL) {
13052 		return VM_MAP_NULL;
13053 	}
13054 
13055 	vm_map_reference(old_map);
13056 	vm_map_lock(old_map);
13057 
13058 	map_create_options = 0;
13059 	if (old_map->hdr.entries_pageable) {
13060 		map_create_options |= VM_MAP_CREATE_PAGEABLE;
13061 	}
13062 	if (options & VM_MAP_FORK_CORPSE_FOOTPRINT) {
13063 		map_create_options |= VM_MAP_CREATE_CORPSE_FOOTPRINT;
13064 		footprint_collect_kr = KERN_SUCCESS;
13065 	}
13066 	new_map = vm_map_create_options(new_pmap,
13067 	    old_map->min_offset,
13068 	    old_map->max_offset,
13069 	    map_create_options);
13070 	/* inherit cs_enforcement */
13071 	vm_map_cs_enforcement_set(new_map, old_map->cs_enforcement);
13072 	vm_map_lock(new_map);
13073 	vm_commit_pagezero_status(new_map);
13074 	/* inherit the parent map's page size */
13075 	vm_map_set_page_shift(new_map, VM_MAP_PAGE_SHIFT(old_map));
13076 
13077 #if CONFIG_MAP_RANGES
13078 	/* inherit the parent map's VM ranges */
13079 	vm_map_range_fork(new_map, old_map);
13080 #endif
13081 	/* ensure PMAP_CS structures are prepared for the fork */
13082 	pmap_cs_fork_prepare(old_map->pmap, new_pmap);
13083 
13084 #if PMAP_FORK_NEST
13085 	/*
13086 	 * Pre-nest the shared region's pmap.
13087 	 */
13088 	vm_map_offset_t pre_nested_start = 0, pre_nested_end = 0;
13089 	pmap_fork_nest(old_map->pmap, new_pmap,
13090 	    &pre_nested_start, &pre_nested_end);
13091 #if PMAP_FORK_NEST_DEBUG
13092 	printf("PMAP_FORK_NEST %s:%d old %p new %p pre_nested start 0x%llx end 0x%llx\n", __FUNCTION__, __LINE__, old_map->pmap, new_pmap, (uint64_t)pre_nested_start, (uint64_t)pre_nested_end);
13093 #endif /* PMAP_FORK_NEST_DEBUG */
13094 #endif /* PMAP_FORK_NEST */
13095 
13096 	for (old_entry = vm_map_first_entry(old_map); old_entry != vm_map_to_entry(old_map);) {
13097 		/*
13098 		 * Abort any corpse collection if the system is shutting down.
13099 		 */
13100 		if ((options & VM_MAP_FORK_CORPSE_FOOTPRINT) &&
13101 		    get_system_inshutdown()) {
13102 #if PMAP_FORK_NEST
13103 			new_entry = vm_map_last_entry(new_map);
13104 			if (new_entry == vm_map_to_entry(new_map)) {
13105 				/* unnest all that was pre-nested */
13106 				vm_map_fork_unnest(new_pmap,
13107 				    pre_nested_start, pre_nested_end,
13108 				    vm_map_min(new_map), vm_map_max(new_map));
13109 			} else if (new_entry->vme_end < vm_map_max(new_map)) {
13110 				/* unnest hole at the end, if pre-nested */
13111 				vm_map_fork_unnest(new_pmap,
13112 				    pre_nested_start, pre_nested_end,
13113 				    new_entry->vme_end, vm_map_max(new_map));
13114 			}
13115 #endif /* PMAP_FORK_NEST */
13116 			vm_map_corpse_footprint_collect_done(new_map);
13117 			vm_map_unlock(new_map);
13118 			vm_map_unlock(old_map);
13119 			vm_map_deallocate(new_map);
13120 			vm_map_deallocate(old_map);
13121 			printf("Aborting corpse map due to system shutdown\n");
13122 			return VM_MAP_NULL;
13123 		}
13124 
13125 		entry_size = old_entry->vme_end - old_entry->vme_start;
13126 
13127 #if PMAP_FORK_NEST
13128 		/*
13129 		 * Undo any unnecessary pre-nesting.
13130 		 */
13131 		vm_map_offset_t prev_end;
13132 		if (old_entry == vm_map_first_entry(old_map)) {
13133 			prev_end = vm_map_min(old_map);
13134 		} else {
13135 			prev_end = old_entry->vme_prev->vme_end;
13136 		}
13137 		if (prev_end < old_entry->vme_start) {
13138 			/* unnest hole before this entry, if pre-nested */
13139 			vm_map_fork_unnest(new_pmap,
13140 			    pre_nested_start, pre_nested_end,
13141 			    prev_end, old_entry->vme_start);
13142 		}
13143 		if (old_entry->is_sub_map && old_entry->use_pmap) {
13144 			/* keep this entry nested in the child */
13145 #if PMAP_FORK_NEST_DEBUG
13146 			printf("PMAP_FORK_NEST %s:%d new_pmap %p keeping 0x%llx:0x%llx nested\n", __FUNCTION__, __LINE__, new_pmap, (uint64_t)old_entry->vme_start, (uint64_t)old_entry->vme_end);
13147 #endif /* PMAP_FORK_NEST_DEBUG */
13148 		} else {
13149 			/* undo nesting for this entry, if pre-nested */
13150 			vm_map_fork_unnest(new_pmap,
13151 			    pre_nested_start, pre_nested_end,
13152 			    old_entry->vme_start, old_entry->vme_end);
13153 		}
13154 #endif /* PMAP_FORK_NEST */
13155 
13156 		old_entry_inheritance = old_entry->inheritance;
13157 		/*
13158 		 * If caller used the VM_MAP_FORK_SHARE_IF_INHERIT_NONE option
13159 		 * share VM_INHERIT_NONE entries that are not backed by a
13160 		 * device pager.
13161 		 */
13162 		if (old_entry_inheritance == VM_INHERIT_NONE &&
13163 		    (options & VM_MAP_FORK_SHARE_IF_INHERIT_NONE) &&
13164 		    (old_entry->protection & VM_PROT_READ) &&
13165 		    !(!old_entry->is_sub_map &&
13166 		    VME_OBJECT(old_entry) != NULL &&
13167 		    VME_OBJECT(old_entry)->pager != NULL &&
13168 		    is_device_pager_ops(
13169 			    VME_OBJECT(old_entry)->pager->mo_pager_ops))) {
13170 			old_entry_inheritance = VM_INHERIT_SHARE;
13171 		}
13172 
13173 		if (old_entry_inheritance != VM_INHERIT_NONE &&
13174 		    (options & VM_MAP_FORK_CORPSE_FOOTPRINT) &&
13175 		    footprint_collect_kr == KERN_SUCCESS) {
13176 			/*
13177 			 * The corpse won't have old_map->pmap to query
13178 			 * footprint information, so collect that data now
13179 			 * and store it in new_map->vmmap_corpse_footprint
13180 			 * for later autopsy.
13181 			 */
13182 			footprint_collect_kr =
13183 			    vm_map_corpse_footprint_collect(old_map,
13184 			    old_entry,
13185 			    new_map);
13186 		}
13187 
13188 		switch (old_entry_inheritance) {
13189 		case VM_INHERIT_NONE:
13190 			break;
13191 
13192 		case VM_INHERIT_SHARE:
13193 			vm_map_fork_share(old_map, old_entry, new_map);
13194 			new_size += entry_size;
13195 			break;
13196 
13197 		case VM_INHERIT_COPY:
13198 
13199 			/*
13200 			 *	Inline the copy_quickly case;
13201 			 *	upon failure, fall back on call
13202 			 *	to vm_map_fork_copy.
13203 			 */
13204 
13205 			if (old_entry->is_sub_map) {
13206 				break;
13207 			}
13208 			if ((old_entry->wired_count != 0) ||
13209 			    ((VME_OBJECT(old_entry) != NULL) &&
13210 			    (VME_OBJECT(old_entry)->true_share))) {
13211 				goto slow_vm_map_fork_copy;
13212 			}
13213 
13214 			new_entry = vm_map_entry_create(new_map); /* never the kernel map or descendants */
13215 			vm_map_entry_copy(old_map, new_entry, old_entry);
13216 			if (old_entry->vme_permanent) {
13217 				/* inherit "permanent" on fork() */
13218 				new_entry->vme_permanent = TRUE;
13219 			}
13220 
13221 			if (new_entry->used_for_jit == TRUE && new_map->jit_entry_exists == FALSE) {
13222 				new_map->jit_entry_exists = TRUE;
13223 			}
13224 
13225 			if (new_entry->is_sub_map) {
13226 				/* clear address space specifics */
13227 				new_entry->use_pmap = FALSE;
13228 			} else {
13229 				/*
13230 				 * We're dealing with a copy-on-write operation,
13231 				 * so the resulting mapping should not inherit
13232 				 * the original mapping's accounting settings.
13233 				 * "iokit_acct" should have been cleared in
13234 				 * vm_map_entry_copy().
13235 				 * "use_pmap" should be reset to its default
13236 				 * (TRUE) so that the new mapping gets
13237 				 * accounted for in the task's memory footprint.
13238 				 */
13239 				assert(!new_entry->iokit_acct);
13240 				new_entry->use_pmap = TRUE;
13241 			}
13242 
13243 			if (!vm_object_copy_quickly(
13244 				    VME_OBJECT(new_entry),
13245 				    VME_OFFSET(old_entry),
13246 				    (old_entry->vme_end -
13247 				    old_entry->vme_start),
13248 				    &src_needs_copy,
13249 				    &new_entry_needs_copy)) {
13250 				vm_map_entry_dispose(new_entry);
13251 				goto slow_vm_map_fork_copy;
13252 			}
13253 
13254 			/*
13255 			 *	Handle copy-on-write obligations
13256 			 */
13257 
13258 			if (src_needs_copy && !old_entry->needs_copy) {
13259 				vm_prot_t prot;
13260 
13261 				assert(!pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, old_entry->protection));
13262 
13263 				prot = old_entry->protection & ~VM_PROT_WRITE;
13264 
13265 				if (override_nx(old_map, VME_ALIAS(old_entry))
13266 				    && prot) {
13267 					prot |= VM_PROT_EXECUTE;
13268 				}
13269 
13270 				assert(!pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, prot));
13271 
13272 				vm_object_pmap_protect(
13273 					VME_OBJECT(old_entry),
13274 					VME_OFFSET(old_entry),
13275 					(old_entry->vme_end -
13276 					old_entry->vme_start),
13277 					((old_entry->is_shared
13278 					|| old_map->mapped_in_other_pmaps)
13279 					? PMAP_NULL :
13280 					old_map->pmap),
13281 					VM_MAP_PAGE_SIZE(old_map),
13282 					old_entry->vme_start,
13283 					prot);
13284 
13285 				assert(old_entry->wired_count == 0);
13286 				old_entry->needs_copy = TRUE;
13287 			}
13288 			new_entry->needs_copy = new_entry_needs_copy;
13289 
13290 			/*
13291 			 *	Insert the entry at the end
13292 			 *	of the map.
13293 			 */
13294 
13295 			vm_map_store_entry_link(new_map,
13296 			    vm_map_last_entry(new_map),
13297 			    new_entry,
13298 			    VM_MAP_KERNEL_FLAGS_NONE);
13299 			new_size += entry_size;
13300 			break;
13301 
13302 slow_vm_map_fork_copy:
13303 			vm_map_copyin_flags = 0;
13304 			if (options & VM_MAP_FORK_PRESERVE_PURGEABLE) {
13305 				vm_map_copyin_flags |=
13306 				    VM_MAP_COPYIN_PRESERVE_PURGEABLE;
13307 			}
13308 			if (vm_map_fork_copy(old_map,
13309 			    &old_entry,
13310 			    new_map,
13311 			    vm_map_copyin_flags)) {
13312 				new_size += entry_size;
13313 			}
13314 			continue;
13315 		}
13316 		old_entry = old_entry->vme_next;
13317 	}
13318 
13319 #if PMAP_FORK_NEST
13320 	new_entry = vm_map_last_entry(new_map);
13321 	if (new_entry == vm_map_to_entry(new_map)) {
13322 		/* unnest all that was pre-nested */
13323 		vm_map_fork_unnest(new_pmap,
13324 		    pre_nested_start, pre_nested_end,
13325 		    vm_map_min(new_map), vm_map_max(new_map));
13326 	} else if (new_entry->vme_end < vm_map_max(new_map)) {
13327 		/* unnest hole at the end, if pre-nested */
13328 		vm_map_fork_unnest(new_pmap,
13329 		    pre_nested_start, pre_nested_end,
13330 		    new_entry->vme_end, vm_map_max(new_map));
13331 	}
13332 #endif /* PMAP_FORK_NEST */
13333 
13334 #if defined(__arm64__)
13335 	pmap_insert_sharedpage(new_map->pmap);
13336 #endif /* __arm64__ */
13337 
13338 	new_map->size = new_size;
13339 
13340 	if (options & VM_MAP_FORK_CORPSE_FOOTPRINT) {
13341 		vm_map_corpse_footprint_collect_done(new_map);
13342 	}
13343 
13344 	/* Propagate JIT entitlement for the pmap layer. */
13345 	if (pmap_get_jit_entitled(old_map->pmap)) {
13346 		/* Tell the pmap that it supports JIT. */
13347 		pmap_set_jit_entitled(new_map->pmap);
13348 	}
13349 
13350 	vm_map_unlock(new_map);
13351 	vm_map_unlock(old_map);
13352 	vm_map_deallocate(old_map);
13353 
13354 	return new_map;
13355 }
13356 
13357 /*
13358  * vm_map_exec:
13359  *
13360  *      Setup the "new_map" with the proper execution environment according
13361  *	to the type of executable (platform, 64bit, chroot environment).
13362  *	Map the comm page and shared region, etc...
13363  */
13364 kern_return_t
vm_map_exec(vm_map_t new_map,task_t task,boolean_t is64bit,void * fsroot,cpu_type_t cpu,cpu_subtype_t cpu_subtype,boolean_t reslide,boolean_t is_driverkit,uint32_t rsr_version)13365 vm_map_exec(
13366 	vm_map_t        new_map,
13367 	task_t          task,
13368 	boolean_t       is64bit,
13369 	void            *fsroot,
13370 	cpu_type_t      cpu,
13371 	cpu_subtype_t   cpu_subtype,
13372 	boolean_t       reslide,
13373 	boolean_t       is_driverkit,
13374 	uint32_t        rsr_version)
13375 {
13376 	SHARED_REGION_TRACE_DEBUG(
13377 		("shared_region: task %p: vm_map_exec(%p,%p,%p,0x%x,0x%x): ->\n",
13378 		(void *)VM_KERNEL_ADDRPERM(current_task()),
13379 		(void *)VM_KERNEL_ADDRPERM(new_map),
13380 		(void *)VM_KERNEL_ADDRPERM(task),
13381 		(void *)VM_KERNEL_ADDRPERM(fsroot),
13382 		cpu,
13383 		cpu_subtype));
13384 	(void) vm_commpage_enter(new_map, task, is64bit);
13385 
13386 	(void) vm_shared_region_enter(new_map, task, is64bit, fsroot, cpu, cpu_subtype, reslide, is_driverkit, rsr_version);
13387 
13388 	SHARED_REGION_TRACE_DEBUG(
13389 		("shared_region: task %p: vm_map_exec(%p,%p,%p,0x%x,0x%x): <-\n",
13390 		(void *)VM_KERNEL_ADDRPERM(current_task()),
13391 		(void *)VM_KERNEL_ADDRPERM(new_map),
13392 		(void *)VM_KERNEL_ADDRPERM(task),
13393 		(void *)VM_KERNEL_ADDRPERM(fsroot),
13394 		cpu,
13395 		cpu_subtype));
13396 
13397 	/*
13398 	 * Some devices have region(s) of memory that shouldn't get allocated by
13399 	 * user processes. The following code creates dummy vm_map_entry_t's for each
13400 	 * of the regions that needs to be reserved to prevent any allocations in
13401 	 * those regions.
13402 	 */
13403 	kern_return_t kr = KERN_FAILURE;
13404 	vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
13405 	vmk_flags.vmkf_permanent = TRUE;
13406 	vmk_flags.vmkf_beyond_max = TRUE;
13407 
13408 	struct vm_reserved_region *regions = NULL;
13409 	size_t num_regions = ml_get_vm_reserved_regions(is64bit, &regions);
13410 	assert((num_regions == 0) || (num_regions > 0 && regions != NULL));
13411 
13412 	for (size_t i = 0; i < num_regions; ++i) {
13413 		kr = vm_map_enter(
13414 			new_map,
13415 			&regions[i].vmrr_addr,
13416 			regions[i].vmrr_size,
13417 			(vm_map_offset_t)0,
13418 			VM_FLAGS_FIXED,
13419 			vmk_flags,
13420 			VM_KERN_MEMORY_NONE,
13421 			VM_OBJECT_NULL,
13422 			(vm_object_offset_t)0,
13423 			FALSE,
13424 			VM_PROT_NONE,
13425 			VM_PROT_NONE,
13426 			VM_INHERIT_COPY);
13427 
13428 		if (kr != KERN_SUCCESS) {
13429 			panic("Failed to reserve %s region in user map %p %d", regions[i].vmrr_name, new_map, kr);
13430 		}
13431 	}
13432 
13433 	new_map->reserved_regions = (num_regions ? TRUE : FALSE);
13434 
13435 	return KERN_SUCCESS;
13436 }
13437 
13438 uint64_t vm_map_lookup_and_lock_object_copy_slowly_count = 0;
13439 uint64_t vm_map_lookup_and_lock_object_copy_slowly_size = 0;
13440 uint64_t vm_map_lookup_and_lock_object_copy_slowly_max = 0;
13441 uint64_t vm_map_lookup_and_lock_object_copy_slowly_restart = 0;
13442 uint64_t vm_map_lookup_and_lock_object_copy_slowly_error = 0;
13443 uint64_t vm_map_lookup_and_lock_object_copy_strategically_count = 0;
13444 uint64_t vm_map_lookup_and_lock_object_copy_strategically_size = 0;
13445 uint64_t vm_map_lookup_and_lock_object_copy_strategically_max = 0;
13446 uint64_t vm_map_lookup_and_lock_object_copy_strategically_restart = 0;
13447 uint64_t vm_map_lookup_and_lock_object_copy_strategically_error = 0;
13448 uint64_t vm_map_lookup_and_lock_object_copy_shadow_count = 0;
13449 uint64_t vm_map_lookup_and_lock_object_copy_shadow_size = 0;
13450 uint64_t vm_map_lookup_and_lock_object_copy_shadow_max = 0;
13451 /*
13452  *	vm_map_lookup_and_lock_object:
13453  *
13454  *	Finds the VM object, offset, and
13455  *	protection for a given virtual address in the
13456  *	specified map, assuming a page fault of the
13457  *	type specified.
13458  *
13459  *	Returns the (object, offset, protection) for
13460  *	this address, whether it is wired down, and whether
13461  *	this map has the only reference to the data in question.
13462  *	In order to later verify this lookup, a "version"
13463  *	is returned.
13464  *	If contended != NULL, *contended will be set to
13465  *	true iff the thread had to spin or block to acquire
13466  *	an exclusive lock.
13467  *
13468  *	The map MUST be locked by the caller and WILL be
13469  *	locked on exit.  In order to guarantee the
13470  *	existence of the returned object, it is returned
13471  *	locked.
13472  *
13473  *	If a lookup is requested with "write protection"
13474  *	specified, the map may be changed to perform virtual
13475  *	copying operations, although the data referenced will
13476  *	remain the same.
13477  */
13478 kern_return_t
vm_map_lookup_and_lock_object(vm_map_t * var_map,vm_map_offset_t vaddr,vm_prot_t fault_type,int object_lock_type,vm_map_version_t * out_version,vm_object_t * object,vm_object_offset_t * offset,vm_prot_t * out_prot,boolean_t * wired,vm_object_fault_info_t fault_info,vm_map_t * real_map,bool * contended)13479 vm_map_lookup_and_lock_object(
13480 	vm_map_t                *var_map,       /* IN/OUT */
13481 	vm_map_offset_t         vaddr,
13482 	vm_prot_t               fault_type,
13483 	int                     object_lock_type,
13484 	vm_map_version_t        *out_version,   /* OUT */
13485 	vm_object_t             *object,        /* OUT */
13486 	vm_object_offset_t      *offset,        /* OUT */
13487 	vm_prot_t               *out_prot,      /* OUT */
13488 	boolean_t               *wired,         /* OUT */
13489 	vm_object_fault_info_t  fault_info,     /* OUT */
13490 	vm_map_t                *real_map,      /* OUT */
13491 	bool                    *contended)     /* OUT */
13492 {
13493 	vm_map_entry_t                  entry;
13494 	vm_map_t                        map = *var_map;
13495 	vm_map_t                        old_map = *var_map;
13496 	vm_map_t                        cow_sub_map_parent = VM_MAP_NULL;
13497 	vm_map_offset_t                 cow_parent_vaddr = 0;
13498 	vm_map_offset_t                 old_start = 0;
13499 	vm_map_offset_t                 old_end = 0;
13500 	vm_prot_t                       prot;
13501 	boolean_t                       mask_protections;
13502 	boolean_t                       force_copy;
13503 	boolean_t                       no_force_copy_if_executable;
13504 	boolean_t                       submap_needed_copy;
13505 	vm_prot_t                       original_fault_type;
13506 	vm_map_size_t                   fault_page_mask;
13507 
13508 	/*
13509 	 * VM_PROT_MASK means that the caller wants us to use "fault_type"
13510 	 * as a mask against the mapping's actual protections, not as an
13511 	 * absolute value.
13512 	 */
13513 	mask_protections = (fault_type & VM_PROT_IS_MASK) ? TRUE : FALSE;
13514 	force_copy = (fault_type & VM_PROT_COPY) ? TRUE : FALSE;
13515 	no_force_copy_if_executable = (fault_type & VM_PROT_COPY_FAIL_IF_EXECUTABLE) ? TRUE : FALSE;
13516 	fault_type &= VM_PROT_ALL;
13517 	original_fault_type = fault_type;
13518 	if (contended) {
13519 		*contended = false;
13520 	}
13521 
13522 	*real_map = map;
13523 
13524 	fault_page_mask = MIN(VM_MAP_PAGE_MASK(map), PAGE_MASK);
13525 	vaddr = VM_MAP_TRUNC_PAGE(vaddr, fault_page_mask);
13526 
13527 RetryLookup:
13528 	fault_type = original_fault_type;
13529 
13530 	/*
13531 	 *	If the map has an interesting hint, try it before calling
13532 	 *	full blown lookup routine.
13533 	 */
13534 	entry = map->hint;
13535 
13536 	if ((entry == vm_map_to_entry(map)) ||
13537 	    (vaddr < entry->vme_start) || (vaddr >= entry->vme_end)) {
13538 		vm_map_entry_t  tmp_entry;
13539 
13540 		/*
13541 		 *	Entry was either not a valid hint, or the vaddr
13542 		 *	was not contained in the entry, so do a full lookup.
13543 		 */
13544 		if (!vm_map_lookup_entry(map, vaddr, &tmp_entry)) {
13545 			if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
13546 				vm_map_unlock(cow_sub_map_parent);
13547 			}
13548 			if ((*real_map != map)
13549 			    && (*real_map != cow_sub_map_parent)) {
13550 				vm_map_unlock(*real_map);
13551 			}
13552 			return KERN_INVALID_ADDRESS;
13553 		}
13554 
13555 		entry = tmp_entry;
13556 	}
13557 	if (map == old_map) {
13558 		old_start = entry->vme_start;
13559 		old_end = entry->vme_end;
13560 	}
13561 
13562 	/*
13563 	 *	Handle submaps.  Drop lock on upper map, submap is
13564 	 *	returned locked.
13565 	 */
13566 
13567 	submap_needed_copy = FALSE;
13568 submap_recurse:
13569 	if (entry->is_sub_map) {
13570 		vm_map_offset_t         local_vaddr;
13571 		vm_map_offset_t         end_delta;
13572 		vm_map_offset_t         start_delta;
13573 		vm_map_entry_t          submap_entry, saved_submap_entry;
13574 		vm_object_offset_t      submap_entry_offset;
13575 		vm_object_size_t        submap_entry_size;
13576 		vm_prot_t               subentry_protection;
13577 		vm_prot_t               subentry_max_protection;
13578 		boolean_t               subentry_no_copy_on_read;
13579 		boolean_t               subentry_permanent;
13580 		boolean_t               subentry_pmap_cs_associated;
13581 		boolean_t               mapped_needs_copy = FALSE;
13582 		vm_map_version_t        version;
13583 
13584 		assertf(VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)) >= VM_MAP_PAGE_SHIFT(map),
13585 		    "map %p (%d) entry %p submap %p (%d)\n",
13586 		    map, VM_MAP_PAGE_SHIFT(map), entry,
13587 		    VME_SUBMAP(entry), VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)));
13588 
13589 		local_vaddr = vaddr;
13590 
13591 		if ((entry->use_pmap &&
13592 		    !((fault_type & VM_PROT_WRITE) ||
13593 		    force_copy))) {
13594 			/* if real_map equals map we unlock below */
13595 			if ((*real_map != map) &&
13596 			    (*real_map != cow_sub_map_parent)) {
13597 				vm_map_unlock(*real_map);
13598 			}
13599 			*real_map = VME_SUBMAP(entry);
13600 		}
13601 
13602 		if (entry->needs_copy &&
13603 		    ((fault_type & VM_PROT_WRITE) ||
13604 		    force_copy)) {
13605 			if (!mapped_needs_copy) {
13606 				if (vm_map_lock_read_to_write(map)) {
13607 					vm_map_lock_read(map);
13608 					*real_map = map;
13609 					goto RetryLookup;
13610 				}
13611 				vm_map_lock_read(VME_SUBMAP(entry));
13612 				*var_map = VME_SUBMAP(entry);
13613 				cow_sub_map_parent = map;
13614 				/* reset base to map before cow object */
13615 				/* this is the map which will accept   */
13616 				/* the new cow object */
13617 				old_start = entry->vme_start;
13618 				old_end = entry->vme_end;
13619 				cow_parent_vaddr = vaddr;
13620 				mapped_needs_copy = TRUE;
13621 			} else {
13622 				vm_map_lock_read(VME_SUBMAP(entry));
13623 				*var_map = VME_SUBMAP(entry);
13624 				if ((cow_sub_map_parent != map) &&
13625 				    (*real_map != map)) {
13626 					vm_map_unlock(map);
13627 				}
13628 			}
13629 		} else {
13630 			if (entry->needs_copy) {
13631 				submap_needed_copy = TRUE;
13632 			}
13633 			vm_map_lock_read(VME_SUBMAP(entry));
13634 			*var_map = VME_SUBMAP(entry);
13635 			/* leave map locked if it is a target */
13636 			/* cow sub_map above otherwise, just  */
13637 			/* follow the maps down to the object */
13638 			/* here we unlock knowing we are not  */
13639 			/* revisiting the map.  */
13640 			if ((*real_map != map) && (map != cow_sub_map_parent)) {
13641 				vm_map_unlock_read(map);
13642 			}
13643 		}
13644 
13645 		map = *var_map;
13646 
13647 		/* calculate the offset in the submap for vaddr */
13648 		local_vaddr = (local_vaddr - entry->vme_start) + VME_OFFSET(entry);
13649 		assertf(VM_MAP_PAGE_ALIGNED(local_vaddr, fault_page_mask),
13650 		    "local_vaddr 0x%llx entry->vme_start 0x%llx fault_page_mask 0x%llx\n",
13651 		    (uint64_t)local_vaddr, (uint64_t)entry->vme_start, (uint64_t)fault_page_mask);
13652 
13653 RetrySubMap:
13654 		if (!vm_map_lookup_entry(map, local_vaddr, &submap_entry)) {
13655 			if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
13656 				vm_map_unlock(cow_sub_map_parent);
13657 			}
13658 			if ((*real_map != map)
13659 			    && (*real_map != cow_sub_map_parent)) {
13660 				vm_map_unlock(*real_map);
13661 			}
13662 			*real_map = map;
13663 			return KERN_INVALID_ADDRESS;
13664 		}
13665 
13666 		/* find the attenuated shadow of the underlying object */
13667 		/* on our target map */
13668 
13669 		/* in english the submap object may extend beyond the     */
13670 		/* region mapped by the entry or, may only fill a portion */
13671 		/* of it.  For our purposes, we only care if the object   */
13672 		/* doesn't fill.  In this case the area which will        */
13673 		/* ultimately be clipped in the top map will only need    */
13674 		/* to be as big as the portion of the underlying entry    */
13675 		/* which is mapped */
13676 		start_delta = submap_entry->vme_start > VME_OFFSET(entry) ?
13677 		    submap_entry->vme_start - VME_OFFSET(entry) : 0;
13678 
13679 		end_delta =
13680 		    (VME_OFFSET(entry) + start_delta + (old_end - old_start)) <=
13681 		    submap_entry->vme_end ?
13682 		    0 : (VME_OFFSET(entry) +
13683 		    (old_end - old_start))
13684 		    - submap_entry->vme_end;
13685 
13686 		old_start += start_delta;
13687 		old_end -= end_delta;
13688 
13689 		if (submap_entry->is_sub_map) {
13690 			entry = submap_entry;
13691 			vaddr = local_vaddr;
13692 			goto submap_recurse;
13693 		}
13694 
13695 		if (((fault_type & VM_PROT_WRITE) ||
13696 		    force_copy)
13697 		    && cow_sub_map_parent) {
13698 			vm_object_t     sub_object, copy_object;
13699 			vm_object_offset_t copy_offset;
13700 			vm_map_offset_t local_start;
13701 			vm_map_offset_t local_end;
13702 			boolean_t       object_copied = FALSE;
13703 			vm_object_offset_t object_copied_offset = 0;
13704 			boolean_t       object_copied_needs_copy = FALSE;
13705 			kern_return_t   kr = KERN_SUCCESS;
13706 
13707 			if (vm_map_lock_read_to_write(map)) {
13708 				vm_map_lock_read(map);
13709 				old_start -= start_delta;
13710 				old_end += end_delta;
13711 				goto RetrySubMap;
13712 			}
13713 
13714 
13715 			sub_object = VME_OBJECT(submap_entry);
13716 			if (sub_object == VM_OBJECT_NULL) {
13717 				sub_object =
13718 				    vm_object_allocate(
13719 					(vm_map_size_t)
13720 					(submap_entry->vme_end -
13721 					submap_entry->vme_start));
13722 				VME_OBJECT_SET(submap_entry, sub_object, false, 0);
13723 				VME_OFFSET_SET(submap_entry, 0);
13724 				assert(!submap_entry->is_sub_map);
13725 				assert(submap_entry->use_pmap);
13726 			}
13727 			local_start =  local_vaddr -
13728 			    (cow_parent_vaddr - old_start);
13729 			local_end = local_vaddr +
13730 			    (old_end - cow_parent_vaddr);
13731 			vm_map_clip_start(map, submap_entry, local_start);
13732 			vm_map_clip_end(map, submap_entry, local_end);
13733 			if (submap_entry->is_sub_map) {
13734 				/* unnesting was done when clipping */
13735 				assert(!submap_entry->use_pmap);
13736 			}
13737 
13738 			/* This is the COW case, lets connect */
13739 			/* an entry in our space to the underlying */
13740 			/* object in the submap, bypassing the  */
13741 			/* submap. */
13742 			submap_entry_offset = VME_OFFSET(submap_entry);
13743 			submap_entry_size = submap_entry->vme_end - submap_entry->vme_start;
13744 
13745 			if ((submap_entry->wired_count != 0 ||
13746 			    sub_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) &&
13747 			    (submap_entry->protection & VM_PROT_EXECUTE) &&
13748 			    no_force_copy_if_executable) {
13749 //				printf("FBDP map %p entry %p start 0x%llx end 0x%llx wired %d strat %d\n", map, submap_entry, (uint64_t)local_start, (uint64_t)local_end, submap_entry->wired_count, sub_object->copy_strategy);
13750 				if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
13751 					vm_map_unlock(cow_sub_map_parent);
13752 				}
13753 				if ((*real_map != map)
13754 				    && (*real_map != cow_sub_map_parent)) {
13755 					vm_map_unlock(*real_map);
13756 				}
13757 				*real_map = map;
13758 				ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_NO_COW_ON_EXECUTABLE), 0 /* arg */);
13759 				vm_map_lock_write_to_read(map);
13760 				kr = KERN_PROTECTION_FAILURE;
13761 				DTRACE_VM4(submap_no_copy_executable,
13762 				    vm_map_t, map,
13763 				    vm_object_offset_t, submap_entry_offset,
13764 				    vm_object_size_t, submap_entry_size,
13765 				    int, kr);
13766 				return kr;
13767 			}
13768 
13769 			if (submap_entry->wired_count != 0) {
13770 				vm_object_reference(sub_object);
13771 
13772 				assertf(VM_MAP_PAGE_ALIGNED(VME_OFFSET(submap_entry), VM_MAP_PAGE_MASK(map)),
13773 				    "submap_entry %p offset 0x%llx\n",
13774 				    submap_entry, VME_OFFSET(submap_entry));
13775 
13776 				DTRACE_VM6(submap_copy_slowly,
13777 				    vm_map_t, cow_sub_map_parent,
13778 				    vm_map_offset_t, vaddr,
13779 				    vm_map_t, map,
13780 				    vm_object_size_t, submap_entry_size,
13781 				    int, submap_entry->wired_count,
13782 				    int, sub_object->copy_strategy);
13783 
13784 				saved_submap_entry = submap_entry;
13785 				version.main_timestamp = map->timestamp;
13786 				vm_map_unlock(map); /* Increments timestamp by 1 */
13787 				submap_entry = VM_MAP_ENTRY_NULL;
13788 
13789 				vm_object_lock(sub_object);
13790 				kr = vm_object_copy_slowly(sub_object,
13791 				    submap_entry_offset,
13792 				    submap_entry_size,
13793 				    FALSE,
13794 				    &copy_object);
13795 				object_copied = TRUE;
13796 				object_copied_offset = 0;
13797 				/* 4k: account for extra offset in physical page */
13798 				object_copied_offset += submap_entry_offset - vm_object_trunc_page(submap_entry_offset);
13799 				object_copied_needs_copy = FALSE;
13800 				vm_object_deallocate(sub_object);
13801 
13802 				vm_map_lock(map);
13803 
13804 				if (kr != KERN_SUCCESS &&
13805 				    kr != KERN_MEMORY_RESTART_COPY) {
13806 					if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
13807 						vm_map_unlock(cow_sub_map_parent);
13808 					}
13809 					if ((*real_map != map)
13810 					    && (*real_map != cow_sub_map_parent)) {
13811 						vm_map_unlock(*real_map);
13812 					}
13813 					*real_map = map;
13814 					vm_object_deallocate(copy_object);
13815 					copy_object = VM_OBJECT_NULL;
13816 					ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_COPY_SLOWLY_FAILED), 0 /* arg */);
13817 					vm_map_lock_write_to_read(map);
13818 					DTRACE_VM4(submap_copy_error_slowly,
13819 					    vm_object_t, sub_object,
13820 					    vm_object_offset_t, submap_entry_offset,
13821 					    vm_object_size_t, submap_entry_size,
13822 					    int, kr);
13823 					vm_map_lookup_and_lock_object_copy_slowly_error++;
13824 					return kr;
13825 				}
13826 
13827 				if ((kr == KERN_SUCCESS) &&
13828 				    (version.main_timestamp + 1) == map->timestamp) {
13829 					submap_entry = saved_submap_entry;
13830 				} else {
13831 					saved_submap_entry = NULL;
13832 					old_start -= start_delta;
13833 					old_end += end_delta;
13834 					vm_object_deallocate(copy_object);
13835 					copy_object = VM_OBJECT_NULL;
13836 					vm_map_lock_write_to_read(map);
13837 					vm_map_lookup_and_lock_object_copy_slowly_restart++;
13838 					goto RetrySubMap;
13839 				}
13840 				vm_map_lookup_and_lock_object_copy_slowly_count++;
13841 				vm_map_lookup_and_lock_object_copy_slowly_size += submap_entry_size;
13842 				if (submap_entry_size > vm_map_lookup_and_lock_object_copy_slowly_max) {
13843 					vm_map_lookup_and_lock_object_copy_slowly_max = submap_entry_size;
13844 				}
13845 			} else if (sub_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
13846 				submap_entry_offset = VME_OFFSET(submap_entry);
13847 				copy_object = VM_OBJECT_NULL;
13848 				object_copied_offset = submap_entry_offset;
13849 				object_copied_needs_copy = FALSE;
13850 				DTRACE_VM6(submap_copy_strategically,
13851 				    vm_map_t, cow_sub_map_parent,
13852 				    vm_map_offset_t, vaddr,
13853 				    vm_map_t, map,
13854 				    vm_object_size_t, submap_entry_size,
13855 				    int, submap_entry->wired_count,
13856 				    int, sub_object->copy_strategy);
13857 				kr = vm_object_copy_strategically(
13858 					sub_object,
13859 					submap_entry_offset,
13860 					submap_entry->vme_end - submap_entry->vme_start,
13861 					&copy_object,
13862 					&object_copied_offset,
13863 					&object_copied_needs_copy);
13864 				if (kr == KERN_MEMORY_RESTART_COPY) {
13865 					old_start -= start_delta;
13866 					old_end += end_delta;
13867 					vm_object_deallocate(copy_object);
13868 					copy_object = VM_OBJECT_NULL;
13869 					vm_map_lock_write_to_read(map);
13870 					vm_map_lookup_and_lock_object_copy_strategically_restart++;
13871 					goto RetrySubMap;
13872 				}
13873 				if (kr != KERN_SUCCESS) {
13874 					if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
13875 						vm_map_unlock(cow_sub_map_parent);
13876 					}
13877 					if ((*real_map != map)
13878 					    && (*real_map != cow_sub_map_parent)) {
13879 						vm_map_unlock(*real_map);
13880 					}
13881 					*real_map = map;
13882 					vm_object_deallocate(copy_object);
13883 					copy_object = VM_OBJECT_NULL;
13884 					ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_COPY_STRAT_FAILED), 0 /* arg */);
13885 					vm_map_lock_write_to_read(map);
13886 					DTRACE_VM4(submap_copy_error_strategically,
13887 					    vm_object_t, sub_object,
13888 					    vm_object_offset_t, submap_entry_offset,
13889 					    vm_object_size_t, submap_entry_size,
13890 					    int, kr);
13891 					vm_map_lookup_and_lock_object_copy_strategically_error++;
13892 					return kr;
13893 				}
13894 				assert(copy_object != VM_OBJECT_NULL);
13895 				assert(copy_object != sub_object);
13896 				object_copied = TRUE;
13897 				vm_map_lookup_and_lock_object_copy_strategically_count++;
13898 				vm_map_lookup_and_lock_object_copy_strategically_size += submap_entry_size;
13899 				if (submap_entry_size > vm_map_lookup_and_lock_object_copy_strategically_max) {
13900 					vm_map_lookup_and_lock_object_copy_strategically_max = submap_entry_size;
13901 				}
13902 			} else {
13903 				/* set up shadow object */
13904 				object_copied = FALSE;
13905 				copy_object = sub_object;
13906 				vm_object_lock(sub_object);
13907 				vm_object_reference_locked(sub_object);
13908 				sub_object->shadowed = TRUE;
13909 				vm_object_unlock(sub_object);
13910 
13911 				assert(submap_entry->wired_count == 0);
13912 				submap_entry->needs_copy = TRUE;
13913 
13914 				prot = submap_entry->protection;
13915 				assert(!pmap_has_prot_policy(map->pmap, submap_entry->translated_allow_execute, prot));
13916 				prot = prot & ~VM_PROT_WRITE;
13917 				assert(!pmap_has_prot_policy(map->pmap, submap_entry->translated_allow_execute, prot));
13918 
13919 				if (override_nx(old_map,
13920 				    VME_ALIAS(submap_entry))
13921 				    && prot) {
13922 					prot |= VM_PROT_EXECUTE;
13923 				}
13924 
13925 				vm_object_pmap_protect(
13926 					sub_object,
13927 					VME_OFFSET(submap_entry),
13928 					submap_entry->vme_end -
13929 					submap_entry->vme_start,
13930 					(submap_entry->is_shared
13931 					|| map->mapped_in_other_pmaps) ?
13932 					PMAP_NULL : map->pmap,
13933 					VM_MAP_PAGE_SIZE(map),
13934 					submap_entry->vme_start,
13935 					prot);
13936 				vm_map_lookup_and_lock_object_copy_shadow_count++;
13937 				vm_map_lookup_and_lock_object_copy_shadow_size += submap_entry_size;
13938 				if (submap_entry_size > vm_map_lookup_and_lock_object_copy_shadow_max) {
13939 					vm_map_lookup_and_lock_object_copy_shadow_max = submap_entry_size;
13940 				}
13941 			}
13942 
13943 			/*
13944 			 * Adjust the fault offset to the submap entry.
13945 			 */
13946 			copy_offset = (local_vaddr -
13947 			    submap_entry->vme_start +
13948 			    VME_OFFSET(submap_entry));
13949 
13950 			/* This works diffently than the   */
13951 			/* normal submap case. We go back  */
13952 			/* to the parent of the cow map and*/
13953 			/* clip out the target portion of  */
13954 			/* the sub_map, substituting the   */
13955 			/* new copy object,                */
13956 
13957 			subentry_protection = submap_entry->protection;
13958 			subentry_max_protection = submap_entry->max_protection;
13959 			subentry_no_copy_on_read = submap_entry->vme_no_copy_on_read;
13960 			subentry_permanent = submap_entry->vme_permanent;
13961 			subentry_pmap_cs_associated = submap_entry->pmap_cs_associated;
13962 
13963 			vm_map_unlock(map);
13964 			submap_entry = NULL; /* not valid after map unlock */
13965 
13966 			local_start = old_start;
13967 			local_end = old_end;
13968 			map = cow_sub_map_parent;
13969 			*var_map = cow_sub_map_parent;
13970 			vaddr = cow_parent_vaddr;
13971 			cow_sub_map_parent = NULL;
13972 
13973 			if (!vm_map_lookup_entry(map,
13974 			    vaddr, &entry)) {
13975 				if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
13976 					vm_map_unlock(cow_sub_map_parent);
13977 				}
13978 				if ((*real_map != map)
13979 				    && (*real_map != cow_sub_map_parent)) {
13980 					vm_map_unlock(*real_map);
13981 				}
13982 				*real_map = map;
13983 				vm_object_deallocate(
13984 					copy_object);
13985 				copy_object = VM_OBJECT_NULL;
13986 				vm_map_lock_write_to_read(map);
13987 				DTRACE_VM4(submap_lookup_post_unlock,
13988 				    uint64_t, (uint64_t)entry->vme_start,
13989 				    uint64_t, (uint64_t)entry->vme_end,
13990 				    vm_map_offset_t, vaddr,
13991 				    int, object_copied);
13992 				return KERN_INVALID_ADDRESS;
13993 			}
13994 
13995 			/* clip out the portion of space */
13996 			/* mapped by the sub map which   */
13997 			/* corresponds to the underlying */
13998 			/* object */
13999 
14000 			/*
14001 			 * Clip (and unnest) the smallest nested chunk
14002 			 * possible around the faulting address...
14003 			 */
14004 			local_start = vaddr & ~(pmap_shared_region_size_min(map->pmap) - 1);
14005 			local_end = local_start + pmap_shared_region_size_min(map->pmap);
14006 			/*
14007 			 * ... but don't go beyond the "old_start" to "old_end"
14008 			 * range, to avoid spanning over another VM region
14009 			 * with a possibly different VM object and/or offset.
14010 			 */
14011 			if (local_start < old_start) {
14012 				local_start = old_start;
14013 			}
14014 			if (local_end > old_end) {
14015 				local_end = old_end;
14016 			}
14017 			/*
14018 			 * Adjust copy_offset to the start of the range.
14019 			 */
14020 			copy_offset -= (vaddr - local_start);
14021 
14022 			vm_map_clip_start(map, entry, local_start);
14023 			vm_map_clip_end(map, entry, local_end);
14024 			if (entry->is_sub_map) {
14025 				/* unnesting was done when clipping */
14026 				assert(!entry->use_pmap);
14027 			}
14028 
14029 			/* substitute copy object for */
14030 			/* shared map entry           */
14031 			vm_map_deallocate(VME_SUBMAP(entry));
14032 			assert(!entry->iokit_acct);
14033 			entry->use_pmap = TRUE;
14034 			VME_OBJECT_SET(entry, copy_object, false, 0);
14035 
14036 			/* propagate the submap entry's protections */
14037 			if (entry->protection != VM_PROT_READ) {
14038 				/*
14039 				 * Someone has already altered the top entry's
14040 				 * protections via vm_protect(VM_PROT_COPY).
14041 				 * Respect these new values and ignore the
14042 				 * submap entry's protections.
14043 				 */
14044 			} else {
14045 				/*
14046 				 * Regular copy-on-write: propagate the submap
14047 				 * entry's protections to the top map entry.
14048 				 */
14049 				entry->protection |= subentry_protection;
14050 			}
14051 			entry->max_protection |= subentry_max_protection;
14052 			/* propagate some attributes from subentry */
14053 			entry->vme_no_copy_on_read = subentry_no_copy_on_read;
14054 			entry->vme_permanent = subentry_permanent;
14055 			entry->pmap_cs_associated = subentry_pmap_cs_associated;
14056 
14057 			if ((entry->protection & VM_PROT_WRITE) &&
14058 			    (entry->protection & VM_PROT_EXECUTE) &&
14059 #if XNU_TARGET_OS_OSX
14060 			    map->pmap != kernel_pmap &&
14061 			    (vm_map_cs_enforcement(map)
14062 #if __arm64__
14063 			    || !VM_MAP_IS_EXOTIC(map)
14064 #endif /* __arm64__ */
14065 			    ) &&
14066 #endif /* XNU_TARGET_OS_OSX */
14067 			    !(entry->used_for_jit) &&
14068 			    VM_MAP_POLICY_WX_STRIP_X(map)) {
14069 				DTRACE_VM3(cs_wx,
14070 				    uint64_t, (uint64_t)entry->vme_start,
14071 				    uint64_t, (uint64_t)entry->vme_end,
14072 				    vm_prot_t, entry->protection);
14073 				printf("CODE SIGNING: %d[%s] %s can't have both write and exec at the same time\n",
14074 				    proc_selfpid(),
14075 				    (get_bsdtask_info(current_task())
14076 				    ? proc_name_address(get_bsdtask_info(current_task()))
14077 				    : "?"),
14078 				    __FUNCTION__);
14079 				entry->protection &= ~VM_PROT_EXECUTE;
14080 			}
14081 
14082 			if (object_copied) {
14083 				VME_OFFSET_SET(entry, local_start - old_start + object_copied_offset);
14084 				entry->needs_copy = object_copied_needs_copy;
14085 				entry->is_shared = FALSE;
14086 			} else {
14087 				assert(VME_OBJECT(entry) != VM_OBJECT_NULL);
14088 				assert(VME_OBJECT(entry)->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC);
14089 				assert(entry->wired_count == 0);
14090 				VME_OFFSET_SET(entry, copy_offset);
14091 				entry->needs_copy = TRUE;
14092 				if (map != old_map) {
14093 					entry->is_shared = TRUE;
14094 				}
14095 			}
14096 			if (entry->inheritance == VM_INHERIT_SHARE) {
14097 				entry->inheritance = VM_INHERIT_COPY;
14098 			}
14099 
14100 			vm_map_lock_write_to_read(map);
14101 		} else {
14102 			if ((cow_sub_map_parent)
14103 			    && (cow_sub_map_parent != *real_map)
14104 			    && (cow_sub_map_parent != map)) {
14105 				vm_map_unlock(cow_sub_map_parent);
14106 			}
14107 			entry = submap_entry;
14108 			vaddr = local_vaddr;
14109 		}
14110 	}
14111 
14112 	/*
14113 	 *	Check whether this task is allowed to have
14114 	 *	this page.
14115 	 */
14116 
14117 	prot = entry->protection;
14118 
14119 	if (override_nx(old_map, VME_ALIAS(entry)) && prot) {
14120 		/*
14121 		 * HACK -- if not a stack, then allow execution
14122 		 */
14123 		prot |= VM_PROT_EXECUTE;
14124 	}
14125 
14126 	if (mask_protections) {
14127 		fault_type &= prot;
14128 		if (fault_type == VM_PROT_NONE) {
14129 			goto protection_failure;
14130 		}
14131 	}
14132 	if (((fault_type & prot) != fault_type)
14133 #if __arm64__
14134 	    /* prefetch abort in execute-only page */
14135 	    && !(prot == VM_PROT_EXECUTE && fault_type == (VM_PROT_READ | VM_PROT_EXECUTE))
14136 #elif defined(__x86_64__)
14137 	    /* Consider the UEXEC bit when handling an EXECUTE fault */
14138 	    && !((fault_type & VM_PROT_EXECUTE) && !(prot & VM_PROT_EXECUTE) && (prot & VM_PROT_UEXEC))
14139 #endif
14140 	    ) {
14141 protection_failure:
14142 		if (*real_map != map) {
14143 			vm_map_unlock(*real_map);
14144 		}
14145 		*real_map = map;
14146 
14147 		if ((fault_type & VM_PROT_EXECUTE) && prot) {
14148 			log_stack_execution_failure((addr64_t)vaddr, prot);
14149 		}
14150 
14151 		DTRACE_VM2(prot_fault, int, 1, (uint64_t *), NULL);
14152 		DTRACE_VM3(prot_fault_detailed, vm_prot_t, fault_type, vm_prot_t, prot, void *, vaddr);
14153 		/*
14154 		 * Noisy (esp. internally) and can be inferred from CrashReports. So OFF for now.
14155 		 *
14156 		 * ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_PROTECTION_FAILURE), 0);
14157 		 */
14158 		return KERN_PROTECTION_FAILURE;
14159 	}
14160 
14161 	/*
14162 	 *	If this page is not pageable, we have to get
14163 	 *	it for all possible accesses.
14164 	 */
14165 
14166 	*wired = (entry->wired_count != 0);
14167 	if (*wired) {
14168 		fault_type = prot;
14169 	}
14170 
14171 	/*
14172 	 *	If the entry was copy-on-write, we either ...
14173 	 */
14174 
14175 	if (entry->needs_copy) {
14176 		/*
14177 		 *	If we want to write the page, we may as well
14178 		 *	handle that now since we've got the map locked.
14179 		 *
14180 		 *	If we don't need to write the page, we just
14181 		 *	demote the permissions allowed.
14182 		 */
14183 
14184 		if ((fault_type & VM_PROT_WRITE) || *wired || force_copy) {
14185 			/*
14186 			 *	Make a new object, and place it in the
14187 			 *	object chain.  Note that no new references
14188 			 *	have appeared -- one just moved from the
14189 			 *	map to the new object.
14190 			 */
14191 
14192 			if (vm_map_lock_read_to_write(map)) {
14193 				vm_map_lock_read(map);
14194 				goto RetryLookup;
14195 			}
14196 
14197 			if (VME_OBJECT(entry)->shadowed == FALSE) {
14198 				vm_object_lock(VME_OBJECT(entry));
14199 				VME_OBJECT(entry)->shadowed = TRUE;
14200 				vm_object_unlock(VME_OBJECT(entry));
14201 			}
14202 			VME_OBJECT_SHADOW(entry,
14203 			    (vm_map_size_t) (entry->vme_end -
14204 			    entry->vme_start),
14205 			    vm_map_always_shadow(map));
14206 			entry->needs_copy = FALSE;
14207 
14208 			vm_map_lock_write_to_read(map);
14209 		}
14210 		if ((fault_type & VM_PROT_WRITE) == 0 && *wired == 0) {
14211 			/*
14212 			 *	We're attempting to read a copy-on-write
14213 			 *	page -- don't allow writes.
14214 			 */
14215 
14216 			prot &= (~VM_PROT_WRITE);
14217 		}
14218 	}
14219 
14220 	if (submap_needed_copy && (prot & VM_PROT_WRITE)) {
14221 		/*
14222 		 * We went through a "needs_copy" submap without triggering
14223 		 * a copy, so granting write access to the page would bypass
14224 		 * that submap's "needs_copy".
14225 		 */
14226 		assert(!(fault_type & VM_PROT_WRITE));
14227 		assert(!*wired);
14228 		assert(!force_copy);
14229 		// printf("FBDP %d[%s] submap_needed_copy for %p 0x%llx\n", proc_selfpid(), proc_name_address(current_task()->bsd_info), map, vaddr);
14230 		prot &= ~VM_PROT_WRITE;
14231 	}
14232 
14233 	/*
14234 	 *	Create an object if necessary.
14235 	 */
14236 	if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
14237 		if (vm_map_lock_read_to_write(map)) {
14238 			vm_map_lock_read(map);
14239 			goto RetryLookup;
14240 		}
14241 
14242 		VME_OBJECT_SET(entry,
14243 		    vm_object_allocate(
14244 			    (vm_map_size_t)(entry->vme_end -
14245 			    entry->vme_start)), false, 0);
14246 		VME_OFFSET_SET(entry, 0);
14247 		assert(entry->use_pmap);
14248 		vm_map_lock_write_to_read(map);
14249 	}
14250 
14251 	/*
14252 	 *	Return the object/offset from this entry.  If the entry
14253 	 *	was copy-on-write or empty, it has been fixed up.  Also
14254 	 *	return the protection.
14255 	 */
14256 
14257 	*offset = (vaddr - entry->vme_start) + VME_OFFSET(entry);
14258 	*object = VME_OBJECT(entry);
14259 	*out_prot = prot;
14260 	KDBG_FILTERED(MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_MAP_LOOKUP_OBJECT), VM_KERNEL_UNSLIDE_OR_PERM(*object), (unsigned long) VME_ALIAS(entry), 0, 0);
14261 
14262 	if (fault_info) {
14263 		fault_info->interruptible = THREAD_UNINT; /* for now... */
14264 		/* ... the caller will change "interruptible" if needed */
14265 		fault_info->cluster_size = 0;
14266 		fault_info->user_tag = VME_ALIAS(entry);
14267 		fault_info->pmap_options = 0;
14268 		if (entry->iokit_acct ||
14269 		    (!entry->is_sub_map && !entry->use_pmap)) {
14270 			fault_info->pmap_options |= PMAP_OPTIONS_ALT_ACCT;
14271 		}
14272 		fault_info->behavior = entry->behavior;
14273 		fault_info->lo_offset = VME_OFFSET(entry);
14274 		fault_info->hi_offset =
14275 		    (entry->vme_end - entry->vme_start) + VME_OFFSET(entry);
14276 		fault_info->no_cache  = entry->no_cache;
14277 		fault_info->stealth = FALSE;
14278 		fault_info->io_sync = FALSE;
14279 		if (entry->used_for_jit ||
14280 		    entry->vme_resilient_codesign) {
14281 			fault_info->cs_bypass = TRUE;
14282 		} else {
14283 			fault_info->cs_bypass = FALSE;
14284 		}
14285 		fault_info->pmap_cs_associated = FALSE;
14286 #if CONFIG_PMAP_CS
14287 		if (entry->pmap_cs_associated) {
14288 			/*
14289 			 * The pmap layer will validate this page
14290 			 * before allowing it to be executed from.
14291 			 */
14292 			fault_info->pmap_cs_associated = TRUE;
14293 		}
14294 #endif /* CONFIG_PMAP_CS */
14295 		fault_info->mark_zf_absent = FALSE;
14296 		fault_info->batch_pmap_op = FALSE;
14297 		fault_info->resilient_media = entry->vme_resilient_media;
14298 		fault_info->no_copy_on_read = entry->vme_no_copy_on_read;
14299 		if (entry->translated_allow_execute) {
14300 			fault_info->pmap_options |= PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE;
14301 		}
14302 	}
14303 
14304 	/*
14305 	 *	Lock the object to prevent it from disappearing
14306 	 */
14307 	if (object_lock_type == OBJECT_LOCK_EXCLUSIVE) {
14308 		if (contended == NULL) {
14309 			vm_object_lock(*object);
14310 		} else {
14311 			*contended = vm_object_lock_check_contended(*object);
14312 		}
14313 	} else {
14314 		vm_object_lock_shared(*object);
14315 	}
14316 
14317 	/*
14318 	 *	Save the version number
14319 	 */
14320 
14321 	out_version->main_timestamp = map->timestamp;
14322 
14323 	return KERN_SUCCESS;
14324 }
14325 
14326 
14327 /*
14328  *	vm_map_verify:
14329  *
14330  *	Verifies that the map in question has not changed
14331  *	since the given version. The map has to be locked
14332  *	("shared" mode is fine) before calling this function
14333  *	and it will be returned locked too.
14334  */
14335 boolean_t
vm_map_verify(vm_map_t map,vm_map_version_t * version)14336 vm_map_verify(
14337 	vm_map_t                map,
14338 	vm_map_version_t        *version)       /* REF */
14339 {
14340 	boolean_t       result;
14341 
14342 	vm_map_lock_assert_held(map);
14343 	result = (map->timestamp == version->main_timestamp);
14344 
14345 	return result;
14346 }
14347 
14348 /*
14349  *	TEMPORARYTEMPORARYTEMPORARYTEMPORARYTEMPORARYTEMPORARY
14350  *	Goes away after regular vm_region_recurse function migrates to
14351  *	64 bits
14352  *	vm_region_recurse: A form of vm_region which follows the
14353  *	submaps in a target map
14354  *
14355  */
14356 
14357 kern_return_t
vm_map_region_recurse_64(vm_map_t map,vm_map_offset_t * address,vm_map_size_t * size,natural_t * nesting_depth,vm_region_submap_info_64_t submap_info,mach_msg_type_number_t * count)14358 vm_map_region_recurse_64(
14359 	vm_map_t                 map,
14360 	vm_map_offset_t *address,               /* IN/OUT */
14361 	vm_map_size_t           *size,                  /* OUT */
14362 	natural_t               *nesting_depth, /* IN/OUT */
14363 	vm_region_submap_info_64_t      submap_info,    /* IN/OUT */
14364 	mach_msg_type_number_t  *count) /* IN/OUT */
14365 {
14366 	mach_msg_type_number_t  original_count;
14367 	vm_region_extended_info_data_t  extended;
14368 	vm_map_entry_t                  tmp_entry;
14369 	vm_map_offset_t                 user_address;
14370 	unsigned int                    user_max_depth;
14371 
14372 	/*
14373 	 * "curr_entry" is the VM map entry preceding or including the
14374 	 * address we're looking for.
14375 	 * "curr_map" is the map or sub-map containing "curr_entry".
14376 	 * "curr_address" is the equivalent of the top map's "user_address"
14377 	 * in the current map.
14378 	 * "curr_offset" is the cumulated offset of "curr_map" in the
14379 	 * target task's address space.
14380 	 * "curr_depth" is the depth of "curr_map" in the chain of
14381 	 * sub-maps.
14382 	 *
14383 	 * "curr_max_below" and "curr_max_above" limit the range (around
14384 	 * "curr_address") we should take into account in the current (sub)map.
14385 	 * They limit the range to what's visible through the map entries
14386 	 * we've traversed from the top map to the current map.
14387 	 *
14388 	 */
14389 	vm_map_entry_t                  curr_entry;
14390 	vm_map_address_t                curr_address;
14391 	vm_map_offset_t                 curr_offset;
14392 	vm_map_t                        curr_map;
14393 	unsigned int                    curr_depth;
14394 	vm_map_offset_t                 curr_max_below, curr_max_above;
14395 	vm_map_offset_t                 curr_skip;
14396 
14397 	/*
14398 	 * "next_" is the same as "curr_" but for the VM region immediately
14399 	 * after the address we're looking for.  We need to keep track of this
14400 	 * too because we want to return info about that region if the
14401 	 * address we're looking for is not mapped.
14402 	 */
14403 	vm_map_entry_t                  next_entry;
14404 	vm_map_offset_t                 next_offset;
14405 	vm_map_offset_t                 next_address;
14406 	vm_map_t                        next_map;
14407 	unsigned int                    next_depth;
14408 	vm_map_offset_t                 next_max_below, next_max_above;
14409 	vm_map_offset_t                 next_skip;
14410 
14411 	boolean_t                       look_for_pages;
14412 	vm_region_submap_short_info_64_t short_info;
14413 	boolean_t                       do_region_footprint;
14414 	int                             effective_page_size, effective_page_shift;
14415 	boolean_t                       submap_needed_copy;
14416 
14417 	if (map == VM_MAP_NULL) {
14418 		/* no address space to work on */
14419 		return KERN_INVALID_ARGUMENT;
14420 	}
14421 
14422 	effective_page_shift = vm_self_region_page_shift(map);
14423 	effective_page_size = (1 << effective_page_shift);
14424 
14425 	if (*count < VM_REGION_SUBMAP_SHORT_INFO_COUNT_64) {
14426 		/*
14427 		 * "info" structure is not big enough and
14428 		 * would overflow
14429 		 */
14430 		return KERN_INVALID_ARGUMENT;
14431 	}
14432 
14433 	do_region_footprint = task_self_region_footprint();
14434 	original_count = *count;
14435 
14436 	if (original_count < VM_REGION_SUBMAP_INFO_V0_COUNT_64) {
14437 		*count = VM_REGION_SUBMAP_SHORT_INFO_COUNT_64;
14438 		look_for_pages = FALSE;
14439 		short_info = (vm_region_submap_short_info_64_t) submap_info;
14440 		submap_info = NULL;
14441 	} else {
14442 		look_for_pages = TRUE;
14443 		*count = VM_REGION_SUBMAP_INFO_V0_COUNT_64;
14444 		short_info = NULL;
14445 
14446 		if (original_count >= VM_REGION_SUBMAP_INFO_V1_COUNT_64) {
14447 			*count = VM_REGION_SUBMAP_INFO_V1_COUNT_64;
14448 		}
14449 		if (original_count >= VM_REGION_SUBMAP_INFO_V2_COUNT_64) {
14450 			*count = VM_REGION_SUBMAP_INFO_V2_COUNT_64;
14451 		}
14452 	}
14453 
14454 	user_address = *address;
14455 	user_max_depth = *nesting_depth;
14456 	submap_needed_copy = FALSE;
14457 
14458 	if (not_in_kdp) {
14459 		vm_map_lock_read(map);
14460 	}
14461 
14462 recurse_again:
14463 	curr_entry = NULL;
14464 	curr_map = map;
14465 	curr_address = user_address;
14466 	curr_offset = 0;
14467 	curr_skip = 0;
14468 	curr_depth = 0;
14469 	curr_max_above = ((vm_map_offset_t) -1) - curr_address;
14470 	curr_max_below = curr_address;
14471 
14472 	next_entry = NULL;
14473 	next_map = NULL;
14474 	next_address = 0;
14475 	next_offset = 0;
14476 	next_skip = 0;
14477 	next_depth = 0;
14478 	next_max_above = (vm_map_offset_t) -1;
14479 	next_max_below = (vm_map_offset_t) -1;
14480 
14481 	for (;;) {
14482 		if (vm_map_lookup_entry(curr_map,
14483 		    curr_address,
14484 		    &tmp_entry)) {
14485 			/* tmp_entry contains the address we're looking for */
14486 			curr_entry = tmp_entry;
14487 		} else {
14488 			vm_map_offset_t skip;
14489 			/*
14490 			 * The address is not mapped.  "tmp_entry" is the
14491 			 * map entry preceding the address.  We want the next
14492 			 * one, if it exists.
14493 			 */
14494 			curr_entry = tmp_entry->vme_next;
14495 
14496 			if (curr_entry == vm_map_to_entry(curr_map) ||
14497 			    (curr_entry->vme_start >=
14498 			    curr_address + curr_max_above)) {
14499 				/* no next entry at this level: stop looking */
14500 				if (not_in_kdp) {
14501 					vm_map_unlock_read(curr_map);
14502 				}
14503 				curr_entry = NULL;
14504 				curr_map = NULL;
14505 				curr_skip = 0;
14506 				curr_offset = 0;
14507 				curr_depth = 0;
14508 				curr_max_above = 0;
14509 				curr_max_below = 0;
14510 				break;
14511 			}
14512 
14513 			/* adjust current address and offset */
14514 			skip = curr_entry->vme_start - curr_address;
14515 			curr_address = curr_entry->vme_start;
14516 			curr_skip += skip;
14517 			curr_offset += skip;
14518 			curr_max_above -= skip;
14519 			curr_max_below = 0;
14520 		}
14521 
14522 		/*
14523 		 * Is the next entry at this level closer to the address (or
14524 		 * deeper in the submap chain) than the one we had
14525 		 * so far ?
14526 		 */
14527 		tmp_entry = curr_entry->vme_next;
14528 		if (tmp_entry == vm_map_to_entry(curr_map)) {
14529 			/* no next entry at this level */
14530 		} else if (tmp_entry->vme_start >=
14531 		    curr_address + curr_max_above) {
14532 			/*
14533 			 * tmp_entry is beyond the scope of what we mapped of
14534 			 * this submap in the upper level: ignore it.
14535 			 */
14536 		} else if ((next_entry == NULL) ||
14537 		    (tmp_entry->vme_start + curr_offset <=
14538 		    next_entry->vme_start + next_offset)) {
14539 			/*
14540 			 * We didn't have a "next_entry" or this one is
14541 			 * closer to the address we're looking for:
14542 			 * use this "tmp_entry" as the new "next_entry".
14543 			 */
14544 			if (next_entry != NULL) {
14545 				/* unlock the last "next_map" */
14546 				if (next_map != curr_map && not_in_kdp) {
14547 					vm_map_unlock_read(next_map);
14548 				}
14549 			}
14550 			next_entry = tmp_entry;
14551 			next_map = curr_map;
14552 			next_depth = curr_depth;
14553 			next_address = next_entry->vme_start;
14554 			next_skip = curr_skip;
14555 			next_skip += (next_address - curr_address);
14556 			next_offset = curr_offset;
14557 			next_offset += (next_address - curr_address);
14558 			next_max_above = MIN(next_max_above, curr_max_above);
14559 			next_max_above = MIN(next_max_above,
14560 			    next_entry->vme_end - next_address);
14561 			next_max_below = MIN(next_max_below, curr_max_below);
14562 			next_max_below = MIN(next_max_below,
14563 			    next_address - next_entry->vme_start);
14564 		}
14565 
14566 		/*
14567 		 * "curr_max_{above,below}" allow us to keep track of the
14568 		 * portion of the submap that is actually mapped at this level:
14569 		 * the rest of that submap is irrelevant to us, since it's not
14570 		 * mapped here.
14571 		 * The relevant portion of the map starts at
14572 		 * "VME_OFFSET(curr_entry)" up to the size of "curr_entry".
14573 		 */
14574 		curr_max_above = MIN(curr_max_above,
14575 		    curr_entry->vme_end - curr_address);
14576 		curr_max_below = MIN(curr_max_below,
14577 		    curr_address - curr_entry->vme_start);
14578 
14579 		if (!curr_entry->is_sub_map ||
14580 		    curr_depth >= user_max_depth) {
14581 			/*
14582 			 * We hit a leaf map or we reached the maximum depth
14583 			 * we could, so stop looking.  Keep the current map
14584 			 * locked.
14585 			 */
14586 			break;
14587 		}
14588 
14589 		/*
14590 		 * Get down to the next submap level.
14591 		 */
14592 
14593 		if (curr_entry->needs_copy) {
14594 			/* everything below this is effectively copy-on-write */
14595 			submap_needed_copy = TRUE;
14596 		}
14597 
14598 		/*
14599 		 * Lock the next level and unlock the current level,
14600 		 * unless we need to keep it locked to access the "next_entry"
14601 		 * later.
14602 		 */
14603 		if (not_in_kdp) {
14604 			vm_map_lock_read(VME_SUBMAP(curr_entry));
14605 		}
14606 		if (curr_map == next_map) {
14607 			/* keep "next_map" locked in case we need it */
14608 		} else {
14609 			/* release this map */
14610 			if (not_in_kdp) {
14611 				vm_map_unlock_read(curr_map);
14612 			}
14613 		}
14614 
14615 		/*
14616 		 * Adjust the offset.  "curr_entry" maps the submap
14617 		 * at relative address "curr_entry->vme_start" in the
14618 		 * curr_map but skips the first "VME_OFFSET(curr_entry)"
14619 		 * bytes of the submap.
14620 		 * "curr_offset" always represents the offset of a virtual
14621 		 * address in the curr_map relative to the absolute address
14622 		 * space (i.e. the top-level VM map).
14623 		 */
14624 		curr_offset +=
14625 		    (VME_OFFSET(curr_entry) - curr_entry->vme_start);
14626 		curr_address = user_address + curr_offset;
14627 		/* switch to the submap */
14628 		curr_map = VME_SUBMAP(curr_entry);
14629 		curr_depth++;
14630 		curr_entry = NULL;
14631 	}
14632 
14633 // LP64todo: all the current tools are 32bit, obviously never worked for 64b
14634 // so probably should be a real 32b ID vs. ptr.
14635 // Current users just check for equality
14636 
14637 	if (curr_entry == NULL) {
14638 		/* no VM region contains the address... */
14639 
14640 		if (do_region_footprint && /* we want footprint numbers */
14641 		    next_entry == NULL && /* & there are no more regions */
14642 		    /* & we haven't already provided our fake region: */
14643 		    user_address <= vm_map_last_entry(map)->vme_end) {
14644 			ledger_amount_t ledger_resident, ledger_compressed;
14645 
14646 			/*
14647 			 * Add a fake memory region to account for
14648 			 * purgeable and/or ledger-tagged memory that
14649 			 * counts towards this task's memory footprint,
14650 			 * i.e. the resident/compressed pages of non-volatile
14651 			 * objects owned by that task.
14652 			 */
14653 			task_ledgers_footprint(map->pmap->ledger,
14654 			    &ledger_resident,
14655 			    &ledger_compressed);
14656 			if (ledger_resident + ledger_compressed == 0) {
14657 				/* no purgeable memory usage to report */
14658 				return KERN_INVALID_ADDRESS;
14659 			}
14660 			/* fake region to show nonvolatile footprint */
14661 			if (look_for_pages) {
14662 				submap_info->protection = VM_PROT_DEFAULT;
14663 				submap_info->max_protection = VM_PROT_DEFAULT;
14664 				submap_info->inheritance = VM_INHERIT_DEFAULT;
14665 				submap_info->offset = 0;
14666 				submap_info->user_tag = -1;
14667 				submap_info->pages_resident = (unsigned int) (ledger_resident / effective_page_size);
14668 				submap_info->pages_shared_now_private = 0;
14669 				submap_info->pages_swapped_out = (unsigned int) (ledger_compressed / effective_page_size);
14670 				submap_info->pages_dirtied = submap_info->pages_resident;
14671 				submap_info->ref_count = 1;
14672 				submap_info->shadow_depth = 0;
14673 				submap_info->external_pager = 0;
14674 				submap_info->share_mode = SM_PRIVATE;
14675 				if (submap_needed_copy) {
14676 					submap_info->share_mode = SM_COW;
14677 				}
14678 				submap_info->is_submap = 0;
14679 				submap_info->behavior = VM_BEHAVIOR_DEFAULT;
14680 				submap_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
14681 				submap_info->user_wired_count = 0;
14682 				submap_info->pages_reusable = 0;
14683 			} else {
14684 				short_info->user_tag = -1;
14685 				short_info->offset = 0;
14686 				short_info->protection = VM_PROT_DEFAULT;
14687 				short_info->inheritance = VM_INHERIT_DEFAULT;
14688 				short_info->max_protection = VM_PROT_DEFAULT;
14689 				short_info->behavior = VM_BEHAVIOR_DEFAULT;
14690 				short_info->user_wired_count = 0;
14691 				short_info->is_submap = 0;
14692 				short_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
14693 				short_info->external_pager = 0;
14694 				short_info->shadow_depth = 0;
14695 				short_info->share_mode = SM_PRIVATE;
14696 				if (submap_needed_copy) {
14697 					short_info->share_mode = SM_COW;
14698 				}
14699 				short_info->ref_count = 1;
14700 			}
14701 			*nesting_depth = 0;
14702 			*size = (vm_map_size_t) (ledger_resident + ledger_compressed);
14703 //			*address = user_address;
14704 			*address = vm_map_last_entry(map)->vme_end;
14705 			return KERN_SUCCESS;
14706 		}
14707 
14708 		if (next_entry == NULL) {
14709 			/* ... and no VM region follows it either */
14710 			return KERN_INVALID_ADDRESS;
14711 		}
14712 		/* ... gather info about the next VM region */
14713 		curr_entry = next_entry;
14714 		curr_map = next_map;    /* still locked ... */
14715 		curr_address = next_address;
14716 		curr_skip = next_skip;
14717 		curr_offset = next_offset;
14718 		curr_depth = next_depth;
14719 		curr_max_above = next_max_above;
14720 		curr_max_below = next_max_below;
14721 	} else {
14722 		/* we won't need "next_entry" after all */
14723 		if (next_entry != NULL) {
14724 			/* release "next_map" */
14725 			if (next_map != curr_map && not_in_kdp) {
14726 				vm_map_unlock_read(next_map);
14727 			}
14728 		}
14729 	}
14730 	next_entry = NULL;
14731 	next_map = NULL;
14732 	next_offset = 0;
14733 	next_skip = 0;
14734 	next_depth = 0;
14735 	next_max_below = -1;
14736 	next_max_above = -1;
14737 
14738 	if (curr_entry->is_sub_map &&
14739 	    curr_depth < user_max_depth) {
14740 		/*
14741 		 * We're not as deep as we could be:  we must have
14742 		 * gone back up after not finding anything mapped
14743 		 * below the original top-level map entry's.
14744 		 * Let's move "curr_address" forward and recurse again.
14745 		 */
14746 		user_address = curr_address;
14747 		goto recurse_again;
14748 	}
14749 
14750 	*nesting_depth = curr_depth;
14751 	*size = curr_max_above + curr_max_below;
14752 	*address = user_address + curr_skip - curr_max_below;
14753 
14754 	if (look_for_pages) {
14755 		submap_info->user_tag = VME_ALIAS(curr_entry);
14756 		submap_info->offset = VME_OFFSET(curr_entry);
14757 		submap_info->protection = curr_entry->protection;
14758 		submap_info->inheritance = curr_entry->inheritance;
14759 		submap_info->max_protection = curr_entry->max_protection;
14760 		submap_info->behavior = curr_entry->behavior;
14761 		submap_info->user_wired_count = curr_entry->user_wired_count;
14762 		submap_info->is_submap = curr_entry->is_sub_map;
14763 		if (curr_entry->is_sub_map) {
14764 			submap_info->object_id = VM_OBJECT_ID(VME_SUBMAP(curr_entry));
14765 		} else {
14766 			submap_info->object_id = VM_OBJECT_ID(VME_OBJECT(curr_entry));
14767 		}
14768 	} else {
14769 		short_info->user_tag = VME_ALIAS(curr_entry);
14770 		short_info->offset = VME_OFFSET(curr_entry);
14771 		short_info->protection = curr_entry->protection;
14772 		short_info->inheritance = curr_entry->inheritance;
14773 		short_info->max_protection = curr_entry->max_protection;
14774 		short_info->behavior = curr_entry->behavior;
14775 		short_info->user_wired_count = curr_entry->user_wired_count;
14776 		short_info->is_submap = curr_entry->is_sub_map;
14777 		if (curr_entry->is_sub_map) {
14778 			short_info->object_id = VM_OBJECT_ID(VME_SUBMAP(curr_entry));
14779 		} else {
14780 			short_info->object_id = VM_OBJECT_ID(VME_OBJECT(curr_entry));
14781 		}
14782 	}
14783 
14784 	extended.pages_resident = 0;
14785 	extended.pages_swapped_out = 0;
14786 	extended.pages_shared_now_private = 0;
14787 	extended.pages_dirtied = 0;
14788 	extended.pages_reusable = 0;
14789 	extended.external_pager = 0;
14790 	extended.shadow_depth = 0;
14791 	extended.share_mode = SM_EMPTY;
14792 	extended.ref_count = 0;
14793 
14794 	if (not_in_kdp) {
14795 		if (!curr_entry->is_sub_map) {
14796 			vm_map_offset_t range_start, range_end;
14797 			range_start = MAX((curr_address - curr_max_below),
14798 			    curr_entry->vme_start);
14799 			range_end = MIN((curr_address + curr_max_above),
14800 			    curr_entry->vme_end);
14801 			vm_map_region_walk(curr_map,
14802 			    range_start,
14803 			    curr_entry,
14804 			    (VME_OFFSET(curr_entry) +
14805 			    (range_start -
14806 			    curr_entry->vme_start)),
14807 			    range_end - range_start,
14808 			    &extended,
14809 			    look_for_pages, VM_REGION_EXTENDED_INFO_COUNT);
14810 			if (extended.external_pager &&
14811 			    extended.ref_count == 2 &&
14812 			    extended.share_mode == SM_SHARED) {
14813 				extended.share_mode = SM_PRIVATE;
14814 			}
14815 			if (submap_needed_copy) {
14816 				extended.share_mode = SM_COW;
14817 			}
14818 		} else {
14819 			if (curr_entry->use_pmap) {
14820 				extended.share_mode = SM_TRUESHARED;
14821 			} else {
14822 				extended.share_mode = SM_PRIVATE;
14823 			}
14824 			extended.ref_count = os_ref_get_count_raw(&VME_SUBMAP(curr_entry)->map_refcnt);
14825 		}
14826 	}
14827 
14828 	if (look_for_pages) {
14829 		submap_info->pages_resident = extended.pages_resident;
14830 		submap_info->pages_swapped_out = extended.pages_swapped_out;
14831 		submap_info->pages_shared_now_private =
14832 		    extended.pages_shared_now_private;
14833 		submap_info->pages_dirtied = extended.pages_dirtied;
14834 		submap_info->external_pager = extended.external_pager;
14835 		submap_info->shadow_depth = extended.shadow_depth;
14836 		submap_info->share_mode = extended.share_mode;
14837 		submap_info->ref_count = extended.ref_count;
14838 
14839 		if (original_count >= VM_REGION_SUBMAP_INFO_V1_COUNT_64) {
14840 			submap_info->pages_reusable = extended.pages_reusable;
14841 		}
14842 		if (original_count >= VM_REGION_SUBMAP_INFO_V2_COUNT_64) {
14843 			if (curr_entry->is_sub_map) {
14844 				submap_info->object_id_full = (vm_object_id_t)VM_KERNEL_ADDRPERM(VME_SUBMAP(curr_entry));
14845 			} else if (VME_OBJECT(curr_entry)) {
14846 				submap_info->object_id_full = (vm_object_id_t)VM_KERNEL_ADDRPERM(VME_OBJECT(curr_entry));
14847 			} else {
14848 				submap_info->object_id_full = 0ull;
14849 			}
14850 		}
14851 	} else {
14852 		short_info->external_pager = extended.external_pager;
14853 		short_info->shadow_depth = extended.shadow_depth;
14854 		short_info->share_mode = extended.share_mode;
14855 		short_info->ref_count = extended.ref_count;
14856 	}
14857 
14858 	if (not_in_kdp) {
14859 		vm_map_unlock_read(curr_map);
14860 	}
14861 
14862 	return KERN_SUCCESS;
14863 }
14864 
14865 /*
14866  *	vm_region:
14867  *
14868  *	User call to obtain information about a region in
14869  *	a task's address map. Currently, only one flavor is
14870  *	supported.
14871  *
14872  *	XXX The reserved and behavior fields cannot be filled
14873  *	    in until the vm merge from the IK is completed, and
14874  *	    vm_reserve is implemented.
14875  */
14876 
14877 kern_return_t
vm_map_region(vm_map_t map,vm_map_offset_t * address,vm_map_size_t * size,vm_region_flavor_t flavor,vm_region_info_t info,mach_msg_type_number_t * count,mach_port_t * object_name)14878 vm_map_region(
14879 	vm_map_t                 map,
14880 	vm_map_offset_t *address,               /* IN/OUT */
14881 	vm_map_size_t           *size,                  /* OUT */
14882 	vm_region_flavor_t       flavor,                /* IN */
14883 	vm_region_info_t         info,                  /* OUT */
14884 	mach_msg_type_number_t  *count, /* IN/OUT */
14885 	mach_port_t             *object_name)           /* OUT */
14886 {
14887 	vm_map_entry_t          tmp_entry;
14888 	vm_map_entry_t          entry;
14889 	vm_map_offset_t         start;
14890 
14891 	if (map == VM_MAP_NULL) {
14892 		return KERN_INVALID_ARGUMENT;
14893 	}
14894 
14895 	switch (flavor) {
14896 	case VM_REGION_BASIC_INFO:
14897 		/* legacy for old 32-bit objects info */
14898 	{
14899 		vm_region_basic_info_t  basic;
14900 
14901 		if (*count < VM_REGION_BASIC_INFO_COUNT) {
14902 			return KERN_INVALID_ARGUMENT;
14903 		}
14904 
14905 		basic = (vm_region_basic_info_t) info;
14906 		*count = VM_REGION_BASIC_INFO_COUNT;
14907 
14908 		vm_map_lock_read(map);
14909 
14910 		start = *address;
14911 		if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
14912 			if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
14913 				vm_map_unlock_read(map);
14914 				return KERN_INVALID_ADDRESS;
14915 			}
14916 		} else {
14917 			entry = tmp_entry;
14918 		}
14919 
14920 		start = entry->vme_start;
14921 
14922 		basic->offset = (uint32_t)VME_OFFSET(entry);
14923 		basic->protection = entry->protection;
14924 		basic->inheritance = entry->inheritance;
14925 		basic->max_protection = entry->max_protection;
14926 		basic->behavior = entry->behavior;
14927 		basic->user_wired_count = entry->user_wired_count;
14928 		basic->reserved = entry->is_sub_map;
14929 		*address = start;
14930 		*size = (entry->vme_end - start);
14931 
14932 		if (object_name) {
14933 			*object_name = IP_NULL;
14934 		}
14935 		if (entry->is_sub_map) {
14936 			basic->shared = FALSE;
14937 		} else {
14938 			basic->shared = entry->is_shared;
14939 		}
14940 
14941 		vm_map_unlock_read(map);
14942 		return KERN_SUCCESS;
14943 	}
14944 
14945 	case VM_REGION_BASIC_INFO_64:
14946 	{
14947 		vm_region_basic_info_64_t       basic;
14948 
14949 		if (*count < VM_REGION_BASIC_INFO_COUNT_64) {
14950 			return KERN_INVALID_ARGUMENT;
14951 		}
14952 
14953 		basic = (vm_region_basic_info_64_t) info;
14954 		*count = VM_REGION_BASIC_INFO_COUNT_64;
14955 
14956 		vm_map_lock_read(map);
14957 
14958 		start = *address;
14959 		if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
14960 			if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
14961 				vm_map_unlock_read(map);
14962 				return KERN_INVALID_ADDRESS;
14963 			}
14964 		} else {
14965 			entry = tmp_entry;
14966 		}
14967 
14968 		start = entry->vme_start;
14969 
14970 		basic->offset = VME_OFFSET(entry);
14971 		basic->protection = entry->protection;
14972 		basic->inheritance = entry->inheritance;
14973 		basic->max_protection = entry->max_protection;
14974 		basic->behavior = entry->behavior;
14975 		basic->user_wired_count = entry->user_wired_count;
14976 		basic->reserved = entry->is_sub_map;
14977 		*address = start;
14978 		*size = (entry->vme_end - start);
14979 
14980 		if (object_name) {
14981 			*object_name = IP_NULL;
14982 		}
14983 		if (entry->is_sub_map) {
14984 			basic->shared = FALSE;
14985 		} else {
14986 			basic->shared = entry->is_shared;
14987 		}
14988 
14989 		vm_map_unlock_read(map);
14990 		return KERN_SUCCESS;
14991 	}
14992 	case VM_REGION_EXTENDED_INFO:
14993 		if (*count < VM_REGION_EXTENDED_INFO_COUNT) {
14994 			return KERN_INVALID_ARGUMENT;
14995 		}
14996 		OS_FALLTHROUGH;
14997 	case VM_REGION_EXTENDED_INFO__legacy:
14998 		if (*count < VM_REGION_EXTENDED_INFO_COUNT__legacy) {
14999 			return KERN_INVALID_ARGUMENT;
15000 		}
15001 
15002 		{
15003 			vm_region_extended_info_t       extended;
15004 			mach_msg_type_number_t original_count;
15005 			int effective_page_size, effective_page_shift;
15006 
15007 			extended = (vm_region_extended_info_t) info;
15008 
15009 			effective_page_shift = vm_self_region_page_shift(map);
15010 			effective_page_size = (1 << effective_page_shift);
15011 
15012 			vm_map_lock_read(map);
15013 
15014 			start = *address;
15015 			if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15016 				if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15017 					vm_map_unlock_read(map);
15018 					return KERN_INVALID_ADDRESS;
15019 				}
15020 			} else {
15021 				entry = tmp_entry;
15022 			}
15023 			start = entry->vme_start;
15024 
15025 			extended->protection = entry->protection;
15026 			extended->user_tag = VME_ALIAS(entry);
15027 			extended->pages_resident = 0;
15028 			extended->pages_swapped_out = 0;
15029 			extended->pages_shared_now_private = 0;
15030 			extended->pages_dirtied = 0;
15031 			extended->external_pager = 0;
15032 			extended->shadow_depth = 0;
15033 
15034 			original_count = *count;
15035 			if (flavor == VM_REGION_EXTENDED_INFO__legacy) {
15036 				*count = VM_REGION_EXTENDED_INFO_COUNT__legacy;
15037 			} else {
15038 				extended->pages_reusable = 0;
15039 				*count = VM_REGION_EXTENDED_INFO_COUNT;
15040 			}
15041 
15042 			vm_map_region_walk(map, start, entry, VME_OFFSET(entry), entry->vme_end - start, extended, TRUE, *count);
15043 
15044 			if (extended->external_pager && extended->ref_count == 2 && extended->share_mode == SM_SHARED) {
15045 				extended->share_mode = SM_PRIVATE;
15046 			}
15047 
15048 			if (object_name) {
15049 				*object_name = IP_NULL;
15050 			}
15051 			*address = start;
15052 			*size = (entry->vme_end - start);
15053 
15054 			vm_map_unlock_read(map);
15055 			return KERN_SUCCESS;
15056 		}
15057 	case VM_REGION_TOP_INFO:
15058 	{
15059 		vm_region_top_info_t    top;
15060 
15061 		if (*count < VM_REGION_TOP_INFO_COUNT) {
15062 			return KERN_INVALID_ARGUMENT;
15063 		}
15064 
15065 		top = (vm_region_top_info_t) info;
15066 		*count = VM_REGION_TOP_INFO_COUNT;
15067 
15068 		vm_map_lock_read(map);
15069 
15070 		start = *address;
15071 		if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15072 			if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15073 				vm_map_unlock_read(map);
15074 				return KERN_INVALID_ADDRESS;
15075 			}
15076 		} else {
15077 			entry = tmp_entry;
15078 		}
15079 		start = entry->vme_start;
15080 
15081 		top->private_pages_resident = 0;
15082 		top->shared_pages_resident = 0;
15083 
15084 		vm_map_region_top_walk(entry, top);
15085 
15086 		if (object_name) {
15087 			*object_name = IP_NULL;
15088 		}
15089 		*address = start;
15090 		*size = (entry->vme_end - start);
15091 
15092 		vm_map_unlock_read(map);
15093 		return KERN_SUCCESS;
15094 	}
15095 	default:
15096 		return KERN_INVALID_ARGUMENT;
15097 	}
15098 }
15099 
15100 #define OBJ_RESIDENT_COUNT(obj, entry_size)                             \
15101 	MIN((entry_size),                                               \
15102 	    ((obj)->all_reusable ?                                      \
15103 	     (obj)->wired_page_count :                                  \
15104 	     (obj)->resident_page_count - (obj)->reusable_page_count))
15105 
15106 void
vm_map_region_top_walk(vm_map_entry_t entry,vm_region_top_info_t top)15107 vm_map_region_top_walk(
15108 	vm_map_entry_t             entry,
15109 	vm_region_top_info_t       top)
15110 {
15111 	if (entry->is_sub_map || VME_OBJECT(entry) == 0) {
15112 		top->share_mode = SM_EMPTY;
15113 		top->ref_count = 0;
15114 		top->obj_id = 0;
15115 		return;
15116 	}
15117 
15118 	{
15119 		struct  vm_object *obj, *tmp_obj;
15120 		int             ref_count;
15121 		uint32_t        entry_size;
15122 
15123 		entry_size = (uint32_t) ((entry->vme_end - entry->vme_start) / PAGE_SIZE_64);
15124 
15125 		obj = VME_OBJECT(entry);
15126 
15127 		vm_object_lock(obj);
15128 
15129 		if ((ref_count = obj->ref_count) > 1 && obj->paging_in_progress) {
15130 			ref_count--;
15131 		}
15132 
15133 		assert(obj->reusable_page_count <= obj->resident_page_count);
15134 		if (obj->shadow) {
15135 			if (ref_count == 1) {
15136 				top->private_pages_resident =
15137 				    OBJ_RESIDENT_COUNT(obj, entry_size);
15138 			} else {
15139 				top->shared_pages_resident =
15140 				    OBJ_RESIDENT_COUNT(obj, entry_size);
15141 			}
15142 			top->ref_count  = ref_count;
15143 			top->share_mode = SM_COW;
15144 
15145 			while ((tmp_obj = obj->shadow)) {
15146 				vm_object_lock(tmp_obj);
15147 				vm_object_unlock(obj);
15148 				obj = tmp_obj;
15149 
15150 				if ((ref_count = obj->ref_count) > 1 && obj->paging_in_progress) {
15151 					ref_count--;
15152 				}
15153 
15154 				assert(obj->reusable_page_count <= obj->resident_page_count);
15155 				top->shared_pages_resident +=
15156 				    OBJ_RESIDENT_COUNT(obj, entry_size);
15157 				top->ref_count += ref_count - 1;
15158 			}
15159 		} else {
15160 			if (entry->superpage_size) {
15161 				top->share_mode = SM_LARGE_PAGE;
15162 				top->shared_pages_resident = 0;
15163 				top->private_pages_resident = entry_size;
15164 			} else if (entry->needs_copy) {
15165 				top->share_mode = SM_COW;
15166 				top->shared_pages_resident =
15167 				    OBJ_RESIDENT_COUNT(obj, entry_size);
15168 			} else {
15169 				if (ref_count == 1 ||
15170 				    (ref_count == 2 && obj->named)) {
15171 					top->share_mode = SM_PRIVATE;
15172 					top->private_pages_resident =
15173 					    OBJ_RESIDENT_COUNT(obj,
15174 					    entry_size);
15175 				} else {
15176 					top->share_mode = SM_SHARED;
15177 					top->shared_pages_resident =
15178 					    OBJ_RESIDENT_COUNT(obj,
15179 					    entry_size);
15180 				}
15181 			}
15182 			top->ref_count = ref_count;
15183 		}
15184 		/* XXX K64: obj_id will be truncated */
15185 		top->obj_id = (unsigned int) (uintptr_t)VM_KERNEL_ADDRPERM(obj);
15186 
15187 		vm_object_unlock(obj);
15188 	}
15189 }
15190 
15191 void
vm_map_region_walk(vm_map_t map,vm_map_offset_t va,vm_map_entry_t entry,vm_object_offset_t offset,vm_object_size_t range,vm_region_extended_info_t extended,boolean_t look_for_pages,mach_msg_type_number_t count)15192 vm_map_region_walk(
15193 	vm_map_t                        map,
15194 	vm_map_offset_t                 va,
15195 	vm_map_entry_t                  entry,
15196 	vm_object_offset_t              offset,
15197 	vm_object_size_t                range,
15198 	vm_region_extended_info_t       extended,
15199 	boolean_t                       look_for_pages,
15200 	mach_msg_type_number_t count)
15201 {
15202 	struct vm_object *obj, *tmp_obj;
15203 	vm_map_offset_t       last_offset;
15204 	int               i;
15205 	int               ref_count;
15206 	struct vm_object        *shadow_object;
15207 	unsigned short          shadow_depth;
15208 	boolean_t         do_region_footprint;
15209 	int                     effective_page_size, effective_page_shift;
15210 	vm_map_offset_t         effective_page_mask;
15211 
15212 	do_region_footprint = task_self_region_footprint();
15213 
15214 	if ((entry->is_sub_map) ||
15215 	    (VME_OBJECT(entry) == 0) ||
15216 	    (VME_OBJECT(entry)->phys_contiguous &&
15217 	    !entry->superpage_size)) {
15218 		extended->share_mode = SM_EMPTY;
15219 		extended->ref_count = 0;
15220 		return;
15221 	}
15222 
15223 	if (entry->superpage_size) {
15224 		extended->shadow_depth = 0;
15225 		extended->share_mode = SM_LARGE_PAGE;
15226 		extended->ref_count = 1;
15227 		extended->external_pager = 0;
15228 
15229 		/* TODO4K: Superpage in 4k mode? */
15230 		extended->pages_resident = (unsigned int)(range >> PAGE_SHIFT);
15231 		extended->shadow_depth = 0;
15232 		return;
15233 	}
15234 
15235 	effective_page_shift = vm_self_region_page_shift(map);
15236 	effective_page_size = (1 << effective_page_shift);
15237 	effective_page_mask = effective_page_size - 1;
15238 
15239 	offset = vm_map_trunc_page(offset, effective_page_mask);
15240 
15241 	obj = VME_OBJECT(entry);
15242 
15243 	vm_object_lock(obj);
15244 
15245 	if ((ref_count = obj->ref_count) > 1 && obj->paging_in_progress) {
15246 		ref_count--;
15247 	}
15248 
15249 	if (look_for_pages) {
15250 		for (last_offset = offset + range;
15251 		    offset < last_offset;
15252 		    offset += effective_page_size, va += effective_page_size) {
15253 			if (do_region_footprint) {
15254 				int disp;
15255 
15256 				disp = 0;
15257 				if (map->has_corpse_footprint) {
15258 					/*
15259 					 * Query the page info data we saved
15260 					 * while forking the corpse.
15261 					 */
15262 					vm_map_corpse_footprint_query_page_info(
15263 						map,
15264 						va,
15265 						&disp);
15266 				} else {
15267 					/*
15268 					 * Query the pmap.
15269 					 */
15270 					vm_map_footprint_query_page_info(
15271 						map,
15272 						entry,
15273 						va,
15274 						&disp);
15275 				}
15276 				if (disp & VM_PAGE_QUERY_PAGE_PRESENT) {
15277 					extended->pages_resident++;
15278 				}
15279 				if (disp & VM_PAGE_QUERY_PAGE_REUSABLE) {
15280 					extended->pages_reusable++;
15281 				}
15282 				if (disp & VM_PAGE_QUERY_PAGE_DIRTY) {
15283 					extended->pages_dirtied++;
15284 				}
15285 				if (disp & PMAP_QUERY_PAGE_COMPRESSED) {
15286 					extended->pages_swapped_out++;
15287 				}
15288 				continue;
15289 			}
15290 
15291 			vm_map_region_look_for_page(map, va, obj,
15292 			    vm_object_trunc_page(offset), ref_count,
15293 			    0, extended, count);
15294 		}
15295 
15296 		if (do_region_footprint) {
15297 			goto collect_object_info;
15298 		}
15299 	} else {
15300 collect_object_info:
15301 		shadow_object = obj->shadow;
15302 		shadow_depth = 0;
15303 
15304 		if (!(obj->internal)) {
15305 			extended->external_pager = 1;
15306 		}
15307 
15308 		if (shadow_object != VM_OBJECT_NULL) {
15309 			vm_object_lock(shadow_object);
15310 			for (;
15311 			    shadow_object != VM_OBJECT_NULL;
15312 			    shadow_depth++) {
15313 				vm_object_t     next_shadow;
15314 
15315 				if (!(shadow_object->internal)) {
15316 					extended->external_pager = 1;
15317 				}
15318 
15319 				next_shadow = shadow_object->shadow;
15320 				if (next_shadow) {
15321 					vm_object_lock(next_shadow);
15322 				}
15323 				vm_object_unlock(shadow_object);
15324 				shadow_object = next_shadow;
15325 			}
15326 		}
15327 		extended->shadow_depth = shadow_depth;
15328 	}
15329 
15330 	if (extended->shadow_depth || entry->needs_copy) {
15331 		extended->share_mode = SM_COW;
15332 	} else {
15333 		if (ref_count == 1) {
15334 			extended->share_mode = SM_PRIVATE;
15335 		} else {
15336 			if (obj->true_share) {
15337 				extended->share_mode = SM_TRUESHARED;
15338 			} else {
15339 				extended->share_mode = SM_SHARED;
15340 			}
15341 		}
15342 	}
15343 	extended->ref_count = ref_count - extended->shadow_depth;
15344 
15345 	for (i = 0; i < extended->shadow_depth; i++) {
15346 		if ((tmp_obj = obj->shadow) == 0) {
15347 			break;
15348 		}
15349 		vm_object_lock(tmp_obj);
15350 		vm_object_unlock(obj);
15351 
15352 		if ((ref_count = tmp_obj->ref_count) > 1 && tmp_obj->paging_in_progress) {
15353 			ref_count--;
15354 		}
15355 
15356 		extended->ref_count += ref_count;
15357 		obj = tmp_obj;
15358 	}
15359 	vm_object_unlock(obj);
15360 
15361 	if (extended->share_mode == SM_SHARED) {
15362 		vm_map_entry_t       cur;
15363 		vm_map_entry_t       last;
15364 		int      my_refs;
15365 
15366 		obj = VME_OBJECT(entry);
15367 		last = vm_map_to_entry(map);
15368 		my_refs = 0;
15369 
15370 		if ((ref_count = obj->ref_count) > 1 && obj->paging_in_progress) {
15371 			ref_count--;
15372 		}
15373 		for (cur = vm_map_first_entry(map); cur != last; cur = cur->vme_next) {
15374 			my_refs += vm_map_region_count_obj_refs(cur, obj);
15375 		}
15376 
15377 		if (my_refs == ref_count) {
15378 			extended->share_mode = SM_PRIVATE_ALIASED;
15379 		} else if (my_refs > 1) {
15380 			extended->share_mode = SM_SHARED_ALIASED;
15381 		}
15382 	}
15383 }
15384 
15385 
15386 /* object is locked on entry and locked on return */
15387 
15388 
15389 static void
vm_map_region_look_for_page(__unused vm_map_t map,__unused vm_map_offset_t va,vm_object_t object,vm_object_offset_t offset,int max_refcnt,unsigned short depth,vm_region_extended_info_t extended,mach_msg_type_number_t count)15390 vm_map_region_look_for_page(
15391 	__unused vm_map_t               map,
15392 	__unused vm_map_offset_t        va,
15393 	vm_object_t                     object,
15394 	vm_object_offset_t              offset,
15395 	int                             max_refcnt,
15396 	unsigned short                  depth,
15397 	vm_region_extended_info_t       extended,
15398 	mach_msg_type_number_t count)
15399 {
15400 	vm_page_t       p;
15401 	vm_object_t     shadow;
15402 	int             ref_count;
15403 	vm_object_t     caller_object;
15404 
15405 	shadow = object->shadow;
15406 	caller_object = object;
15407 
15408 
15409 	while (TRUE) {
15410 		if (!(object->internal)) {
15411 			extended->external_pager = 1;
15412 		}
15413 
15414 		if ((p = vm_page_lookup(object, offset)) != VM_PAGE_NULL) {
15415 			if (shadow && (max_refcnt == 1)) {
15416 				extended->pages_shared_now_private++;
15417 			}
15418 
15419 			if (!p->vmp_fictitious &&
15420 			    (p->vmp_dirty || pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(p)))) {
15421 				extended->pages_dirtied++;
15422 			} else if (count >= VM_REGION_EXTENDED_INFO_COUNT) {
15423 				if (p->vmp_reusable || object->all_reusable) {
15424 					extended->pages_reusable++;
15425 				}
15426 			}
15427 
15428 			extended->pages_resident++;
15429 
15430 			if (object != caller_object) {
15431 				vm_object_unlock(object);
15432 			}
15433 
15434 			return;
15435 		}
15436 		if (object->internal &&
15437 		    object->alive &&
15438 		    !object->terminating &&
15439 		    object->pager_ready) {
15440 			if (VM_COMPRESSOR_PAGER_STATE_GET(object, offset)
15441 			    == VM_EXTERNAL_STATE_EXISTS) {
15442 				/* the pager has that page */
15443 				extended->pages_swapped_out++;
15444 				if (object != caller_object) {
15445 					vm_object_unlock(object);
15446 				}
15447 				return;
15448 			}
15449 		}
15450 
15451 		if (shadow) {
15452 			vm_object_lock(shadow);
15453 
15454 			if ((ref_count = shadow->ref_count) > 1 && shadow->paging_in_progress) {
15455 				ref_count--;
15456 			}
15457 
15458 			if (++depth > extended->shadow_depth) {
15459 				extended->shadow_depth = depth;
15460 			}
15461 
15462 			if (ref_count > max_refcnt) {
15463 				max_refcnt = ref_count;
15464 			}
15465 
15466 			if (object != caller_object) {
15467 				vm_object_unlock(object);
15468 			}
15469 
15470 			offset = offset + object->vo_shadow_offset;
15471 			object = shadow;
15472 			shadow = object->shadow;
15473 			continue;
15474 		}
15475 		if (object != caller_object) {
15476 			vm_object_unlock(object);
15477 		}
15478 		break;
15479 	}
15480 }
15481 
15482 static int
vm_map_region_count_obj_refs(vm_map_entry_t entry,vm_object_t object)15483 vm_map_region_count_obj_refs(
15484 	vm_map_entry_t    entry,
15485 	vm_object_t       object)
15486 {
15487 	int ref_count;
15488 	vm_object_t chk_obj;
15489 	vm_object_t tmp_obj;
15490 
15491 	if (entry->is_sub_map || VME_OBJECT(entry) == VM_OBJECT_NULL) {
15492 		return 0;
15493 	}
15494 
15495 	ref_count = 0;
15496 	chk_obj = VME_OBJECT(entry);
15497 	vm_object_lock(chk_obj);
15498 
15499 	while (chk_obj) {
15500 		if (chk_obj == object) {
15501 			ref_count++;
15502 		}
15503 		tmp_obj = chk_obj->shadow;
15504 		if (tmp_obj) {
15505 			vm_object_lock(tmp_obj);
15506 		}
15507 		vm_object_unlock(chk_obj);
15508 
15509 		chk_obj = tmp_obj;
15510 	}
15511 
15512 	return ref_count;
15513 }
15514 
15515 
15516 /*
15517  *	Routine:	vm_map_simplify
15518  *
15519  *	Description:
15520  *		Attempt to simplify the map representation in
15521  *		the vicinity of the given starting address.
15522  *	Note:
15523  *		This routine is intended primarily to keep the
15524  *		kernel maps more compact -- they generally don't
15525  *		benefit from the "expand a map entry" technology
15526  *		at allocation time because the adjacent entry
15527  *		is often wired down.
15528  */
15529 void
vm_map_simplify_entry(vm_map_t map,vm_map_entry_t this_entry)15530 vm_map_simplify_entry(
15531 	vm_map_t        map,
15532 	vm_map_entry_t  this_entry)
15533 {
15534 	vm_map_entry_t  prev_entry;
15535 
15536 	prev_entry = this_entry->vme_prev;
15537 
15538 	if ((this_entry != vm_map_to_entry(map)) &&
15539 	    (prev_entry != vm_map_to_entry(map)) &&
15540 
15541 	    (prev_entry->vme_end == this_entry->vme_start) &&
15542 
15543 	    (prev_entry->is_sub_map == this_entry->is_sub_map) &&
15544 	    (prev_entry->vme_object_value == this_entry->vme_object_value) &&
15545 	    (prev_entry->vme_kernel_object == this_entry->vme_kernel_object) &&
15546 	    ((VME_OFFSET(prev_entry) + (prev_entry->vme_end -
15547 	    prev_entry->vme_start))
15548 	    == VME_OFFSET(this_entry)) &&
15549 
15550 	    (prev_entry->behavior == this_entry->behavior) &&
15551 	    (prev_entry->needs_copy == this_entry->needs_copy) &&
15552 	    (prev_entry->protection == this_entry->protection) &&
15553 	    (prev_entry->max_protection == this_entry->max_protection) &&
15554 	    (prev_entry->inheritance == this_entry->inheritance) &&
15555 	    (prev_entry->use_pmap == this_entry->use_pmap) &&
15556 	    (VME_ALIAS(prev_entry) == VME_ALIAS(this_entry)) &&
15557 	    (prev_entry->no_cache == this_entry->no_cache) &&
15558 	    (prev_entry->vme_permanent == this_entry->vme_permanent) &&
15559 	    (prev_entry->map_aligned == this_entry->map_aligned) &&
15560 	    (prev_entry->zero_wired_pages == this_entry->zero_wired_pages) &&
15561 	    (prev_entry->used_for_jit == this_entry->used_for_jit) &&
15562 	    (prev_entry->pmap_cs_associated == this_entry->pmap_cs_associated) &&
15563 	    (prev_entry->iokit_acct == this_entry->iokit_acct) &&
15564 	    (prev_entry->vme_resilient_codesign ==
15565 	    this_entry->vme_resilient_codesign) &&
15566 	    (prev_entry->vme_resilient_media ==
15567 	    this_entry->vme_resilient_media) &&
15568 	    (prev_entry->vme_no_copy_on_read == this_entry->vme_no_copy_on_read) &&
15569 
15570 	    (prev_entry->wired_count == this_entry->wired_count) &&
15571 	    (prev_entry->user_wired_count == this_entry->user_wired_count) &&
15572 
15573 	    ((prev_entry->vme_atomic == FALSE) && (this_entry->vme_atomic == FALSE)) &&
15574 	    (prev_entry->in_transition == FALSE) &&
15575 	    (this_entry->in_transition == FALSE) &&
15576 	    (prev_entry->needs_wakeup == FALSE) &&
15577 	    (this_entry->needs_wakeup == FALSE) &&
15578 	    (prev_entry->is_shared == this_entry->is_shared) &&
15579 	    (prev_entry->superpage_size == FALSE) &&
15580 	    (this_entry->superpage_size == FALSE)
15581 	    ) {
15582 		if (prev_entry->vme_permanent) {
15583 			assert(this_entry->vme_permanent);
15584 			prev_entry->vme_permanent = false;
15585 		}
15586 		vm_map_store_entry_unlink(map, prev_entry, true);
15587 		assert(prev_entry->vme_start < this_entry->vme_end);
15588 		if (prev_entry->map_aligned) {
15589 			assert(VM_MAP_PAGE_ALIGNED(prev_entry->vme_start,
15590 			    VM_MAP_PAGE_MASK(map)));
15591 		}
15592 		this_entry->vme_start = prev_entry->vme_start;
15593 		VME_OFFSET_SET(this_entry, VME_OFFSET(prev_entry));
15594 
15595 		if (map->holelistenabled) {
15596 			vm_map_store_update_first_free(map, this_entry, TRUE);
15597 		}
15598 
15599 		if (prev_entry->is_sub_map) {
15600 			vm_map_deallocate(VME_SUBMAP(prev_entry));
15601 		} else {
15602 			vm_object_deallocate(VME_OBJECT(prev_entry));
15603 		}
15604 		vm_map_entry_dispose(prev_entry);
15605 		SAVE_HINT_MAP_WRITE(map, this_entry);
15606 	}
15607 }
15608 
15609 void
vm_map_simplify(vm_map_t map,vm_map_offset_t start)15610 vm_map_simplify(
15611 	vm_map_t        map,
15612 	vm_map_offset_t start)
15613 {
15614 	vm_map_entry_t  this_entry;
15615 
15616 	vm_map_lock(map);
15617 	if (vm_map_lookup_entry(map, start, &this_entry)) {
15618 		vm_map_simplify_entry(map, this_entry);
15619 		vm_map_simplify_entry(map, this_entry->vme_next);
15620 	}
15621 	vm_map_unlock(map);
15622 }
15623 
15624 static void
vm_map_simplify_range(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)15625 vm_map_simplify_range(
15626 	vm_map_t        map,
15627 	vm_map_offset_t start,
15628 	vm_map_offset_t end)
15629 {
15630 	vm_map_entry_t  entry;
15631 
15632 	/*
15633 	 * The map should be locked (for "write") by the caller.
15634 	 */
15635 
15636 	if (start >= end) {
15637 		/* invalid address range */
15638 		return;
15639 	}
15640 
15641 	start = vm_map_trunc_page(start,
15642 	    VM_MAP_PAGE_MASK(map));
15643 	end = vm_map_round_page(end,
15644 	    VM_MAP_PAGE_MASK(map));
15645 
15646 	if (!vm_map_lookup_entry(map, start, &entry)) {
15647 		/* "start" is not mapped and "entry" ends before "start" */
15648 		if (entry == vm_map_to_entry(map)) {
15649 			/* start with first entry in the map */
15650 			entry = vm_map_first_entry(map);
15651 		} else {
15652 			/* start with next entry */
15653 			entry = entry->vme_next;
15654 		}
15655 	}
15656 
15657 	while (entry != vm_map_to_entry(map) &&
15658 	    entry->vme_start <= end) {
15659 		/* try and coalesce "entry" with its previous entry */
15660 		vm_map_simplify_entry(map, entry);
15661 		entry = entry->vme_next;
15662 	}
15663 }
15664 
15665 
15666 /*
15667  *	Routine:	vm_map_machine_attribute
15668  *	Purpose:
15669  *		Provide machine-specific attributes to mappings,
15670  *		such as cachability etc. for machines that provide
15671  *		them.  NUMA architectures and machines with big/strange
15672  *		caches will use this.
15673  *	Note:
15674  *		Responsibilities for locking and checking are handled here,
15675  *		everything else in the pmap module. If any non-volatile
15676  *		information must be kept, the pmap module should handle
15677  *		it itself. [This assumes that attributes do not
15678  *		need to be inherited, which seems ok to me]
15679  */
15680 kern_return_t
vm_map_machine_attribute(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_machine_attribute_t attribute,vm_machine_attribute_val_t * value)15681 vm_map_machine_attribute(
15682 	vm_map_t                        map,
15683 	vm_map_offset_t         start,
15684 	vm_map_offset_t         end,
15685 	vm_machine_attribute_t  attribute,
15686 	vm_machine_attribute_val_t* value)              /* IN/OUT */
15687 {
15688 	kern_return_t   ret;
15689 	vm_map_size_t sync_size;
15690 	vm_map_entry_t entry;
15691 
15692 	if (start < vm_map_min(map) || end > vm_map_max(map)) {
15693 		return KERN_INVALID_ADDRESS;
15694 	}
15695 
15696 	/* Figure how much memory we need to flush (in page increments) */
15697 	sync_size = end - start;
15698 
15699 	vm_map_lock(map);
15700 
15701 	if (attribute != MATTR_CACHE) {
15702 		/* If we don't have to find physical addresses, we */
15703 		/* don't have to do an explicit traversal here.    */
15704 		ret = pmap_attribute(map->pmap, start, end - start,
15705 		    attribute, value);
15706 		vm_map_unlock(map);
15707 		return ret;
15708 	}
15709 
15710 	ret = KERN_SUCCESS;                                                                             /* Assume it all worked */
15711 
15712 	while (sync_size) {
15713 		if (vm_map_lookup_entry(map, start, &entry)) {
15714 			vm_map_size_t   sub_size;
15715 			if ((entry->vme_end - start) > sync_size) {
15716 				sub_size = sync_size;
15717 				sync_size = 0;
15718 			} else {
15719 				sub_size = entry->vme_end - start;
15720 				sync_size -= sub_size;
15721 			}
15722 			if (entry->is_sub_map) {
15723 				vm_map_offset_t sub_start;
15724 				vm_map_offset_t sub_end;
15725 
15726 				sub_start = (start - entry->vme_start)
15727 				    + VME_OFFSET(entry);
15728 				sub_end = sub_start + sub_size;
15729 				vm_map_machine_attribute(
15730 					VME_SUBMAP(entry),
15731 					sub_start,
15732 					sub_end,
15733 					attribute, value);
15734 			} else if (VME_OBJECT(entry)) {
15735 				vm_page_t               m;
15736 				vm_object_t             object;
15737 				vm_object_t             base_object;
15738 				vm_object_t             last_object;
15739 				vm_object_offset_t      offset;
15740 				vm_object_offset_t      base_offset;
15741 				vm_map_size_t           range;
15742 				range = sub_size;
15743 				offset = (start - entry->vme_start)
15744 				    + VME_OFFSET(entry);
15745 				offset = vm_object_trunc_page(offset);
15746 				base_offset = offset;
15747 				object = VME_OBJECT(entry);
15748 				base_object = object;
15749 				last_object = NULL;
15750 
15751 				vm_object_lock(object);
15752 
15753 				while (range) {
15754 					m = vm_page_lookup(
15755 						object, offset);
15756 
15757 					if (m && !m->vmp_fictitious) {
15758 						ret =
15759 						    pmap_attribute_cache_sync(
15760 							VM_PAGE_GET_PHYS_PAGE(m),
15761 							PAGE_SIZE,
15762 							attribute, value);
15763 					} else if (object->shadow) {
15764 						offset = offset + object->vo_shadow_offset;
15765 						last_object = object;
15766 						object = object->shadow;
15767 						vm_object_lock(last_object->shadow);
15768 						vm_object_unlock(last_object);
15769 						continue;
15770 					}
15771 					if (range < PAGE_SIZE) {
15772 						range = 0;
15773 					} else {
15774 						range -= PAGE_SIZE;
15775 					}
15776 
15777 					if (base_object != object) {
15778 						vm_object_unlock(object);
15779 						vm_object_lock(base_object);
15780 						object = base_object;
15781 					}
15782 					/* Bump to the next page */
15783 					base_offset += PAGE_SIZE;
15784 					offset = base_offset;
15785 				}
15786 				vm_object_unlock(object);
15787 			}
15788 			start += sub_size;
15789 		} else {
15790 			vm_map_unlock(map);
15791 			return KERN_FAILURE;
15792 		}
15793 	}
15794 
15795 	vm_map_unlock(map);
15796 
15797 	return ret;
15798 }
15799 
15800 /*
15801  *	vm_map_behavior_set:
15802  *
15803  *	Sets the paging reference behavior of the specified address
15804  *	range in the target map.  Paging reference behavior affects
15805  *	how pagein operations resulting from faults on the map will be
15806  *	clustered.
15807  */
15808 kern_return_t
vm_map_behavior_set(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_behavior_t new_behavior)15809 vm_map_behavior_set(
15810 	vm_map_t        map,
15811 	vm_map_offset_t start,
15812 	vm_map_offset_t end,
15813 	vm_behavior_t   new_behavior)
15814 {
15815 	vm_map_entry_t  entry;
15816 	vm_map_entry_t  temp_entry;
15817 
15818 	if (start > end ||
15819 	    start < vm_map_min(map) ||
15820 	    end > vm_map_max(map)) {
15821 		return KERN_NO_SPACE;
15822 	}
15823 
15824 	switch (new_behavior) {
15825 	/*
15826 	 * This first block of behaviors all set a persistent state on the specified
15827 	 * memory range.  All we have to do here is to record the desired behavior
15828 	 * in the vm_map_entry_t's.
15829 	 */
15830 
15831 	case VM_BEHAVIOR_DEFAULT:
15832 	case VM_BEHAVIOR_RANDOM:
15833 	case VM_BEHAVIOR_SEQUENTIAL:
15834 	case VM_BEHAVIOR_RSEQNTL:
15835 	case VM_BEHAVIOR_ZERO_WIRED_PAGES:
15836 		vm_map_lock(map);
15837 
15838 		/*
15839 		 *	The entire address range must be valid for the map.
15840 		 *      Note that vm_map_range_check() does a
15841 		 *	vm_map_lookup_entry() internally and returns the
15842 		 *	entry containing the start of the address range if
15843 		 *	the entire range is valid.
15844 		 */
15845 		if (vm_map_range_check(map, start, end, &temp_entry)) {
15846 			entry = temp_entry;
15847 			vm_map_clip_start(map, entry, start);
15848 		} else {
15849 			vm_map_unlock(map);
15850 			return KERN_INVALID_ADDRESS;
15851 		}
15852 
15853 		while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
15854 			vm_map_clip_end(map, entry, end);
15855 			if (entry->is_sub_map) {
15856 				assert(!entry->use_pmap);
15857 			}
15858 
15859 			if (new_behavior == VM_BEHAVIOR_ZERO_WIRED_PAGES) {
15860 				entry->zero_wired_pages = TRUE;
15861 			} else {
15862 				entry->behavior = new_behavior;
15863 			}
15864 			entry = entry->vme_next;
15865 		}
15866 
15867 		vm_map_unlock(map);
15868 		break;
15869 
15870 	/*
15871 	 * The rest of these are different from the above in that they cause
15872 	 * an immediate action to take place as opposed to setting a behavior that
15873 	 * affects future actions.
15874 	 */
15875 
15876 	case VM_BEHAVIOR_WILLNEED:
15877 		return vm_map_willneed(map, start, end);
15878 
15879 	case VM_BEHAVIOR_DONTNEED:
15880 		return vm_map_msync(map, start, end - start, VM_SYNC_DEACTIVATE | VM_SYNC_CONTIGUOUS);
15881 
15882 	case VM_BEHAVIOR_FREE:
15883 		return vm_map_msync(map, start, end - start, VM_SYNC_KILLPAGES | VM_SYNC_CONTIGUOUS);
15884 
15885 	case VM_BEHAVIOR_REUSABLE:
15886 		return vm_map_reusable_pages(map, start, end);
15887 
15888 	case VM_BEHAVIOR_REUSE:
15889 		return vm_map_reuse_pages(map, start, end);
15890 
15891 	case VM_BEHAVIOR_CAN_REUSE:
15892 		return vm_map_can_reuse(map, start, end);
15893 
15894 #if MACH_ASSERT
15895 	case VM_BEHAVIOR_PAGEOUT:
15896 		return vm_map_pageout(map, start, end);
15897 #endif /* MACH_ASSERT */
15898 
15899 	default:
15900 		return KERN_INVALID_ARGUMENT;
15901 	}
15902 
15903 	return KERN_SUCCESS;
15904 }
15905 
15906 
15907 /*
15908  * Internals for madvise(MADV_WILLNEED) system call.
15909  *
15910  * The implementation is to do:-
15911  * a) read-ahead if the mapping corresponds to a mapped regular file
15912  * b) or, fault in the pages (zero-fill, decompress etc) if it's an anonymous mapping
15913  */
15914 
15915 
15916 static kern_return_t
vm_map_willneed(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)15917 vm_map_willneed(
15918 	vm_map_t        map,
15919 	vm_map_offset_t start,
15920 	vm_map_offset_t end
15921 	)
15922 {
15923 	vm_map_entry_t                  entry;
15924 	vm_object_t                     object;
15925 	memory_object_t                 pager;
15926 	struct vm_object_fault_info     fault_info = {};
15927 	kern_return_t                   kr;
15928 	vm_object_size_t                len;
15929 	vm_object_offset_t              offset;
15930 
15931 	fault_info.interruptible = THREAD_UNINT;        /* ignored value */
15932 	fault_info.behavior      = VM_BEHAVIOR_SEQUENTIAL;
15933 	fault_info.stealth       = TRUE;
15934 
15935 	/*
15936 	 * The MADV_WILLNEED operation doesn't require any changes to the
15937 	 * vm_map_entry_t's, so the read lock is sufficient.
15938 	 */
15939 
15940 	vm_map_lock_read(map);
15941 
15942 	/*
15943 	 * The madvise semantics require that the address range be fully
15944 	 * allocated with no holes.  Otherwise, we're required to return
15945 	 * an error.
15946 	 */
15947 
15948 	if (!vm_map_range_check(map, start, end, &entry)) {
15949 		vm_map_unlock_read(map);
15950 		return KERN_INVALID_ADDRESS;
15951 	}
15952 
15953 	/*
15954 	 * Examine each vm_map_entry_t in the range.
15955 	 */
15956 	for (; entry != vm_map_to_entry(map) && start < end;) {
15957 		/*
15958 		 * The first time through, the start address could be anywhere
15959 		 * within the vm_map_entry we found.  So adjust the offset to
15960 		 * correspond.  After that, the offset will always be zero to
15961 		 * correspond to the beginning of the current vm_map_entry.
15962 		 */
15963 		offset = (start - entry->vme_start) + VME_OFFSET(entry);
15964 
15965 		/*
15966 		 * Set the length so we don't go beyond the end of the
15967 		 * map_entry or beyond the end of the range we were given.
15968 		 * This range could span also multiple map entries all of which
15969 		 * map different files, so make sure we only do the right amount
15970 		 * of I/O for each object.  Note that it's possible for there
15971 		 * to be multiple map entries all referring to the same object
15972 		 * but with different page permissions, but it's not worth
15973 		 * trying to optimize that case.
15974 		 */
15975 		len = MIN(entry->vme_end - start, end - start);
15976 
15977 		if ((vm_size_t) len != len) {
15978 			/* 32-bit overflow */
15979 			len = (vm_size_t) (0 - PAGE_SIZE);
15980 		}
15981 		fault_info.cluster_size = (vm_size_t) len;
15982 		fault_info.lo_offset    = offset;
15983 		fault_info.hi_offset    = offset + len;
15984 		fault_info.user_tag     = VME_ALIAS(entry);
15985 		fault_info.pmap_options = 0;
15986 		if (entry->iokit_acct ||
15987 		    (!entry->is_sub_map && !entry->use_pmap)) {
15988 			fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
15989 		}
15990 
15991 		/*
15992 		 * If the entry is a submap OR there's no read permission
15993 		 * to this mapping, then just skip it.
15994 		 */
15995 		if ((entry->is_sub_map) || (entry->protection & VM_PROT_READ) == 0) {
15996 			entry = entry->vme_next;
15997 			start = entry->vme_start;
15998 			continue;
15999 		}
16000 
16001 		object = VME_OBJECT(entry);
16002 
16003 		if (object == NULL ||
16004 		    (object && object->internal)) {
16005 			/*
16006 			 * Memory range backed by anonymous memory.
16007 			 */
16008 			vm_size_t region_size = 0, effective_page_size = 0;
16009 			vm_map_offset_t addr = 0, effective_page_mask = 0;
16010 
16011 			region_size = len;
16012 			addr = start;
16013 
16014 			effective_page_mask = MIN(vm_map_page_mask(current_map()), PAGE_MASK);
16015 			effective_page_size = effective_page_mask + 1;
16016 
16017 			vm_map_unlock_read(map);
16018 
16019 			while (region_size) {
16020 				vm_pre_fault(
16021 					vm_map_trunc_page(addr, effective_page_mask),
16022 					VM_PROT_READ | VM_PROT_WRITE);
16023 
16024 				region_size -= effective_page_size;
16025 				addr += effective_page_size;
16026 			}
16027 		} else {
16028 			/*
16029 			 * Find the file object backing this map entry.  If there is
16030 			 * none, then we simply ignore the "will need" advice for this
16031 			 * entry and go on to the next one.
16032 			 */
16033 			if ((object = find_vnode_object(entry)) == VM_OBJECT_NULL) {
16034 				entry = entry->vme_next;
16035 				start = entry->vme_start;
16036 				continue;
16037 			}
16038 
16039 			vm_object_paging_begin(object);
16040 			pager = object->pager;
16041 			vm_object_unlock(object);
16042 
16043 			/*
16044 			 * The data_request() could take a long time, so let's
16045 			 * release the map lock to avoid blocking other threads.
16046 			 */
16047 			vm_map_unlock_read(map);
16048 
16049 			/*
16050 			 * Get the data from the object asynchronously.
16051 			 *
16052 			 * Note that memory_object_data_request() places limits on the
16053 			 * amount of I/O it will do.  Regardless of the len we
16054 			 * specified, it won't do more than MAX_UPL_TRANSFER_BYTES and it
16055 			 * silently truncates the len to that size.  This isn't
16056 			 * necessarily bad since madvise shouldn't really be used to
16057 			 * page in unlimited amounts of data.  Other Unix variants
16058 			 * limit the willneed case as well.  If this turns out to be an
16059 			 * issue for developers, then we can always adjust the policy
16060 			 * here and still be backwards compatible since this is all
16061 			 * just "advice".
16062 			 */
16063 			kr = memory_object_data_request(
16064 				pager,
16065 				vm_object_trunc_page(offset) + object->paging_offset,
16066 				0,      /* ignored */
16067 				VM_PROT_READ,
16068 				(memory_object_fault_info_t)&fault_info);
16069 
16070 			vm_object_lock(object);
16071 			vm_object_paging_end(object);
16072 			vm_object_unlock(object);
16073 
16074 			/*
16075 			 * If we couldn't do the I/O for some reason, just give up on
16076 			 * the madvise.  We still return success to the user since
16077 			 * madvise isn't supposed to fail when the advice can't be
16078 			 * taken.
16079 			 */
16080 
16081 			if (kr != KERN_SUCCESS) {
16082 				return KERN_SUCCESS;
16083 			}
16084 		}
16085 
16086 		start += len;
16087 		if (start >= end) {
16088 			/* done */
16089 			return KERN_SUCCESS;
16090 		}
16091 
16092 		/* look up next entry */
16093 		vm_map_lock_read(map);
16094 		if (!vm_map_lookup_entry(map, start, &entry)) {
16095 			/*
16096 			 * There's a new hole in the address range.
16097 			 */
16098 			vm_map_unlock_read(map);
16099 			return KERN_INVALID_ADDRESS;
16100 		}
16101 	}
16102 
16103 	vm_map_unlock_read(map);
16104 	return KERN_SUCCESS;
16105 }
16106 
16107 static boolean_t
vm_map_entry_is_reusable(vm_map_entry_t entry)16108 vm_map_entry_is_reusable(
16109 	vm_map_entry_t entry)
16110 {
16111 	/* Only user map entries */
16112 
16113 	vm_object_t object;
16114 
16115 	if (entry->is_sub_map) {
16116 		return FALSE;
16117 	}
16118 
16119 	switch (VME_ALIAS(entry)) {
16120 	case VM_MEMORY_MALLOC:
16121 	case VM_MEMORY_MALLOC_SMALL:
16122 	case VM_MEMORY_MALLOC_LARGE:
16123 	case VM_MEMORY_REALLOC:
16124 	case VM_MEMORY_MALLOC_TINY:
16125 	case VM_MEMORY_MALLOC_LARGE_REUSABLE:
16126 	case VM_MEMORY_MALLOC_LARGE_REUSED:
16127 		/*
16128 		 * This is a malloc() memory region: check if it's still
16129 		 * in its original state and can be re-used for more
16130 		 * malloc() allocations.
16131 		 */
16132 		break;
16133 	default:
16134 		/*
16135 		 * Not a malloc() memory region: let the caller decide if
16136 		 * it's re-usable.
16137 		 */
16138 		return TRUE;
16139 	}
16140 
16141 	if (/*entry->is_shared ||*/
16142 		entry->is_sub_map ||
16143 		entry->in_transition ||
16144 		entry->protection != VM_PROT_DEFAULT ||
16145 		entry->max_protection != VM_PROT_ALL ||
16146 		entry->inheritance != VM_INHERIT_DEFAULT ||
16147 		entry->no_cache ||
16148 		entry->vme_permanent ||
16149 		entry->superpage_size != FALSE ||
16150 		entry->zero_wired_pages ||
16151 		entry->wired_count != 0 ||
16152 		entry->user_wired_count != 0) {
16153 		return FALSE;
16154 	}
16155 
16156 	object = VME_OBJECT(entry);
16157 	if (object == VM_OBJECT_NULL) {
16158 		return TRUE;
16159 	}
16160 	if (
16161 #if 0
16162 		/*
16163 		 * Let's proceed even if the VM object is potentially
16164 		 * shared.
16165 		 * We check for this later when processing the actual
16166 		 * VM pages, so the contents will be safe if shared.
16167 		 *
16168 		 * But we can still mark this memory region as "reusable" to
16169 		 * acknowledge that the caller did let us know that the memory
16170 		 * could be re-used and should not be penalized for holding
16171 		 * on to it.  This allows its "resident size" to not include
16172 		 * the reusable range.
16173 		 */
16174 		object->ref_count == 1 &&
16175 #endif
16176 		object->wired_page_count == 0 &&
16177 		object->copy == VM_OBJECT_NULL &&
16178 		object->shadow == VM_OBJECT_NULL &&
16179 		object->internal &&
16180 		object->purgable == VM_PURGABLE_DENY &&
16181 		object->wimg_bits == VM_WIMG_USE_DEFAULT &&
16182 		!object->code_signed) {
16183 		return TRUE;
16184 	}
16185 	return FALSE;
16186 }
16187 
16188 static kern_return_t
vm_map_reuse_pages(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16189 vm_map_reuse_pages(
16190 	vm_map_t        map,
16191 	vm_map_offset_t start,
16192 	vm_map_offset_t end)
16193 {
16194 	vm_map_entry_t                  entry;
16195 	vm_object_t                     object;
16196 	vm_object_offset_t              start_offset, end_offset;
16197 
16198 	/*
16199 	 * The MADV_REUSE operation doesn't require any changes to the
16200 	 * vm_map_entry_t's, so the read lock is sufficient.
16201 	 */
16202 
16203 	if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
16204 		/*
16205 		 * XXX TODO4K
16206 		 * need to figure out what reusable means for a
16207 		 * portion of a native page.
16208 		 */
16209 		return KERN_SUCCESS;
16210 	}
16211 
16212 	vm_map_lock_read(map);
16213 	assert(map->pmap != kernel_pmap);       /* protect alias access */
16214 
16215 	/*
16216 	 * The madvise semantics require that the address range be fully
16217 	 * allocated with no holes.  Otherwise, we're required to return
16218 	 * an error.
16219 	 */
16220 
16221 	if (!vm_map_range_check(map, start, end, &entry)) {
16222 		vm_map_unlock_read(map);
16223 		vm_page_stats_reusable.reuse_pages_failure++;
16224 		return KERN_INVALID_ADDRESS;
16225 	}
16226 
16227 	/*
16228 	 * Examine each vm_map_entry_t in the range.
16229 	 */
16230 	for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
16231 	    entry = entry->vme_next) {
16232 		/*
16233 		 * Sanity check on the VM map entry.
16234 		 */
16235 		if (!vm_map_entry_is_reusable(entry)) {
16236 			vm_map_unlock_read(map);
16237 			vm_page_stats_reusable.reuse_pages_failure++;
16238 			return KERN_INVALID_ADDRESS;
16239 		}
16240 
16241 		/*
16242 		 * The first time through, the start address could be anywhere
16243 		 * within the vm_map_entry we found.  So adjust the offset to
16244 		 * correspond.
16245 		 */
16246 		if (entry->vme_start < start) {
16247 			start_offset = start - entry->vme_start;
16248 		} else {
16249 			start_offset = 0;
16250 		}
16251 		end_offset = MIN(end, entry->vme_end) - entry->vme_start;
16252 		start_offset += VME_OFFSET(entry);
16253 		end_offset += VME_OFFSET(entry);
16254 
16255 		object = VME_OBJECT(entry);
16256 		if (object != VM_OBJECT_NULL) {
16257 			vm_object_lock(object);
16258 			vm_object_reuse_pages(object, start_offset, end_offset,
16259 			    TRUE);
16260 			vm_object_unlock(object);
16261 		}
16262 
16263 		if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE_REUSABLE) {
16264 			/*
16265 			 * XXX
16266 			 * We do not hold the VM map exclusively here.
16267 			 * The "alias" field is not that critical, so it's
16268 			 * safe to update it here, as long as it is the only
16269 			 * one that can be modified while holding the VM map
16270 			 * "shared".
16271 			 */
16272 			VME_ALIAS_SET(entry, VM_MEMORY_MALLOC_LARGE_REUSED);
16273 		}
16274 	}
16275 
16276 	vm_map_unlock_read(map);
16277 	vm_page_stats_reusable.reuse_pages_success++;
16278 	return KERN_SUCCESS;
16279 }
16280 
16281 
16282 static kern_return_t
vm_map_reusable_pages(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16283 vm_map_reusable_pages(
16284 	vm_map_t        map,
16285 	vm_map_offset_t start,
16286 	vm_map_offset_t end)
16287 {
16288 	vm_map_entry_t                  entry;
16289 	vm_object_t                     object;
16290 	vm_object_offset_t              start_offset, end_offset;
16291 	vm_map_offset_t                 pmap_offset;
16292 
16293 	if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
16294 		/*
16295 		 * XXX TODO4K
16296 		 * need to figure out what reusable means for a portion
16297 		 * of a native page.
16298 		 */
16299 		return KERN_SUCCESS;
16300 	}
16301 
16302 	/*
16303 	 * The MADV_REUSABLE operation doesn't require any changes to the
16304 	 * vm_map_entry_t's, so the read lock is sufficient.
16305 	 */
16306 
16307 	vm_map_lock_read(map);
16308 	assert(map->pmap != kernel_pmap);       /* protect alias access */
16309 
16310 	/*
16311 	 * The madvise semantics require that the address range be fully
16312 	 * allocated with no holes.  Otherwise, we're required to return
16313 	 * an error.
16314 	 */
16315 
16316 	if (!vm_map_range_check(map, start, end, &entry)) {
16317 		vm_map_unlock_read(map);
16318 		vm_page_stats_reusable.reusable_pages_failure++;
16319 		return KERN_INVALID_ADDRESS;
16320 	}
16321 
16322 	/*
16323 	 * Examine each vm_map_entry_t in the range.
16324 	 */
16325 	for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
16326 	    entry = entry->vme_next) {
16327 		int kill_pages = 0;
16328 
16329 		/*
16330 		 * Sanity check on the VM map entry.
16331 		 */
16332 		if (!vm_map_entry_is_reusable(entry)) {
16333 			vm_map_unlock_read(map);
16334 			vm_page_stats_reusable.reusable_pages_failure++;
16335 			return KERN_INVALID_ADDRESS;
16336 		}
16337 
16338 		if (!(entry->protection & VM_PROT_WRITE) && !entry->used_for_jit) {
16339 			/* not writable: can't discard contents */
16340 			vm_map_unlock_read(map);
16341 			vm_page_stats_reusable.reusable_nonwritable++;
16342 			vm_page_stats_reusable.reusable_pages_failure++;
16343 			return KERN_PROTECTION_FAILURE;
16344 		}
16345 
16346 		/*
16347 		 * The first time through, the start address could be anywhere
16348 		 * within the vm_map_entry we found.  So adjust the offset to
16349 		 * correspond.
16350 		 */
16351 		if (entry->vme_start < start) {
16352 			start_offset = start - entry->vme_start;
16353 			pmap_offset = start;
16354 		} else {
16355 			start_offset = 0;
16356 			pmap_offset = entry->vme_start;
16357 		}
16358 		end_offset = MIN(end, entry->vme_end) - entry->vme_start;
16359 		start_offset += VME_OFFSET(entry);
16360 		end_offset += VME_OFFSET(entry);
16361 
16362 		object = VME_OBJECT(entry);
16363 		if (object == VM_OBJECT_NULL) {
16364 			continue;
16365 		}
16366 
16367 
16368 		vm_object_lock(object);
16369 		if (((object->ref_count == 1) ||
16370 		    (object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC &&
16371 		    object->copy == VM_OBJECT_NULL)) &&
16372 		    object->shadow == VM_OBJECT_NULL &&
16373 		    /*
16374 		     * "iokit_acct" entries are billed for their virtual size
16375 		     * (rather than for their resident pages only), so they
16376 		     * wouldn't benefit from making pages reusable, and it
16377 		     * would be hard to keep track of pages that are both
16378 		     * "iokit_acct" and "reusable" in the pmap stats and
16379 		     * ledgers.
16380 		     */
16381 		    !(entry->iokit_acct ||
16382 		    (!entry->is_sub_map && !entry->use_pmap))) {
16383 			if (object->ref_count != 1) {
16384 				vm_page_stats_reusable.reusable_shared++;
16385 			}
16386 			kill_pages = 1;
16387 		} else {
16388 			kill_pages = -1;
16389 		}
16390 		if (kill_pages != -1) {
16391 			vm_object_deactivate_pages(object,
16392 			    start_offset,
16393 			    end_offset - start_offset,
16394 			    kill_pages,
16395 			    TRUE /*reusable_pages*/,
16396 			    map->pmap,
16397 			    pmap_offset);
16398 		} else {
16399 			vm_page_stats_reusable.reusable_pages_shared++;
16400 			DTRACE_VM4(vm_map_reusable_pages_shared,
16401 			    unsigned int, VME_ALIAS(entry),
16402 			    vm_map_t, map,
16403 			    vm_map_entry_t, entry,
16404 			    vm_object_t, object);
16405 		}
16406 		vm_object_unlock(object);
16407 
16408 		if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE ||
16409 		    VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE_REUSED) {
16410 			/*
16411 			 * XXX
16412 			 * We do not hold the VM map exclusively here.
16413 			 * The "alias" field is not that critical, so it's
16414 			 * safe to update it here, as long as it is the only
16415 			 * one that can be modified while holding the VM map
16416 			 * "shared".
16417 			 */
16418 			VME_ALIAS_SET(entry, VM_MEMORY_MALLOC_LARGE_REUSABLE);
16419 		}
16420 	}
16421 
16422 	vm_map_unlock_read(map);
16423 	vm_page_stats_reusable.reusable_pages_success++;
16424 	return KERN_SUCCESS;
16425 }
16426 
16427 
16428 static kern_return_t
vm_map_can_reuse(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16429 vm_map_can_reuse(
16430 	vm_map_t        map,
16431 	vm_map_offset_t start,
16432 	vm_map_offset_t end)
16433 {
16434 	vm_map_entry_t                  entry;
16435 
16436 	/*
16437 	 * The MADV_REUSABLE operation doesn't require any changes to the
16438 	 * vm_map_entry_t's, so the read lock is sufficient.
16439 	 */
16440 
16441 	vm_map_lock_read(map);
16442 	assert(map->pmap != kernel_pmap);       /* protect alias access */
16443 
16444 	/*
16445 	 * The madvise semantics require that the address range be fully
16446 	 * allocated with no holes.  Otherwise, we're required to return
16447 	 * an error.
16448 	 */
16449 
16450 	if (!vm_map_range_check(map, start, end, &entry)) {
16451 		vm_map_unlock_read(map);
16452 		vm_page_stats_reusable.can_reuse_failure++;
16453 		return KERN_INVALID_ADDRESS;
16454 	}
16455 
16456 	/*
16457 	 * Examine each vm_map_entry_t in the range.
16458 	 */
16459 	for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
16460 	    entry = entry->vme_next) {
16461 		/*
16462 		 * Sanity check on the VM map entry.
16463 		 */
16464 		if (!vm_map_entry_is_reusable(entry)) {
16465 			vm_map_unlock_read(map);
16466 			vm_page_stats_reusable.can_reuse_failure++;
16467 			return KERN_INVALID_ADDRESS;
16468 		}
16469 	}
16470 
16471 	vm_map_unlock_read(map);
16472 	vm_page_stats_reusable.can_reuse_success++;
16473 	return KERN_SUCCESS;
16474 }
16475 
16476 
16477 #if MACH_ASSERT
16478 static kern_return_t
vm_map_pageout(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16479 vm_map_pageout(
16480 	vm_map_t        map,
16481 	vm_map_offset_t start,
16482 	vm_map_offset_t end)
16483 {
16484 	vm_map_entry_t                  entry;
16485 
16486 	/*
16487 	 * The MADV_PAGEOUT operation doesn't require any changes to the
16488 	 * vm_map_entry_t's, so the read lock is sufficient.
16489 	 */
16490 
16491 	vm_map_lock_read(map);
16492 
16493 	/*
16494 	 * The madvise semantics require that the address range be fully
16495 	 * allocated with no holes.  Otherwise, we're required to return
16496 	 * an error.
16497 	 */
16498 
16499 	if (!vm_map_range_check(map, start, end, &entry)) {
16500 		vm_map_unlock_read(map);
16501 		return KERN_INVALID_ADDRESS;
16502 	}
16503 
16504 	/*
16505 	 * Examine each vm_map_entry_t in the range.
16506 	 */
16507 	for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
16508 	    entry = entry->vme_next) {
16509 		vm_object_t     object;
16510 
16511 		/*
16512 		 * Sanity check on the VM map entry.
16513 		 */
16514 		if (entry->is_sub_map) {
16515 			vm_map_t submap;
16516 			vm_map_offset_t submap_start;
16517 			vm_map_offset_t submap_end;
16518 			vm_map_entry_t submap_entry;
16519 
16520 			submap = VME_SUBMAP(entry);
16521 			submap_start = VME_OFFSET(entry);
16522 			submap_end = submap_start + (entry->vme_end -
16523 			    entry->vme_start);
16524 
16525 			vm_map_lock_read(submap);
16526 
16527 			if (!vm_map_range_check(submap,
16528 			    submap_start,
16529 			    submap_end,
16530 			    &submap_entry)) {
16531 				vm_map_unlock_read(submap);
16532 				vm_map_unlock_read(map);
16533 				return KERN_INVALID_ADDRESS;
16534 			}
16535 
16536 			if (submap_entry->is_sub_map) {
16537 				vm_map_unlock_read(submap);
16538 				continue;
16539 			}
16540 
16541 			object = VME_OBJECT(submap_entry);
16542 			if (object == VM_OBJECT_NULL || !object->internal) {
16543 				vm_map_unlock_read(submap);
16544 				continue;
16545 			}
16546 
16547 			vm_object_pageout(object);
16548 
16549 			vm_map_unlock_read(submap);
16550 			submap = VM_MAP_NULL;
16551 			submap_entry = VM_MAP_ENTRY_NULL;
16552 			continue;
16553 		}
16554 
16555 		object = VME_OBJECT(entry);
16556 		if (object == VM_OBJECT_NULL || !object->internal) {
16557 			continue;
16558 		}
16559 
16560 		vm_object_pageout(object);
16561 	}
16562 
16563 	vm_map_unlock_read(map);
16564 	return KERN_SUCCESS;
16565 }
16566 #endif /* MACH_ASSERT */
16567 
16568 
16569 /*
16570  *	Routine:	vm_map_entry_insert
16571  *
16572  *	Description:	This routine inserts a new vm_entry in a locked map.
16573  */
16574 static vm_map_entry_t
vm_map_entry_insert(vm_map_t map,vm_map_entry_t insp_entry,vm_map_offset_t start,vm_map_offset_t end,vm_object_t object,vm_object_offset_t offset,vm_map_kernel_flags_t vmk_flags,boolean_t needs_copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance,boolean_t no_cache,boolean_t permanent,unsigned int superpage_size,boolean_t clear_map_aligned,int alias)16575 vm_map_entry_insert(
16576 	vm_map_t                map,
16577 	vm_map_entry_t          insp_entry,
16578 	vm_map_offset_t         start,
16579 	vm_map_offset_t         end,
16580 	vm_object_t             object,
16581 	vm_object_offset_t      offset,
16582 	vm_map_kernel_flags_t   vmk_flags,
16583 	boolean_t               needs_copy,
16584 	vm_prot_t               cur_protection,
16585 	vm_prot_t               max_protection,
16586 	vm_inherit_t            inheritance,
16587 	boolean_t               no_cache,
16588 	boolean_t               permanent,
16589 	unsigned int            superpage_size,
16590 	boolean_t               clear_map_aligned,
16591 	int                     alias)
16592 {
16593 	vm_map_entry_t  new_entry;
16594 	boolean_t map_aligned = FALSE;
16595 
16596 	assert(insp_entry != (vm_map_entry_t)0);
16597 	vm_map_lock_assert_exclusive(map);
16598 
16599 #if DEVELOPMENT || DEBUG
16600 	vm_object_offset_t      end_offset = 0;
16601 	assertf(!os_add_overflow(end - start, offset, &end_offset), "size 0x%llx, offset 0x%llx caused overflow", (uint64_t)(end - start), offset);
16602 #endif /* DEVELOPMENT || DEBUG */
16603 
16604 	if (VM_MAP_PAGE_SHIFT(map) != PAGE_SHIFT) {
16605 		map_aligned = TRUE;
16606 	}
16607 	if (clear_map_aligned &&
16608 	    (!VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)) ||
16609 	    !VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)))) {
16610 		map_aligned = FALSE;
16611 	}
16612 	if (map_aligned) {
16613 		assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
16614 		assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
16615 	} else {
16616 		assert(page_aligned(start));
16617 		assert(page_aligned(end));
16618 	}
16619 	assert(start < end);
16620 
16621 	new_entry = vm_map_entry_create(map);
16622 
16623 	new_entry->vme_start = start;
16624 	new_entry->vme_end = end;
16625 
16626 	if (vmk_flags.vmkf_submap) {
16627 		new_entry->vme_atomic = vmk_flags.vmkf_submap_atomic;
16628 		VME_SUBMAP_SET(new_entry, (vm_map_t)object);
16629 	} else {
16630 		VME_OBJECT_SET(new_entry, object, false, 0);
16631 	}
16632 	VME_OFFSET_SET(new_entry, offset);
16633 	VME_ALIAS_SET(new_entry, alias);
16634 
16635 	new_entry->map_aligned = map_aligned;
16636 	new_entry->needs_copy = needs_copy;
16637 	new_entry->inheritance = inheritance;
16638 	new_entry->protection = cur_protection;
16639 	new_entry->max_protection = max_protection;
16640 	/*
16641 	 * submap: "use_pmap" means "nested".
16642 	 * default: false.
16643 	 *
16644 	 * object: "use_pmap" means "use pmap accounting" for footprint.
16645 	 * default: true.
16646 	 */
16647 	new_entry->use_pmap = !vmk_flags.vmkf_submap;
16648 	new_entry->no_cache = no_cache;
16649 	new_entry->vme_permanent = permanent;
16650 	new_entry->translated_allow_execute = vmk_flags.vmkf_translated_allow_execute;
16651 	new_entry->vme_no_copy_on_read = vmk_flags.vmkf_no_copy_on_read;
16652 	new_entry->superpage_size = (superpage_size != 0);
16653 
16654 	if (vmk_flags.vmkf_map_jit) {
16655 		if (!(map->jit_entry_exists) ||
16656 		    VM_MAP_POLICY_ALLOW_MULTIPLE_JIT(map)) {
16657 			new_entry->used_for_jit = TRUE;
16658 			map->jit_entry_exists = TRUE;
16659 		}
16660 	}
16661 
16662 	/*
16663 	 *	Insert the new entry into the list.
16664 	 */
16665 
16666 	vm_map_store_entry_link(map, insp_entry, new_entry, vmk_flags);
16667 	map->size += end - start;
16668 
16669 	/*
16670 	 *	Update the free space hint and the lookup hint.
16671 	 */
16672 
16673 	SAVE_HINT_MAP_WRITE(map, new_entry);
16674 	return new_entry;
16675 }
16676 
16677 /*
16678  *	Routine:	vm_map_remap_extract
16679  *
16680  *	Description:	This routine returns a vm_entry list from a map.
16681  */
16682 static kern_return_t
vm_map_remap_extract(vm_map_t map,vm_map_offset_t addr,vm_map_size_t size,boolean_t copy,struct vm_map_header * map_header,vm_prot_t * cur_protection,vm_prot_t * max_protection,vm_inherit_t inheritance,vm_map_kernel_flags_t vmk_flags)16683 vm_map_remap_extract(
16684 	vm_map_t                map,
16685 	vm_map_offset_t         addr,
16686 	vm_map_size_t           size,
16687 	boolean_t               copy,
16688 	struct vm_map_header    *map_header,
16689 	vm_prot_t               *cur_protection,   /* IN/OUT */
16690 	vm_prot_t               *max_protection,   /* IN/OUT */
16691 	/* What, no behavior? */
16692 	vm_inherit_t            inheritance,
16693 	vm_map_kernel_flags_t   vmk_flags)
16694 {
16695 	kern_return_t           result;
16696 	vm_map_size_t           mapped_size;
16697 	vm_map_size_t           tmp_size;
16698 	vm_map_entry_t          src_entry;     /* result of last map lookup */
16699 	vm_map_entry_t          new_entry;
16700 	vm_object_offset_t      offset;
16701 	vm_map_offset_t         map_address;
16702 	vm_map_offset_t         src_start;     /* start of entry to map */
16703 	vm_map_offset_t         src_end;       /* end of region to be mapped */
16704 	vm_object_t             object;
16705 	vm_map_version_t        version;
16706 	boolean_t               src_needs_copy;
16707 	boolean_t               new_entry_needs_copy;
16708 	vm_map_entry_t          saved_src_entry;
16709 	boolean_t               src_entry_was_wired;
16710 	vm_prot_t               max_prot_for_prot_copy;
16711 	vm_map_offset_t         effective_page_mask;
16712 	boolean_t               pageable, same_map;
16713 	boolean_t               vm_remap_legacy;
16714 	vm_prot_t               required_cur_prot, required_max_prot;
16715 	vm_object_t             new_copy_object;     /* vm_object_copy_* result */
16716 	boolean_t               saved_used_for_jit;     /* Saved used_for_jit. */
16717 
16718 	pageable = vmk_flags.vmkf_copy_pageable;
16719 	same_map = vmk_flags.vmkf_copy_same_map;
16720 
16721 	effective_page_mask = MIN(PAGE_MASK, VM_MAP_PAGE_MASK(map));
16722 
16723 	assert(map != VM_MAP_NULL);
16724 	assert(size != 0);
16725 	assert(size == vm_map_round_page(size, effective_page_mask));
16726 	assert(inheritance == VM_INHERIT_NONE ||
16727 	    inheritance == VM_INHERIT_COPY ||
16728 	    inheritance == VM_INHERIT_SHARE);
16729 	assert(!(*cur_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)));
16730 	assert(!(*max_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)));
16731 	assert((*cur_protection & *max_protection) == *cur_protection);
16732 
16733 	/*
16734 	 *	Compute start and end of region.
16735 	 */
16736 	src_start = vm_map_trunc_page(addr, effective_page_mask);
16737 	src_end = vm_map_round_page(src_start + size, effective_page_mask);
16738 
16739 	/*
16740 	 *	Initialize map_header.
16741 	 */
16742 	map_header->links.next = CAST_TO_VM_MAP_ENTRY(&map_header->links);
16743 	map_header->links.prev = CAST_TO_VM_MAP_ENTRY(&map_header->links);
16744 	map_header->nentries = 0;
16745 	map_header->entries_pageable = pageable;
16746 //	map_header->page_shift = MIN(VM_MAP_PAGE_SHIFT(map), PAGE_SHIFT);
16747 	map_header->page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(map);
16748 	map_header->rb_head_store.rbh_root = (void *)(int)SKIP_RB_TREE;
16749 
16750 	vm_map_store_init( map_header );
16751 
16752 	if (copy && vmk_flags.vmkf_remap_prot_copy) {
16753 		/*
16754 		 * Special case for vm_map_protect(VM_PROT_COPY):
16755 		 * we want to set the new mappings' max protection to the
16756 		 * specified *max_protection...
16757 		 */
16758 		max_prot_for_prot_copy = *max_protection & (VM_PROT_ALL | VM_PROT_ALLEXEC);
16759 		/* ... but we want to use the vm_remap() legacy mode */
16760 		*max_protection = VM_PROT_NONE;
16761 		*cur_protection = VM_PROT_NONE;
16762 	} else {
16763 		max_prot_for_prot_copy = VM_PROT_NONE;
16764 	}
16765 
16766 	if (*cur_protection == VM_PROT_NONE &&
16767 	    *max_protection == VM_PROT_NONE) {
16768 		/*
16769 		 * vm_remap() legacy mode:
16770 		 * Extract all memory regions in the specified range and
16771 		 * collect the strictest set of protections allowed on the
16772 		 * entire range, so the caller knows what they can do with
16773 		 * the remapped range.
16774 		 * We start with VM_PROT_ALL and we'll remove the protections
16775 		 * missing from each memory region.
16776 		 */
16777 		vm_remap_legacy = TRUE;
16778 		*cur_protection = VM_PROT_ALL;
16779 		*max_protection = VM_PROT_ALL;
16780 		required_cur_prot = VM_PROT_NONE;
16781 		required_max_prot = VM_PROT_NONE;
16782 	} else {
16783 		/*
16784 		 * vm_remap_new() mode:
16785 		 * Extract all memory regions in the specified range and
16786 		 * ensure that they have at least the protections specified
16787 		 * by the caller via *cur_protection and *max_protection.
16788 		 * The resulting mapping should have these protections.
16789 		 */
16790 		vm_remap_legacy = FALSE;
16791 		if (copy) {
16792 			required_cur_prot = VM_PROT_NONE;
16793 			required_max_prot = VM_PROT_READ;
16794 		} else {
16795 			required_cur_prot = *cur_protection;
16796 			required_max_prot = *max_protection;
16797 		}
16798 	}
16799 
16800 	map_address = 0;
16801 	mapped_size = 0;
16802 	result = KERN_SUCCESS;
16803 
16804 	/*
16805 	 *	The specified source virtual space might correspond to
16806 	 *	multiple map entries, need to loop on them.
16807 	 */
16808 	vm_map_lock(map);
16809 	if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
16810 		/*
16811 		 * This address space uses sub-pages so the range might
16812 		 * not be re-mappable in an address space with larger
16813 		 * pages. Re-assemble any broken-up VM map entries to
16814 		 * improve our chances of making it work.
16815 		 */
16816 		vm_map_simplify_range(map, src_start, src_end);
16817 	}
16818 	while (mapped_size != size) {
16819 		vm_map_size_t   entry_size;
16820 
16821 		/*
16822 		 *	Find the beginning of the region.
16823 		 */
16824 		if (!vm_map_lookup_entry(map, src_start, &src_entry)) {
16825 			result = KERN_INVALID_ADDRESS;
16826 			break;
16827 		}
16828 
16829 		if (src_start < src_entry->vme_start ||
16830 		    (mapped_size && src_start != src_entry->vme_start)) {
16831 			result = KERN_INVALID_ADDRESS;
16832 			break;
16833 		}
16834 
16835 		tmp_size = size - mapped_size;
16836 		if (src_end > src_entry->vme_end) {
16837 			tmp_size -= (src_end - src_entry->vme_end);
16838 		}
16839 
16840 		entry_size = (vm_map_size_t)(src_entry->vme_end -
16841 		    src_entry->vme_start);
16842 
16843 		if (src_entry->is_sub_map &&
16844 		    vmk_flags.vmkf_copy_single_object) {
16845 			vm_map_t submap;
16846 			vm_map_offset_t submap_start;
16847 			vm_map_size_t submap_size;
16848 			boolean_t submap_needs_copy;
16849 
16850 			/*
16851 			 * No check for "required protection" on "src_entry"
16852 			 * because the protections that matter are the ones
16853 			 * on the submap's VM map entry, which will be checked
16854 			 * during the call to vm_map_remap_extract() below.
16855 			 */
16856 			submap_size = src_entry->vme_end - src_start;
16857 			if (submap_size > size) {
16858 				submap_size = size;
16859 			}
16860 			submap_start = VME_OFFSET(src_entry) + src_start - src_entry->vme_start;
16861 			submap = VME_SUBMAP(src_entry);
16862 			if (copy) {
16863 				/*
16864 				 * The caller wants a copy-on-write re-mapping,
16865 				 * so let's extract from the submap accordingly.
16866 				 */
16867 				submap_needs_copy = TRUE;
16868 			} else if (src_entry->needs_copy) {
16869 				/*
16870 				 * The caller wants a shared re-mapping but the
16871 				 * submap is mapped with "needs_copy", so its
16872 				 * contents can't be shared as is. Extract the
16873 				 * contents of the submap as "copy-on-write".
16874 				 * The re-mapping won't be shared with the
16875 				 * original mapping but this is equivalent to
16876 				 * what happened with the original "remap from
16877 				 * submap" code.
16878 				 * The shared region is mapped "needs_copy", for
16879 				 * example.
16880 				 */
16881 				submap_needs_copy = TRUE;
16882 			} else {
16883 				/*
16884 				 * The caller wants a shared re-mapping and
16885 				 * this mapping can be shared (no "needs_copy"),
16886 				 * so let's extract from the submap accordingly.
16887 				 * Kernel submaps are mapped without
16888 				 * "needs_copy", for example.
16889 				 */
16890 				submap_needs_copy = FALSE;
16891 			}
16892 			vm_map_reference(submap);
16893 			vm_map_unlock(map);
16894 			src_entry = NULL;
16895 			if (vm_remap_legacy) {
16896 				*cur_protection = VM_PROT_NONE;
16897 				*max_protection = VM_PROT_NONE;
16898 			}
16899 
16900 			DTRACE_VM7(remap_submap_recurse,
16901 			    vm_map_t, map,
16902 			    vm_map_offset_t, addr,
16903 			    vm_map_size_t, size,
16904 			    boolean_t, copy,
16905 			    vm_map_offset_t, submap_start,
16906 			    vm_map_size_t, submap_size,
16907 			    boolean_t, submap_needs_copy);
16908 
16909 			result = vm_map_remap_extract(submap,
16910 			    submap_start,
16911 			    submap_size,
16912 			    submap_needs_copy,
16913 			    map_header,
16914 			    cur_protection,
16915 			    max_protection,
16916 			    inheritance,
16917 			    vmk_flags);
16918 			vm_map_deallocate(submap);
16919 			return result;
16920 		}
16921 
16922 		if (src_entry->is_sub_map) {
16923 			/* protections for submap mapping are irrelevant here */
16924 		} else if (((src_entry->protection & required_cur_prot) !=
16925 		    required_cur_prot) ||
16926 		    ((src_entry->max_protection & required_max_prot) !=
16927 		    required_max_prot)) {
16928 			if (vmk_flags.vmkf_copy_single_object &&
16929 			    mapped_size != 0) {
16930 				/*
16931 				 * Single object extraction.
16932 				 * We can't extract more with the required
16933 				 * protection but we've extracted some, so
16934 				 * stop there and declare success.
16935 				 * The caller should check the size of
16936 				 * the copy entry we've extracted.
16937 				 */
16938 				result = KERN_SUCCESS;
16939 			} else {
16940 				/*
16941 				 * VM range extraction.
16942 				 * Required proctection is not available
16943 				 * for this part of the range: fail.
16944 				 */
16945 				result = KERN_PROTECTION_FAILURE;
16946 			}
16947 			break;
16948 		}
16949 
16950 		if (src_entry->is_sub_map) {
16951 			vm_map_t submap;
16952 			vm_map_offset_t submap_start;
16953 			vm_map_size_t submap_size;
16954 			vm_map_copy_t submap_copy;
16955 			vm_prot_t submap_curprot, submap_maxprot;
16956 			boolean_t submap_needs_copy;
16957 
16958 			/*
16959 			 * No check for "required protection" on "src_entry"
16960 			 * because the protections that matter are the ones
16961 			 * on the submap's VM map entry, which will be checked
16962 			 * during the call to vm_map_copy_extract() below.
16963 			 */
16964 			object = VM_OBJECT_NULL;
16965 			submap_copy = VM_MAP_COPY_NULL;
16966 
16967 			/* find equivalent range in the submap */
16968 			submap = VME_SUBMAP(src_entry);
16969 			submap_start = VME_OFFSET(src_entry) + src_start - src_entry->vme_start;
16970 			submap_size = tmp_size;
16971 			if (copy) {
16972 				/*
16973 				 * The caller wants a copy-on-write re-mapping,
16974 				 * so let's extract from the submap accordingly.
16975 				 */
16976 				submap_needs_copy = TRUE;
16977 			} else if (src_entry->needs_copy) {
16978 				/*
16979 				 * The caller wants a shared re-mapping but the
16980 				 * submap is mapped with "needs_copy", so its
16981 				 * contents can't be shared as is. Extract the
16982 				 * contents of the submap as "copy-on-write".
16983 				 * The re-mapping won't be shared with the
16984 				 * original mapping but this is equivalent to
16985 				 * what happened with the original "remap from
16986 				 * submap" code.
16987 				 * The shared region is mapped "needs_copy", for
16988 				 * example.
16989 				 */
16990 				submap_needs_copy = TRUE;
16991 			} else {
16992 				/*
16993 				 * The caller wants a shared re-mapping and
16994 				 * this mapping can be shared (no "needs_copy"),
16995 				 * so let's extract from the submap accordingly.
16996 				 * Kernel submaps are mapped without
16997 				 * "needs_copy", for example.
16998 				 */
16999 				submap_needs_copy = FALSE;
17000 			}
17001 			/* extra ref to keep submap alive */
17002 			vm_map_reference(submap);
17003 
17004 			DTRACE_VM7(remap_submap_recurse,
17005 			    vm_map_t, map,
17006 			    vm_map_offset_t, addr,
17007 			    vm_map_size_t, size,
17008 			    boolean_t, copy,
17009 			    vm_map_offset_t, submap_start,
17010 			    vm_map_size_t, submap_size,
17011 			    boolean_t, submap_needs_copy);
17012 
17013 			/*
17014 			 * The map can be safely unlocked since we
17015 			 * already hold a reference on the submap.
17016 			 *
17017 			 * No timestamp since we don't care if the map
17018 			 * gets modified while we're down in the submap.
17019 			 * We'll resume the extraction at src_start + tmp_size
17020 			 * anyway.
17021 			 */
17022 			vm_map_unlock(map);
17023 			src_entry = NULL; /* not valid once map is unlocked */
17024 
17025 			if (vm_remap_legacy) {
17026 				submap_curprot = VM_PROT_NONE;
17027 				submap_maxprot = VM_PROT_NONE;
17028 				if (max_prot_for_prot_copy) {
17029 					submap_maxprot = max_prot_for_prot_copy;
17030 				}
17031 			} else {
17032 				assert(!max_prot_for_prot_copy);
17033 				submap_curprot = *cur_protection;
17034 				submap_maxprot = *max_protection;
17035 			}
17036 			result = vm_map_copy_extract(submap,
17037 			    submap_start,
17038 			    submap_size,
17039 			    submap_needs_copy,
17040 			    &submap_copy,
17041 			    &submap_curprot,
17042 			    &submap_maxprot,
17043 			    inheritance,
17044 			    vmk_flags);
17045 
17046 			/* release extra ref on submap */
17047 			vm_map_deallocate(submap);
17048 			submap = VM_MAP_NULL;
17049 
17050 			if (result != KERN_SUCCESS) {
17051 				vm_map_lock(map);
17052 				break;
17053 			}
17054 
17055 			/* transfer submap_copy entries to map_header */
17056 			while (vm_map_copy_first_entry(submap_copy) !=
17057 			    vm_map_copy_to_entry(submap_copy)) {
17058 				vm_map_entry_t copy_entry;
17059 				vm_map_size_t copy_entry_size;
17060 
17061 				copy_entry = vm_map_copy_first_entry(submap_copy);
17062 
17063 				/*
17064 				 * Prevent kernel_object from being exposed to
17065 				 * user space.
17066 				 */
17067 				if (__improbable(copy_entry->vme_kernel_object)) {
17068 					printf("%d[%s]: rejecting attempt to extract from kernel_object\n",
17069 					    proc_selfpid(),
17070 					    (get_bsdtask_info(current_task())
17071 					    ? proc_name_address(get_bsdtask_info(current_task()))
17072 					    : "?"));
17073 					DTRACE_VM(extract_kernel_only);
17074 					result = KERN_INVALID_RIGHT;
17075 					vm_map_copy_discard(submap_copy);
17076 					submap_copy = VM_MAP_COPY_NULL;
17077 					vm_map_lock(map);
17078 					break;
17079 				}
17080 
17081 				vm_map_copy_entry_unlink(submap_copy, copy_entry);
17082 				copy_entry_size = copy_entry->vme_end - copy_entry->vme_start;
17083 				copy_entry->vme_start = map_address;
17084 				copy_entry->vme_end = map_address + copy_entry_size;
17085 				map_address += copy_entry_size;
17086 				mapped_size += copy_entry_size;
17087 				src_start += copy_entry_size;
17088 				assert(src_start <= src_end);
17089 				_vm_map_store_entry_link(map_header,
17090 				    map_header->links.prev,
17091 				    copy_entry);
17092 			}
17093 			/* done with submap_copy */
17094 			vm_map_copy_discard(submap_copy);
17095 
17096 			if (vm_remap_legacy) {
17097 				*cur_protection &= submap_curprot;
17098 				*max_protection &= submap_maxprot;
17099 			}
17100 
17101 			/* re-acquire the map lock and continue to next entry */
17102 			vm_map_lock(map);
17103 			continue;
17104 		} else {
17105 			object = VME_OBJECT(src_entry);
17106 
17107 			/*
17108 			 * Prevent kernel_object from being exposed to
17109 			 * user space.
17110 			 */
17111 			if (__improbable(object == kernel_object)) {
17112 				printf("%d[%s]: rejecting attempt to extract from kernel_object\n",
17113 				    proc_selfpid(),
17114 				    (get_bsdtask_info(current_task())
17115 				    ? proc_name_address(get_bsdtask_info(current_task()))
17116 				    : "?"));
17117 				DTRACE_VM(extract_kernel_only);
17118 				result = KERN_INVALID_RIGHT;
17119 				break;
17120 			}
17121 
17122 			if (src_entry->iokit_acct) {
17123 				/*
17124 				 * This entry uses "IOKit accounting".
17125 				 */
17126 			} else if (object != VM_OBJECT_NULL &&
17127 			    (object->purgable != VM_PURGABLE_DENY ||
17128 			    object->vo_ledger_tag != VM_LEDGER_TAG_NONE)) {
17129 				/*
17130 				 * Purgeable objects have their own accounting:
17131 				 * no pmap accounting for them.
17132 				 */
17133 				assertf(!src_entry->use_pmap,
17134 				    "map=%p src_entry=%p [0x%llx:0x%llx] 0x%x/0x%x %d",
17135 				    map,
17136 				    src_entry,
17137 				    (uint64_t)src_entry->vme_start,
17138 				    (uint64_t)src_entry->vme_end,
17139 				    src_entry->protection,
17140 				    src_entry->max_protection,
17141 				    VME_ALIAS(src_entry));
17142 			} else {
17143 				/*
17144 				 * Not IOKit or purgeable:
17145 				 * must be accounted by pmap stats.
17146 				 */
17147 				assertf(src_entry->use_pmap,
17148 				    "map=%p src_entry=%p [0x%llx:0x%llx] 0x%x/0x%x %d",
17149 				    map,
17150 				    src_entry,
17151 				    (uint64_t)src_entry->vme_start,
17152 				    (uint64_t)src_entry->vme_end,
17153 				    src_entry->protection,
17154 				    src_entry->max_protection,
17155 				    VME_ALIAS(src_entry));
17156 			}
17157 
17158 			if (object == VM_OBJECT_NULL) {
17159 				assert(!src_entry->needs_copy);
17160 				if (src_entry->max_protection == VM_PROT_NONE) {
17161 					assert(src_entry->protection == VM_PROT_NONE);
17162 					/*
17163 					 * No VM object and no permissions:
17164 					 * this must be a reserved range with
17165 					 * nothing to share or copy.
17166 					 * There could also be all sorts of
17167 					 * pmap shenanigans within that reserved
17168 					 * range, so let's just copy the map
17169 					 * entry as is to remap a similar
17170 					 * reserved range.
17171 					 */
17172 					offset = 0; /* no object => no offset */
17173 					goto copy_src_entry;
17174 				}
17175 				object = vm_object_allocate(entry_size);
17176 				VME_OFFSET_SET(src_entry, 0);
17177 				VME_OBJECT_SET(src_entry, object, false, 0);
17178 				assert(src_entry->use_pmap);
17179 				assert(!map->mapped_in_other_pmaps);
17180 			} else if (src_entry->wired_count ||
17181 			    object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
17182 				/*
17183 				 * A wired memory region should not have
17184 				 * any pending copy-on-write and needs to
17185 				 * keep pointing at the VM object that
17186 				 * contains the wired pages.
17187 				 * If we're sharing this memory (copy=false),
17188 				 * we'll share this VM object.
17189 				 * If we're copying this memory (copy=true),
17190 				 * we'll call vm_object_copy_slowly() below
17191 				 * and use the new VM object for the remapping.
17192 				 *
17193 				 * Or, we are already using an asymmetric
17194 				 * copy, and therefore we already have
17195 				 * the right object.
17196 				 */
17197 				assert(!src_entry->needs_copy);
17198 			} else if (src_entry->needs_copy || object->shadowed ||
17199 			    (object->internal && !object->true_share &&
17200 			    !src_entry->is_shared &&
17201 			    object->vo_size > entry_size)) {
17202 				VME_OBJECT_SHADOW(src_entry, entry_size,
17203 				    vm_map_always_shadow(map));
17204 				assert(src_entry->use_pmap);
17205 
17206 				if (!src_entry->needs_copy &&
17207 				    (src_entry->protection & VM_PROT_WRITE)) {
17208 					vm_prot_t prot;
17209 
17210 					assert(!pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, src_entry->protection));
17211 
17212 					prot = src_entry->protection & ~VM_PROT_WRITE;
17213 
17214 					if (override_nx(map,
17215 					    VME_ALIAS(src_entry))
17216 					    && prot) {
17217 						prot |= VM_PROT_EXECUTE;
17218 					}
17219 
17220 					assert(!pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, prot));
17221 
17222 					if (map->mapped_in_other_pmaps) {
17223 						vm_object_pmap_protect(
17224 							VME_OBJECT(src_entry),
17225 							VME_OFFSET(src_entry),
17226 							entry_size,
17227 							PMAP_NULL,
17228 							PAGE_SIZE,
17229 							src_entry->vme_start,
17230 							prot);
17231 #if MACH_ASSERT
17232 					} else if (__improbable(map->pmap == PMAP_NULL)) {
17233 						extern boolean_t vm_tests_in_progress;
17234 						assert(vm_tests_in_progress);
17235 						/*
17236 						 * Some VM tests (in vm_tests.c)
17237 						 * sometimes want to use a VM
17238 						 * map without a pmap.
17239 						 * Otherwise, this should never
17240 						 * happen.
17241 						 */
17242 #endif /* MACH_ASSERT */
17243 					} else {
17244 						pmap_protect(vm_map_pmap(map),
17245 						    src_entry->vme_start,
17246 						    src_entry->vme_end,
17247 						    prot);
17248 					}
17249 				}
17250 
17251 				object = VME_OBJECT(src_entry);
17252 				src_entry->needs_copy = FALSE;
17253 			}
17254 
17255 
17256 			vm_object_lock(object);
17257 			vm_object_reference_locked(object); /* object ref. for new entry */
17258 			assert(!src_entry->needs_copy);
17259 			if (object->copy_strategy ==
17260 			    MEMORY_OBJECT_COPY_SYMMETRIC) {
17261 				/*
17262 				 * If we want to share this object (copy==0),
17263 				 * it needs to be COPY_DELAY.
17264 				 * If we want to copy this object (copy==1),
17265 				 * we can't just set "needs_copy" on our side
17266 				 * and expect the other side to do the same
17267 				 * (symmetrically), so we can't let the object
17268 				 * stay COPY_SYMMETRIC.
17269 				 * So we always switch from COPY_SYMMETRIC to
17270 				 * COPY_DELAY.
17271 				 */
17272 				object->copy_strategy =
17273 				    MEMORY_OBJECT_COPY_DELAY;
17274 				object->true_share = TRUE;
17275 			}
17276 			vm_object_unlock(object);
17277 		}
17278 
17279 		offset = (VME_OFFSET(src_entry) +
17280 		    (src_start - src_entry->vme_start));
17281 
17282 copy_src_entry:
17283 		new_entry = _vm_map_entry_create(map_header);
17284 		vm_map_entry_copy(map, new_entry, src_entry);
17285 		if (new_entry->is_sub_map) {
17286 			/* clr address space specifics */
17287 			new_entry->use_pmap = FALSE;
17288 		} else if (copy) {
17289 			/*
17290 			 * We're dealing with a copy-on-write operation,
17291 			 * so the resulting mapping should not inherit the
17292 			 * original mapping's accounting settings.
17293 			 * "use_pmap" should be reset to its default (TRUE)
17294 			 * so that the new mapping gets accounted for in
17295 			 * the task's memory footprint.
17296 			 */
17297 			new_entry->use_pmap = TRUE;
17298 		}
17299 		/* "iokit_acct" was cleared in vm_map_entry_copy() */
17300 		assert(!new_entry->iokit_acct);
17301 
17302 		new_entry->map_aligned = FALSE;
17303 
17304 		new_entry->vme_start = map_address;
17305 		new_entry->vme_end = map_address + tmp_size;
17306 		assert(new_entry->vme_start < new_entry->vme_end);
17307 		if (copy && vmk_flags.vmkf_remap_prot_copy) {
17308 			/* security: keep "permanent" and "pmap_cs_associated" */
17309 			new_entry->vme_permanent = src_entry->vme_permanent;
17310 			new_entry->pmap_cs_associated = src_entry->pmap_cs_associated;
17311 			/*
17312 			 * Remapping for vm_map_protect(VM_PROT_COPY)
17313 			 * to convert a read-only mapping into a
17314 			 * copy-on-write version of itself but
17315 			 * with write access:
17316 			 * keep the original inheritance but let's not
17317 			 * add VM_PROT_WRITE to the max protection yet
17318 			 * since we want to do more security checks against
17319 			 * the target map.
17320 			 */
17321 			new_entry->inheritance = src_entry->inheritance;
17322 			new_entry->protection &= max_prot_for_prot_copy;
17323 		} else {
17324 			new_entry->inheritance = inheritance;
17325 			if (!vm_remap_legacy) {
17326 				new_entry->protection = *cur_protection;
17327 				new_entry->max_protection = *max_protection;
17328 			}
17329 		}
17330 		VME_OFFSET_SET(new_entry, offset);
17331 
17332 		/*
17333 		 * The new region has to be copied now if required.
17334 		 */
17335 RestartCopy:
17336 		if (!copy) {
17337 			if (src_entry->used_for_jit == TRUE) {
17338 				if (same_map) {
17339 				} else if (!VM_MAP_POLICY_ALLOW_JIT_SHARING(map)) {
17340 					/*
17341 					 * Cannot allow an entry describing a JIT
17342 					 * region to be shared across address spaces.
17343 					 */
17344 					result = KERN_INVALID_ARGUMENT;
17345 					vm_object_deallocate(object);
17346 					vm_map_entry_dispose(new_entry);
17347 					new_entry = VM_MAP_ENTRY_NULL;
17348 					break;
17349 				}
17350 			}
17351 
17352 			src_entry->is_shared = TRUE;
17353 			new_entry->is_shared = TRUE;
17354 			if (!(new_entry->is_sub_map)) {
17355 				new_entry->needs_copy = FALSE;
17356 			}
17357 		} else if (src_entry->is_sub_map) {
17358 			/* make this a COW sub_map if not already */
17359 			assert(new_entry->wired_count == 0);
17360 			new_entry->needs_copy = TRUE;
17361 			object = VM_OBJECT_NULL;
17362 		} else if (src_entry->wired_count == 0 &&
17363 		    !(debug4k_no_cow_copyin && VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) &&
17364 		    vm_object_copy_quickly(VME_OBJECT(new_entry),
17365 		    VME_OFFSET(new_entry),
17366 		    (new_entry->vme_end -
17367 		    new_entry->vme_start),
17368 		    &src_needs_copy,
17369 		    &new_entry_needs_copy)) {
17370 			new_entry->needs_copy = new_entry_needs_copy;
17371 			new_entry->is_shared = FALSE;
17372 			assertf(new_entry->use_pmap, "map %p new_entry %p\n", map, new_entry);
17373 
17374 			/*
17375 			 * Handle copy_on_write semantics.
17376 			 */
17377 			if (src_needs_copy && !src_entry->needs_copy) {
17378 				vm_prot_t prot;
17379 
17380 				assert(!pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, src_entry->protection));
17381 
17382 				prot = src_entry->protection & ~VM_PROT_WRITE;
17383 
17384 				if (override_nx(map,
17385 				    VME_ALIAS(src_entry))
17386 				    && prot) {
17387 					prot |= VM_PROT_EXECUTE;
17388 				}
17389 
17390 				assert(!pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, prot));
17391 
17392 				vm_object_pmap_protect(object,
17393 				    offset,
17394 				    entry_size,
17395 				    ((src_entry->is_shared
17396 				    || map->mapped_in_other_pmaps) ?
17397 				    PMAP_NULL : map->pmap),
17398 				    VM_MAP_PAGE_SIZE(map),
17399 				    src_entry->vme_start,
17400 				    prot);
17401 
17402 				assert(src_entry->wired_count == 0);
17403 				src_entry->needs_copy = TRUE;
17404 			}
17405 			/*
17406 			 * Throw away the old object reference of the new entry.
17407 			 */
17408 			vm_object_deallocate(object);
17409 		} else {
17410 			new_entry->is_shared = FALSE;
17411 			assertf(new_entry->use_pmap, "map %p new_entry %p\n", map, new_entry);
17412 
17413 			src_entry_was_wired = (src_entry->wired_count > 0);
17414 			saved_src_entry = src_entry;
17415 			src_entry = VM_MAP_ENTRY_NULL;
17416 
17417 			/*
17418 			 * The map can be safely unlocked since we
17419 			 * already hold a reference on the object.
17420 			 *
17421 			 * Record the timestamp of the map for later
17422 			 * verification, and unlock the map.
17423 			 */
17424 			version.main_timestamp = map->timestamp;
17425 			vm_map_unlock(map);     /* Increments timestamp once! */
17426 
17427 			/*
17428 			 * Perform the copy.
17429 			 */
17430 			if (src_entry_was_wired > 0 ||
17431 			    (debug4k_no_cow_copyin &&
17432 			    VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT)) {
17433 				vm_object_lock(object);
17434 				result = vm_object_copy_slowly(
17435 					object,
17436 					offset,
17437 					(new_entry->vme_end -
17438 					new_entry->vme_start),
17439 					THREAD_UNINT,
17440 					&new_copy_object);
17441 				/* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
17442 				saved_used_for_jit = new_entry->used_for_jit;
17443 				VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
17444 				new_entry->used_for_jit = saved_used_for_jit;
17445 				VME_OFFSET_SET(new_entry, offset - vm_object_trunc_page(offset));
17446 				new_entry->needs_copy = FALSE;
17447 			} else {
17448 				vm_object_offset_t new_offset;
17449 
17450 				new_offset = VME_OFFSET(new_entry);
17451 				result = vm_object_copy_strategically(
17452 					object,
17453 					offset,
17454 					(new_entry->vme_end -
17455 					new_entry->vme_start),
17456 					&new_copy_object,
17457 					&new_offset,
17458 					&new_entry_needs_copy);
17459 				/* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
17460 				saved_used_for_jit = new_entry->used_for_jit;
17461 				VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
17462 				new_entry->used_for_jit = saved_used_for_jit;
17463 				if (new_offset != VME_OFFSET(new_entry)) {
17464 					VME_OFFSET_SET(new_entry, new_offset);
17465 				}
17466 
17467 				new_entry->needs_copy = new_entry_needs_copy;
17468 			}
17469 
17470 			/*
17471 			 * Throw away the old object reference of the new entry.
17472 			 */
17473 			vm_object_deallocate(object);
17474 
17475 			if (result != KERN_SUCCESS &&
17476 			    result != KERN_MEMORY_RESTART_COPY) {
17477 				vm_map_entry_dispose(new_entry);
17478 				vm_map_lock(map);
17479 				break;
17480 			}
17481 
17482 			/*
17483 			 * Verify that the map has not substantially
17484 			 * changed while the copy was being made.
17485 			 */
17486 
17487 			vm_map_lock(map);
17488 			if (version.main_timestamp + 1 != map->timestamp) {
17489 				/*
17490 				 * Simple version comparison failed.
17491 				 *
17492 				 * Retry the lookup and verify that the
17493 				 * same object/offset are still present.
17494 				 */
17495 				saved_src_entry = VM_MAP_ENTRY_NULL;
17496 				vm_object_deallocate(VME_OBJECT(new_entry));
17497 				vm_map_entry_dispose(new_entry);
17498 				if (result == KERN_MEMORY_RESTART_COPY) {
17499 					result = KERN_SUCCESS;
17500 				}
17501 				continue;
17502 			}
17503 			/* map hasn't changed: src_entry is still valid */
17504 			src_entry = saved_src_entry;
17505 			saved_src_entry = VM_MAP_ENTRY_NULL;
17506 
17507 			if (result == KERN_MEMORY_RESTART_COPY) {
17508 				vm_object_reference(object);
17509 				goto RestartCopy;
17510 			}
17511 		}
17512 
17513 		_vm_map_store_entry_link(map_header,
17514 		    map_header->links.prev, new_entry);
17515 
17516 		/* protections for submap mapping are irrelevant here */
17517 		if (vm_remap_legacy && !src_entry->is_sub_map) {
17518 			*cur_protection &= src_entry->protection;
17519 			*max_protection &= src_entry->max_protection;
17520 		}
17521 
17522 		map_address += tmp_size;
17523 		mapped_size += tmp_size;
17524 		src_start += tmp_size;
17525 
17526 		if (vmk_flags.vmkf_copy_single_object) {
17527 			if (mapped_size != size) {
17528 				DEBUG4K_SHARE("map %p addr 0x%llx size 0x%llx clipped copy at mapped_size 0x%llx\n",
17529 				    map, (uint64_t)addr, (uint64_t)size, (uint64_t)mapped_size);
17530 				if (src_entry->vme_next != vm_map_to_entry(map) &&
17531 				    src_entry->vme_next->vme_object_value ==
17532 				    src_entry->vme_object_value) {
17533 					/* XXX TODO4K */
17534 					DEBUG4K_ERROR("could have extended copy to next entry...\n");
17535 				}
17536 			}
17537 			break;
17538 		}
17539 	} /* end while */
17540 
17541 	vm_map_unlock(map);
17542 	if (result != KERN_SUCCESS) {
17543 		/*
17544 		 * Free all allocated elements.
17545 		 */
17546 		for (src_entry = map_header->links.next;
17547 		    src_entry != CAST_TO_VM_MAP_ENTRY(&map_header->links);
17548 		    src_entry = new_entry) {
17549 			new_entry = src_entry->vme_next;
17550 			_vm_map_store_entry_unlink(map_header, src_entry, false);
17551 			if (src_entry->is_sub_map) {
17552 				vm_map_deallocate(VME_SUBMAP(src_entry));
17553 			} else {
17554 				vm_object_deallocate(VME_OBJECT(src_entry));
17555 			}
17556 			vm_map_entry_dispose(src_entry);
17557 		}
17558 	}
17559 	return result;
17560 }
17561 
17562 bool
vm_map_is_exotic(vm_map_t map)17563 vm_map_is_exotic(
17564 	vm_map_t map)
17565 {
17566 	return VM_MAP_IS_EXOTIC(map);
17567 }
17568 
17569 bool
vm_map_is_alien(vm_map_t map)17570 vm_map_is_alien(
17571 	vm_map_t map)
17572 {
17573 	return VM_MAP_IS_ALIEN(map);
17574 }
17575 
17576 #if XNU_TARGET_OS_OSX
17577 void
vm_map_mark_alien(vm_map_t map)17578 vm_map_mark_alien(
17579 	vm_map_t map)
17580 {
17581 	vm_map_lock(map);
17582 	map->is_alien = true;
17583 	vm_map_unlock(map);
17584 }
17585 
17586 void
vm_map_single_jit(vm_map_t map)17587 vm_map_single_jit(
17588 	vm_map_t map)
17589 {
17590 	vm_map_lock(map);
17591 	map->single_jit = true;
17592 	vm_map_unlock(map);
17593 }
17594 #endif /* XNU_TARGET_OS_OSX */
17595 
17596 /*
17597  * Callers of this function must call vm_map_copy_require on
17598  * previously created vm_map_copy_t or pass a newly created
17599  * one to ensure that it hasn't been forged.
17600  */
17601 static kern_return_t
vm_map_copy_to_physcopy(vm_map_copy_t copy_map,vm_map_t target_map)17602 vm_map_copy_to_physcopy(
17603 	vm_map_copy_t   copy_map,
17604 	vm_map_t        target_map)
17605 {
17606 	vm_map_size_t           size;
17607 	vm_map_entry_t          entry;
17608 	vm_map_entry_t          new_entry;
17609 	vm_object_t             new_object;
17610 	unsigned int            pmap_flags;
17611 	pmap_t                  new_pmap;
17612 	vm_map_t                new_map;
17613 	vm_map_address_t        src_start, src_end, src_cur;
17614 	vm_map_address_t        dst_start, dst_end, dst_cur;
17615 	kern_return_t           kr;
17616 	void                    *kbuf;
17617 
17618 	/*
17619 	 * Perform the equivalent of vm_allocate() and memcpy().
17620 	 * Replace the mappings in "copy_map" with the newly allocated mapping.
17621 	 */
17622 	DEBUG4K_COPY("copy_map %p (%d %d 0x%llx 0x%llx) BEFORE\n", copy_map, copy_map->cpy_hdr.page_shift, copy_map->cpy_hdr.nentries, copy_map->offset, (uint64_t)copy_map->size);
17623 
17624 	assert(copy_map->cpy_hdr.page_shift != VM_MAP_PAGE_MASK(target_map));
17625 
17626 	/* create a new pmap to map "copy_map" */
17627 	pmap_flags = 0;
17628 	assert(copy_map->cpy_hdr.page_shift == FOURK_PAGE_SHIFT);
17629 #if PMAP_CREATE_FORCE_4K_PAGES
17630 	pmap_flags |= PMAP_CREATE_FORCE_4K_PAGES;
17631 #endif /* PMAP_CREATE_FORCE_4K_PAGES */
17632 	pmap_flags |= PMAP_CREATE_64BIT;
17633 	new_pmap = pmap_create_options(NULL, (vm_map_size_t)0, pmap_flags);
17634 	if (new_pmap == NULL) {
17635 		return KERN_RESOURCE_SHORTAGE;
17636 	}
17637 
17638 	/* allocate new VM object */
17639 	size = VM_MAP_ROUND_PAGE(copy_map->size, PAGE_MASK);
17640 	new_object = vm_object_allocate(size);
17641 	assert(new_object);
17642 
17643 	/* allocate new VM map entry */
17644 	new_entry = vm_map_copy_entry_create(copy_map);
17645 	assert(new_entry);
17646 
17647 	/* finish initializing new VM map entry */
17648 	new_entry->protection = VM_PROT_DEFAULT;
17649 	new_entry->max_protection = VM_PROT_DEFAULT;
17650 	new_entry->use_pmap = TRUE;
17651 
17652 	/* make new VM map entry point to new VM object */
17653 	new_entry->vme_start = 0;
17654 	new_entry->vme_end = size;
17655 	VME_OBJECT_SET(new_entry, new_object, false, 0);
17656 	VME_OFFSET_SET(new_entry, 0);
17657 
17658 	/* create a new pageable VM map to map "copy_map" */
17659 	new_map = vm_map_create_options(new_pmap, 0, MACH_VM_MAX_ADDRESS,
17660 	    VM_MAP_CREATE_PAGEABLE);
17661 	assert(new_map);
17662 	vm_map_set_page_shift(new_map, copy_map->cpy_hdr.page_shift);
17663 
17664 	/* map "copy_map" in the new VM map */
17665 	src_start = 0;
17666 	kr = vm_map_copyout_internal(
17667 		new_map,
17668 		&src_start,
17669 		copy_map,
17670 		copy_map->size,
17671 		FALSE, /* consume_on_success */
17672 		VM_PROT_DEFAULT,
17673 		VM_PROT_DEFAULT,
17674 		VM_INHERIT_DEFAULT);
17675 	assert(kr == KERN_SUCCESS);
17676 	src_end = src_start + copy_map->size;
17677 
17678 	/* map "new_object" in the new VM map */
17679 	vm_object_reference(new_object);
17680 	dst_start = 0;
17681 	kr = vm_map_enter(new_map,
17682 	    &dst_start,
17683 	    size,
17684 	    0,               /* mask */
17685 	    VM_FLAGS_ANYWHERE,
17686 	    VM_MAP_KERNEL_FLAGS_NONE,
17687 	    VM_KERN_MEMORY_OSFMK,
17688 	    new_object,
17689 	    0,               /* offset */
17690 	    FALSE,               /* needs copy */
17691 	    VM_PROT_DEFAULT,
17692 	    VM_PROT_DEFAULT,
17693 	    VM_INHERIT_DEFAULT);
17694 	assert(kr == KERN_SUCCESS);
17695 	dst_end = dst_start + size;
17696 
17697 	/* get a kernel buffer */
17698 	kbuf = kalloc_data(PAGE_SIZE, Z_WAITOK | Z_NOFAIL);
17699 
17700 	/* physically copy "copy_map" mappings to new VM object */
17701 	for (src_cur = src_start, dst_cur = dst_start;
17702 	    src_cur < src_end;
17703 	    src_cur += PAGE_SIZE, dst_cur += PAGE_SIZE) {
17704 		vm_size_t bytes;
17705 
17706 		bytes = PAGE_SIZE;
17707 		if (src_cur + PAGE_SIZE > src_end) {
17708 			/* partial copy for last page */
17709 			bytes = src_end - src_cur;
17710 			assert(bytes > 0 && bytes < PAGE_SIZE);
17711 			/* rest of dst page should be zero-filled */
17712 		}
17713 		/* get bytes from src mapping */
17714 		kr = copyinmap(new_map, src_cur, kbuf, bytes);
17715 		if (kr != KERN_SUCCESS) {
17716 			DEBUG4K_COPY("copyinmap(%p, 0x%llx, %p, 0x%llx) kr 0x%x\n", new_map, (uint64_t)src_cur, kbuf, (uint64_t)bytes, kr);
17717 		}
17718 		/* put bytes in dst mapping */
17719 		assert(dst_cur < dst_end);
17720 		assert(dst_cur + bytes <= dst_end);
17721 		kr = copyoutmap(new_map, kbuf, dst_cur, bytes);
17722 		if (kr != KERN_SUCCESS) {
17723 			DEBUG4K_COPY("copyoutmap(%p, %p, 0x%llx, 0x%llx) kr 0x%x\n", new_map, kbuf, (uint64_t)dst_cur, (uint64_t)bytes, kr);
17724 		}
17725 	}
17726 
17727 	/* free kernel buffer */
17728 	kfree_data(kbuf, PAGE_SIZE);
17729 
17730 	/* destroy new map */
17731 	vm_map_destroy(new_map);
17732 	new_map = VM_MAP_NULL;
17733 
17734 	/* dispose of the old map entries in "copy_map" */
17735 	while (vm_map_copy_first_entry(copy_map) !=
17736 	    vm_map_copy_to_entry(copy_map)) {
17737 		entry = vm_map_copy_first_entry(copy_map);
17738 		vm_map_copy_entry_unlink(copy_map, entry);
17739 		if (entry->is_sub_map) {
17740 			vm_map_deallocate(VME_SUBMAP(entry));
17741 		} else {
17742 			vm_object_deallocate(VME_OBJECT(entry));
17743 		}
17744 		vm_map_copy_entry_dispose(entry);
17745 	}
17746 
17747 	/* change "copy_map"'s page_size to match "target_map" */
17748 	copy_map->cpy_hdr.page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(target_map);
17749 	copy_map->offset = 0;
17750 	copy_map->size = size;
17751 
17752 	/* insert new map entry in "copy_map" */
17753 	assert(vm_map_copy_last_entry(copy_map) == vm_map_copy_to_entry(copy_map));
17754 	vm_map_copy_entry_link(copy_map, vm_map_copy_last_entry(copy_map), new_entry);
17755 
17756 	DEBUG4K_COPY("copy_map %p (%d %d 0x%llx 0x%llx) AFTER\n", copy_map, copy_map->cpy_hdr.page_shift, copy_map->cpy_hdr.nentries, copy_map->offset, (uint64_t)copy_map->size);
17757 	return KERN_SUCCESS;
17758 }
17759 
17760 void
17761 vm_map_copy_adjust_get_target_copy_map(
17762 	vm_map_copy_t   copy_map,
17763 	vm_map_copy_t   *target_copy_map_p);
17764 void
vm_map_copy_adjust_get_target_copy_map(vm_map_copy_t copy_map,vm_map_copy_t * target_copy_map_p)17765 vm_map_copy_adjust_get_target_copy_map(
17766 	vm_map_copy_t   copy_map,
17767 	vm_map_copy_t   *target_copy_map_p)
17768 {
17769 	vm_map_copy_t   target_copy_map;
17770 	vm_map_entry_t  entry, target_entry;
17771 
17772 	if (*target_copy_map_p != VM_MAP_COPY_NULL) {
17773 		/* the caller already has a "target_copy_map": use it */
17774 		return;
17775 	}
17776 
17777 	/* the caller wants us to create a new copy of "copy_map" */
17778 	target_copy_map = vm_map_copy_allocate();
17779 	target_copy_map->type = copy_map->type;
17780 	assert(target_copy_map->type == VM_MAP_COPY_ENTRY_LIST);
17781 	target_copy_map->offset = copy_map->offset;
17782 	target_copy_map->size = copy_map->size;
17783 	target_copy_map->cpy_hdr.page_shift = copy_map->cpy_hdr.page_shift;
17784 	vm_map_store_init(&target_copy_map->cpy_hdr);
17785 	for (entry = vm_map_copy_first_entry(copy_map);
17786 	    entry != vm_map_copy_to_entry(copy_map);
17787 	    entry = entry->vme_next) {
17788 		target_entry = vm_map_copy_entry_create(target_copy_map);
17789 		vm_map_entry_copy_full(target_entry, entry);
17790 		if (target_entry->is_sub_map) {
17791 			vm_map_reference(VME_SUBMAP(target_entry));
17792 		} else {
17793 			vm_object_reference(VME_OBJECT(target_entry));
17794 		}
17795 		vm_map_copy_entry_link(
17796 			target_copy_map,
17797 			vm_map_copy_last_entry(target_copy_map),
17798 			target_entry);
17799 	}
17800 	entry = VM_MAP_ENTRY_NULL;
17801 	*target_copy_map_p = target_copy_map;
17802 }
17803 
17804 /*
17805  * Callers of this function must call vm_map_copy_require on
17806  * previously created vm_map_copy_t or pass a newly created
17807  * one to ensure that it hasn't been forged.
17808  */
17809 static void
vm_map_copy_trim(vm_map_copy_t copy_map,uint16_t new_page_shift,vm_map_offset_t trim_start,vm_map_offset_t trim_end)17810 vm_map_copy_trim(
17811 	vm_map_copy_t   copy_map,
17812 	uint16_t        new_page_shift,
17813 	vm_map_offset_t trim_start,
17814 	vm_map_offset_t trim_end)
17815 {
17816 	uint16_t        copy_page_shift;
17817 	vm_map_entry_t  entry, next_entry;
17818 
17819 	assert(copy_map->type == VM_MAP_COPY_ENTRY_LIST);
17820 	assert(copy_map->cpy_hdr.nentries > 0);
17821 
17822 	trim_start += vm_map_copy_first_entry(copy_map)->vme_start;
17823 	trim_end += vm_map_copy_first_entry(copy_map)->vme_start;
17824 
17825 	/* use the new page_shift to do the clipping */
17826 	copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy_map);
17827 	copy_map->cpy_hdr.page_shift = new_page_shift;
17828 
17829 	for (entry = vm_map_copy_first_entry(copy_map);
17830 	    entry != vm_map_copy_to_entry(copy_map);
17831 	    entry = next_entry) {
17832 		next_entry = entry->vme_next;
17833 		if (entry->vme_end <= trim_start) {
17834 			/* entry fully before trim range: skip */
17835 			continue;
17836 		}
17837 		if (entry->vme_start >= trim_end) {
17838 			/* entry fully after trim range: done */
17839 			break;
17840 		}
17841 		/* clip entry if needed */
17842 		vm_map_copy_clip_start(copy_map, entry, trim_start);
17843 		vm_map_copy_clip_end(copy_map, entry, trim_end);
17844 		/* dispose of entry */
17845 		copy_map->size -= entry->vme_end - entry->vme_start;
17846 		vm_map_copy_entry_unlink(copy_map, entry);
17847 		if (entry->is_sub_map) {
17848 			vm_map_deallocate(VME_SUBMAP(entry));
17849 		} else {
17850 			vm_object_deallocate(VME_OBJECT(entry));
17851 		}
17852 		vm_map_copy_entry_dispose(entry);
17853 		entry = VM_MAP_ENTRY_NULL;
17854 	}
17855 
17856 	/* restore copy_map's original page_shift */
17857 	copy_map->cpy_hdr.page_shift = copy_page_shift;
17858 }
17859 
17860 /*
17861  * Make any necessary adjustments to "copy_map" to allow it to be
17862  * mapped into "target_map".
17863  * If no changes were necessary, "target_copy_map" points to the
17864  * untouched "copy_map".
17865  * If changes are necessary, changes will be made to "target_copy_map".
17866  * If "target_copy_map" was NULL, we create a new "vm_map_copy_t" and
17867  * copy the original "copy_map" to it before applying the changes.
17868  * The caller should discard "target_copy_map" if it's not the same as
17869  * the original "copy_map".
17870  */
17871 /* TODO4K: also adjust to sub-range in the copy_map -> add start&end? */
17872 kern_return_t
vm_map_copy_adjust_to_target(vm_map_copy_t src_copy_map,vm_map_offset_t offset,vm_map_size_t size,vm_map_t target_map,boolean_t copy,vm_map_copy_t * target_copy_map_p,vm_map_offset_t * overmap_start_p,vm_map_offset_t * overmap_end_p,vm_map_offset_t * trimmed_start_p)17873 vm_map_copy_adjust_to_target(
17874 	vm_map_copy_t           src_copy_map,
17875 	vm_map_offset_t         offset,
17876 	vm_map_size_t           size,
17877 	vm_map_t                target_map,
17878 	boolean_t               copy,
17879 	vm_map_copy_t           *target_copy_map_p,
17880 	vm_map_offset_t         *overmap_start_p,
17881 	vm_map_offset_t         *overmap_end_p,
17882 	vm_map_offset_t         *trimmed_start_p)
17883 {
17884 	vm_map_copy_t           copy_map, target_copy_map;
17885 	vm_map_size_t           target_size;
17886 	vm_map_size_t           src_copy_map_size;
17887 	vm_map_size_t           overmap_start, overmap_end;
17888 	int                     misalignments;
17889 	vm_map_entry_t          entry, target_entry;
17890 	vm_map_offset_t         addr_adjustment;
17891 	vm_map_offset_t         new_start, new_end;
17892 	int                     copy_page_mask, target_page_mask;
17893 	uint16_t                copy_page_shift, target_page_shift;
17894 	vm_map_offset_t         trimmed_end;
17895 
17896 	/*
17897 	 * Assert that the vm_map_copy is coming from the right
17898 	 * zone and hasn't been forged
17899 	 */
17900 	vm_map_copy_require(src_copy_map);
17901 	assert(src_copy_map->type == VM_MAP_COPY_ENTRY_LIST);
17902 
17903 	/*
17904 	 * Start working with "src_copy_map" but we'll switch
17905 	 * to "target_copy_map" as soon as we start making adjustments.
17906 	 */
17907 	copy_map = src_copy_map;
17908 	src_copy_map_size = src_copy_map->size;
17909 
17910 	copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy_map);
17911 	copy_page_mask = VM_MAP_COPY_PAGE_MASK(copy_map);
17912 	target_page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(target_map);
17913 	target_page_mask = VM_MAP_PAGE_MASK(target_map);
17914 
17915 	DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p...\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, (uint64_t)offset, (uint64_t)size, *target_copy_map_p);
17916 
17917 	target_copy_map = *target_copy_map_p;
17918 	if (target_copy_map != VM_MAP_COPY_NULL) {
17919 		vm_map_copy_require(target_copy_map);
17920 	}
17921 
17922 	if (offset + size > copy_map->size) {
17923 		DEBUG4K_ERROR("copy_map %p (%d->%d) copy_map->size 0x%llx offset 0x%llx size 0x%llx KERN_INVALID_ARGUMENT\n", copy_map, copy_page_shift, target_page_shift, (uint64_t)copy_map->size, (uint64_t)offset, (uint64_t)size);
17924 		return KERN_INVALID_ARGUMENT;
17925 	}
17926 
17927 	/* trim the end */
17928 	trimmed_end = 0;
17929 	new_end = VM_MAP_ROUND_PAGE(offset + size, target_page_mask);
17930 	if (new_end < copy_map->size) {
17931 		trimmed_end = src_copy_map_size - new_end;
17932 		DEBUG4K_ADJUST("copy_map %p (%d->%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p... trim end from 0x%llx to 0x%llx\n", copy_map, copy_page_shift, target_page_shift, copy, (uint64_t)offset, (uint64_t)size, target_copy_map, (uint64_t)new_end, (uint64_t)copy_map->size);
17933 		/* get "target_copy_map" if needed and adjust it */
17934 		vm_map_copy_adjust_get_target_copy_map(copy_map,
17935 		    &target_copy_map);
17936 		copy_map = target_copy_map;
17937 		vm_map_copy_trim(target_copy_map, target_page_shift,
17938 		    new_end, copy_map->size);
17939 	}
17940 
17941 	/* trim the start */
17942 	new_start = VM_MAP_TRUNC_PAGE(offset, target_page_mask);
17943 	if (new_start != 0) {
17944 		DEBUG4K_ADJUST("copy_map %p (%d->%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p... trim start from 0x%llx to 0x%llx\n", copy_map, copy_page_shift, target_page_shift, copy, (uint64_t)offset, (uint64_t)size, target_copy_map, (uint64_t)0, (uint64_t)new_start);
17945 		/* get "target_copy_map" if needed and adjust it */
17946 		vm_map_copy_adjust_get_target_copy_map(copy_map,
17947 		    &target_copy_map);
17948 		copy_map = target_copy_map;
17949 		vm_map_copy_trim(target_copy_map, target_page_shift,
17950 		    0, new_start);
17951 	}
17952 	*trimmed_start_p = new_start;
17953 
17954 	/* target_size starts with what's left after trimming */
17955 	target_size = copy_map->size;
17956 	assertf(target_size == src_copy_map_size - *trimmed_start_p - trimmed_end,
17957 	    "target_size 0x%llx src_copy_map_size 0x%llx trimmed_start 0x%llx trimmed_end 0x%llx\n",
17958 	    (uint64_t)target_size, (uint64_t)src_copy_map_size,
17959 	    (uint64_t)*trimmed_start_p, (uint64_t)trimmed_end);
17960 
17961 	/* check for misalignments but don't adjust yet */
17962 	misalignments = 0;
17963 	overmap_start = 0;
17964 	overmap_end = 0;
17965 	if (copy_page_shift < target_page_shift) {
17966 		/*
17967 		 * Remapping from 4K to 16K: check the VM object alignments
17968 		 * throughout the range.
17969 		 * If the start and end of the range are mis-aligned, we can
17970 		 * over-map to re-align, and adjust the "overmap" start/end
17971 		 * and "target_size" of the range accordingly.
17972 		 * If there is any mis-alignment within the range:
17973 		 *     if "copy":
17974 		 *         we can do immediate-copy instead of copy-on-write,
17975 		 *     else:
17976 		 *         no way to remap and share; fail.
17977 		 */
17978 		for (entry = vm_map_copy_first_entry(copy_map);
17979 		    entry != vm_map_copy_to_entry(copy_map);
17980 		    entry = entry->vme_next) {
17981 			vm_object_offset_t object_offset_start, object_offset_end;
17982 
17983 			object_offset_start = VME_OFFSET(entry);
17984 			object_offset_end = object_offset_start;
17985 			object_offset_end += entry->vme_end - entry->vme_start;
17986 			if (object_offset_start & target_page_mask) {
17987 				if (entry == vm_map_copy_first_entry(copy_map) && !copy) {
17988 					overmap_start++;
17989 				} else {
17990 					misalignments++;
17991 				}
17992 			}
17993 			if (object_offset_end & target_page_mask) {
17994 				if (entry->vme_next == vm_map_copy_to_entry(copy_map) && !copy) {
17995 					overmap_end++;
17996 				} else {
17997 					misalignments++;
17998 				}
17999 			}
18000 		}
18001 	}
18002 	entry = VM_MAP_ENTRY_NULL;
18003 
18004 	/* decide how to deal with misalignments */
18005 	assert(overmap_start <= 1);
18006 	assert(overmap_end <= 1);
18007 	if (!overmap_start && !overmap_end && !misalignments) {
18008 		/* copy_map is properly aligned for target_map ... */
18009 		if (*trimmed_start_p) {
18010 			/* ... but we trimmed it, so still need to adjust */
18011 		} else {
18012 			/* ... and we didn't trim anything: we're done */
18013 			if (target_copy_map == VM_MAP_COPY_NULL) {
18014 				target_copy_map = copy_map;
18015 			}
18016 			*target_copy_map_p = target_copy_map;
18017 			*overmap_start_p = 0;
18018 			*overmap_end_p = 0;
18019 			DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx) -> trimmed 0x%llx overmap start 0x%llx end 0x%llx KERN_SUCCESS\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p);
18020 			return KERN_SUCCESS;
18021 		}
18022 	} else if (misalignments && !copy) {
18023 		/* can't "share" if misaligned */
18024 		DEBUG4K_ADJUST("unsupported sharing\n");
18025 #if MACH_ASSERT
18026 		if (debug4k_panic_on_misaligned_sharing) {
18027 			panic("DEBUG4k %s:%d unsupported sharing", __FUNCTION__, __LINE__);
18028 		}
18029 #endif /* MACH_ASSERT */
18030 		DEBUG4K_ADJUST("copy_map %p (%d) target_map %p (%d) copy %d target_copy_map %p -> KERN_NOT_SUPPORTED\n", copy_map, copy_page_shift, target_map, target_page_shift, copy, *target_copy_map_p);
18031 		return KERN_NOT_SUPPORTED;
18032 	} else {
18033 		/* can't virtual-copy if misaligned (but can physical-copy) */
18034 		DEBUG4K_ADJUST("mis-aligned copying\n");
18035 	}
18036 
18037 	/* get a "target_copy_map" if needed and switch to it */
18038 	vm_map_copy_adjust_get_target_copy_map(copy_map, &target_copy_map);
18039 	copy_map = target_copy_map;
18040 
18041 	if (misalignments && copy) {
18042 		vm_map_size_t target_copy_map_size;
18043 
18044 		/*
18045 		 * Can't do copy-on-write with misaligned mappings.
18046 		 * Replace the mappings with a physical copy of the original
18047 		 * mappings' contents.
18048 		 */
18049 		target_copy_map_size = target_copy_map->size;
18050 		kern_return_t kr = vm_map_copy_to_physcopy(target_copy_map, target_map);
18051 		if (kr != KERN_SUCCESS) {
18052 			return kr;
18053 		}
18054 		*target_copy_map_p = target_copy_map;
18055 		*overmap_start_p = 0;
18056 		*overmap_end_p = target_copy_map->size - target_copy_map_size;
18057 		DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx)-> trimmed 0x%llx overmap start 0x%llx end 0x%llx PHYSCOPY\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p);
18058 		return KERN_SUCCESS;
18059 	}
18060 
18061 	/* apply the adjustments */
18062 	misalignments = 0;
18063 	overmap_start = 0;
18064 	overmap_end = 0;
18065 	/* remove copy_map->offset, so that everything starts at offset 0 */
18066 	addr_adjustment = copy_map->offset;
18067 	/* also remove whatever we trimmed from the start */
18068 	addr_adjustment += *trimmed_start_p;
18069 	for (target_entry = vm_map_copy_first_entry(target_copy_map);
18070 	    target_entry != vm_map_copy_to_entry(target_copy_map);
18071 	    target_entry = target_entry->vme_next) {
18072 		vm_object_offset_t object_offset_start, object_offset_end;
18073 
18074 		DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx BEFORE\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
18075 		object_offset_start = VME_OFFSET(target_entry);
18076 		if (object_offset_start & target_page_mask) {
18077 			DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx misaligned at start\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
18078 			if (target_entry == vm_map_copy_first_entry(target_copy_map)) {
18079 				/*
18080 				 * start of 1st entry is mis-aligned:
18081 				 * re-adjust by over-mapping.
18082 				 */
18083 				overmap_start = object_offset_start - trunc_page_mask_64(object_offset_start, target_page_mask);
18084 				DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> overmap_start 0x%llx\n", target_entry, VME_OFFSET(target_entry), copy, (uint64_t)overmap_start);
18085 				VME_OFFSET_SET(target_entry, VME_OFFSET(target_entry) - overmap_start);
18086 			} else {
18087 				misalignments++;
18088 				DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> misalignments %d\n", target_entry, VME_OFFSET(target_entry), copy, misalignments);
18089 				assert(copy);
18090 			}
18091 		}
18092 
18093 		if (target_entry == vm_map_copy_first_entry(target_copy_map)) {
18094 			target_size += overmap_start;
18095 		} else {
18096 			target_entry->vme_start += overmap_start;
18097 		}
18098 		target_entry->vme_end += overmap_start;
18099 
18100 		object_offset_end = VME_OFFSET(target_entry) + target_entry->vme_end - target_entry->vme_start;
18101 		if (object_offset_end & target_page_mask) {
18102 			DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx misaligned at end\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
18103 			if (target_entry->vme_next == vm_map_copy_to_entry(target_copy_map)) {
18104 				/*
18105 				 * end of last entry is mis-aligned: re-adjust by over-mapping.
18106 				 */
18107 				overmap_end = round_page_mask_64(object_offset_end, target_page_mask) - object_offset_end;
18108 				DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> overmap_end 0x%llx\n", target_entry, VME_OFFSET(target_entry), copy, (uint64_t)overmap_end);
18109 				target_entry->vme_end += overmap_end;
18110 				target_size += overmap_end;
18111 			} else {
18112 				misalignments++;
18113 				DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> misalignments %d\n", target_entry, VME_OFFSET(target_entry), copy, misalignments);
18114 				assert(copy);
18115 			}
18116 		}
18117 		target_entry->vme_start -= addr_adjustment;
18118 		target_entry->vme_end -= addr_adjustment;
18119 		DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx AFTER\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
18120 	}
18121 
18122 	target_copy_map->size = target_size;
18123 	target_copy_map->offset += overmap_start;
18124 	target_copy_map->offset -= addr_adjustment;
18125 	target_copy_map->cpy_hdr.page_shift = target_page_shift;
18126 
18127 //	assert(VM_MAP_PAGE_ALIGNED(target_copy_map->size, target_page_mask));
18128 //	assert(VM_MAP_PAGE_ALIGNED(target_copy_map->offset, FOURK_PAGE_MASK));
18129 	assert(overmap_start < VM_MAP_PAGE_SIZE(target_map));
18130 	assert(overmap_end < VM_MAP_PAGE_SIZE(target_map));
18131 
18132 	*target_copy_map_p = target_copy_map;
18133 	*overmap_start_p = overmap_start;
18134 	*overmap_end_p = overmap_end;
18135 
18136 	DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx) -> trimmed 0x%llx overmap start 0x%llx end 0x%llx KERN_SUCCESS\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p);
18137 	return KERN_SUCCESS;
18138 }
18139 
18140 kern_return_t
vm_map_range_physical_size(vm_map_t map,vm_map_address_t start,mach_vm_size_t size,mach_vm_size_t * phys_size)18141 vm_map_range_physical_size(
18142 	vm_map_t         map,
18143 	vm_map_address_t start,
18144 	mach_vm_size_t   size,
18145 	mach_vm_size_t * phys_size)
18146 {
18147 	kern_return_t   kr;
18148 	vm_map_copy_t   copy_map, target_copy_map;
18149 	vm_map_offset_t adjusted_start, adjusted_end;
18150 	vm_map_size_t   adjusted_size;
18151 	vm_prot_t       cur_prot, max_prot;
18152 	vm_map_offset_t overmap_start, overmap_end, trimmed_start, end;
18153 	vm_map_kernel_flags_t vmk_flags;
18154 
18155 	if (size == 0) {
18156 		DEBUG4K_SHARE("map %p start 0x%llx size 0x%llx -> phys_size 0!\n", map, (uint64_t)start, (uint64_t)size);
18157 		*phys_size = 0;
18158 		return KERN_SUCCESS;
18159 	}
18160 
18161 	adjusted_start = vm_map_trunc_page(start, VM_MAP_PAGE_MASK(map));
18162 	adjusted_end = vm_map_round_page(start + size, VM_MAP_PAGE_MASK(map));
18163 	if (__improbable(os_add_overflow(start, size, &end) ||
18164 	    adjusted_end <= adjusted_start)) {
18165 		/* wraparound */
18166 		printf("%s:%d(start=0x%llx, size=0x%llx) pgmask 0x%x: wraparound\n", __FUNCTION__, __LINE__, (uint64_t)start, (uint64_t)size, VM_MAP_PAGE_MASK(map));
18167 		*phys_size = 0;
18168 		return KERN_INVALID_ARGUMENT;
18169 	}
18170 	assert(adjusted_end > adjusted_start);
18171 	adjusted_size = adjusted_end - adjusted_start;
18172 	*phys_size = adjusted_size;
18173 	if (VM_MAP_PAGE_SIZE(map) == PAGE_SIZE) {
18174 		return KERN_SUCCESS;
18175 	}
18176 	if (start == 0) {
18177 		adjusted_start = vm_map_trunc_page(start, PAGE_MASK);
18178 		adjusted_end = vm_map_round_page(start + size, PAGE_MASK);
18179 		if (__improbable(adjusted_end <= adjusted_start)) {
18180 			/* wraparound */
18181 			printf("%s:%d(start=0x%llx, size=0x%llx) pgmask 0x%x: wraparound\n", __FUNCTION__, __LINE__, (uint64_t)start, (uint64_t)size, PAGE_MASK);
18182 			*phys_size = 0;
18183 			return KERN_INVALID_ARGUMENT;
18184 		}
18185 		assert(adjusted_end > adjusted_start);
18186 		adjusted_size = adjusted_end - adjusted_start;
18187 		*phys_size = adjusted_size;
18188 		return KERN_SUCCESS;
18189 	}
18190 
18191 	vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
18192 	vmk_flags.vmkf_copy_pageable = TRUE;
18193 	vmk_flags.vmkf_copy_same_map = TRUE;
18194 	assert(adjusted_size != 0);
18195 	cur_prot = VM_PROT_NONE; /* legacy mode */
18196 	max_prot = VM_PROT_NONE; /* legacy mode */
18197 	kr = vm_map_copy_extract(map, adjusted_start, adjusted_size,
18198 	    FALSE /* copy */,
18199 	    &copy_map,
18200 	    &cur_prot, &max_prot, VM_INHERIT_DEFAULT,
18201 	    vmk_flags);
18202 	if (kr != KERN_SUCCESS) {
18203 		DEBUG4K_ERROR("map %p start 0x%llx 0x%llx size 0x%llx 0x%llx kr 0x%x\n", map, (uint64_t)start, (uint64_t)adjusted_start, size, (uint64_t)adjusted_size, kr);
18204 		//assert(0);
18205 		*phys_size = 0;
18206 		return kr;
18207 	}
18208 	assert(copy_map != VM_MAP_COPY_NULL);
18209 	target_copy_map = copy_map;
18210 	DEBUG4K_ADJUST("adjusting...\n");
18211 	kr = vm_map_copy_adjust_to_target(
18212 		copy_map,
18213 		start - adjusted_start, /* offset */
18214 		size, /* size */
18215 		kernel_map,
18216 		FALSE,                          /* copy */
18217 		&target_copy_map,
18218 		&overmap_start,
18219 		&overmap_end,
18220 		&trimmed_start);
18221 	if (kr == KERN_SUCCESS) {
18222 		if (target_copy_map->size != *phys_size) {
18223 			DEBUG4K_ADJUST("map %p (%d) start 0x%llx size 0x%llx adjusted_start 0x%llx adjusted_end 0x%llx overmap_start 0x%llx overmap_end 0x%llx trimmed_start 0x%llx phys_size 0x%llx -> 0x%llx\n", map, VM_MAP_PAGE_SHIFT(map), (uint64_t)start, (uint64_t)size, (uint64_t)adjusted_start, (uint64_t)adjusted_end, (uint64_t)overmap_start, (uint64_t)overmap_end, (uint64_t)trimmed_start, (uint64_t)*phys_size, (uint64_t)target_copy_map->size);
18224 		}
18225 		*phys_size = target_copy_map->size;
18226 	} else {
18227 		DEBUG4K_ERROR("map %p start 0x%llx 0x%llx size 0x%llx 0x%llx kr 0x%x\n", map, (uint64_t)start, (uint64_t)adjusted_start, size, (uint64_t)adjusted_size, kr);
18228 		//assert(0);
18229 		*phys_size = 0;
18230 	}
18231 	vm_map_copy_discard(copy_map);
18232 	copy_map = VM_MAP_COPY_NULL;
18233 
18234 	return kr;
18235 }
18236 
18237 
18238 kern_return_t
memory_entry_check_for_adjustment(vm_map_t src_map,ipc_port_t port,vm_map_offset_t * overmap_start,vm_map_offset_t * overmap_end)18239 memory_entry_check_for_adjustment(
18240 	vm_map_t                        src_map,
18241 	ipc_port_t                      port,
18242 	vm_map_offset_t         *overmap_start,
18243 	vm_map_offset_t         *overmap_end)
18244 {
18245 	kern_return_t kr = KERN_SUCCESS;
18246 	vm_map_copy_t copy_map = VM_MAP_COPY_NULL, target_copy_map = VM_MAP_COPY_NULL;
18247 
18248 	assert(port);
18249 	assertf(ip_kotype(port) == IKOT_NAMED_ENTRY, "Port Type expected: %d...received:%d\n", IKOT_NAMED_ENTRY, ip_kotype(port));
18250 
18251 	vm_named_entry_t        named_entry;
18252 
18253 	named_entry = mach_memory_entry_from_port(port);
18254 	named_entry_lock(named_entry);
18255 	copy_map = named_entry->backing.copy;
18256 	target_copy_map = copy_map;
18257 
18258 	if (src_map && VM_MAP_PAGE_SHIFT(src_map) < PAGE_SHIFT) {
18259 		vm_map_offset_t trimmed_start;
18260 
18261 		trimmed_start = 0;
18262 		DEBUG4K_ADJUST("adjusting...\n");
18263 		kr = vm_map_copy_adjust_to_target(
18264 			copy_map,
18265 			0, /* offset */
18266 			copy_map->size, /* size */
18267 			src_map,
18268 			FALSE, /* copy */
18269 			&target_copy_map,
18270 			overmap_start,
18271 			overmap_end,
18272 			&trimmed_start);
18273 		assert(trimmed_start == 0);
18274 	}
18275 	named_entry_unlock(named_entry);
18276 
18277 	return kr;
18278 }
18279 
18280 
18281 /*
18282  *	Routine:	vm_remap
18283  *
18284  *			Map portion of a task's address space.
18285  *			Mapped region must not overlap more than
18286  *			one vm memory object. Protections and
18287  *			inheritance attributes remain the same
18288  *			as in the original task and are	out parameters.
18289  *			Source and Target task can be identical
18290  *			Other attributes are identical as for vm_map()
18291  */
18292 kern_return_t
vm_map_remap(vm_map_t target_map,vm_map_address_t * address,vm_map_size_t size,vm_map_offset_t mask,int flags,vm_map_kernel_flags_t vmk_flags,vm_tag_t tag,vm_map_t src_map,vm_map_offset_t memory_address,boolean_t copy,vm_prot_t * cur_protection,vm_prot_t * max_protection,vm_inherit_t inheritance)18293 vm_map_remap(
18294 	vm_map_t                target_map,
18295 	vm_map_address_t        *address,
18296 	vm_map_size_t           size,
18297 	vm_map_offset_t         mask,
18298 	int                     flags,
18299 	vm_map_kernel_flags_t   vmk_flags,
18300 	vm_tag_t                tag,
18301 	vm_map_t                src_map,
18302 	vm_map_offset_t         memory_address,
18303 	boolean_t               copy,
18304 	vm_prot_t               *cur_protection, /* IN/OUT */
18305 	vm_prot_t               *max_protection, /* IN/OUT */
18306 	vm_inherit_t            inheritance)
18307 {
18308 	kern_return_t           result;
18309 	vm_map_entry_t          entry;
18310 	vm_map_entry_t          insp_entry = VM_MAP_ENTRY_NULL;
18311 	vm_map_entry_t          new_entry;
18312 	vm_map_copy_t           copy_map;
18313 	vm_map_offset_t         offset_in_mapping;
18314 	vm_map_size_t           target_size = 0;
18315 	vm_map_size_t           src_page_mask, target_page_mask;
18316 	vm_map_offset_t         overmap_start, overmap_end, trimmed_start;
18317 	vm_map_offset_t         initial_memory_address;
18318 	vm_map_size_t           initial_size;
18319 	VM_MAP_ZAP_DECLARE(zap_list);
18320 
18321 	if (target_map == VM_MAP_NULL) {
18322 		return KERN_INVALID_ARGUMENT;
18323 	}
18324 
18325 	initial_memory_address = memory_address;
18326 	initial_size = size;
18327 	src_page_mask = VM_MAP_PAGE_MASK(src_map);
18328 	target_page_mask = VM_MAP_PAGE_MASK(target_map);
18329 
18330 	switch (inheritance) {
18331 	case VM_INHERIT_NONE:
18332 	case VM_INHERIT_COPY:
18333 	case VM_INHERIT_SHARE:
18334 		if (size != 0 && src_map != VM_MAP_NULL) {
18335 			break;
18336 		}
18337 		OS_FALLTHROUGH;
18338 	default:
18339 		return KERN_INVALID_ARGUMENT;
18340 	}
18341 
18342 	if (src_page_mask != target_page_mask) {
18343 		if (copy) {
18344 			DEBUG4K_COPY("src_map %p pgsz 0x%x addr 0x%llx size 0x%llx copy %d -> target_map %p pgsz 0x%x\n", src_map, VM_MAP_PAGE_SIZE(src_map), (uint64_t)memory_address, (uint64_t)size, copy, target_map, VM_MAP_PAGE_SIZE(target_map));
18345 		} else {
18346 			DEBUG4K_SHARE("src_map %p pgsz 0x%x addr 0x%llx size 0x%llx copy %d -> target_map %p pgsz 0x%x\n", src_map, VM_MAP_PAGE_SIZE(src_map), (uint64_t)memory_address, (uint64_t)size, copy, target_map, VM_MAP_PAGE_SIZE(target_map));
18347 		}
18348 	}
18349 
18350 	/*
18351 	 * If the user is requesting that we return the address of the
18352 	 * first byte of the data (rather than the base of the page),
18353 	 * then we use different rounding semantics: specifically,
18354 	 * we assume that (memory_address, size) describes a region
18355 	 * all of whose pages we must cover, rather than a base to be truncated
18356 	 * down and a size to be added to that base.  So we figure out
18357 	 * the highest page that the requested region includes and make
18358 	 * sure that the size will cover it.
18359 	 *
18360 	 * The key example we're worried about it is of the form:
18361 	 *
18362 	 *              memory_address = 0x1ff0, size = 0x20
18363 	 *
18364 	 * With the old semantics, we round down the memory_address to 0x1000
18365 	 * and round up the size to 0x1000, resulting in our covering *only*
18366 	 * page 0x1000.  With the new semantics, we'd realize that the region covers
18367 	 * 0x1ff0-0x2010, and compute a size of 0x2000.  Thus, we cover both page
18368 	 * 0x1000 and page 0x2000 in the region we remap.
18369 	 */
18370 	if ((flags & VM_FLAGS_RETURN_DATA_ADDR) != 0) {
18371 		vm_map_offset_t range_start, range_end;
18372 
18373 		range_start = vm_map_trunc_page(memory_address, src_page_mask);
18374 		range_end = vm_map_round_page(memory_address + size, src_page_mask);
18375 		memory_address = range_start;
18376 		size = range_end - range_start;
18377 		offset_in_mapping = initial_memory_address - memory_address;
18378 	} else {
18379 		/*
18380 		 * IMPORTANT:
18381 		 * This legacy code path is broken: for the range mentioned
18382 		 * above [ memory_address = 0x1ff0,size = 0x20 ], which spans
18383 		 * two 4k pages, it yields [ memory_address = 0x1000,
18384 		 * size = 0x1000 ], which covers only the first 4k page.
18385 		 * BUT some code unfortunately depends on this bug, so we
18386 		 * can't fix it without breaking something.
18387 		 * New code should get automatically opted in the new
18388 		 * behavior with the new VM_FLAGS_RETURN_DATA_ADDR flags.
18389 		 */
18390 		offset_in_mapping = 0;
18391 		memory_address = vm_map_trunc_page(memory_address, src_page_mask);
18392 		size = vm_map_round_page(size, src_page_mask);
18393 		initial_memory_address = memory_address;
18394 		initial_size = size;
18395 	}
18396 
18397 
18398 	if (size == 0) {
18399 		return KERN_INVALID_ARGUMENT;
18400 	}
18401 
18402 	if (flags & VM_FLAGS_RESILIENT_MEDIA) {
18403 		/* must be copy-on-write to be "media resilient" */
18404 		if (!copy) {
18405 			return KERN_INVALID_ARGUMENT;
18406 		}
18407 	}
18408 
18409 	vmk_flags.vmkf_copy_pageable = target_map->hdr.entries_pageable;
18410 	vmk_flags.vmkf_copy_same_map = (src_map == target_map);
18411 
18412 	assert(size != 0);
18413 	result = vm_map_copy_extract(src_map,
18414 	    memory_address,
18415 	    size,
18416 	    copy, &copy_map,
18417 	    cur_protection, /* IN/OUT */
18418 	    max_protection, /* IN/OUT */
18419 	    inheritance,
18420 	    vmk_flags);
18421 	if (result != KERN_SUCCESS) {
18422 		return result;
18423 	}
18424 	assert(copy_map != VM_MAP_COPY_NULL);
18425 
18426 	overmap_start = 0;
18427 	overmap_end = 0;
18428 	trimmed_start = 0;
18429 	target_size = size;
18430 	if (src_page_mask != target_page_mask) {
18431 		vm_map_copy_t target_copy_map;
18432 
18433 		target_copy_map = copy_map; /* can modify "copy_map" itself */
18434 		DEBUG4K_ADJUST("adjusting...\n");
18435 		result = vm_map_copy_adjust_to_target(
18436 			copy_map,
18437 			offset_in_mapping, /* offset */
18438 			initial_size,
18439 			target_map,
18440 			copy,
18441 			&target_copy_map,
18442 			&overmap_start,
18443 			&overmap_end,
18444 			&trimmed_start);
18445 		if (result != KERN_SUCCESS) {
18446 			DEBUG4K_COPY("failed to adjust 0x%x\n", result);
18447 			vm_map_copy_discard(copy_map);
18448 			return result;
18449 		}
18450 		if (trimmed_start == 0) {
18451 			/* nothing trimmed: no adjustment needed */
18452 		} else if (trimmed_start >= offset_in_mapping) {
18453 			/* trimmed more than offset_in_mapping: nothing left */
18454 			assert(overmap_start == 0);
18455 			assert(overmap_end == 0);
18456 			offset_in_mapping = 0;
18457 		} else {
18458 			/* trimmed some of offset_in_mapping: adjust */
18459 			assert(overmap_start == 0);
18460 			assert(overmap_end == 0);
18461 			offset_in_mapping -= trimmed_start;
18462 		}
18463 		offset_in_mapping += overmap_start;
18464 		target_size = target_copy_map->size;
18465 	}
18466 
18467 	/*
18468 	 * Allocate/check a range of free virtual address
18469 	 * space for the target
18470 	 */
18471 	*address = vm_map_trunc_page(*address, target_page_mask);
18472 	vm_map_lock(target_map);
18473 	target_size = vm_map_round_page(target_size, target_page_mask);
18474 	result = vm_map_remap_range_allocate(target_map, address,
18475 	    target_size, mask, flags, vmk_flags, tag,
18476 	    &insp_entry, &zap_list);
18477 
18478 	for (entry = vm_map_copy_first_entry(copy_map);
18479 	    entry != vm_map_copy_to_entry(copy_map);
18480 	    entry = new_entry) {
18481 		new_entry = entry->vme_next;
18482 		vm_map_copy_entry_unlink(copy_map, entry);
18483 		if (result == KERN_SUCCESS) {
18484 			if (vmk_flags.vmkf_remap_prot_copy) {
18485 				/*
18486 				 * This vm_map_remap() is for a
18487 				 * vm_protect(VM_PROT_COPY), so the caller
18488 				 * expects to be allowed to add write access
18489 				 * to this new mapping.  This is done by
18490 				 * adding VM_PROT_WRITE to each entry's
18491 				 * max_protection... unless some security
18492 				 * settings disallow it.
18493 				 */
18494 				bool allow_write = false;
18495 				if (entry->vme_permanent) {
18496 					/* immutable mapping... */
18497 					if ((entry->max_protection & VM_PROT_EXECUTE) &&
18498 					    developer_mode_state()) {
18499 						/*
18500 						 * ... but executable and
18501 						 * possibly being debugged,
18502 						 * so let's allow it to become
18503 						 * writable, for breakpoints
18504 						 * and dtrace probes, for
18505 						 * example.
18506 						 */
18507 						allow_write = true;
18508 					} else {
18509 						printf("%d[%s] vm_remap(0x%llx,0x%llx) VM_PROT_COPY denied on permanent mapping prot 0x%x/0x%x developer %d\n",
18510 						    proc_selfpid(),
18511 						    (get_bsdtask_info(current_task())
18512 						    ? proc_name_address(get_bsdtask_info(current_task()))
18513 						    : "?"),
18514 						    (uint64_t)memory_address,
18515 						    (uint64_t)size,
18516 						    entry->protection,
18517 						    entry->max_protection,
18518 						    developer_mode_state());
18519 						DTRACE_VM6(vm_map_delete_permanent_deny_protcopy,
18520 						    vm_map_entry_t, entry,
18521 						    vm_map_offset_t, entry->vme_start,
18522 						    vm_map_offset_t, entry->vme_end,
18523 						    vm_prot_t, entry->protection,
18524 						    vm_prot_t, entry->max_protection,
18525 						    int, VME_ALIAS(entry));
18526 					}
18527 				} else {
18528 					allow_write = true;
18529 				}
18530 
18531 				/*
18532 				 * VM_PROT_COPY: allow this mapping to become
18533 				 * writable, unless it was "permanent".
18534 				 */
18535 				if (allow_write) {
18536 					entry->max_protection |= VM_PROT_WRITE;
18537 				}
18538 			}
18539 			if (flags & VM_FLAGS_RESILIENT_CODESIGN) {
18540 				/* no codesigning -> read-only access */
18541 				entry->max_protection = VM_PROT_READ;
18542 				entry->protection = VM_PROT_READ;
18543 				entry->vme_resilient_codesign = TRUE;
18544 			}
18545 			entry->vme_start += *address;
18546 			entry->vme_end += *address;
18547 			assert(!entry->map_aligned);
18548 			if ((flags & VM_FLAGS_RESILIENT_MEDIA) &&
18549 			    !entry->is_sub_map &&
18550 			    (VME_OBJECT(entry) == VM_OBJECT_NULL ||
18551 			    VME_OBJECT(entry)->internal)) {
18552 				entry->vme_resilient_media = TRUE;
18553 			}
18554 			assert(VM_MAP_PAGE_ALIGNED(entry->vme_start, MIN(target_page_mask, PAGE_MASK)));
18555 			assert(VM_MAP_PAGE_ALIGNED(entry->vme_end, MIN(target_page_mask, PAGE_MASK)));
18556 			assert(VM_MAP_PAGE_ALIGNED(VME_OFFSET(entry), MIN(target_page_mask, PAGE_MASK)));
18557 			vm_map_store_entry_link(target_map, insp_entry, entry,
18558 			    vmk_flags);
18559 			insp_entry = entry;
18560 		} else {
18561 			if (!entry->is_sub_map) {
18562 				vm_object_deallocate(VME_OBJECT(entry));
18563 			} else {
18564 				vm_map_deallocate(VME_SUBMAP(entry));
18565 			}
18566 			vm_map_copy_entry_dispose(entry);
18567 		}
18568 	}
18569 
18570 	if (flags & VM_FLAGS_RESILIENT_CODESIGN) {
18571 		*cur_protection = VM_PROT_READ;
18572 		*max_protection = VM_PROT_READ;
18573 	}
18574 
18575 	if (result == KERN_SUCCESS) {
18576 		target_map->size += target_size;
18577 		SAVE_HINT_MAP_WRITE(target_map, insp_entry);
18578 
18579 	}
18580 	vm_map_unlock(target_map);
18581 
18582 	vm_map_zap_dispose(&zap_list);
18583 
18584 	if (result == KERN_SUCCESS && target_map->wiring_required) {
18585 		result = vm_map_wire_kernel(target_map, *address,
18586 		    *address + size, *cur_protection, VM_KERN_MEMORY_MLOCK,
18587 		    TRUE);
18588 	}
18589 
18590 	/*
18591 	 * If requested, return the address of the data pointed to by the
18592 	 * request, rather than the base of the resulting page.
18593 	 */
18594 	if ((flags & VM_FLAGS_RETURN_DATA_ADDR) != 0) {
18595 		*address += offset_in_mapping;
18596 	}
18597 
18598 	if (src_page_mask != target_page_mask) {
18599 		DEBUG4K_SHARE("vm_remap(%p 0x%llx 0x%llx copy=%d-> %p 0x%llx 0x%llx  result=0x%x\n", src_map, (uint64_t)memory_address, (uint64_t)size, copy, target_map, (uint64_t)*address, (uint64_t)offset_in_mapping, result);
18600 	}
18601 	vm_map_copy_discard(copy_map);
18602 	copy_map = VM_MAP_COPY_NULL;
18603 
18604 	return result;
18605 }
18606 
18607 /*
18608  *	Routine:	vm_map_remap_range_allocate
18609  *
18610  *	Description:
18611  *		Allocate a range in the specified virtual address map.
18612  *		returns the address and the map entry just before the allocated
18613  *		range
18614  *
18615  *	Map must be locked.
18616  */
18617 
18618 static kern_return_t
vm_map_remap_range_allocate(vm_map_t map,vm_map_address_t * address,vm_map_size_t size,vm_map_offset_t mask,int flags,vm_map_kernel_flags_t vmk_flags,__unused vm_tag_t tag,vm_map_entry_t * map_entry,vm_map_zap_t zap_list)18619 vm_map_remap_range_allocate(
18620 	vm_map_t                map,
18621 	vm_map_address_t        *address,       /* IN/OUT */
18622 	vm_map_size_t           size,
18623 	vm_map_offset_t         mask,
18624 	int                     flags,
18625 	vm_map_kernel_flags_t   vmk_flags,
18626 	__unused vm_tag_t       tag,
18627 	vm_map_entry_t          *map_entry,     /* OUT */
18628 	vm_map_zap_t            zap_list)
18629 {
18630 	vm_map_entry_t  entry;
18631 	vm_map_offset_t start;
18632 	kern_return_t   kr;
18633 
18634 	start = *address;
18635 
18636 	if (flags & VM_FLAGS_ANYWHERE) {
18637 		if (flags & VM_FLAGS_RANDOM_ADDR) {
18638 			vmk_flags.vmkf_random_address = true;
18639 		}
18640 
18641 		if (start) {
18642 			/* override the target range if a hint has been provided */
18643 			vmk_flags.vmkf_range_id = (map == kernel_map ?
18644 			    kmem_addr_get_range(start, size) :
18645 			    VM_MAP_REMAP_RANGE_ID(map, NULL, start, size));
18646 		}
18647 
18648 		kr = vm_map_locate_space(map, size, mask, vmk_flags,
18649 		    &start, &entry);
18650 		if (kr != KERN_SUCCESS) {
18651 			return kr;
18652 		}
18653 		*address = start;
18654 	} else {
18655 		vm_map_entry_t  temp_entry;
18656 		vm_map_offset_t end;
18657 
18658 		/*
18659 		 *	Verify that:
18660 		 *		the address doesn't itself violate
18661 		 *		the mask requirement.
18662 		 */
18663 
18664 		if ((start & mask) != 0) {
18665 			return KERN_NO_SPACE;
18666 		}
18667 
18668 
18669 		/*
18670 		 *	...	the address is within bounds
18671 		 */
18672 
18673 		end = start + size;
18674 
18675 		if ((start < map->min_offset) ||
18676 		    (end > map->max_offset) ||
18677 		    (start >= end)) {
18678 			return KERN_INVALID_ADDRESS;
18679 		}
18680 
18681 		/*
18682 		 * If we're asked to overwrite whatever was mapped in that
18683 		 * range, first deallocate that range.
18684 		 */
18685 		if (flags & VM_FLAGS_OVERWRITE) {
18686 			vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_MAP_ALIGN;
18687 
18688 			/*
18689 			 * We use a "zap_list" to avoid having to unlock
18690 			 * the "map" in vm_map_delete(), which would compromise
18691 			 * the atomicity of the "deallocate" and then "remap"
18692 			 * combination.
18693 			 */
18694 			remove_flags |= VM_MAP_REMOVE_NO_YIELD;
18695 
18696 			if (vmk_flags.vmkf_overwrite_immutable) {
18697 				remove_flags |= VM_MAP_REMOVE_IMMUTABLE;
18698 			}
18699 			if (vmk_flags.vmkf_remap_prot_copy) {
18700 				remove_flags |= VM_MAP_REMOVE_IMMUTABLE_CODE;
18701 			}
18702 			kr = vm_map_delete(map, start, end, remove_flags,
18703 			    KMEM_GUARD_NONE, zap_list).kmr_return;
18704 			if (kr != KERN_SUCCESS) {
18705 				/* XXX FBDP restore zap_list? */
18706 				return kr;
18707 			}
18708 		}
18709 
18710 		/*
18711 		 *	...	the starting address isn't allocated
18712 		 */
18713 
18714 		if (vm_map_lookup_entry(map, start, &temp_entry)) {
18715 			return KERN_NO_SPACE;
18716 		}
18717 
18718 		entry = temp_entry;
18719 
18720 		/*
18721 		 *	...	the next region doesn't overlap the
18722 		 *		end point.
18723 		 */
18724 
18725 		if ((entry->vme_next != vm_map_to_entry(map)) &&
18726 		    (entry->vme_next->vme_start < end)) {
18727 			return KERN_NO_SPACE;
18728 		}
18729 	}
18730 	*map_entry = entry;
18731 	return KERN_SUCCESS;
18732 }
18733 
18734 /*
18735  *	vm_map_switch:
18736  *
18737  *	Set the address map for the current thread to the specified map
18738  */
18739 
18740 vm_map_t
vm_map_switch(vm_map_t map)18741 vm_map_switch(
18742 	vm_map_t        map)
18743 {
18744 	int             mycpu;
18745 	thread_t        thread = current_thread();
18746 	vm_map_t        oldmap = thread->map;
18747 
18748 	mp_disable_preemption();
18749 	mycpu = cpu_number();
18750 
18751 	/*
18752 	 *	Deactivate the current map and activate the requested map
18753 	 */
18754 	PMAP_SWITCH_USER(thread, map, mycpu);
18755 
18756 	mp_enable_preemption();
18757 	return oldmap;
18758 }
18759 
18760 
18761 /*
18762  *	Routine:	vm_map_write_user
18763  *
18764  *	Description:
18765  *		Copy out data from a kernel space into space in the
18766  *		destination map. The space must already exist in the
18767  *		destination map.
18768  *		NOTE:  This routine should only be called by threads
18769  *		which can block on a page fault. i.e. kernel mode user
18770  *		threads.
18771  *
18772  */
18773 kern_return_t
vm_map_write_user(vm_map_t map,void * src_p,vm_map_address_t dst_addr,vm_size_t size)18774 vm_map_write_user(
18775 	vm_map_t                map,
18776 	void                    *src_p,
18777 	vm_map_address_t        dst_addr,
18778 	vm_size_t               size)
18779 {
18780 	kern_return_t   kr = KERN_SUCCESS;
18781 
18782 	if (current_map() == map) {
18783 		if (copyout(src_p, dst_addr, size)) {
18784 			kr = KERN_INVALID_ADDRESS;
18785 		}
18786 	} else {
18787 		vm_map_t        oldmap;
18788 
18789 		/* take on the identity of the target map while doing */
18790 		/* the transfer */
18791 
18792 		vm_map_reference(map);
18793 		oldmap = vm_map_switch(map);
18794 		if (copyout(src_p, dst_addr, size)) {
18795 			kr = KERN_INVALID_ADDRESS;
18796 		}
18797 		vm_map_switch(oldmap);
18798 		vm_map_deallocate(map);
18799 	}
18800 	return kr;
18801 }
18802 
18803 /*
18804  *	Routine:	vm_map_read_user
18805  *
18806  *	Description:
18807  *		Copy in data from a user space source map into the
18808  *		kernel map. The space must already exist in the
18809  *		kernel map.
18810  *		NOTE:  This routine should only be called by threads
18811  *		which can block on a page fault. i.e. kernel mode user
18812  *		threads.
18813  *
18814  */
18815 kern_return_t
vm_map_read_user(vm_map_t map,vm_map_address_t src_addr,void * dst_p,vm_size_t size)18816 vm_map_read_user(
18817 	vm_map_t                map,
18818 	vm_map_address_t        src_addr,
18819 	void                    *dst_p,
18820 	vm_size_t               size)
18821 {
18822 	kern_return_t   kr = KERN_SUCCESS;
18823 
18824 	if (current_map() == map) {
18825 		if (copyin(src_addr, dst_p, size)) {
18826 			kr = KERN_INVALID_ADDRESS;
18827 		}
18828 	} else {
18829 		vm_map_t        oldmap;
18830 
18831 		/* take on the identity of the target map while doing */
18832 		/* the transfer */
18833 
18834 		vm_map_reference(map);
18835 		oldmap = vm_map_switch(map);
18836 		if (copyin(src_addr, dst_p, size)) {
18837 			kr = KERN_INVALID_ADDRESS;
18838 		}
18839 		vm_map_switch(oldmap);
18840 		vm_map_deallocate(map);
18841 	}
18842 	return kr;
18843 }
18844 
18845 
18846 /*
18847  *	vm_map_check_protection:
18848  *
18849  *	Assert that the target map allows the specified
18850  *	privilege on the entire address region given.
18851  *	The entire region must be allocated.
18852  */
18853 boolean_t
vm_map_check_protection(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t protection)18854 vm_map_check_protection(vm_map_t map, vm_map_offset_t start,
18855     vm_map_offset_t end, vm_prot_t protection)
18856 {
18857 	vm_map_entry_t entry;
18858 	vm_map_entry_t tmp_entry;
18859 
18860 	vm_map_lock(map);
18861 
18862 	if (start < vm_map_min(map) || end > vm_map_max(map) || start > end) {
18863 		vm_map_unlock(map);
18864 		return FALSE;
18865 	}
18866 
18867 	if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
18868 		vm_map_unlock(map);
18869 		return FALSE;
18870 	}
18871 
18872 	entry = tmp_entry;
18873 
18874 	while (start < end) {
18875 		if (entry == vm_map_to_entry(map)) {
18876 			vm_map_unlock(map);
18877 			return FALSE;
18878 		}
18879 
18880 		/*
18881 		 *	No holes allowed!
18882 		 */
18883 
18884 		if (start < entry->vme_start) {
18885 			vm_map_unlock(map);
18886 			return FALSE;
18887 		}
18888 
18889 		/*
18890 		 * Check protection associated with entry.
18891 		 */
18892 
18893 		if ((entry->protection & protection) != protection) {
18894 			vm_map_unlock(map);
18895 			return FALSE;
18896 		}
18897 
18898 		/* go to next entry */
18899 
18900 		start = entry->vme_end;
18901 		entry = entry->vme_next;
18902 	}
18903 	vm_map_unlock(map);
18904 	return TRUE;
18905 }
18906 
18907 kern_return_t
vm_map_purgable_control(vm_map_t map,vm_map_offset_t address,vm_purgable_t control,int * state)18908 vm_map_purgable_control(
18909 	vm_map_t                map,
18910 	vm_map_offset_t         address,
18911 	vm_purgable_t           control,
18912 	int                     *state)
18913 {
18914 	vm_map_entry_t          entry;
18915 	vm_object_t             object;
18916 	kern_return_t           kr;
18917 	boolean_t               was_nonvolatile;
18918 
18919 	/*
18920 	 * Vet all the input parameters and current type and state of the
18921 	 * underlaying object.  Return with an error if anything is amiss.
18922 	 */
18923 	if (map == VM_MAP_NULL) {
18924 		return KERN_INVALID_ARGUMENT;
18925 	}
18926 
18927 	if (control != VM_PURGABLE_SET_STATE &&
18928 	    control != VM_PURGABLE_GET_STATE &&
18929 	    control != VM_PURGABLE_PURGE_ALL &&
18930 	    control != VM_PURGABLE_SET_STATE_FROM_KERNEL) {
18931 		return KERN_INVALID_ARGUMENT;
18932 	}
18933 
18934 	if (control == VM_PURGABLE_PURGE_ALL) {
18935 		vm_purgeable_object_purge_all();
18936 		return KERN_SUCCESS;
18937 	}
18938 
18939 	if ((control == VM_PURGABLE_SET_STATE ||
18940 	    control == VM_PURGABLE_SET_STATE_FROM_KERNEL) &&
18941 	    (((*state & ~(VM_PURGABLE_ALL_MASKS)) != 0) ||
18942 	    ((*state & VM_PURGABLE_STATE_MASK) > VM_PURGABLE_STATE_MASK))) {
18943 		return KERN_INVALID_ARGUMENT;
18944 	}
18945 
18946 	vm_map_lock_read(map);
18947 
18948 	if (!vm_map_lookup_entry(map, address, &entry) || entry->is_sub_map) {
18949 		/*
18950 		 * Must pass a valid non-submap address.
18951 		 */
18952 		vm_map_unlock_read(map);
18953 		return KERN_INVALID_ADDRESS;
18954 	}
18955 
18956 	if ((entry->protection & VM_PROT_WRITE) == 0 &&
18957 	    control != VM_PURGABLE_GET_STATE) {
18958 		/*
18959 		 * Can't apply purgable controls to something you can't write.
18960 		 */
18961 		vm_map_unlock_read(map);
18962 		return KERN_PROTECTION_FAILURE;
18963 	}
18964 
18965 	object = VME_OBJECT(entry);
18966 	if (object == VM_OBJECT_NULL ||
18967 	    object->purgable == VM_PURGABLE_DENY) {
18968 		/*
18969 		 * Object must already be present and be purgeable.
18970 		 */
18971 		vm_map_unlock_read(map);
18972 		return KERN_INVALID_ARGUMENT;
18973 	}
18974 
18975 	vm_object_lock(object);
18976 
18977 #if 00
18978 	if (VME_OFFSET(entry) != 0 ||
18979 	    entry->vme_end - entry->vme_start != object->vo_size) {
18980 		/*
18981 		 * Can only apply purgable controls to the whole (existing)
18982 		 * object at once.
18983 		 */
18984 		vm_map_unlock_read(map);
18985 		vm_object_unlock(object);
18986 		return KERN_INVALID_ARGUMENT;
18987 	}
18988 #endif
18989 
18990 	assert(!entry->is_sub_map);
18991 	assert(!entry->use_pmap); /* purgeable has its own accounting */
18992 
18993 	vm_map_unlock_read(map);
18994 
18995 	was_nonvolatile = (object->purgable == VM_PURGABLE_NONVOLATILE);
18996 
18997 	kr = vm_object_purgable_control(object, control, state);
18998 
18999 	if (was_nonvolatile &&
19000 	    object->purgable != VM_PURGABLE_NONVOLATILE &&
19001 	    map->pmap == kernel_pmap) {
19002 #if DEBUG
19003 		object->vo_purgeable_volatilizer = kernel_task;
19004 #endif /* DEBUG */
19005 	}
19006 
19007 	vm_object_unlock(object);
19008 
19009 	return kr;
19010 }
19011 
19012 void
vm_map_footprint_query_page_info(vm_map_t map,vm_map_entry_t map_entry,vm_map_offset_t curr_s_offset,int * disposition_p)19013 vm_map_footprint_query_page_info(
19014 	vm_map_t        map,
19015 	vm_map_entry_t  map_entry,
19016 	vm_map_offset_t curr_s_offset,
19017 	int             *disposition_p)
19018 {
19019 	int             pmap_disp;
19020 	vm_object_t     object = VM_OBJECT_NULL;
19021 	int             disposition;
19022 	int             effective_page_size;
19023 
19024 	vm_map_lock_assert_held(map);
19025 	assert(!map->has_corpse_footprint);
19026 	assert(curr_s_offset >= map_entry->vme_start);
19027 	assert(curr_s_offset < map_entry->vme_end);
19028 
19029 	if (map_entry->is_sub_map) {
19030 		if (!map_entry->use_pmap) {
19031 			/* nested pmap: no footprint */
19032 			*disposition_p = 0;
19033 			return;
19034 		}
19035 	} else {
19036 		object = VME_OBJECT(map_entry);
19037 		if (object == VM_OBJECT_NULL) {
19038 			/* nothing mapped here: no need to ask */
19039 			*disposition_p = 0;
19040 			return;
19041 		}
19042 	}
19043 
19044 	effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(map));
19045 
19046 	pmap_disp = 0;
19047 
19048 	/*
19049 	 * Query the pmap.
19050 	 */
19051 	pmap_query_page_info(map->pmap, curr_s_offset, &pmap_disp);
19052 
19053 	/*
19054 	 * Compute this page's disposition.
19055 	 */
19056 	disposition = 0;
19057 
19058 	/* deal with "alternate accounting" first */
19059 	if (!map_entry->is_sub_map &&
19060 	    object->vo_no_footprint) {
19061 		/* does not count in footprint */
19062 		assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19063 	} else if (!map_entry->is_sub_map &&
19064 	    (object->purgable == VM_PURGABLE_NONVOLATILE ||
19065 	    (object->purgable == VM_PURGABLE_DENY &&
19066 	    object->vo_ledger_tag)) &&
19067 	    VM_OBJECT_OWNER(object) != NULL &&
19068 	    VM_OBJECT_OWNER(object)->map == map) {
19069 		assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19070 		if ((((curr_s_offset
19071 		    - map_entry->vme_start
19072 		    + VME_OFFSET(map_entry))
19073 		    / effective_page_size) <
19074 		    (object->resident_page_count +
19075 		    vm_compressor_pager_get_count(object->pager)))) {
19076 			/*
19077 			 * Non-volatile purgeable object owned
19078 			 * by this task: report the first
19079 			 * "#resident + #compressed" pages as
19080 			 * "resident" (to show that they
19081 			 * contribute to the footprint) but not
19082 			 * "dirty" (to avoid double-counting
19083 			 * with the fake "non-volatile" region
19084 			 * we'll report at the end of the
19085 			 * address space to account for all
19086 			 * (mapped or not) non-volatile memory
19087 			 * owned by this task.
19088 			 */
19089 			disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
19090 		}
19091 	} else if (!map_entry->is_sub_map &&
19092 	    (object->purgable == VM_PURGABLE_VOLATILE ||
19093 	    object->purgable == VM_PURGABLE_EMPTY) &&
19094 	    VM_OBJECT_OWNER(object) != NULL &&
19095 	    VM_OBJECT_OWNER(object)->map == map) {
19096 		assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19097 		if ((((curr_s_offset
19098 		    - map_entry->vme_start
19099 		    + VME_OFFSET(map_entry))
19100 		    / effective_page_size) <
19101 		    object->wired_page_count)) {
19102 			/*
19103 			 * Volatile|empty purgeable object owned
19104 			 * by this task: report the first
19105 			 * "#wired" pages as "resident" (to
19106 			 * show that they contribute to the
19107 			 * footprint) but not "dirty" (to avoid
19108 			 * double-counting with the fake
19109 			 * "non-volatile" region we'll report
19110 			 * at the end of the address space to
19111 			 * account for all (mapped or not)
19112 			 * non-volatile memory owned by this
19113 			 * task.
19114 			 */
19115 			disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
19116 		}
19117 	} else if (!map_entry->is_sub_map &&
19118 	    map_entry->iokit_acct &&
19119 	    object->internal &&
19120 	    object->purgable == VM_PURGABLE_DENY) {
19121 		/*
19122 		 * Non-purgeable IOKit memory: phys_footprint
19123 		 * includes the entire virtual mapping.
19124 		 */
19125 		assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19126 		disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
19127 		disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
19128 	} else if (pmap_disp & (PMAP_QUERY_PAGE_ALTACCT |
19129 	    PMAP_QUERY_PAGE_COMPRESSED_ALTACCT)) {
19130 		/* alternate accounting */
19131 #if __arm64__ && (DEVELOPMENT || DEBUG)
19132 		if (map->pmap->footprint_was_suspended) {
19133 			/*
19134 			 * The assertion below can fail if dyld
19135 			 * suspended footprint accounting
19136 			 * while doing some adjustments to
19137 			 * this page;  the mapping would say
19138 			 * "use pmap accounting" but the page
19139 			 * would be marked "alternate
19140 			 * accounting".
19141 			 */
19142 		} else
19143 #endif /* __arm64__ && (DEVELOPMENT || DEBUG) */
19144 		{
19145 			assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19146 		}
19147 		disposition = 0;
19148 	} else {
19149 		if (pmap_disp & PMAP_QUERY_PAGE_PRESENT) {
19150 			assertf(map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19151 			disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
19152 			disposition |= VM_PAGE_QUERY_PAGE_REF;
19153 			if (pmap_disp & PMAP_QUERY_PAGE_INTERNAL) {
19154 				disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
19155 			} else {
19156 				disposition |= VM_PAGE_QUERY_PAGE_EXTERNAL;
19157 			}
19158 			if (pmap_disp & PMAP_QUERY_PAGE_REUSABLE) {
19159 				disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
19160 			}
19161 		} else if (pmap_disp & PMAP_QUERY_PAGE_COMPRESSED) {
19162 			assertf(map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19163 			disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
19164 		}
19165 	}
19166 
19167 	*disposition_p = disposition;
19168 }
19169 
19170 kern_return_t
vm_map_page_query_internal(vm_map_t target_map,vm_map_offset_t offset,int * disposition,int * ref_count)19171 vm_map_page_query_internal(
19172 	vm_map_t        target_map,
19173 	vm_map_offset_t offset,
19174 	int             *disposition,
19175 	int             *ref_count)
19176 {
19177 	kern_return_t                   kr;
19178 	vm_page_info_basic_data_t       info;
19179 	mach_msg_type_number_t          count;
19180 
19181 	count = VM_PAGE_INFO_BASIC_COUNT;
19182 	kr = vm_map_page_info(target_map,
19183 	    offset,
19184 	    VM_PAGE_INFO_BASIC,
19185 	    (vm_page_info_t) &info,
19186 	    &count);
19187 	if (kr == KERN_SUCCESS) {
19188 		*disposition = info.disposition;
19189 		*ref_count = info.ref_count;
19190 	} else {
19191 		*disposition = 0;
19192 		*ref_count = 0;
19193 	}
19194 
19195 	return kr;
19196 }
19197 
19198 kern_return_t
vm_map_page_info(vm_map_t map,vm_map_offset_t offset,vm_page_info_flavor_t flavor,vm_page_info_t info,mach_msg_type_number_t * count)19199 vm_map_page_info(
19200 	vm_map_t                map,
19201 	vm_map_offset_t         offset,
19202 	vm_page_info_flavor_t   flavor,
19203 	vm_page_info_t          info,
19204 	mach_msg_type_number_t  *count)
19205 {
19206 	return vm_map_page_range_info_internal(map,
19207 	           offset, /* start of range */
19208 	           (offset + 1), /* this will get rounded in the call to the page boundary */
19209 	           (int)-1, /* effective_page_shift: unspecified */
19210 	           flavor,
19211 	           info,
19212 	           count);
19213 }
19214 
19215 kern_return_t
vm_map_page_range_info_internal(vm_map_t map,vm_map_offset_t start_offset,vm_map_offset_t end_offset,int effective_page_shift,vm_page_info_flavor_t flavor,vm_page_info_t info,mach_msg_type_number_t * count)19216 vm_map_page_range_info_internal(
19217 	vm_map_t                map,
19218 	vm_map_offset_t         start_offset,
19219 	vm_map_offset_t         end_offset,
19220 	int                     effective_page_shift,
19221 	vm_page_info_flavor_t   flavor,
19222 	vm_page_info_t          info,
19223 	mach_msg_type_number_t  *count)
19224 {
19225 	vm_map_entry_t          map_entry = VM_MAP_ENTRY_NULL;
19226 	vm_object_t             object = VM_OBJECT_NULL, curr_object = VM_OBJECT_NULL;
19227 	vm_page_t               m = VM_PAGE_NULL;
19228 	kern_return_t           retval = KERN_SUCCESS;
19229 	int                     disposition = 0;
19230 	int                     ref_count = 0;
19231 	int                     depth = 0, info_idx = 0;
19232 	vm_page_info_basic_t    basic_info = 0;
19233 	vm_map_offset_t         offset_in_page = 0, offset_in_object = 0, curr_offset_in_object = 0;
19234 	vm_map_offset_t         start = 0, end = 0, curr_s_offset = 0, curr_e_offset = 0;
19235 	boolean_t               do_region_footprint;
19236 	ledger_amount_t         ledger_resident, ledger_compressed;
19237 	int                     effective_page_size;
19238 	vm_map_offset_t         effective_page_mask;
19239 
19240 	switch (flavor) {
19241 	case VM_PAGE_INFO_BASIC:
19242 		if (*count != VM_PAGE_INFO_BASIC_COUNT) {
19243 			/*
19244 			 * The "vm_page_info_basic_data" structure was not
19245 			 * properly padded, so allow the size to be off by
19246 			 * one to maintain backwards binary compatibility...
19247 			 */
19248 			if (*count != VM_PAGE_INFO_BASIC_COUNT - 1) {
19249 				return KERN_INVALID_ARGUMENT;
19250 			}
19251 		}
19252 		break;
19253 	default:
19254 		return KERN_INVALID_ARGUMENT;
19255 	}
19256 
19257 	if (effective_page_shift == -1) {
19258 		effective_page_shift = vm_self_region_page_shift_safely(map);
19259 		if (effective_page_shift == -1) {
19260 			return KERN_INVALID_ARGUMENT;
19261 		}
19262 	}
19263 	effective_page_size = (1 << effective_page_shift);
19264 	effective_page_mask = effective_page_size - 1;
19265 
19266 	do_region_footprint = task_self_region_footprint();
19267 	disposition = 0;
19268 	ref_count = 0;
19269 	depth = 0;
19270 	info_idx = 0; /* Tracks the next index within the info structure to be filled.*/
19271 	retval = KERN_SUCCESS;
19272 
19273 	offset_in_page = start_offset & effective_page_mask;
19274 	start = vm_map_trunc_page(start_offset, effective_page_mask);
19275 	end = vm_map_round_page(end_offset, effective_page_mask);
19276 
19277 	if (end < start) {
19278 		return KERN_INVALID_ARGUMENT;
19279 	}
19280 
19281 	assert((end - start) <= MAX_PAGE_RANGE_QUERY);
19282 
19283 	vm_map_lock_read(map);
19284 
19285 	task_ledgers_footprint(map->pmap->ledger, &ledger_resident, &ledger_compressed);
19286 
19287 	for (curr_s_offset = start; curr_s_offset < end;) {
19288 		/*
19289 		 * New lookup needs reset of these variables.
19290 		 */
19291 		curr_object = object = VM_OBJECT_NULL;
19292 		offset_in_object = 0;
19293 		ref_count = 0;
19294 		depth = 0;
19295 
19296 		if (do_region_footprint &&
19297 		    curr_s_offset >= vm_map_last_entry(map)->vme_end) {
19298 			/*
19299 			 * Request for "footprint" info about a page beyond
19300 			 * the end of address space: this must be for
19301 			 * the fake region vm_map_region_recurse_64()
19302 			 * reported to account for non-volatile purgeable
19303 			 * memory owned by this task.
19304 			 */
19305 			disposition = 0;
19306 
19307 			if (curr_s_offset - vm_map_last_entry(map)->vme_end <=
19308 			    (unsigned) ledger_compressed) {
19309 				/*
19310 				 * We haven't reported all the "non-volatile
19311 				 * compressed" pages yet, so report this fake
19312 				 * page as "compressed".
19313 				 */
19314 				disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
19315 			} else {
19316 				/*
19317 				 * We've reported all the non-volatile
19318 				 * compressed page but not all the non-volatile
19319 				 * pages , so report this fake page as
19320 				 * "resident dirty".
19321 				 */
19322 				disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
19323 				disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
19324 				disposition |= VM_PAGE_QUERY_PAGE_REF;
19325 			}
19326 			switch (flavor) {
19327 			case VM_PAGE_INFO_BASIC:
19328 				basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
19329 				basic_info->disposition = disposition;
19330 				basic_info->ref_count = 1;
19331 				basic_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
19332 				basic_info->offset = 0;
19333 				basic_info->depth = 0;
19334 
19335 				info_idx++;
19336 				break;
19337 			}
19338 			curr_s_offset += effective_page_size;
19339 			continue;
19340 		}
19341 
19342 		/*
19343 		 * First, find the map entry covering "curr_s_offset", going down
19344 		 * submaps if necessary.
19345 		 */
19346 		if (!vm_map_lookup_entry(map, curr_s_offset, &map_entry)) {
19347 			/* no entry -> no object -> no page */
19348 
19349 			if (curr_s_offset < vm_map_min(map)) {
19350 				/*
19351 				 * Illegal address that falls below map min.
19352 				 */
19353 				curr_e_offset = MIN(end, vm_map_min(map));
19354 			} else if (curr_s_offset >= vm_map_max(map)) {
19355 				/*
19356 				 * Illegal address that falls on/after map max.
19357 				 */
19358 				curr_e_offset = end;
19359 			} else if (map_entry == vm_map_to_entry(map)) {
19360 				/*
19361 				 * Hit a hole.
19362 				 */
19363 				if (map_entry->vme_next == vm_map_to_entry(map)) {
19364 					/*
19365 					 * Empty map.
19366 					 */
19367 					curr_e_offset = MIN(map->max_offset, end);
19368 				} else {
19369 					/*
19370 					 * Hole at start of the map.
19371 					 */
19372 					curr_e_offset = MIN(map_entry->vme_next->vme_start, end);
19373 				}
19374 			} else {
19375 				if (map_entry->vme_next == vm_map_to_entry(map)) {
19376 					/*
19377 					 * Hole at the end of the map.
19378 					 */
19379 					curr_e_offset = MIN(map->max_offset, end);
19380 				} else {
19381 					curr_e_offset = MIN(map_entry->vme_next->vme_start, end);
19382 				}
19383 			}
19384 
19385 			assert(curr_e_offset >= curr_s_offset);
19386 
19387 			uint64_t num_pages = (curr_e_offset - curr_s_offset) >> effective_page_shift;
19388 
19389 			void *info_ptr = (void*) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
19390 
19391 			bzero(info_ptr, num_pages * sizeof(struct vm_page_info_basic));
19392 
19393 			curr_s_offset = curr_e_offset;
19394 
19395 			info_idx += num_pages;
19396 
19397 			continue;
19398 		}
19399 
19400 		/* compute offset from this map entry's start */
19401 		offset_in_object = curr_s_offset - map_entry->vme_start;
19402 
19403 		/* compute offset into this map entry's object (or submap) */
19404 		offset_in_object += VME_OFFSET(map_entry);
19405 
19406 		if (map_entry->is_sub_map) {
19407 			vm_map_t sub_map = VM_MAP_NULL;
19408 			vm_page_info_t submap_info = 0;
19409 			vm_map_offset_t submap_s_offset = 0, submap_e_offset = 0, range_len = 0;
19410 
19411 			range_len = MIN(map_entry->vme_end, end) - curr_s_offset;
19412 
19413 			submap_s_offset = offset_in_object;
19414 			submap_e_offset = submap_s_offset + range_len;
19415 
19416 			sub_map = VME_SUBMAP(map_entry);
19417 
19418 			vm_map_reference(sub_map);
19419 			vm_map_unlock_read(map);
19420 
19421 			submap_info = (vm_page_info_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
19422 
19423 			assertf(VM_MAP_PAGE_SHIFT(sub_map) >= VM_MAP_PAGE_SHIFT(map),
19424 			    "Submap page size (%d) differs from current map (%d)\n", VM_MAP_PAGE_SIZE(sub_map), VM_MAP_PAGE_SIZE(map));
19425 
19426 			retval = vm_map_page_range_info_internal(sub_map,
19427 			    submap_s_offset,
19428 			    submap_e_offset,
19429 			    effective_page_shift,
19430 			    VM_PAGE_INFO_BASIC,
19431 			    (vm_page_info_t) submap_info,
19432 			    count);
19433 
19434 			assert(retval == KERN_SUCCESS);
19435 
19436 			vm_map_lock_read(map);
19437 			vm_map_deallocate(sub_map);
19438 
19439 			/* Move the "info" index by the number of pages we inspected.*/
19440 			info_idx += range_len >> effective_page_shift;
19441 
19442 			/* Move our current offset by the size of the range we inspected.*/
19443 			curr_s_offset += range_len;
19444 
19445 			continue;
19446 		}
19447 
19448 		object = VME_OBJECT(map_entry);
19449 
19450 		if (object == VM_OBJECT_NULL) {
19451 			/*
19452 			 * We don't have an object here and, hence,
19453 			 * no pages to inspect. We'll fill up the
19454 			 * info structure appropriately.
19455 			 */
19456 
19457 			curr_e_offset = MIN(map_entry->vme_end, end);
19458 
19459 			uint64_t num_pages = (curr_e_offset - curr_s_offset) >> effective_page_shift;
19460 
19461 			void *info_ptr = (void*) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
19462 
19463 			bzero(info_ptr, num_pages * sizeof(struct vm_page_info_basic));
19464 
19465 			curr_s_offset = curr_e_offset;
19466 
19467 			info_idx += num_pages;
19468 
19469 			continue;
19470 		}
19471 
19472 		if (do_region_footprint) {
19473 			disposition = 0;
19474 			if (map->has_corpse_footprint) {
19475 				/*
19476 				 * Query the page info data we saved
19477 				 * while forking the corpse.
19478 				 */
19479 				vm_map_corpse_footprint_query_page_info(
19480 					map,
19481 					curr_s_offset,
19482 					&disposition);
19483 			} else {
19484 				/*
19485 				 * Query the live pmap for footprint info
19486 				 * about this page.
19487 				 */
19488 				vm_map_footprint_query_page_info(
19489 					map,
19490 					map_entry,
19491 					curr_s_offset,
19492 					&disposition);
19493 			}
19494 			switch (flavor) {
19495 			case VM_PAGE_INFO_BASIC:
19496 				basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
19497 				basic_info->disposition = disposition;
19498 				basic_info->ref_count = 1;
19499 				basic_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
19500 				basic_info->offset = 0;
19501 				basic_info->depth = 0;
19502 
19503 				info_idx++;
19504 				break;
19505 			}
19506 			curr_s_offset += effective_page_size;
19507 			continue;
19508 		}
19509 
19510 		vm_object_reference(object);
19511 		/*
19512 		 * Shared mode -- so we can allow other readers
19513 		 * to grab the lock too.
19514 		 */
19515 		vm_object_lock_shared(object);
19516 
19517 		curr_e_offset = MIN(map_entry->vme_end, end);
19518 
19519 		vm_map_unlock_read(map);
19520 
19521 		map_entry = NULL; /* map is unlocked, the entry is no longer valid. */
19522 
19523 		curr_object = object;
19524 
19525 		for (; curr_s_offset < curr_e_offset;) {
19526 			if (object == curr_object) {
19527 				ref_count = curr_object->ref_count - 1; /* account for our object reference above. */
19528 			} else {
19529 				ref_count = curr_object->ref_count;
19530 			}
19531 
19532 			curr_offset_in_object = offset_in_object;
19533 
19534 			for (;;) {
19535 				m = vm_page_lookup(curr_object, vm_object_trunc_page(curr_offset_in_object));
19536 
19537 				if (m != VM_PAGE_NULL) {
19538 					disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
19539 					break;
19540 				} else {
19541 					if (curr_object->internal &&
19542 					    curr_object->alive &&
19543 					    !curr_object->terminating &&
19544 					    curr_object->pager_ready) {
19545 						if (VM_COMPRESSOR_PAGER_STATE_GET(curr_object, vm_object_trunc_page(curr_offset_in_object))
19546 						    == VM_EXTERNAL_STATE_EXISTS) {
19547 							/* the pager has that page */
19548 							disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
19549 							break;
19550 						}
19551 					}
19552 
19553 					/*
19554 					 * Go down the VM object shadow chain until we find the page
19555 					 * we're looking for.
19556 					 */
19557 
19558 					if (curr_object->shadow != VM_OBJECT_NULL) {
19559 						vm_object_t shadow = VM_OBJECT_NULL;
19560 
19561 						curr_offset_in_object += curr_object->vo_shadow_offset;
19562 						shadow = curr_object->shadow;
19563 
19564 						vm_object_lock_shared(shadow);
19565 						vm_object_unlock(curr_object);
19566 
19567 						curr_object = shadow;
19568 						depth++;
19569 						continue;
19570 					} else {
19571 						break;
19572 					}
19573 				}
19574 			}
19575 
19576 			/* The ref_count is not strictly accurate, it measures the number   */
19577 			/* of entities holding a ref on the object, they may not be mapping */
19578 			/* the object or may not be mapping the section holding the         */
19579 			/* target page but its still a ball park number and though an over- */
19580 			/* count, it picks up the copy-on-write cases                       */
19581 
19582 			/* We could also get a picture of page sharing from pmap_attributes */
19583 			/* but this would under count as only faulted-in mappings would     */
19584 			/* show up.							    */
19585 
19586 			if ((curr_object == object) && curr_object->shadow) {
19587 				disposition |= VM_PAGE_QUERY_PAGE_COPIED;
19588 			}
19589 
19590 			if (!curr_object->internal) {
19591 				disposition |= VM_PAGE_QUERY_PAGE_EXTERNAL;
19592 			}
19593 
19594 			if (m != VM_PAGE_NULL) {
19595 				if (m->vmp_fictitious) {
19596 					disposition |= VM_PAGE_QUERY_PAGE_FICTITIOUS;
19597 				} else {
19598 					if (m->vmp_dirty || pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(m))) {
19599 						disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
19600 					}
19601 
19602 					if (m->vmp_reference || pmap_is_referenced(VM_PAGE_GET_PHYS_PAGE(m))) {
19603 						disposition |= VM_PAGE_QUERY_PAGE_REF;
19604 					}
19605 
19606 					if (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
19607 						disposition |= VM_PAGE_QUERY_PAGE_SPECULATIVE;
19608 					}
19609 
19610 					/*
19611 					 * XXX TODO4K:
19612 					 * when this routine deals with 4k
19613 					 * pages, check the appropriate CS bit
19614 					 * here.
19615 					 */
19616 					if (m->vmp_cs_validated) {
19617 						disposition |= VM_PAGE_QUERY_PAGE_CS_VALIDATED;
19618 					}
19619 					if (m->vmp_cs_tainted) {
19620 						disposition |= VM_PAGE_QUERY_PAGE_CS_TAINTED;
19621 					}
19622 					if (m->vmp_cs_nx) {
19623 						disposition |= VM_PAGE_QUERY_PAGE_CS_NX;
19624 					}
19625 					if (m->vmp_reusable || curr_object->all_reusable) {
19626 						disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
19627 					}
19628 				}
19629 			}
19630 
19631 			switch (flavor) {
19632 			case VM_PAGE_INFO_BASIC:
19633 				basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
19634 				basic_info->disposition = disposition;
19635 				basic_info->ref_count = ref_count;
19636 				basic_info->object_id = (vm_object_id_t) (uintptr_t)
19637 				    VM_KERNEL_ADDRPERM(curr_object);
19638 				basic_info->offset =
19639 				    (memory_object_offset_t) curr_offset_in_object + offset_in_page;
19640 				basic_info->depth = depth;
19641 
19642 				info_idx++;
19643 				break;
19644 			}
19645 
19646 			disposition = 0;
19647 			offset_in_page = 0; // This doesn't really make sense for any offset other than the starting offset.
19648 
19649 			/*
19650 			 * Move to next offset in the range and in our object.
19651 			 */
19652 			curr_s_offset += effective_page_size;
19653 			offset_in_object += effective_page_size;
19654 			curr_offset_in_object = offset_in_object;
19655 
19656 			if (curr_object != object) {
19657 				vm_object_unlock(curr_object);
19658 
19659 				curr_object = object;
19660 
19661 				vm_object_lock_shared(curr_object);
19662 			} else {
19663 				vm_object_lock_yield_shared(curr_object);
19664 			}
19665 		}
19666 
19667 		vm_object_unlock(curr_object);
19668 		vm_object_deallocate(curr_object);
19669 
19670 		vm_map_lock_read(map);
19671 	}
19672 
19673 	vm_map_unlock_read(map);
19674 	return retval;
19675 }
19676 
19677 /*
19678  *	vm_map_msync
19679  *
19680  *	Synchronises the memory range specified with its backing store
19681  *	image by either flushing or cleaning the contents to the appropriate
19682  *	memory manager engaging in a memory object synchronize dialog with
19683  *	the manager.  The client doesn't return until the manager issues
19684  *	m_o_s_completed message.  MIG Magically converts user task parameter
19685  *	to the task's address map.
19686  *
19687  *	interpretation of sync_flags
19688  *	VM_SYNC_INVALIDATE	- discard pages, only return precious
19689  *				  pages to manager.
19690  *
19691  *	VM_SYNC_INVALIDATE & (VM_SYNC_SYNCHRONOUS | VM_SYNC_ASYNCHRONOUS)
19692  *				- discard pages, write dirty or precious
19693  *				  pages back to memory manager.
19694  *
19695  *	VM_SYNC_SYNCHRONOUS | VM_SYNC_ASYNCHRONOUS
19696  *				- write dirty or precious pages back to
19697  *				  the memory manager.
19698  *
19699  *	VM_SYNC_CONTIGUOUS	- does everything normally, but if there
19700  *				  is a hole in the region, and we would
19701  *				  have returned KERN_SUCCESS, return
19702  *				  KERN_INVALID_ADDRESS instead.
19703  *
19704  *	NOTE
19705  *	The memory object attributes have not yet been implemented, this
19706  *	function will have to deal with the invalidate attribute
19707  *
19708  *	RETURNS
19709  *	KERN_INVALID_TASK		Bad task parameter
19710  *	KERN_INVALID_ARGUMENT		both sync and async were specified.
19711  *	KERN_SUCCESS			The usual.
19712  *	KERN_INVALID_ADDRESS		There was a hole in the region.
19713  */
19714 
19715 kern_return_t
vm_map_msync(vm_map_t map,vm_map_address_t address,vm_map_size_t size,vm_sync_t sync_flags)19716 vm_map_msync(
19717 	vm_map_t                map,
19718 	vm_map_address_t        address,
19719 	vm_map_size_t           size,
19720 	vm_sync_t               sync_flags)
19721 {
19722 	vm_map_entry_t          entry;
19723 	vm_map_size_t           amount_left;
19724 	vm_object_offset_t      offset;
19725 	vm_object_offset_t      start_offset, end_offset;
19726 	boolean_t               do_sync_req;
19727 	boolean_t               had_hole = FALSE;
19728 	vm_map_offset_t         pmap_offset;
19729 
19730 	if ((sync_flags & VM_SYNC_ASYNCHRONOUS) &&
19731 	    (sync_flags & VM_SYNC_SYNCHRONOUS)) {
19732 		return KERN_INVALID_ARGUMENT;
19733 	}
19734 
19735 	if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
19736 		DEBUG4K_SHARE("map %p address 0x%llx size 0x%llx flags 0x%x\n", map, (uint64_t)address, (uint64_t)size, sync_flags);
19737 	}
19738 
19739 	/*
19740 	 * align address and size on page boundaries
19741 	 */
19742 	size = (vm_map_round_page(address + size,
19743 	    VM_MAP_PAGE_MASK(map)) -
19744 	    vm_map_trunc_page(address,
19745 	    VM_MAP_PAGE_MASK(map)));
19746 	address = vm_map_trunc_page(address,
19747 	    VM_MAP_PAGE_MASK(map));
19748 
19749 	if (map == VM_MAP_NULL) {
19750 		return KERN_INVALID_TASK;
19751 	}
19752 
19753 	if (size == 0) {
19754 		return KERN_SUCCESS;
19755 	}
19756 
19757 	amount_left = size;
19758 
19759 	while (amount_left > 0) {
19760 		vm_object_size_t        flush_size;
19761 		vm_object_t             object;
19762 
19763 		vm_map_lock(map);
19764 		if (!vm_map_lookup_entry(map,
19765 		    address,
19766 		    &entry)) {
19767 			vm_map_size_t   skip;
19768 
19769 			/*
19770 			 * hole in the address map.
19771 			 */
19772 			had_hole = TRUE;
19773 
19774 			if (sync_flags & VM_SYNC_KILLPAGES) {
19775 				/*
19776 				 * For VM_SYNC_KILLPAGES, there should be
19777 				 * no holes in the range, since we couldn't
19778 				 * prevent someone else from allocating in
19779 				 * that hole and we wouldn't want to "kill"
19780 				 * their pages.
19781 				 */
19782 				vm_map_unlock(map);
19783 				break;
19784 			}
19785 
19786 			/*
19787 			 * Check for empty map.
19788 			 */
19789 			if (entry == vm_map_to_entry(map) &&
19790 			    entry->vme_next == entry) {
19791 				vm_map_unlock(map);
19792 				break;
19793 			}
19794 			/*
19795 			 * Check that we don't wrap and that
19796 			 * we have at least one real map entry.
19797 			 */
19798 			if ((map->hdr.nentries == 0) ||
19799 			    (entry->vme_next->vme_start < address)) {
19800 				vm_map_unlock(map);
19801 				break;
19802 			}
19803 			/*
19804 			 * Move up to the next entry if needed
19805 			 */
19806 			skip = (entry->vme_next->vme_start - address);
19807 			if (skip >= amount_left) {
19808 				amount_left = 0;
19809 			} else {
19810 				amount_left -= skip;
19811 			}
19812 			address = entry->vme_next->vme_start;
19813 			vm_map_unlock(map);
19814 			continue;
19815 		}
19816 
19817 		offset = address - entry->vme_start;
19818 		pmap_offset = address;
19819 
19820 		/*
19821 		 * do we have more to flush than is contained in this
19822 		 * entry ?
19823 		 */
19824 		if (amount_left + entry->vme_start + offset > entry->vme_end) {
19825 			flush_size = entry->vme_end -
19826 			    (entry->vme_start + offset);
19827 		} else {
19828 			flush_size = amount_left;
19829 		}
19830 		amount_left -= flush_size;
19831 		address += flush_size;
19832 
19833 		if (entry->is_sub_map == TRUE) {
19834 			vm_map_t        local_map;
19835 			vm_map_offset_t local_offset;
19836 
19837 			local_map = VME_SUBMAP(entry);
19838 			local_offset = VME_OFFSET(entry);
19839 			vm_map_reference(local_map);
19840 			vm_map_unlock(map);
19841 			if (vm_map_msync(
19842 				    local_map,
19843 				    local_offset,
19844 				    flush_size,
19845 				    sync_flags) == KERN_INVALID_ADDRESS) {
19846 				had_hole = TRUE;
19847 			}
19848 			vm_map_deallocate(local_map);
19849 			continue;
19850 		}
19851 		object = VME_OBJECT(entry);
19852 
19853 		/*
19854 		 * We can't sync this object if the object has not been
19855 		 * created yet
19856 		 */
19857 		if (object == VM_OBJECT_NULL) {
19858 			vm_map_unlock(map);
19859 			continue;
19860 		}
19861 		offset += VME_OFFSET(entry);
19862 
19863 		vm_object_lock(object);
19864 
19865 		if (sync_flags & (VM_SYNC_KILLPAGES | VM_SYNC_DEACTIVATE)) {
19866 			int kill_pages = 0;
19867 			boolean_t reusable_pages = FALSE;
19868 
19869 			if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
19870 				/*
19871 				 * This is a destructive operation and so we
19872 				 * err on the side of limiting the range of
19873 				 * the operation.
19874 				 */
19875 				start_offset = vm_object_round_page(offset);
19876 				end_offset = vm_object_trunc_page(offset + flush_size);
19877 
19878 				if (end_offset <= start_offset) {
19879 					vm_object_unlock(object);
19880 					vm_map_unlock(map);
19881 					continue;
19882 				}
19883 
19884 				pmap_offset += start_offset - offset;
19885 			} else {
19886 				start_offset = offset;
19887 				end_offset = offset + flush_size;
19888 			}
19889 
19890 			if (sync_flags & VM_SYNC_KILLPAGES) {
19891 				if (((object->ref_count == 1) ||
19892 				    ((object->copy_strategy !=
19893 				    MEMORY_OBJECT_COPY_SYMMETRIC) &&
19894 				    (object->copy == VM_OBJECT_NULL))) &&
19895 				    (object->shadow == VM_OBJECT_NULL)) {
19896 					if (object->ref_count != 1) {
19897 						vm_page_stats_reusable.free_shared++;
19898 					}
19899 					kill_pages = 1;
19900 				} else {
19901 					kill_pages = -1;
19902 				}
19903 			}
19904 			if (kill_pages != -1) {
19905 				vm_object_deactivate_pages(
19906 					object,
19907 					start_offset,
19908 					(vm_object_size_t) (end_offset - start_offset),
19909 					kill_pages,
19910 					reusable_pages,
19911 					map->pmap,
19912 					pmap_offset);
19913 			}
19914 			vm_object_unlock(object);
19915 			vm_map_unlock(map);
19916 			continue;
19917 		}
19918 		/*
19919 		 * We can't sync this object if there isn't a pager.
19920 		 * Don't bother to sync internal objects, since there can't
19921 		 * be any "permanent" storage for these objects anyway.
19922 		 */
19923 		if ((object->pager == MEMORY_OBJECT_NULL) ||
19924 		    (object->internal) || (object->private)) {
19925 			vm_object_unlock(object);
19926 			vm_map_unlock(map);
19927 			continue;
19928 		}
19929 		/*
19930 		 * keep reference on the object until syncing is done
19931 		 */
19932 		vm_object_reference_locked(object);
19933 		vm_object_unlock(object);
19934 
19935 		vm_map_unlock(map);
19936 
19937 		if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
19938 			start_offset = vm_object_trunc_page(offset);
19939 			end_offset = vm_object_round_page(offset + flush_size);
19940 		} else {
19941 			start_offset = offset;
19942 			end_offset = offset + flush_size;
19943 		}
19944 
19945 		do_sync_req = vm_object_sync(object,
19946 		    start_offset,
19947 		    (end_offset - start_offset),
19948 		    sync_flags & VM_SYNC_INVALIDATE,
19949 		    ((sync_flags & VM_SYNC_SYNCHRONOUS) ||
19950 		    (sync_flags & VM_SYNC_ASYNCHRONOUS)),
19951 		    sync_flags & VM_SYNC_SYNCHRONOUS);
19952 
19953 		if ((sync_flags & VM_SYNC_INVALIDATE) && object->resident_page_count == 0) {
19954 			/*
19955 			 * clear out the clustering and read-ahead hints
19956 			 */
19957 			vm_object_lock(object);
19958 
19959 			object->pages_created = 0;
19960 			object->pages_used = 0;
19961 			object->sequential = 0;
19962 			object->last_alloc = 0;
19963 
19964 			vm_object_unlock(object);
19965 		}
19966 		vm_object_deallocate(object);
19967 	} /* while */
19968 
19969 	/* for proper msync() behaviour */
19970 	if (had_hole == TRUE && (sync_flags & VM_SYNC_CONTIGUOUS)) {
19971 		return KERN_INVALID_ADDRESS;
19972 	}
19973 
19974 	return KERN_SUCCESS;
19975 }/* vm_msync */
19976 
19977 void
vm_named_entry_associate_vm_object(vm_named_entry_t named_entry,vm_object_t object,vm_object_offset_t offset,vm_object_size_t size,vm_prot_t prot)19978 vm_named_entry_associate_vm_object(
19979 	vm_named_entry_t        named_entry,
19980 	vm_object_t             object,
19981 	vm_object_offset_t      offset,
19982 	vm_object_size_t        size,
19983 	vm_prot_t               prot)
19984 {
19985 	vm_map_copy_t copy;
19986 	vm_map_entry_t copy_entry;
19987 
19988 	assert(!named_entry->is_sub_map);
19989 	assert(!named_entry->is_copy);
19990 	assert(!named_entry->is_object);
19991 	assert(!named_entry->internal);
19992 	assert(named_entry->backing.copy == VM_MAP_COPY_NULL);
19993 
19994 	copy = vm_map_copy_allocate();
19995 	copy->type = VM_MAP_COPY_ENTRY_LIST;
19996 	copy->offset = offset;
19997 	copy->size = size;
19998 	copy->cpy_hdr.page_shift = (uint16_t)PAGE_SHIFT;
19999 	vm_map_store_init(&copy->cpy_hdr);
20000 
20001 	copy_entry = vm_map_copy_entry_create(copy);
20002 	copy_entry->protection = prot;
20003 	copy_entry->max_protection = prot;
20004 	copy_entry->use_pmap = TRUE;
20005 	copy_entry->vme_start = VM_MAP_TRUNC_PAGE(offset, PAGE_MASK);
20006 	copy_entry->vme_end = VM_MAP_ROUND_PAGE(offset + size, PAGE_MASK);
20007 	VME_OBJECT_SET(copy_entry, object, false, 0);
20008 	VME_OFFSET_SET(copy_entry, vm_object_trunc_page(offset));
20009 	vm_map_copy_entry_link(copy, vm_map_copy_last_entry(copy), copy_entry);
20010 
20011 	named_entry->backing.copy = copy;
20012 	named_entry->is_object = TRUE;
20013 	if (object->internal) {
20014 		named_entry->internal = TRUE;
20015 	}
20016 
20017 	DEBUG4K_MEMENTRY("named_entry %p copy %p object %p offset 0x%llx size 0x%llx prot 0x%x\n",
20018 	    named_entry, copy, object, offset, size, prot);
20019 }
20020 
20021 vm_object_t
vm_named_entry_to_vm_object(vm_named_entry_t named_entry)20022 vm_named_entry_to_vm_object(
20023 	vm_named_entry_t named_entry)
20024 {
20025 	vm_map_copy_t   copy;
20026 	vm_map_entry_t  copy_entry;
20027 	vm_object_t     object;
20028 
20029 	assert(!named_entry->is_sub_map);
20030 	assert(!named_entry->is_copy);
20031 	assert(named_entry->is_object);
20032 	copy = named_entry->backing.copy;
20033 	assert(copy != VM_MAP_COPY_NULL);
20034 	/*
20035 	 * Assert that the vm_map_copy is coming from the right
20036 	 * zone and hasn't been forged
20037 	 */
20038 	vm_map_copy_require(copy);
20039 	assert(copy->cpy_hdr.nentries == 1);
20040 	copy_entry = vm_map_copy_first_entry(copy);
20041 	object = VME_OBJECT(copy_entry);
20042 
20043 	DEBUG4K_MEMENTRY("%p -> %p -> %p [0x%llx 0x%llx 0x%llx 0x%x/0x%x ] -> %p offset 0x%llx size 0x%llx prot 0x%x\n", named_entry, copy, copy_entry, (uint64_t)copy_entry->vme_start, (uint64_t)copy_entry->vme_end, copy_entry->vme_offset, copy_entry->protection, copy_entry->max_protection, object, named_entry->offset, named_entry->size, named_entry->protection);
20044 
20045 	return object;
20046 }
20047 
20048 /*
20049  *	Routine:	convert_port_entry_to_map
20050  *	Purpose:
20051  *		Convert from a port specifying an entry or a task
20052  *		to a map. Doesn't consume the port ref; produces a map ref,
20053  *		which may be null.  Unlike convert_port_to_map, the
20054  *		port may be task or a named entry backed.
20055  *	Conditions:
20056  *		Nothing locked.
20057  */
20058 
20059 vm_map_t
convert_port_entry_to_map(ipc_port_t port)20060 convert_port_entry_to_map(
20061 	ipc_port_t      port)
20062 {
20063 	vm_map_t map = VM_MAP_NULL;
20064 	vm_named_entry_t named_entry;
20065 
20066 	if (!IP_VALID(port)) {
20067 		return VM_MAP_NULL;
20068 	}
20069 
20070 	if (ip_kotype(port) != IKOT_NAMED_ENTRY) {
20071 		return convert_port_to_map(port);
20072 	}
20073 
20074 	named_entry = mach_memory_entry_from_port(port);
20075 
20076 	if ((named_entry->is_sub_map) &&
20077 	    (named_entry->protection & VM_PROT_WRITE)) {
20078 		map = named_entry->backing.map;
20079 		if (map->pmap != PMAP_NULL) {
20080 			if (map->pmap == kernel_pmap) {
20081 				panic("userspace has access "
20082 				    "to a kernel map %p", map);
20083 			}
20084 			pmap_require(map->pmap);
20085 		}
20086 		vm_map_reference(map);
20087 	}
20088 
20089 	return map;
20090 }
20091 
20092 /*
20093  * Export routines to other components for the things we access locally through
20094  * macros.
20095  */
20096 #undef current_map
20097 vm_map_t
current_map(void)20098 current_map(void)
20099 {
20100 	return current_map_fast();
20101 }
20102 
20103 /*
20104  *	vm_map_reference:
20105  *
20106  *	Takes a reference on the specified map.
20107  */
20108 void
vm_map_reference(vm_map_t map)20109 vm_map_reference(
20110 	vm_map_t        map)
20111 {
20112 	if (__probable(map != VM_MAP_NULL)) {
20113 		vm_map_require(map);
20114 		os_ref_retain_raw(&map->map_refcnt, &map_refgrp);
20115 	}
20116 }
20117 
20118 /*
20119  *	vm_map_deallocate:
20120  *
20121  *	Removes a reference from the specified map,
20122  *	destroying it if no references remain.
20123  *	The map should not be locked.
20124  */
20125 void
vm_map_deallocate(vm_map_t map)20126 vm_map_deallocate(
20127 	vm_map_t        map)
20128 {
20129 	if (__probable(map != VM_MAP_NULL)) {
20130 		vm_map_require(map);
20131 		if (os_ref_release_raw(&map->map_refcnt, &map_refgrp) == 0) {
20132 			vm_map_destroy(map);
20133 		}
20134 	}
20135 }
20136 
20137 void
vm_map_inspect_deallocate(vm_map_inspect_t map)20138 vm_map_inspect_deallocate(
20139 	vm_map_inspect_t      map)
20140 {
20141 	vm_map_deallocate((vm_map_t)map);
20142 }
20143 
20144 void
vm_map_read_deallocate(vm_map_read_t map)20145 vm_map_read_deallocate(
20146 	vm_map_read_t      map)
20147 {
20148 	vm_map_deallocate((vm_map_t)map);
20149 }
20150 
20151 
20152 void
vm_map_disable_NX(vm_map_t map)20153 vm_map_disable_NX(vm_map_t map)
20154 {
20155 	if (map == NULL) {
20156 		return;
20157 	}
20158 	if (map->pmap == NULL) {
20159 		return;
20160 	}
20161 
20162 	pmap_disable_NX(map->pmap);
20163 }
20164 
20165 void
vm_map_disallow_data_exec(vm_map_t map)20166 vm_map_disallow_data_exec(vm_map_t map)
20167 {
20168 	if (map == NULL) {
20169 		return;
20170 	}
20171 
20172 	map->map_disallow_data_exec = TRUE;
20173 }
20174 
20175 /* XXX Consider making these constants (VM_MAX_ADDRESS and MACH_VM_MAX_ADDRESS)
20176  * more descriptive.
20177  */
20178 void
vm_map_set_32bit(vm_map_t map)20179 vm_map_set_32bit(vm_map_t map)
20180 {
20181 #if defined(__arm64__)
20182 	map->max_offset = pmap_max_offset(FALSE, ARM_PMAP_MAX_OFFSET_DEVICE);
20183 #else
20184 	map->max_offset = (vm_map_offset_t)VM_MAX_ADDRESS;
20185 #endif
20186 }
20187 
20188 
20189 void
vm_map_set_64bit(vm_map_t map)20190 vm_map_set_64bit(vm_map_t map)
20191 {
20192 #if defined(__arm64__)
20193 	map->max_offset = pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_DEVICE);
20194 #else
20195 	map->max_offset = (vm_map_offset_t)MACH_VM_MAX_ADDRESS;
20196 #endif
20197 }
20198 
20199 /*
20200  * Expand the maximum size of an existing map to the maximum supported.
20201  */
20202 void
vm_map_set_jumbo(vm_map_t map)20203 vm_map_set_jumbo(vm_map_t map)
20204 {
20205 #if defined (__arm64__) && !XNU_TARGET_OS_OSX
20206 	vm_map_set_max_addr(map, ~0);
20207 #else /* arm64 */
20208 	(void) map;
20209 #endif
20210 }
20211 
20212 /*
20213  * This map has a JIT entitlement
20214  */
20215 void
vm_map_set_jit_entitled(vm_map_t map)20216 vm_map_set_jit_entitled(vm_map_t map)
20217 {
20218 #if defined (__arm64__)
20219 	pmap_set_jit_entitled(map->pmap);
20220 #else /* arm64 */
20221 	(void) map;
20222 #endif
20223 }
20224 
20225 /*
20226  * This map has TPRO enabled
20227  */
20228 void
vm_map_set_tpro(vm_map_t map)20229 vm_map_set_tpro(vm_map_t map)
20230 {
20231 #if defined (__arm64e__)
20232 	pmap_set_tpro(map->pmap);
20233 #else /* arm64e */
20234 	(void) map;
20235 #endif
20236 }
20237 
20238 /*
20239  * Expand the maximum size of an existing map.
20240  */
20241 void
vm_map_set_max_addr(vm_map_t map,vm_map_offset_t new_max_offset)20242 vm_map_set_max_addr(vm_map_t map, vm_map_offset_t new_max_offset)
20243 {
20244 #if defined(__arm64__)
20245 	vm_map_offset_t max_supported_offset;
20246 	vm_map_offset_t old_max_offset;
20247 
20248 	vm_map_lock(map);
20249 
20250 	old_max_offset = map->max_offset;
20251 	max_supported_offset = pmap_max_offset(vm_map_is_64bit(map), ARM_PMAP_MAX_OFFSET_JUMBO);
20252 
20253 	new_max_offset = trunc_page(new_max_offset);
20254 
20255 	/* The address space cannot be shrunk using this routine. */
20256 	if (old_max_offset >= new_max_offset) {
20257 		vm_map_unlock(map);
20258 		return;
20259 	}
20260 
20261 	if (max_supported_offset < new_max_offset) {
20262 		new_max_offset = max_supported_offset;
20263 	}
20264 
20265 	map->max_offset = new_max_offset;
20266 
20267 	if (map->holelistenabled) {
20268 		if (map->holes_list->prev->vme_end == old_max_offset) {
20269 			/*
20270 			 * There is already a hole at the end of the map; simply make it bigger.
20271 			 */
20272 			map->holes_list->prev->vme_end = map->max_offset;
20273 		} else {
20274 			/*
20275 			 * There is no hole at the end, so we need to create a new hole
20276 			 * for the new empty space we're creating.
20277 			 */
20278 			struct vm_map_links *new_hole;
20279 
20280 			new_hole = zalloc_id(ZONE_ID_VM_MAP_HOLES, Z_WAITOK | Z_NOFAIL);
20281 			new_hole->start = old_max_offset;
20282 			new_hole->end = map->max_offset;
20283 			new_hole->prev = map->holes_list->prev;
20284 			new_hole->next = (struct vm_map_entry *)map->holes_list;
20285 			map->holes_list->prev->links.next = (struct vm_map_entry *)new_hole;
20286 			map->holes_list->prev = (struct vm_map_entry *)new_hole;
20287 		}
20288 	}
20289 
20290 	vm_map_unlock(map);
20291 #else
20292 	(void)map;
20293 	(void)new_max_offset;
20294 #endif
20295 }
20296 
20297 vm_map_offset_t
vm_compute_max_offset(boolean_t is64)20298 vm_compute_max_offset(boolean_t is64)
20299 {
20300 #if defined(__arm64__)
20301 	return pmap_max_offset(is64, ARM_PMAP_MAX_OFFSET_DEVICE);
20302 #else
20303 	return is64 ? (vm_map_offset_t)MACH_VM_MAX_ADDRESS : (vm_map_offset_t)VM_MAX_ADDRESS;
20304 #endif
20305 }
20306 
20307 void
vm_map_get_max_aslr_slide_section(vm_map_t map __unused,int64_t * max_sections,int64_t * section_size)20308 vm_map_get_max_aslr_slide_section(
20309 	vm_map_t                map __unused,
20310 	int64_t                 *max_sections,
20311 	int64_t                 *section_size)
20312 {
20313 #if defined(__arm64__)
20314 	*max_sections = 3;
20315 	*section_size = ARM_TT_TWIG_SIZE;
20316 #else
20317 	*max_sections = 1;
20318 	*section_size = 0;
20319 #endif
20320 }
20321 
20322 uint64_t
vm_map_get_max_aslr_slide_pages(vm_map_t map)20323 vm_map_get_max_aslr_slide_pages(vm_map_t map)
20324 {
20325 #if defined(__arm64__)
20326 	/* Limit arm64 slide to 16MB to conserve contiguous VA space in the more
20327 	 * limited embedded address space; this is also meant to minimize pmap
20328 	 * memory usage on 16KB page systems.
20329 	 */
20330 	return 1 << (24 - VM_MAP_PAGE_SHIFT(map));
20331 #else
20332 	return 1 << (vm_map_is_64bit(map) ? 16 : 8);
20333 #endif
20334 }
20335 
20336 uint64_t
vm_map_get_max_loader_aslr_slide_pages(vm_map_t map)20337 vm_map_get_max_loader_aslr_slide_pages(vm_map_t map)
20338 {
20339 #if defined(__arm64__)
20340 	/* We limit the loader slide to 4MB, in order to ensure at least 8 bits
20341 	 * of independent entropy on 16KB page systems.
20342 	 */
20343 	return 1 << (22 - VM_MAP_PAGE_SHIFT(map));
20344 #else
20345 	return 1 << (vm_map_is_64bit(map) ? 16 : 8);
20346 #endif
20347 }
20348 
20349 boolean_t
vm_map_is_64bit(vm_map_t map)20350 vm_map_is_64bit(
20351 	vm_map_t map)
20352 {
20353 	return map->max_offset > ((vm_map_offset_t)VM_MAX_ADDRESS);
20354 }
20355 
20356 boolean_t
vm_map_has_hard_pagezero(vm_map_t map,vm_map_offset_t pagezero_size)20357 vm_map_has_hard_pagezero(
20358 	vm_map_t        map,
20359 	vm_map_offset_t pagezero_size)
20360 {
20361 	/*
20362 	 * XXX FBDP
20363 	 * We should lock the VM map (for read) here but we can get away
20364 	 * with it for now because there can't really be any race condition:
20365 	 * the VM map's min_offset is changed only when the VM map is created
20366 	 * and when the zero page is established (when the binary gets loaded),
20367 	 * and this routine gets called only when the task terminates and the
20368 	 * VM map is being torn down, and when a new map is created via
20369 	 * load_machfile()/execve().
20370 	 */
20371 	return map->min_offset >= pagezero_size;
20372 }
20373 
20374 /*
20375  * Raise a VM map's maximun offset.
20376  */
20377 kern_return_t
vm_map_raise_max_offset(vm_map_t map,vm_map_offset_t new_max_offset)20378 vm_map_raise_max_offset(
20379 	vm_map_t        map,
20380 	vm_map_offset_t new_max_offset)
20381 {
20382 	kern_return_t   ret;
20383 
20384 	vm_map_lock(map);
20385 	ret = KERN_INVALID_ADDRESS;
20386 
20387 	if (new_max_offset >= map->max_offset) {
20388 		if (!vm_map_is_64bit(map)) {
20389 			if (new_max_offset <= (vm_map_offset_t)VM_MAX_ADDRESS) {
20390 				map->max_offset = new_max_offset;
20391 				ret = KERN_SUCCESS;
20392 			}
20393 		} else {
20394 			if (new_max_offset <= (vm_map_offset_t)MACH_VM_MAX_ADDRESS) {
20395 				map->max_offset = new_max_offset;
20396 				ret = KERN_SUCCESS;
20397 			}
20398 		}
20399 	}
20400 
20401 	vm_map_unlock(map);
20402 	return ret;
20403 }
20404 
20405 
20406 /*
20407  * Raise a VM map's minimum offset.
20408  * To strictly enforce "page zero" reservation.
20409  */
20410 kern_return_t
vm_map_raise_min_offset(vm_map_t map,vm_map_offset_t new_min_offset)20411 vm_map_raise_min_offset(
20412 	vm_map_t        map,
20413 	vm_map_offset_t new_min_offset)
20414 {
20415 	vm_map_entry_t  first_entry;
20416 
20417 	new_min_offset = vm_map_round_page(new_min_offset,
20418 	    VM_MAP_PAGE_MASK(map));
20419 
20420 	vm_map_lock(map);
20421 
20422 	if (new_min_offset < map->min_offset) {
20423 		/*
20424 		 * Can't move min_offset backwards, as that would expose
20425 		 * a part of the address space that was previously, and for
20426 		 * possibly good reasons, inaccessible.
20427 		 */
20428 		vm_map_unlock(map);
20429 		return KERN_INVALID_ADDRESS;
20430 	}
20431 	if (new_min_offset >= map->max_offset) {
20432 		/* can't go beyond the end of the address space */
20433 		vm_map_unlock(map);
20434 		return KERN_INVALID_ADDRESS;
20435 	}
20436 
20437 	first_entry = vm_map_first_entry(map);
20438 	if (first_entry != vm_map_to_entry(map) &&
20439 	    first_entry->vme_start < new_min_offset) {
20440 		/*
20441 		 * Some memory was already allocated below the new
20442 		 * minimun offset.  It's too late to change it now...
20443 		 */
20444 		vm_map_unlock(map);
20445 		return KERN_NO_SPACE;
20446 	}
20447 
20448 	map->min_offset = new_min_offset;
20449 
20450 	if (map->holelistenabled) {
20451 		assert(map->holes_list);
20452 		map->holes_list->start = new_min_offset;
20453 		assert(new_min_offset < map->holes_list->end);
20454 	}
20455 
20456 	vm_map_unlock(map);
20457 
20458 	return KERN_SUCCESS;
20459 }
20460 
20461 /*
20462  * Set the limit on the maximum amount of address space and user wired memory allowed for this map.
20463  * This is basically a copy of the RLIMIT_AS and RLIMIT_MEMLOCK rlimit value maintained by the BSD
20464  * side of the kernel. The limits are checked in the mach VM side, so we keep a copy so we don't
20465  * have to reach over to the BSD data structures.
20466  */
20467 
20468 uint64_t vm_map_set_size_limit_count = 0;
20469 kern_return_t
vm_map_set_size_limit(vm_map_t map,uint64_t new_size_limit)20470 vm_map_set_size_limit(vm_map_t map, uint64_t new_size_limit)
20471 {
20472 	kern_return_t kr;
20473 
20474 	vm_map_lock(map);
20475 	if (new_size_limit < map->size) {
20476 		/* new limit should not be lower than its current size */
20477 		DTRACE_VM2(vm_map_set_size_limit_fail,
20478 		    vm_map_size_t, map->size,
20479 		    uint64_t, new_size_limit);
20480 		kr = KERN_FAILURE;
20481 	} else if (new_size_limit == map->size_limit) {
20482 		/* no change */
20483 		kr = KERN_SUCCESS;
20484 	} else {
20485 		/* set new limit */
20486 		DTRACE_VM2(vm_map_set_size_limit,
20487 		    vm_map_size_t, map->size,
20488 		    uint64_t, new_size_limit);
20489 		if (new_size_limit != RLIM_INFINITY) {
20490 			vm_map_set_size_limit_count++;
20491 		}
20492 		map->size_limit = new_size_limit;
20493 		kr = KERN_SUCCESS;
20494 	}
20495 	vm_map_unlock(map);
20496 	return kr;
20497 }
20498 
20499 uint64_t vm_map_set_data_limit_count = 0;
20500 kern_return_t
vm_map_set_data_limit(vm_map_t map,uint64_t new_data_limit)20501 vm_map_set_data_limit(vm_map_t map, uint64_t new_data_limit)
20502 {
20503 	kern_return_t kr;
20504 
20505 	vm_map_lock(map);
20506 	if (new_data_limit < map->size) {
20507 		/* new limit should not be lower than its current size */
20508 		DTRACE_VM2(vm_map_set_data_limit_fail,
20509 		    vm_map_size_t, map->size,
20510 		    uint64_t, new_data_limit);
20511 		kr = KERN_FAILURE;
20512 	} else if (new_data_limit == map->data_limit) {
20513 		/* no change */
20514 		kr = KERN_SUCCESS;
20515 	} else {
20516 		/* set new limit */
20517 		DTRACE_VM2(vm_map_set_data_limit,
20518 		    vm_map_size_t, map->size,
20519 		    uint64_t, new_data_limit);
20520 		if (new_data_limit != RLIM_INFINITY) {
20521 			vm_map_set_data_limit_count++;
20522 		}
20523 		map->data_limit = new_data_limit;
20524 		kr = KERN_SUCCESS;
20525 	}
20526 	vm_map_unlock(map);
20527 	return kr;
20528 }
20529 
20530 void
vm_map_set_user_wire_limit(vm_map_t map,vm_size_t limit)20531 vm_map_set_user_wire_limit(vm_map_t     map,
20532     vm_size_t    limit)
20533 {
20534 	vm_map_lock(map);
20535 	map->user_wire_limit = limit;
20536 	vm_map_unlock(map);
20537 }
20538 
20539 
20540 void
vm_map_switch_protect(vm_map_t map,boolean_t val)20541 vm_map_switch_protect(vm_map_t     map,
20542     boolean_t    val)
20543 {
20544 	vm_map_lock(map);
20545 	map->switch_protect = val;
20546 	vm_map_unlock(map);
20547 }
20548 
20549 extern int cs_process_enforcement_enable;
20550 boolean_t
vm_map_cs_enforcement(vm_map_t map)20551 vm_map_cs_enforcement(
20552 	vm_map_t map)
20553 {
20554 	if (cs_process_enforcement_enable) {
20555 		return TRUE;
20556 	}
20557 	return map->cs_enforcement;
20558 }
20559 
20560 kern_return_t
vm_map_cs_wx_enable(vm_map_t map)20561 vm_map_cs_wx_enable(
20562 	vm_map_t map)
20563 {
20564 	return pmap_cs_allow_invalid(vm_map_pmap(map));
20565 }
20566 
20567 void
vm_map_cs_debugged_set(vm_map_t map,boolean_t val)20568 vm_map_cs_debugged_set(
20569 	vm_map_t map,
20570 	boolean_t val)
20571 {
20572 	vm_map_lock(map);
20573 	map->cs_debugged = val;
20574 	vm_map_unlock(map);
20575 }
20576 
20577 void
vm_map_cs_enforcement_set(vm_map_t map,boolean_t val)20578 vm_map_cs_enforcement_set(
20579 	vm_map_t map,
20580 	boolean_t val)
20581 {
20582 	vm_map_lock(map);
20583 	map->cs_enforcement = val;
20584 	pmap_set_vm_map_cs_enforced(map->pmap, val);
20585 	vm_map_unlock(map);
20586 }
20587 
20588 /*
20589  * IOKit has mapped a region into this map; adjust the pmap's ledgers appropriately.
20590  * phys_footprint is a composite limit consisting of iokit + physmem, so we need to
20591  * bump both counters.
20592  */
20593 void
vm_map_iokit_mapped_region(vm_map_t map,vm_size_t bytes)20594 vm_map_iokit_mapped_region(vm_map_t map, vm_size_t bytes)
20595 {
20596 	pmap_t pmap = vm_map_pmap(map);
20597 
20598 	ledger_credit(pmap->ledger, task_ledgers.iokit_mapped, bytes);
20599 	ledger_credit(pmap->ledger, task_ledgers.phys_footprint, bytes);
20600 }
20601 
20602 void
vm_map_iokit_unmapped_region(vm_map_t map,vm_size_t bytes)20603 vm_map_iokit_unmapped_region(vm_map_t map, vm_size_t bytes)
20604 {
20605 	pmap_t pmap = vm_map_pmap(map);
20606 
20607 	ledger_debit(pmap->ledger, task_ledgers.iokit_mapped, bytes);
20608 	ledger_debit(pmap->ledger, task_ledgers.phys_footprint, bytes);
20609 }
20610 
20611 /* Add (generate) code signature for memory range */
20612 #if CONFIG_DYNAMIC_CODE_SIGNING
20613 kern_return_t
vm_map_sign(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)20614 vm_map_sign(vm_map_t map,
20615     vm_map_offset_t start,
20616     vm_map_offset_t end)
20617 {
20618 	vm_map_entry_t entry;
20619 	vm_page_t m;
20620 	vm_object_t object;
20621 
20622 	/*
20623 	 * Vet all the input parameters and current type and state of the
20624 	 * underlaying object.  Return with an error if anything is amiss.
20625 	 */
20626 	if (map == VM_MAP_NULL) {
20627 		return KERN_INVALID_ARGUMENT;
20628 	}
20629 
20630 	vm_map_lock_read(map);
20631 
20632 	if (!vm_map_lookup_entry(map, start, &entry) || entry->is_sub_map) {
20633 		/*
20634 		 * Must pass a valid non-submap address.
20635 		 */
20636 		vm_map_unlock_read(map);
20637 		return KERN_INVALID_ADDRESS;
20638 	}
20639 
20640 	if ((entry->vme_start > start) || (entry->vme_end < end)) {
20641 		/*
20642 		 * Map entry doesn't cover the requested range. Not handling
20643 		 * this situation currently.
20644 		 */
20645 		vm_map_unlock_read(map);
20646 		return KERN_INVALID_ARGUMENT;
20647 	}
20648 
20649 	object = VME_OBJECT(entry);
20650 	if (object == VM_OBJECT_NULL) {
20651 		/*
20652 		 * Object must already be present or we can't sign.
20653 		 */
20654 		vm_map_unlock_read(map);
20655 		return KERN_INVALID_ARGUMENT;
20656 	}
20657 
20658 	vm_object_lock(object);
20659 	vm_map_unlock_read(map);
20660 
20661 	while (start < end) {
20662 		uint32_t refmod;
20663 
20664 		m = vm_page_lookup(object,
20665 		    start - entry->vme_start + VME_OFFSET(entry));
20666 		if (m == VM_PAGE_NULL) {
20667 			/* shoud we try to fault a page here? we can probably
20668 			 * demand it exists and is locked for this request */
20669 			vm_object_unlock(object);
20670 			return KERN_FAILURE;
20671 		}
20672 		/* deal with special page status */
20673 		if (m->vmp_busy ||
20674 		    (m->vmp_unusual && (VMP_ERROR_GET(m) || m->vmp_restart || m->vmp_private || m->vmp_absent))) {
20675 			vm_object_unlock(object);
20676 			return KERN_FAILURE;
20677 		}
20678 
20679 		/* Page is OK... now "validate" it */
20680 		/* This is the place where we'll call out to create a code
20681 		 * directory, later */
20682 		/* XXX TODO4K: deal with 4k subpages individually? */
20683 		m->vmp_cs_validated = VMP_CS_ALL_TRUE;
20684 
20685 		/* The page is now "clean" for codesigning purposes. That means
20686 		 * we don't consider it as modified (wpmapped) anymore. But
20687 		 * we'll disconnect the page so we note any future modification
20688 		 * attempts. */
20689 		m->vmp_wpmapped = FALSE;
20690 		refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
20691 
20692 		/* Pull the dirty status from the pmap, since we cleared the
20693 		 * wpmapped bit */
20694 		if ((refmod & VM_MEM_MODIFIED) && !m->vmp_dirty) {
20695 			SET_PAGE_DIRTY(m, FALSE);
20696 		}
20697 
20698 		/* On to the next page */
20699 		start += PAGE_SIZE;
20700 	}
20701 	vm_object_unlock(object);
20702 
20703 	return KERN_SUCCESS;
20704 }
20705 #endif
20706 
20707 kern_return_t
vm_map_partial_reap(vm_map_t map,unsigned int * reclaimed_resident,unsigned int * reclaimed_compressed)20708 vm_map_partial_reap(vm_map_t map, unsigned int *reclaimed_resident, unsigned int *reclaimed_compressed)
20709 {
20710 	vm_map_entry_t  entry = VM_MAP_ENTRY_NULL;
20711 	vm_map_entry_t  next_entry;
20712 	kern_return_t   kr = KERN_SUCCESS;
20713 	VM_MAP_ZAP_DECLARE(zap_list);
20714 
20715 	vm_map_lock(map);
20716 
20717 	for (entry = vm_map_first_entry(map);
20718 	    entry != vm_map_to_entry(map);
20719 	    entry = next_entry) {
20720 		next_entry = entry->vme_next;
20721 
20722 		if (!entry->is_sub_map &&
20723 		    VME_OBJECT(entry) &&
20724 		    (VME_OBJECT(entry)->internal == TRUE) &&
20725 		    (VME_OBJECT(entry)->ref_count == 1)) {
20726 			*reclaimed_resident += VME_OBJECT(entry)->resident_page_count;
20727 			*reclaimed_compressed += vm_compressor_pager_get_count(VME_OBJECT(entry)->pager);
20728 
20729 			(void)vm_map_delete(map, entry->vme_start,
20730 			    entry->vme_end, VM_MAP_REMOVE_NO_YIELD,
20731 			    KMEM_GUARD_NONE, &zap_list);
20732 		}
20733 	}
20734 
20735 	vm_map_unlock(map);
20736 
20737 	vm_map_zap_dispose(&zap_list);
20738 
20739 	return kr;
20740 }
20741 
20742 
20743 #if DEVELOPMENT || DEBUG
20744 
20745 int
vm_map_disconnect_page_mappings(vm_map_t map,boolean_t do_unnest)20746 vm_map_disconnect_page_mappings(
20747 	vm_map_t map,
20748 	boolean_t do_unnest)
20749 {
20750 	vm_map_entry_t entry;
20751 	ledger_amount_t byte_count = 0;
20752 
20753 	if (do_unnest == TRUE) {
20754 #ifndef NO_NESTED_PMAP
20755 		vm_map_lock(map);
20756 
20757 		for (entry = vm_map_first_entry(map);
20758 		    entry != vm_map_to_entry(map);
20759 		    entry = entry->vme_next) {
20760 			if (entry->is_sub_map && entry->use_pmap) {
20761 				/*
20762 				 * Make sure the range between the start of this entry and
20763 				 * the end of this entry is no longer nested, so that
20764 				 * we will only remove mappings from the pmap in use by this
20765 				 * this task
20766 				 */
20767 				vm_map_clip_unnest(map, entry, entry->vme_start, entry->vme_end);
20768 			}
20769 		}
20770 		vm_map_unlock(map);
20771 #endif
20772 	}
20773 	vm_map_lock_read(map);
20774 
20775 	ledger_get_balance(map->pmap->ledger, task_ledgers.phys_mem, &byte_count);
20776 
20777 	for (entry = vm_map_first_entry(map);
20778 	    entry != vm_map_to_entry(map);
20779 	    entry = entry->vme_next) {
20780 		if (!entry->is_sub_map && ((VME_OBJECT(entry) == 0) ||
20781 		    (VME_OBJECT(entry)->phys_contiguous))) {
20782 			continue;
20783 		}
20784 		if (entry->is_sub_map) {
20785 			assert(!entry->use_pmap);
20786 		}
20787 
20788 		pmap_remove_options(map->pmap, entry->vme_start, entry->vme_end, 0);
20789 	}
20790 	vm_map_unlock_read(map);
20791 
20792 	return (int) (byte_count / VM_MAP_PAGE_SIZE(map));
20793 }
20794 
20795 kern_return_t
vm_map_inject_error(vm_map_t map,vm_map_offset_t vaddr)20796 vm_map_inject_error(vm_map_t map, vm_map_offset_t vaddr)
20797 {
20798 	vm_object_t object = NULL;
20799 	vm_object_offset_t offset;
20800 	vm_prot_t prot;
20801 	boolean_t wired;
20802 	vm_map_version_t version;
20803 	vm_map_t real_map;
20804 	int result = KERN_FAILURE;
20805 
20806 	vaddr = vm_map_trunc_page(vaddr, PAGE_MASK);
20807 	vm_map_lock(map);
20808 
20809 	result = vm_map_lookup_and_lock_object(&map, vaddr, VM_PROT_READ,
20810 	    OBJECT_LOCK_EXCLUSIVE, &version, &object, &offset, &prot, &wired,
20811 	    NULL, &real_map, NULL);
20812 	if (object == NULL) {
20813 		result = KERN_MEMORY_ERROR;
20814 	} else if (object->pager) {
20815 		result = vm_compressor_pager_inject_error(object->pager,
20816 		    offset);
20817 	} else {
20818 		result = KERN_MEMORY_PRESENT;
20819 	}
20820 
20821 	if (object != NULL) {
20822 		vm_object_unlock(object);
20823 	}
20824 
20825 	if (real_map != map) {
20826 		vm_map_unlock(real_map);
20827 	}
20828 	vm_map_unlock(map);
20829 
20830 	return result;
20831 }
20832 
20833 #endif
20834 
20835 
20836 #if CONFIG_FREEZE
20837 
20838 
20839 extern struct freezer_context freezer_context_global;
20840 AbsoluteTime c_freezer_last_yield_ts = 0;
20841 
20842 extern unsigned int memorystatus_freeze_private_shared_pages_ratio;
20843 extern unsigned int memorystatus_freeze_shared_mb_per_process_max;
20844 
20845 kern_return_t
vm_map_freeze(task_t task,unsigned int * purgeable_count,unsigned int * wired_count,unsigned int * clean_count,unsigned int * dirty_count,unsigned int dirty_budget,unsigned int * shared_count,int * freezer_error_code,boolean_t eval_only)20846 vm_map_freeze(
20847 	task_t       task,
20848 	unsigned int *purgeable_count,
20849 	unsigned int *wired_count,
20850 	unsigned int *clean_count,
20851 	unsigned int *dirty_count,
20852 	unsigned int dirty_budget,
20853 	unsigned int *shared_count,
20854 	int          *freezer_error_code,
20855 	boolean_t    eval_only)
20856 {
20857 	vm_map_entry_t  entry2 = VM_MAP_ENTRY_NULL;
20858 	kern_return_t   kr = KERN_SUCCESS;
20859 	boolean_t       evaluation_phase = TRUE;
20860 	vm_object_t     cur_shared_object = NULL;
20861 	int             cur_shared_obj_ref_cnt = 0;
20862 	unsigned int    dirty_private_count = 0, dirty_shared_count = 0, obj_pages_snapshot = 0;
20863 
20864 	*purgeable_count = *wired_count = *clean_count = *dirty_count = *shared_count = 0;
20865 
20866 	/*
20867 	 * We need the exclusive lock here so that we can
20868 	 * block any page faults or lookups while we are
20869 	 * in the middle of freezing this vm map.
20870 	 */
20871 	vm_map_t map = task->map;
20872 
20873 	vm_map_lock(map);
20874 
20875 	assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
20876 
20877 	if (vm_compressor_low_on_space() || vm_swap_low_on_space()) {
20878 		if (vm_compressor_low_on_space()) {
20879 			*freezer_error_code = FREEZER_ERROR_NO_COMPRESSOR_SPACE;
20880 		}
20881 
20882 		if (vm_swap_low_on_space()) {
20883 			*freezer_error_code = FREEZER_ERROR_NO_SWAP_SPACE;
20884 		}
20885 
20886 		kr = KERN_NO_SPACE;
20887 		goto done;
20888 	}
20889 
20890 	if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE == FALSE) {
20891 		/*
20892 		 * In-memory compressor backing the freezer. No disk.
20893 		 * So no need to do the evaluation phase.
20894 		 */
20895 		evaluation_phase = FALSE;
20896 
20897 		if (eval_only == TRUE) {
20898 			/*
20899 			 * We don't support 'eval_only' mode
20900 			 * in this non-swap config.
20901 			 */
20902 			*freezer_error_code = FREEZER_ERROR_GENERIC;
20903 			kr = KERN_INVALID_ARGUMENT;
20904 			goto done;
20905 		}
20906 
20907 		freezer_context_global.freezer_ctx_uncompressed_pages = 0;
20908 		clock_get_uptime(&c_freezer_last_yield_ts);
20909 	}
20910 again:
20911 
20912 	for (entry2 = vm_map_first_entry(map);
20913 	    entry2 != vm_map_to_entry(map);
20914 	    entry2 = entry2->vme_next) {
20915 		vm_object_t src_object;
20916 
20917 		if (entry2->is_sub_map) {
20918 			continue;
20919 		}
20920 
20921 		src_object = VME_OBJECT(entry2);
20922 		if (!src_object ||
20923 		    src_object->phys_contiguous ||
20924 		    !src_object->internal) {
20925 			continue;
20926 		}
20927 
20928 		/* If eligible, scan the entry, moving eligible pages over to our parent object */
20929 
20930 		if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
20931 			/*
20932 			 * We skip purgeable objects during evaluation phase only.
20933 			 * If we decide to freeze this process, we'll explicitly
20934 			 * purge these objects before we go around again with
20935 			 * 'evaluation_phase' set to FALSE.
20936 			 */
20937 
20938 			if ((src_object->purgable == VM_PURGABLE_EMPTY) || (src_object->purgable == VM_PURGABLE_VOLATILE)) {
20939 				/*
20940 				 * We want to purge objects that may not belong to this task but are mapped
20941 				 * in this task alone. Since we already purged this task's purgeable memory
20942 				 * at the end of a successful evaluation phase, we want to avoid doing no-op calls
20943 				 * on this task's purgeable objects. Hence the check for only volatile objects.
20944 				 */
20945 				if (evaluation_phase == FALSE &&
20946 				    (src_object->purgable == VM_PURGABLE_VOLATILE) &&
20947 				    (src_object->ref_count == 1)) {
20948 					vm_object_lock(src_object);
20949 					vm_object_purge(src_object, 0);
20950 					vm_object_unlock(src_object);
20951 				}
20952 				continue;
20953 			}
20954 
20955 			/*
20956 			 * Pages belonging to this object could be swapped to disk.
20957 			 * Make sure it's not a shared object because we could end
20958 			 * up just bringing it back in again.
20959 			 *
20960 			 * We try to optimize somewhat by checking for objects that are mapped
20961 			 * more than once within our own map. But we don't do full searches,
20962 			 * we just look at the entries following our current entry.
20963 			 */
20964 
20965 			if (src_object->ref_count > 1) {
20966 				if (src_object != cur_shared_object) {
20967 					obj_pages_snapshot = (src_object->resident_page_count - src_object->wired_page_count) + vm_compressor_pager_get_count(src_object->pager);
20968 					dirty_shared_count += obj_pages_snapshot;
20969 
20970 					cur_shared_object = src_object;
20971 					cur_shared_obj_ref_cnt = 1;
20972 					continue;
20973 				} else {
20974 					cur_shared_obj_ref_cnt++;
20975 					if (src_object->ref_count == cur_shared_obj_ref_cnt) {
20976 						/*
20977 						 * Fall through to below and treat this object as private.
20978 						 * So deduct its pages from our shared total and add it to the
20979 						 * private total.
20980 						 */
20981 
20982 						dirty_shared_count -= obj_pages_snapshot;
20983 						dirty_private_count += obj_pages_snapshot;
20984 					} else {
20985 						continue;
20986 					}
20987 				}
20988 			}
20989 
20990 
20991 			if (src_object->ref_count == 1) {
20992 				dirty_private_count += (src_object->resident_page_count - src_object->wired_page_count) + vm_compressor_pager_get_count(src_object->pager);
20993 			}
20994 
20995 			if (evaluation_phase == TRUE) {
20996 				continue;
20997 			}
20998 		}
20999 
21000 		uint32_t paged_out_count = vm_object_compressed_freezer_pageout(src_object, dirty_budget);
21001 		*wired_count += src_object->wired_page_count;
21002 
21003 		if (vm_compressor_low_on_space() || vm_swap_low_on_space()) {
21004 			if (vm_compressor_low_on_space()) {
21005 				*freezer_error_code = FREEZER_ERROR_NO_COMPRESSOR_SPACE;
21006 			}
21007 
21008 			if (vm_swap_low_on_space()) {
21009 				*freezer_error_code = FREEZER_ERROR_NO_SWAP_SPACE;
21010 			}
21011 
21012 			kr = KERN_NO_SPACE;
21013 			break;
21014 		}
21015 		if (paged_out_count >= dirty_budget) {
21016 			break;
21017 		}
21018 		dirty_budget -= paged_out_count;
21019 	}
21020 
21021 	*shared_count = (unsigned int) ((dirty_shared_count * PAGE_SIZE_64) / (1024 * 1024ULL));
21022 	if (evaluation_phase) {
21023 		unsigned int shared_pages_threshold = (memorystatus_freeze_shared_mb_per_process_max * 1024 * 1024ULL) / PAGE_SIZE_64;
21024 
21025 		if (dirty_shared_count > shared_pages_threshold) {
21026 			*freezer_error_code = FREEZER_ERROR_EXCESS_SHARED_MEMORY;
21027 			kr = KERN_FAILURE;
21028 			goto done;
21029 		}
21030 
21031 		if (dirty_shared_count &&
21032 		    ((dirty_private_count / dirty_shared_count) < memorystatus_freeze_private_shared_pages_ratio)) {
21033 			*freezer_error_code = FREEZER_ERROR_LOW_PRIVATE_SHARED_RATIO;
21034 			kr = KERN_FAILURE;
21035 			goto done;
21036 		}
21037 
21038 		evaluation_phase = FALSE;
21039 		dirty_shared_count = dirty_private_count = 0;
21040 
21041 		freezer_context_global.freezer_ctx_uncompressed_pages = 0;
21042 		clock_get_uptime(&c_freezer_last_yield_ts);
21043 
21044 		if (eval_only) {
21045 			kr = KERN_SUCCESS;
21046 			goto done;
21047 		}
21048 
21049 		vm_purgeable_purge_task_owned(task);
21050 
21051 		goto again;
21052 	} else {
21053 		kr = KERN_SUCCESS;
21054 	}
21055 
21056 done:
21057 	vm_map_unlock(map);
21058 
21059 	if ((eval_only == FALSE) && (kr == KERN_SUCCESS)) {
21060 		vm_object_compressed_freezer_done();
21061 	}
21062 	return kr;
21063 }
21064 
21065 #endif
21066 
21067 /*
21068  * vm_map_entry_should_cow_for_true_share:
21069  *
21070  * Determines if the map entry should be clipped and setup for copy-on-write
21071  * to avoid applying "true_share" to a large VM object when only a subset is
21072  * targeted.
21073  *
21074  * For now, we target only the map entries created for the Objective C
21075  * Garbage Collector, which initially have the following properties:
21076  *	- alias == VM_MEMORY_MALLOC
21077  *      - wired_count == 0
21078  *      - !needs_copy
21079  * and a VM object with:
21080  *      - internal
21081  *      - copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC
21082  *      - !true_share
21083  *      - vo_size == ANON_CHUNK_SIZE
21084  *
21085  * Only non-kernel map entries.
21086  */
21087 boolean_t
vm_map_entry_should_cow_for_true_share(vm_map_entry_t entry)21088 vm_map_entry_should_cow_for_true_share(
21089 	vm_map_entry_t  entry)
21090 {
21091 	vm_object_t     object;
21092 
21093 	if (entry->is_sub_map) {
21094 		/* entry does not point at a VM object */
21095 		return FALSE;
21096 	}
21097 
21098 	if (entry->needs_copy) {
21099 		/* already set for copy_on_write: done! */
21100 		return FALSE;
21101 	}
21102 
21103 	if (VME_ALIAS(entry) != VM_MEMORY_MALLOC &&
21104 	    VME_ALIAS(entry) != VM_MEMORY_MALLOC_SMALL) {
21105 		/* not a malloc heap or Obj-C Garbage Collector heap */
21106 		return FALSE;
21107 	}
21108 
21109 	if (entry->wired_count) {
21110 		/* wired: can't change the map entry... */
21111 		vm_counters.should_cow_but_wired++;
21112 		return FALSE;
21113 	}
21114 
21115 	object = VME_OBJECT(entry);
21116 
21117 	if (object == VM_OBJECT_NULL) {
21118 		/* no object yet... */
21119 		return FALSE;
21120 	}
21121 
21122 	if (!object->internal) {
21123 		/* not an internal object */
21124 		return FALSE;
21125 	}
21126 
21127 	if (object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
21128 		/* not the default copy strategy */
21129 		return FALSE;
21130 	}
21131 
21132 	if (object->true_share) {
21133 		/* already true_share: too late to avoid it */
21134 		return FALSE;
21135 	}
21136 
21137 	if (VME_ALIAS(entry) == VM_MEMORY_MALLOC &&
21138 	    object->vo_size != ANON_CHUNK_SIZE) {
21139 		/* ... not an object created for the ObjC Garbage Collector */
21140 		return FALSE;
21141 	}
21142 
21143 	if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_SMALL &&
21144 	    object->vo_size != 2048 * 4096) {
21145 		/* ... not a "MALLOC_SMALL" heap */
21146 		return FALSE;
21147 	}
21148 
21149 	/*
21150 	 * All the criteria match: we have a large object being targeted for "true_share".
21151 	 * To limit the adverse side-effects linked with "true_share", tell the caller to
21152 	 * try and avoid setting up the entire object for "true_share" by clipping the
21153 	 * targeted range and setting it up for copy-on-write.
21154 	 */
21155 	return TRUE;
21156 }
21157 
21158 vm_map_offset_t
vm_map_round_page_mask(vm_map_offset_t offset,vm_map_offset_t mask)21159 vm_map_round_page_mask(
21160 	vm_map_offset_t offset,
21161 	vm_map_offset_t mask)
21162 {
21163 	return VM_MAP_ROUND_PAGE(offset, mask);
21164 }
21165 
21166 vm_map_offset_t
vm_map_trunc_page_mask(vm_map_offset_t offset,vm_map_offset_t mask)21167 vm_map_trunc_page_mask(
21168 	vm_map_offset_t offset,
21169 	vm_map_offset_t mask)
21170 {
21171 	return VM_MAP_TRUNC_PAGE(offset, mask);
21172 }
21173 
21174 boolean_t
vm_map_page_aligned(vm_map_offset_t offset,vm_map_offset_t mask)21175 vm_map_page_aligned(
21176 	vm_map_offset_t offset,
21177 	vm_map_offset_t mask)
21178 {
21179 	return ((offset) & mask) == 0;
21180 }
21181 
21182 int
vm_map_page_shift(vm_map_t map)21183 vm_map_page_shift(
21184 	vm_map_t map)
21185 {
21186 	return VM_MAP_PAGE_SHIFT(map);
21187 }
21188 
21189 int
vm_map_page_size(vm_map_t map)21190 vm_map_page_size(
21191 	vm_map_t map)
21192 {
21193 	return VM_MAP_PAGE_SIZE(map);
21194 }
21195 
21196 vm_map_offset_t
vm_map_page_mask(vm_map_t map)21197 vm_map_page_mask(
21198 	vm_map_t map)
21199 {
21200 	return VM_MAP_PAGE_MASK(map);
21201 }
21202 
21203 kern_return_t
vm_map_set_page_shift(vm_map_t map,int pageshift)21204 vm_map_set_page_shift(
21205 	vm_map_t        map,
21206 	int             pageshift)
21207 {
21208 	if (map->hdr.nentries != 0) {
21209 		/* too late to change page size */
21210 		return KERN_FAILURE;
21211 	}
21212 
21213 	map->hdr.page_shift = (uint16_t)pageshift;
21214 
21215 	return KERN_SUCCESS;
21216 }
21217 
21218 kern_return_t
vm_map_query_volatile(vm_map_t map,mach_vm_size_t * volatile_virtual_size_p,mach_vm_size_t * volatile_resident_size_p,mach_vm_size_t * volatile_compressed_size_p,mach_vm_size_t * volatile_pmap_size_p,mach_vm_size_t * volatile_compressed_pmap_size_p)21219 vm_map_query_volatile(
21220 	vm_map_t        map,
21221 	mach_vm_size_t  *volatile_virtual_size_p,
21222 	mach_vm_size_t  *volatile_resident_size_p,
21223 	mach_vm_size_t  *volatile_compressed_size_p,
21224 	mach_vm_size_t  *volatile_pmap_size_p,
21225 	mach_vm_size_t  *volatile_compressed_pmap_size_p)
21226 {
21227 	mach_vm_size_t  volatile_virtual_size;
21228 	mach_vm_size_t  volatile_resident_count;
21229 	mach_vm_size_t  volatile_compressed_count;
21230 	mach_vm_size_t  volatile_pmap_count;
21231 	mach_vm_size_t  volatile_compressed_pmap_count;
21232 	mach_vm_size_t  resident_count;
21233 	vm_map_entry_t  entry;
21234 	vm_object_t     object;
21235 
21236 	/* map should be locked by caller */
21237 
21238 	volatile_virtual_size = 0;
21239 	volatile_resident_count = 0;
21240 	volatile_compressed_count = 0;
21241 	volatile_pmap_count = 0;
21242 	volatile_compressed_pmap_count = 0;
21243 
21244 	for (entry = vm_map_first_entry(map);
21245 	    entry != vm_map_to_entry(map);
21246 	    entry = entry->vme_next) {
21247 		mach_vm_size_t  pmap_resident_bytes, pmap_compressed_bytes;
21248 
21249 		if (entry->is_sub_map) {
21250 			continue;
21251 		}
21252 		if (!(entry->protection & VM_PROT_WRITE)) {
21253 			continue;
21254 		}
21255 		object = VME_OBJECT(entry);
21256 		if (object == VM_OBJECT_NULL) {
21257 			continue;
21258 		}
21259 		if (object->purgable != VM_PURGABLE_VOLATILE &&
21260 		    object->purgable != VM_PURGABLE_EMPTY) {
21261 			continue;
21262 		}
21263 		if (VME_OFFSET(entry)) {
21264 			/*
21265 			 * If the map entry has been split and the object now
21266 			 * appears several times in the VM map, we don't want
21267 			 * to count the object's resident_page_count more than
21268 			 * once.  We count it only for the first one, starting
21269 			 * at offset 0 and ignore the other VM map entries.
21270 			 */
21271 			continue;
21272 		}
21273 		resident_count = object->resident_page_count;
21274 		if ((VME_OFFSET(entry) / PAGE_SIZE) >= resident_count) {
21275 			resident_count = 0;
21276 		} else {
21277 			resident_count -= (VME_OFFSET(entry) / PAGE_SIZE);
21278 		}
21279 
21280 		volatile_virtual_size += entry->vme_end - entry->vme_start;
21281 		volatile_resident_count += resident_count;
21282 		if (object->pager) {
21283 			volatile_compressed_count +=
21284 			    vm_compressor_pager_get_count(object->pager);
21285 		}
21286 		pmap_compressed_bytes = 0;
21287 		pmap_resident_bytes =
21288 		    pmap_query_resident(map->pmap,
21289 		    entry->vme_start,
21290 		    entry->vme_end,
21291 		    &pmap_compressed_bytes);
21292 		volatile_pmap_count += (pmap_resident_bytes / PAGE_SIZE);
21293 		volatile_compressed_pmap_count += (pmap_compressed_bytes
21294 		    / PAGE_SIZE);
21295 	}
21296 
21297 	/* map is still locked on return */
21298 
21299 	*volatile_virtual_size_p = volatile_virtual_size;
21300 	*volatile_resident_size_p = volatile_resident_count * PAGE_SIZE;
21301 	*volatile_compressed_size_p = volatile_compressed_count * PAGE_SIZE;
21302 	*volatile_pmap_size_p = volatile_pmap_count * PAGE_SIZE;
21303 	*volatile_compressed_pmap_size_p = volatile_compressed_pmap_count * PAGE_SIZE;
21304 
21305 	return KERN_SUCCESS;
21306 }
21307 
21308 void
vm_map_sizes(vm_map_t map,vm_map_size_t * psize,vm_map_size_t * pfree,vm_map_size_t * plargest_free)21309 vm_map_sizes(vm_map_t map,
21310     vm_map_size_t * psize,
21311     vm_map_size_t * pfree,
21312     vm_map_size_t * plargest_free)
21313 {
21314 	vm_map_entry_t  entry;
21315 	vm_map_offset_t prev;
21316 	vm_map_size_t   free, total_free, largest_free;
21317 	boolean_t       end;
21318 
21319 	if (!map) {
21320 		*psize = *pfree = *plargest_free = 0;
21321 		return;
21322 	}
21323 	total_free = largest_free = 0;
21324 
21325 	vm_map_lock_read(map);
21326 	if (psize) {
21327 		*psize = map->max_offset - map->min_offset;
21328 	}
21329 
21330 	prev = map->min_offset;
21331 	for (entry = vm_map_first_entry(map);; entry = entry->vme_next) {
21332 		end = (entry == vm_map_to_entry(map));
21333 
21334 		if (end) {
21335 			free = entry->vme_end   - prev;
21336 		} else {
21337 			free = entry->vme_start - prev;
21338 		}
21339 
21340 		total_free += free;
21341 		if (free > largest_free) {
21342 			largest_free = free;
21343 		}
21344 
21345 		if (end) {
21346 			break;
21347 		}
21348 		prev = entry->vme_end;
21349 	}
21350 	vm_map_unlock_read(map);
21351 	if (pfree) {
21352 		*pfree = total_free;
21353 	}
21354 	if (plargest_free) {
21355 		*plargest_free = largest_free;
21356 	}
21357 }
21358 
21359 #if VM_SCAN_FOR_SHADOW_CHAIN
21360 int vm_map_shadow_max(vm_map_t map);
21361 int
vm_map_shadow_max(vm_map_t map)21362 vm_map_shadow_max(
21363 	vm_map_t map)
21364 {
21365 	int             shadows, shadows_max;
21366 	vm_map_entry_t  entry;
21367 	vm_object_t     object, next_object;
21368 
21369 	if (map == NULL) {
21370 		return 0;
21371 	}
21372 
21373 	shadows_max = 0;
21374 
21375 	vm_map_lock_read(map);
21376 
21377 	for (entry = vm_map_first_entry(map);
21378 	    entry != vm_map_to_entry(map);
21379 	    entry = entry->vme_next) {
21380 		if (entry->is_sub_map) {
21381 			continue;
21382 		}
21383 		object = VME_OBJECT(entry);
21384 		if (object == NULL) {
21385 			continue;
21386 		}
21387 		vm_object_lock_shared(object);
21388 		for (shadows = 0;
21389 		    object->shadow != NULL;
21390 		    shadows++, object = next_object) {
21391 			next_object = object->shadow;
21392 			vm_object_lock_shared(next_object);
21393 			vm_object_unlock(object);
21394 		}
21395 		vm_object_unlock(object);
21396 		if (shadows > shadows_max) {
21397 			shadows_max = shadows;
21398 		}
21399 	}
21400 
21401 	vm_map_unlock_read(map);
21402 
21403 	return shadows_max;
21404 }
21405 #endif /* VM_SCAN_FOR_SHADOW_CHAIN */
21406 
21407 void
vm_commit_pagezero_status(vm_map_t lmap)21408 vm_commit_pagezero_status(vm_map_t lmap)
21409 {
21410 	pmap_advise_pagezero_range(lmap->pmap, lmap->min_offset);
21411 }
21412 
21413 #if XNU_TARGET_OS_OSX
21414 void
vm_map_set_high_start(vm_map_t map,vm_map_offset_t high_start)21415 vm_map_set_high_start(
21416 	vm_map_t        map,
21417 	vm_map_offset_t high_start)
21418 {
21419 	map->vmmap_high_start = high_start;
21420 }
21421 #endif /* XNU_TARGET_OS_OSX */
21422 
21423 
21424 /*
21425  * FORKED CORPSE FOOTPRINT
21426  *
21427  * A forked corpse gets a copy of the original VM map but its pmap is mostly
21428  * empty since it never ran and never got to fault in any pages.
21429  * Collecting footprint info (via "sysctl vm.self_region_footprint") for
21430  * a forked corpse would therefore return very little information.
21431  *
21432  * When forking a corpse, we can pass the VM_MAP_FORK_CORPSE_FOOTPRINT option
21433  * to vm_map_fork() to collect footprint information from the original VM map
21434  * and its pmap, and store it in the forked corpse's VM map.  That information
21435  * is stored in place of the VM map's "hole list" since we'll never need to
21436  * lookup for holes in the corpse's map.
21437  *
21438  * The corpse's footprint info looks like this:
21439  *
21440  * vm_map->vmmap_corpse_footprint points to pageable kernel memory laid out
21441  * as follows:
21442  *                     +---------------------------------------+
21443  *            header-> | cf_size                               |
21444  *                     +-------------------+-------------------+
21445  *                     | cf_last_region    | cf_last_zeroes    |
21446  *                     +-------------------+-------------------+
21447  *           region1-> | cfr_vaddr                             |
21448  *                     +-------------------+-------------------+
21449  *                     | cfr_num_pages     | d0 | d1 | d2 | d3 |
21450  *                     +---------------------------------------+
21451  *                     | d4 | d5 | ...                         |
21452  *                     +---------------------------------------+
21453  *                     | ...                                   |
21454  *                     +-------------------+-------------------+
21455  *                     | dy | dz | na | na | cfr_vaddr...      | <-region2
21456  *                     +-------------------+-------------------+
21457  *                     | cfr_vaddr (ctd)   | cfr_num_pages     |
21458  *                     +---------------------------------------+
21459  *                     | d0 | d1 ...                           |
21460  *                     +---------------------------------------+
21461  *                       ...
21462  *                     +---------------------------------------+
21463  *       last region-> | cfr_vaddr                             |
21464  *                     +---------------------------------------+
21465  *                     + cfr_num_pages     | d0 | d1 | d2 | d3 |
21466  *                     +---------------------------------------+
21467  *                       ...
21468  *                     +---------------------------------------+
21469  *                     | dx | dy | dz | na | na | na | na | na |
21470  *                     +---------------------------------------+
21471  *
21472  * where:
21473  *      cf_size:	total size of the buffer (rounded to page size)
21474  *      cf_last_region:	offset in the buffer of the last "region" sub-header
21475  *	cf_last_zeroes: number of trailing "zero" dispositions at the end
21476  *			of last region
21477  *	cfr_vaddr:	virtual address of the start of the covered "region"
21478  *	cfr_num_pages:	number of pages in the covered "region"
21479  *	d*:		disposition of the page at that virtual address
21480  * Regions in the buffer are word-aligned.
21481  *
21482  * We estimate the size of the buffer based on the number of memory regions
21483  * and the virtual size of the address space.  While copying each memory region
21484  * during vm_map_fork(), we also collect the footprint info for that region
21485  * and store it in the buffer, packing it as much as possible (coalescing
21486  * contiguous memory regions to avoid having too many region headers and
21487  * avoiding long streaks of "zero" page dispositions by splitting footprint
21488  * "regions", so the number of regions in the footprint buffer might not match
21489  * the number of memory regions in the address space.
21490  *
21491  * We also have to copy the original task's "nonvolatile" ledgers since that's
21492  * part of the footprint and will need to be reported to any tool asking for
21493  * the footprint information of the forked corpse.
21494  */
21495 
21496 uint64_t vm_map_corpse_footprint_count = 0;
21497 uint64_t vm_map_corpse_footprint_size_avg = 0;
21498 uint64_t vm_map_corpse_footprint_size_max = 0;
21499 uint64_t vm_map_corpse_footprint_full = 0;
21500 uint64_t vm_map_corpse_footprint_no_buf = 0;
21501 
21502 struct vm_map_corpse_footprint_header {
21503 	vm_size_t       cf_size;        /* allocated buffer size */
21504 	uint32_t        cf_last_region; /* offset of last region in buffer */
21505 	union {
21506 		uint32_t cfu_last_zeroes; /* during creation:
21507 		                           * number of "zero" dispositions at
21508 		                           * end of last region */
21509 		uint32_t cfu_hint_region; /* during lookup:
21510 		                           * offset of last looked up region */
21511 #define cf_last_zeroes cfu.cfu_last_zeroes
21512 #define cf_hint_region cfu.cfu_hint_region
21513 	} cfu;
21514 };
21515 typedef uint8_t cf_disp_t;
21516 struct vm_map_corpse_footprint_region {
21517 	vm_map_offset_t cfr_vaddr;      /* region start virtual address */
21518 	uint32_t        cfr_num_pages;  /* number of pages in this "region" */
21519 	cf_disp_t   cfr_disposition[0]; /* disposition of each page */
21520 } __attribute__((packed));
21521 
21522 static cf_disp_t
vm_page_disposition_to_cf_disp(int disposition)21523 vm_page_disposition_to_cf_disp(
21524 	int disposition)
21525 {
21526 	assert(sizeof(cf_disp_t) == 1);
21527 	/* relocate bits that don't fit in a "uint8_t" */
21528 	if (disposition & VM_PAGE_QUERY_PAGE_REUSABLE) {
21529 		disposition |= VM_PAGE_QUERY_PAGE_FICTITIOUS;
21530 	}
21531 	/* cast gets rid of extra bits */
21532 	return (cf_disp_t) disposition;
21533 }
21534 
21535 static int
vm_page_cf_disp_to_disposition(cf_disp_t cf_disp)21536 vm_page_cf_disp_to_disposition(
21537 	cf_disp_t cf_disp)
21538 {
21539 	int disposition;
21540 
21541 	assert(sizeof(cf_disp_t) == 1);
21542 	disposition = (int) cf_disp;
21543 	/* move relocated bits back in place */
21544 	if (cf_disp & VM_PAGE_QUERY_PAGE_FICTITIOUS) {
21545 		disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
21546 		disposition &= ~VM_PAGE_QUERY_PAGE_FICTITIOUS;
21547 	}
21548 	return disposition;
21549 }
21550 
21551 /*
21552  * vm_map_corpse_footprint_new_region:
21553  *      closes the current footprint "region" and creates a new one
21554  *
21555  * Returns NULL if there's not enough space in the buffer for a new region.
21556  */
21557 static struct vm_map_corpse_footprint_region *
vm_map_corpse_footprint_new_region(struct vm_map_corpse_footprint_header * footprint_header)21558 vm_map_corpse_footprint_new_region(
21559 	struct vm_map_corpse_footprint_header *footprint_header)
21560 {
21561 	uintptr_t       footprint_edge;
21562 	uint32_t        new_region_offset;
21563 	struct vm_map_corpse_footprint_region *footprint_region;
21564 	struct vm_map_corpse_footprint_region *new_footprint_region;
21565 
21566 	footprint_edge = ((uintptr_t)footprint_header +
21567 	    footprint_header->cf_size);
21568 	footprint_region = ((struct vm_map_corpse_footprint_region *)
21569 	    ((char *)footprint_header +
21570 	    footprint_header->cf_last_region));
21571 	assert((uintptr_t)footprint_region + sizeof(*footprint_region) <=
21572 	    footprint_edge);
21573 
21574 	/* get rid of trailing zeroes in the last region */
21575 	assert(footprint_region->cfr_num_pages >=
21576 	    footprint_header->cf_last_zeroes);
21577 	footprint_region->cfr_num_pages -=
21578 	    footprint_header->cf_last_zeroes;
21579 	footprint_header->cf_last_zeroes = 0;
21580 
21581 	/* reuse this region if it's now empty */
21582 	if (footprint_region->cfr_num_pages == 0) {
21583 		return footprint_region;
21584 	}
21585 
21586 	/* compute offset of new region */
21587 	new_region_offset = footprint_header->cf_last_region;
21588 	new_region_offset += sizeof(*footprint_region);
21589 	new_region_offset += (footprint_region->cfr_num_pages * sizeof(cf_disp_t));
21590 	new_region_offset = roundup(new_region_offset, sizeof(int));
21591 
21592 	/* check if we're going over the edge */
21593 	if (((uintptr_t)footprint_header +
21594 	    new_region_offset +
21595 	    sizeof(*footprint_region)) >=
21596 	    footprint_edge) {
21597 		/* over the edge: no new region */
21598 		return NULL;
21599 	}
21600 
21601 	/* adjust offset of last region in header */
21602 	footprint_header->cf_last_region = new_region_offset;
21603 
21604 	new_footprint_region = (struct vm_map_corpse_footprint_region *)
21605 	    ((char *)footprint_header +
21606 	    footprint_header->cf_last_region);
21607 	new_footprint_region->cfr_vaddr = 0;
21608 	new_footprint_region->cfr_num_pages = 0;
21609 	/* caller needs to initialize new region */
21610 
21611 	return new_footprint_region;
21612 }
21613 
21614 /*
21615  * vm_map_corpse_footprint_collect:
21616  *	collect footprint information for "old_entry" in "old_map" and
21617  *	stores it in "new_map"'s vmmap_footprint_info.
21618  */
21619 kern_return_t
vm_map_corpse_footprint_collect(vm_map_t old_map,vm_map_entry_t old_entry,vm_map_t new_map)21620 vm_map_corpse_footprint_collect(
21621 	vm_map_t        old_map,
21622 	vm_map_entry_t  old_entry,
21623 	vm_map_t        new_map)
21624 {
21625 	vm_map_offset_t va;
21626 	kern_return_t   kr;
21627 	struct vm_map_corpse_footprint_header *footprint_header;
21628 	struct vm_map_corpse_footprint_region *footprint_region;
21629 	struct vm_map_corpse_footprint_region *new_footprint_region;
21630 	cf_disp_t       *next_disp_p;
21631 	uintptr_t       footprint_edge;
21632 	uint32_t        num_pages_tmp;
21633 	int             effective_page_size;
21634 
21635 	effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(old_map));
21636 
21637 	va = old_entry->vme_start;
21638 
21639 	vm_map_lock_assert_exclusive(old_map);
21640 	vm_map_lock_assert_exclusive(new_map);
21641 
21642 	assert(new_map->has_corpse_footprint);
21643 	assert(!old_map->has_corpse_footprint);
21644 	if (!new_map->has_corpse_footprint ||
21645 	    old_map->has_corpse_footprint) {
21646 		/*
21647 		 * This can only transfer footprint info from a
21648 		 * map with a live pmap to a map with a corpse footprint.
21649 		 */
21650 		return KERN_NOT_SUPPORTED;
21651 	}
21652 
21653 	if (new_map->vmmap_corpse_footprint == NULL) {
21654 		vm_offset_t     buf;
21655 		vm_size_t       buf_size;
21656 
21657 		buf = 0;
21658 		buf_size = (sizeof(*footprint_header) +
21659 		    (old_map->hdr.nentries
21660 		    *
21661 		    (sizeof(*footprint_region) +
21662 		    +3))            /* potential alignment for each region */
21663 		    +
21664 		    ((old_map->size / effective_page_size)
21665 		    *
21666 		    sizeof(cf_disp_t)));      /* disposition for each page */
21667 //		printf("FBDP corpse map %p guestimate footprint size 0x%llx\n", new_map, (uint64_t) buf_size);
21668 		buf_size = round_page(buf_size);
21669 
21670 		/* limit buffer to 1 page to validate overflow detection */
21671 //		buf_size = PAGE_SIZE;
21672 
21673 		/* limit size to a somewhat sane amount */
21674 #if XNU_TARGET_OS_OSX
21675 #define VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE   (8*1024*1024)   /* 8MB */
21676 #else /* XNU_TARGET_OS_OSX */
21677 #define VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE   (256*1024)      /* 256KB */
21678 #endif /* XNU_TARGET_OS_OSX */
21679 		if (buf_size > VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE) {
21680 			buf_size = VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE;
21681 		}
21682 
21683 		/*
21684 		 * Allocate the pageable buffer (with a trailing guard page).
21685 		 * It will be zero-filled on demand.
21686 		 */
21687 		kr = kmem_alloc(kernel_map, &buf, buf_size + PAGE_SIZE,
21688 		    KMA_DATA | KMA_PAGEABLE | KMA_GUARD_LAST,
21689 		    VM_KERN_MEMORY_DIAG);
21690 		if (kr != KERN_SUCCESS) {
21691 			vm_map_corpse_footprint_no_buf++;
21692 			return kr;
21693 		}
21694 
21695 		/* initialize header and 1st region */
21696 		footprint_header = (struct vm_map_corpse_footprint_header *)buf;
21697 		new_map->vmmap_corpse_footprint = footprint_header;
21698 
21699 		footprint_header->cf_size = buf_size;
21700 		footprint_header->cf_last_region =
21701 		    sizeof(*footprint_header);
21702 		footprint_header->cf_last_zeroes = 0;
21703 
21704 		footprint_region = (struct vm_map_corpse_footprint_region *)
21705 		    ((char *)footprint_header +
21706 		    footprint_header->cf_last_region);
21707 		footprint_region->cfr_vaddr = 0;
21708 		footprint_region->cfr_num_pages = 0;
21709 	} else {
21710 		/* retrieve header and last region */
21711 		footprint_header = (struct vm_map_corpse_footprint_header *)
21712 		    new_map->vmmap_corpse_footprint;
21713 		footprint_region = (struct vm_map_corpse_footprint_region *)
21714 		    ((char *)footprint_header +
21715 		    footprint_header->cf_last_region);
21716 	}
21717 	footprint_edge = ((uintptr_t)footprint_header +
21718 	    footprint_header->cf_size);
21719 
21720 	if ((footprint_region->cfr_vaddr +
21721 	    (((vm_map_offset_t)footprint_region->cfr_num_pages) *
21722 	    effective_page_size))
21723 	    != old_entry->vme_start) {
21724 		uint64_t num_pages_delta, num_pages_delta_size;
21725 		uint32_t region_offset_delta_size;
21726 
21727 		/*
21728 		 * Not the next contiguous virtual address:
21729 		 * start a new region or store "zero" dispositions for
21730 		 * the missing pages?
21731 		 */
21732 		/* size of gap in actual page dispositions */
21733 		num_pages_delta = ((old_entry->vme_start -
21734 		    footprint_region->cfr_vaddr) / effective_page_size)
21735 		    - footprint_region->cfr_num_pages;
21736 		num_pages_delta_size = num_pages_delta * sizeof(cf_disp_t);
21737 		/* size of gap as a new footprint region header */
21738 		region_offset_delta_size =
21739 		    (sizeof(*footprint_region) +
21740 		    roundup(((footprint_region->cfr_num_pages -
21741 		    footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)),
21742 		    sizeof(int)) -
21743 		    ((footprint_region->cfr_num_pages -
21744 		    footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)));
21745 //		printf("FBDP %s:%d region 0x%x 0x%llx 0x%x vme_start 0x%llx pages_delta 0x%llx region_delta 0x%x\n", __FUNCTION__, __LINE__, footprint_header->cf_last_region, footprint_region->cfr_vaddr, footprint_region->cfr_num_pages, old_entry->vme_start, num_pages_delta, region_offset_delta);
21746 		if (region_offset_delta_size < num_pages_delta_size ||
21747 		    os_add3_overflow(footprint_region->cfr_num_pages,
21748 		    (uint32_t) num_pages_delta,
21749 		    1,
21750 		    &num_pages_tmp)) {
21751 			/*
21752 			 * Storing data for this gap would take more space
21753 			 * than inserting a new footprint region header:
21754 			 * let's start a new region and save space. If it's a
21755 			 * tie, let's avoid using a new region, since that
21756 			 * would require more region hops to find the right
21757 			 * range during lookups.
21758 			 *
21759 			 * If the current region's cfr_num_pages would overflow
21760 			 * if we added "zero" page dispositions for the gap,
21761 			 * no choice but to start a new region.
21762 			 */
21763 //			printf("FBDP %s:%d new region\n", __FUNCTION__, __LINE__);
21764 			new_footprint_region =
21765 			    vm_map_corpse_footprint_new_region(footprint_header);
21766 			/* check that we're not going over the edge */
21767 			if (new_footprint_region == NULL) {
21768 				goto over_the_edge;
21769 			}
21770 			footprint_region = new_footprint_region;
21771 			/* initialize new region as empty */
21772 			footprint_region->cfr_vaddr = old_entry->vme_start;
21773 			footprint_region->cfr_num_pages = 0;
21774 		} else {
21775 			/*
21776 			 * Store "zero" page dispositions for the missing
21777 			 * pages.
21778 			 */
21779 //			printf("FBDP %s:%d zero gap\n", __FUNCTION__, __LINE__);
21780 			for (; num_pages_delta > 0; num_pages_delta--) {
21781 				next_disp_p = (cf_disp_t *)
21782 				    ((uintptr_t) footprint_region +
21783 				    sizeof(*footprint_region));
21784 				next_disp_p += footprint_region->cfr_num_pages;
21785 				/* check that we're not going over the edge */
21786 				if ((uintptr_t)next_disp_p >= footprint_edge) {
21787 					goto over_the_edge;
21788 				}
21789 				/* store "zero" disposition for this gap page */
21790 				footprint_region->cfr_num_pages++;
21791 				*next_disp_p = (cf_disp_t) 0;
21792 				footprint_header->cf_last_zeroes++;
21793 			}
21794 		}
21795 	}
21796 
21797 	for (va = old_entry->vme_start;
21798 	    va < old_entry->vme_end;
21799 	    va += effective_page_size) {
21800 		int             disposition;
21801 		cf_disp_t       cf_disp;
21802 
21803 		vm_map_footprint_query_page_info(old_map,
21804 		    old_entry,
21805 		    va,
21806 		    &disposition);
21807 		cf_disp = vm_page_disposition_to_cf_disp(disposition);
21808 
21809 //		if (va < SHARED_REGION_BASE_ARM64) printf("FBDP collect map %p va 0x%llx disp 0x%x\n", new_map, va, disp);
21810 
21811 		if (cf_disp == 0 && footprint_region->cfr_num_pages == 0) {
21812 			/*
21813 			 * Ignore "zero" dispositions at start of
21814 			 * region: just move start of region.
21815 			 */
21816 			footprint_region->cfr_vaddr += effective_page_size;
21817 			continue;
21818 		}
21819 
21820 		/* would region's cfr_num_pages overflow? */
21821 		if (os_add_overflow(footprint_region->cfr_num_pages, 1,
21822 		    &num_pages_tmp)) {
21823 			/* overflow: create a new region */
21824 			new_footprint_region =
21825 			    vm_map_corpse_footprint_new_region(
21826 				footprint_header);
21827 			if (new_footprint_region == NULL) {
21828 				goto over_the_edge;
21829 			}
21830 			footprint_region = new_footprint_region;
21831 			footprint_region->cfr_vaddr = va;
21832 			footprint_region->cfr_num_pages = 0;
21833 		}
21834 
21835 		next_disp_p = (cf_disp_t *) ((uintptr_t) footprint_region +
21836 		    sizeof(*footprint_region));
21837 		next_disp_p += footprint_region->cfr_num_pages;
21838 		/* check that we're not going over the edge */
21839 		if ((uintptr_t)next_disp_p >= footprint_edge) {
21840 			goto over_the_edge;
21841 		}
21842 		/* store this dispostion */
21843 		*next_disp_p = cf_disp;
21844 		footprint_region->cfr_num_pages++;
21845 
21846 		if (cf_disp != 0) {
21847 			/* non-zero disp: break the current zero streak */
21848 			footprint_header->cf_last_zeroes = 0;
21849 			/* done */
21850 			continue;
21851 		}
21852 
21853 		/* zero disp: add to the current streak of zeroes */
21854 		footprint_header->cf_last_zeroes++;
21855 		if ((footprint_header->cf_last_zeroes +
21856 		    roundup(((footprint_region->cfr_num_pages -
21857 		    footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)) &
21858 		    (sizeof(int) - 1),
21859 		    sizeof(int))) <
21860 		    (sizeof(*footprint_header))) {
21861 			/*
21862 			 * There are not enough trailing "zero" dispositions
21863 			 * (+ the extra padding we would need for the previous
21864 			 * region); creating a new region would not save space
21865 			 * at this point, so let's keep this "zero" disposition
21866 			 * in this region and reconsider later.
21867 			 */
21868 			continue;
21869 		}
21870 		/*
21871 		 * Create a new region to avoid having too many consecutive
21872 		 * "zero" dispositions.
21873 		 */
21874 		new_footprint_region =
21875 		    vm_map_corpse_footprint_new_region(footprint_header);
21876 		if (new_footprint_region == NULL) {
21877 			goto over_the_edge;
21878 		}
21879 		footprint_region = new_footprint_region;
21880 		/* initialize the new region as empty ... */
21881 		footprint_region->cfr_num_pages = 0;
21882 		/* ... and skip this "zero" disp */
21883 		footprint_region->cfr_vaddr = va + effective_page_size;
21884 	}
21885 
21886 	return KERN_SUCCESS;
21887 
21888 over_the_edge:
21889 //	printf("FBDP map %p footprint was full for va 0x%llx\n", new_map, va);
21890 	vm_map_corpse_footprint_full++;
21891 	return KERN_RESOURCE_SHORTAGE;
21892 }
21893 
21894 /*
21895  * vm_map_corpse_footprint_collect_done:
21896  *	completes the footprint collection by getting rid of any remaining
21897  *	trailing "zero" dispositions and trimming the unused part of the
21898  *	kernel buffer
21899  */
21900 void
vm_map_corpse_footprint_collect_done(vm_map_t new_map)21901 vm_map_corpse_footprint_collect_done(
21902 	vm_map_t        new_map)
21903 {
21904 	struct vm_map_corpse_footprint_header *footprint_header;
21905 	struct vm_map_corpse_footprint_region *footprint_region;
21906 	vm_size_t       buf_size, actual_size;
21907 	kern_return_t   kr;
21908 
21909 	assert(new_map->has_corpse_footprint);
21910 	if (!new_map->has_corpse_footprint ||
21911 	    new_map->vmmap_corpse_footprint == NULL) {
21912 		return;
21913 	}
21914 
21915 	footprint_header = (struct vm_map_corpse_footprint_header *)
21916 	    new_map->vmmap_corpse_footprint;
21917 	buf_size = footprint_header->cf_size;
21918 
21919 	footprint_region = (struct vm_map_corpse_footprint_region *)
21920 	    ((char *)footprint_header +
21921 	    footprint_header->cf_last_region);
21922 
21923 	/* get rid of trailing zeroes in last region */
21924 	assert(footprint_region->cfr_num_pages >= footprint_header->cf_last_zeroes);
21925 	footprint_region->cfr_num_pages -= footprint_header->cf_last_zeroes;
21926 	footprint_header->cf_last_zeroes = 0;
21927 
21928 	actual_size = (vm_size_t)(footprint_header->cf_last_region +
21929 	    sizeof(*footprint_region) +
21930 	    (footprint_region->cfr_num_pages * sizeof(cf_disp_t)));
21931 
21932 //	printf("FBDP map %p buf_size 0x%llx actual_size 0x%llx\n", new_map, (uint64_t) buf_size, (uint64_t) actual_size);
21933 	vm_map_corpse_footprint_size_avg =
21934 	    (((vm_map_corpse_footprint_size_avg *
21935 	    vm_map_corpse_footprint_count) +
21936 	    actual_size) /
21937 	    (vm_map_corpse_footprint_count + 1));
21938 	vm_map_corpse_footprint_count++;
21939 	if (actual_size > vm_map_corpse_footprint_size_max) {
21940 		vm_map_corpse_footprint_size_max = actual_size;
21941 	}
21942 
21943 	actual_size = round_page(actual_size);
21944 	if (buf_size > actual_size) {
21945 		kr = vm_deallocate(kernel_map,
21946 		    ((vm_address_t)footprint_header +
21947 		    actual_size +
21948 		    PAGE_SIZE),                 /* trailing guard page */
21949 		    (buf_size - actual_size));
21950 		assertf(kr == KERN_SUCCESS,
21951 		    "trim: footprint_header %p buf_size 0x%llx actual_size 0x%llx kr=0x%x\n",
21952 		    footprint_header,
21953 		    (uint64_t) buf_size,
21954 		    (uint64_t) actual_size,
21955 		    kr);
21956 		kr = vm_protect(kernel_map,
21957 		    ((vm_address_t)footprint_header +
21958 		    actual_size),
21959 		    PAGE_SIZE,
21960 		    FALSE,             /* set_maximum */
21961 		    VM_PROT_NONE);
21962 		assertf(kr == KERN_SUCCESS,
21963 		    "guard: footprint_header %p buf_size 0x%llx actual_size 0x%llx kr=0x%x\n",
21964 		    footprint_header,
21965 		    (uint64_t) buf_size,
21966 		    (uint64_t) actual_size,
21967 		    kr);
21968 	}
21969 
21970 	footprint_header->cf_size = actual_size;
21971 }
21972 
21973 /*
21974  * vm_map_corpse_footprint_query_page_info:
21975  *	retrieves the disposition of the page at virtual address "vaddr"
21976  *	in the forked corpse's VM map
21977  *
21978  * This is the equivalent of vm_map_footprint_query_page_info() for a forked corpse.
21979  */
21980 kern_return_t
vm_map_corpse_footprint_query_page_info(vm_map_t map,vm_map_offset_t va,int * disposition_p)21981 vm_map_corpse_footprint_query_page_info(
21982 	vm_map_t        map,
21983 	vm_map_offset_t va,
21984 	int             *disposition_p)
21985 {
21986 	struct vm_map_corpse_footprint_header *footprint_header;
21987 	struct vm_map_corpse_footprint_region *footprint_region;
21988 	uint32_t        footprint_region_offset;
21989 	vm_map_offset_t region_start, region_end;
21990 	int             disp_idx;
21991 	kern_return_t   kr;
21992 	int             effective_page_size;
21993 	cf_disp_t       cf_disp;
21994 
21995 	if (!map->has_corpse_footprint) {
21996 		*disposition_p = 0;
21997 		kr = KERN_INVALID_ARGUMENT;
21998 		goto done;
21999 	}
22000 
22001 	footprint_header = map->vmmap_corpse_footprint;
22002 	if (footprint_header == NULL) {
22003 		*disposition_p = 0;
22004 //		if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p);
22005 		kr = KERN_INVALID_ARGUMENT;
22006 		goto done;
22007 	}
22008 
22009 	/* start looking at the hint ("cf_hint_region") */
22010 	footprint_region_offset = footprint_header->cf_hint_region;
22011 
22012 	effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(map));
22013 
22014 lookup_again:
22015 	if (footprint_region_offset < sizeof(*footprint_header)) {
22016 		/* hint too low: start from 1st region */
22017 		footprint_region_offset = sizeof(*footprint_header);
22018 	}
22019 	if (footprint_region_offset >= footprint_header->cf_last_region) {
22020 		/* hint too high: re-start from 1st region */
22021 		footprint_region_offset = sizeof(*footprint_header);
22022 	}
22023 	footprint_region = (struct vm_map_corpse_footprint_region *)
22024 	    ((char *)footprint_header + footprint_region_offset);
22025 	region_start = footprint_region->cfr_vaddr;
22026 	region_end = (region_start +
22027 	    ((vm_map_offset_t)(footprint_region->cfr_num_pages) *
22028 	    effective_page_size));
22029 	if (va < region_start &&
22030 	    footprint_region_offset != sizeof(*footprint_header)) {
22031 		/* our range starts before the hint region */
22032 
22033 		/* reset the hint (in a racy way...) */
22034 		footprint_header->cf_hint_region = sizeof(*footprint_header);
22035 		/* lookup "va" again from 1st region */
22036 		footprint_region_offset = sizeof(*footprint_header);
22037 		goto lookup_again;
22038 	}
22039 
22040 	while (va >= region_end) {
22041 		if (footprint_region_offset >= footprint_header->cf_last_region) {
22042 			break;
22043 		}
22044 		/* skip the region's header */
22045 		footprint_region_offset += sizeof(*footprint_region);
22046 		/* skip the region's page dispositions */
22047 		footprint_region_offset += (footprint_region->cfr_num_pages * sizeof(cf_disp_t));
22048 		/* align to next word boundary */
22049 		footprint_region_offset =
22050 		    roundup(footprint_region_offset,
22051 		    sizeof(int));
22052 		footprint_region = (struct vm_map_corpse_footprint_region *)
22053 		    ((char *)footprint_header + footprint_region_offset);
22054 		region_start = footprint_region->cfr_vaddr;
22055 		region_end = (region_start +
22056 		    ((vm_map_offset_t)(footprint_region->cfr_num_pages) *
22057 		    effective_page_size));
22058 	}
22059 	if (va < region_start || va >= region_end) {
22060 		/* page not found */
22061 		*disposition_p = 0;
22062 //		if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p);
22063 		kr = KERN_SUCCESS;
22064 		goto done;
22065 	}
22066 
22067 	/* "va" found: set the lookup hint for next lookup (in a racy way...) */
22068 	footprint_header->cf_hint_region = footprint_region_offset;
22069 
22070 	/* get page disposition for "va" in this region */
22071 	disp_idx = (int) ((va - footprint_region->cfr_vaddr) / effective_page_size);
22072 	cf_disp = footprint_region->cfr_disposition[disp_idx];
22073 	*disposition_p = vm_page_cf_disp_to_disposition(cf_disp);
22074 	kr = KERN_SUCCESS;
22075 done:
22076 //	if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p);
22077 	/* dtrace -n 'vminfo:::footprint_query_page_info { printf("map 0x%p va 0x%llx disp 0x%x kr 0x%x", arg0, arg1, arg2, arg3); }' */
22078 	DTRACE_VM4(footprint_query_page_info,
22079 	    vm_map_t, map,
22080 	    vm_map_offset_t, va,
22081 	    int, *disposition_p,
22082 	    kern_return_t, kr);
22083 
22084 	return kr;
22085 }
22086 
22087 void
vm_map_corpse_footprint_destroy(vm_map_t map)22088 vm_map_corpse_footprint_destroy(
22089 	vm_map_t        map)
22090 {
22091 	if (map->has_corpse_footprint &&
22092 	    map->vmmap_corpse_footprint != 0) {
22093 		struct vm_map_corpse_footprint_header *footprint_header;
22094 		vm_size_t buf_size;
22095 		kern_return_t kr;
22096 
22097 		footprint_header = map->vmmap_corpse_footprint;
22098 		buf_size = footprint_header->cf_size;
22099 		kr = vm_deallocate(kernel_map,
22100 		    (vm_offset_t) map->vmmap_corpse_footprint,
22101 		    ((vm_size_t) buf_size
22102 		    + PAGE_SIZE));                 /* trailing guard page */
22103 		assertf(kr == KERN_SUCCESS, "kr=0x%x\n", kr);
22104 		map->vmmap_corpse_footprint = 0;
22105 		map->has_corpse_footprint = FALSE;
22106 	}
22107 }
22108 
22109 /*
22110  * vm_map_copy_footprint_ledgers:
22111  *	copies any ledger that's relevant to the memory footprint of "old_task"
22112  *	into the forked corpse's task ("new_task")
22113  */
22114 void
vm_map_copy_footprint_ledgers(task_t old_task,task_t new_task)22115 vm_map_copy_footprint_ledgers(
22116 	task_t  old_task,
22117 	task_t  new_task)
22118 {
22119 	vm_map_copy_ledger(old_task, new_task, task_ledgers.phys_footprint);
22120 	vm_map_copy_ledger(old_task, new_task, task_ledgers.purgeable_nonvolatile);
22121 	vm_map_copy_ledger(old_task, new_task, task_ledgers.purgeable_nonvolatile_compressed);
22122 	vm_map_copy_ledger(old_task, new_task, task_ledgers.internal);
22123 	vm_map_copy_ledger(old_task, new_task, task_ledgers.internal_compressed);
22124 	vm_map_copy_ledger(old_task, new_task, task_ledgers.iokit_mapped);
22125 	vm_map_copy_ledger(old_task, new_task, task_ledgers.alternate_accounting);
22126 	vm_map_copy_ledger(old_task, new_task, task_ledgers.alternate_accounting_compressed);
22127 	vm_map_copy_ledger(old_task, new_task, task_ledgers.page_table);
22128 	vm_map_copy_ledger(old_task, new_task, task_ledgers.tagged_footprint);
22129 	vm_map_copy_ledger(old_task, new_task, task_ledgers.tagged_footprint_compressed);
22130 	vm_map_copy_ledger(old_task, new_task, task_ledgers.network_nonvolatile);
22131 	vm_map_copy_ledger(old_task, new_task, task_ledgers.network_nonvolatile_compressed);
22132 	vm_map_copy_ledger(old_task, new_task, task_ledgers.media_footprint);
22133 	vm_map_copy_ledger(old_task, new_task, task_ledgers.media_footprint_compressed);
22134 	vm_map_copy_ledger(old_task, new_task, task_ledgers.graphics_footprint);
22135 	vm_map_copy_ledger(old_task, new_task, task_ledgers.graphics_footprint_compressed);
22136 	vm_map_copy_ledger(old_task, new_task, task_ledgers.neural_footprint);
22137 	vm_map_copy_ledger(old_task, new_task, task_ledgers.neural_footprint_compressed);
22138 	vm_map_copy_ledger(old_task, new_task, task_ledgers.wired_mem);
22139 }
22140 
22141 /*
22142  * vm_map_copy_ledger:
22143  *	copy a single ledger from "old_task" to "new_task"
22144  */
22145 void
vm_map_copy_ledger(task_t old_task,task_t new_task,int ledger_entry)22146 vm_map_copy_ledger(
22147 	task_t  old_task,
22148 	task_t  new_task,
22149 	int     ledger_entry)
22150 {
22151 	ledger_amount_t old_balance, new_balance, delta;
22152 
22153 	assert(new_task->map->has_corpse_footprint);
22154 	if (!new_task->map->has_corpse_footprint) {
22155 		return;
22156 	}
22157 
22158 	/* turn off sanity checks for the ledger we're about to mess with */
22159 	ledger_disable_panic_on_negative(new_task->ledger,
22160 	    ledger_entry);
22161 
22162 	/* adjust "new_task" to match "old_task" */
22163 	ledger_get_balance(old_task->ledger,
22164 	    ledger_entry,
22165 	    &old_balance);
22166 	ledger_get_balance(new_task->ledger,
22167 	    ledger_entry,
22168 	    &new_balance);
22169 	if (new_balance == old_balance) {
22170 		/* new == old: done */
22171 	} else if (new_balance > old_balance) {
22172 		/* new > old ==> new -= new - old */
22173 		delta = new_balance - old_balance;
22174 		ledger_debit(new_task->ledger,
22175 		    ledger_entry,
22176 		    delta);
22177 	} else {
22178 		/* new < old ==> new += old - new */
22179 		delta = old_balance - new_balance;
22180 		ledger_credit(new_task->ledger,
22181 		    ledger_entry,
22182 		    delta);
22183 	}
22184 }
22185 
22186 /*
22187  * vm_map_get_pmap:
22188  * returns the pmap associated with the vm_map
22189  */
22190 pmap_t
vm_map_get_pmap(vm_map_t map)22191 vm_map_get_pmap(vm_map_t map)
22192 {
22193 	return vm_map_pmap(map);
22194 }
22195 
22196 #if CONFIG_MAP_RANGES
22197 /*
22198  * vm_map_range_map_init:
22199  *  initializes the VM range ID map to enable index lookup
22200  *  of user VM ranges based on VM tag from userspace.
22201  */
22202 static void
vm_map_range_map_init(void)22203 vm_map_range_map_init(void)
22204 {
22205 	/* maintain status quo by default */
22206 	for (int i = 0; i < VM_MEMORY_COUNT; i++) {
22207 		vm_map_range_id_map[i] = UMEM_RANGE_ID_DEFAULT;
22208 	}
22209 
22210 	/* move all MALLOC allocations to heap range  */
22211 	vm_map_range_id_map[VM_MEMORY_MALLOC] = UMEM_RANGE_ID_HEAP;
22212 	vm_map_range_id_map[VM_MEMORY_MALLOC_HUGE] = UMEM_RANGE_ID_HEAP;
22213 	vm_map_range_id_map[VM_MEMORY_MALLOC_LARGE] = UMEM_RANGE_ID_HEAP;
22214 	vm_map_range_id_map[VM_MEMORY_MALLOC_LARGE_REUSABLE] = UMEM_RANGE_ID_HEAP;
22215 	vm_map_range_id_map[VM_MEMORY_MALLOC_LARGE_REUSED] = UMEM_RANGE_ID_HEAP;
22216 	vm_map_range_id_map[VM_MEMORY_MALLOC_MEDIUM] = UMEM_RANGE_ID_HEAP;
22217 	vm_map_range_id_map[VM_MEMORY_MALLOC_NANO] = UMEM_RANGE_ID_HEAP;
22218 	vm_map_range_id_map[VM_MEMORY_MALLOC_PGUARD] = UMEM_RANGE_ID_HEAP;
22219 	vm_map_range_id_map[VM_MEMORY_MALLOC_PROB_GUARD] = UMEM_RANGE_ID_HEAP;
22220 	vm_map_range_id_map[VM_MEMORY_MALLOC_SMALL] = UMEM_RANGE_ID_HEAP;
22221 	vm_map_range_id_map[VM_MEMORY_MALLOC_TINY] = UMEM_RANGE_ID_HEAP;
22222 }
22223 
22224 /*
22225  * vm_map_range_configure:
22226  *	configures the user vm_map ranges by increasing the maximum VA range of
22227  *  the map and carving out a range at the end of VA space (searching backwards
22228  *  in the newly expanded map).
22229  */
22230 kern_return_t
vm_map_range_configure(vm_map_t map)22231 vm_map_range_configure(vm_map_t map)
22232 {
22233 	vm_map_size_t           addr_space_size;
22234 	vm_map_offset_t         start, end, saved_max, random_addr;
22235 
22236 	if (!vm_map_user_ranges) {
22237 		return KERN_SUCCESS;
22238 	}
22239 
22240 	/* Should not be applying ranges to kernel map or kernel map submaps */
22241 	assert(map != kernel_map);
22242 	assert(vm_map_pmap(map) != kernel_pmap);
22243 
22244 	/* save the existing max offset */
22245 	vm_map_lock_read(map);
22246 	saved_max = map->max_offset;
22247 	vm_map_unlock_read(map);
22248 
22249 	/*
22250 	 * Check that we're not already jumbo'd. If so we cannot guarantee that
22251 	 * we can set up the ranges safely without interfering with the existing
22252 	 * map.
22253 	 */
22254 	if (saved_max > vm_compute_max_offset(vm_map_is_64bit(map))) {
22255 		return KERN_NO_SPACE;
22256 	}
22257 
22258 	/* expand the default VM space to the largest possible address */
22259 	vm_map_set_jumbo(map);
22260 
22261 	vm_map_lock(map);
22262 	addr_space_size = map->max_offset - saved_max;
22263 
22264 	if (addr_space_size <= VM_MAP_USER_RANGE_MAX) {
22265 		vm_map_unlock(map);
22266 		return KERN_NO_SPACE;
22267 	}
22268 
22269 	addr_space_size -= VM_MAP_USER_RANGE_MAX;
22270 	random_addr = (vm_map_offset_t)random();
22271 	random_addr <<= VM_MAP_PAGE_SHIFT(map);
22272 	random_addr %= addr_space_size;
22273 
22274 	/*
22275 	 * round off the start so we begin on a L2 TT boundary and ensure we have
22276 	 * at least a ARM_TT_L2_SIZE sized hole between existing map range and
22277 	 * new range(s).
22278 	 */
22279 	start = vm_map_round_page(saved_max + random_addr + 1, ARM_TT_L2_OFFMASK);
22280 	end = MIN(map->max_offset, start + VM_MAP_USER_RANGE_MAX);
22281 	assert(start > saved_max);
22282 	assert(end <= map->max_offset);
22283 
22284 	/* default range covers the "normal" heap range */
22285 	map->user_range[UMEM_RANGE_ID_DEFAULT].min_address = map->min_offset;
22286 	map->user_range[UMEM_RANGE_ID_DEFAULT].max_address = saved_max;
22287 
22288 	/* heap range covers the new extended range */
22289 	map->user_range[UMEM_RANGE_ID_HEAP].min_address = start;
22290 	map->user_range[UMEM_RANGE_ID_HEAP].max_address = end;
22291 	map->uses_user_ranges = true;
22292 	vm_map_unlock(map);
22293 
22294 	return KERN_SUCCESS;
22295 }
22296 
22297 /*
22298  * vm_map_range_fork:
22299  *	clones the array of ranges from old_map to new_map in support
22300  *  of a VM map fork.
22301  */
22302 void
vm_map_range_fork(vm_map_t new_map,vm_map_t old_map)22303 vm_map_range_fork(vm_map_t new_map, vm_map_t old_map)
22304 {
22305 	int i = 0;
22306 
22307 	if (!old_map->uses_user_ranges) {
22308 		/* nothing to do */
22309 		return;
22310 	}
22311 
22312 	for (i = 0; i < UMEM_RANGE_COUNT; i++) {
22313 		new_map->user_range[i].min_address = old_map->user_range[i].min_address;
22314 		new_map->user_range[i].max_address = old_map->user_range[i].max_address;
22315 	}
22316 
22317 	new_map->uses_user_ranges = true;
22318 }
22319 
22320 /*
22321  * vm_map_get_user_range_id:
22322  *	looks up the vm_map_range_id_map lookup table to determine which range ID to
22323  *  utilize for any given user memory tag. If no ranges are present return the
22324  *  default range.
22325  */
22326 __attribute__((overloadable))
22327 vm_map_range_id_t
vm_map_get_user_range_id(vm_map_t map,uint16_t tag)22328 vm_map_get_user_range_id(vm_map_t map, uint16_t tag)
22329 {
22330 	vm_map_range_id_t range_id = UMEM_RANGE_ID_DEFAULT;
22331 
22332 	if (map != NULL && map->uses_user_ranges && tag < VM_MEMORY_COUNT) {
22333 		range_id = vm_map_range_id_map[tag];
22334 	}
22335 
22336 	return range_id;
22337 }
22338 
22339 /*
22340  * vm_map_get_user_range_id:
22341  *	determines which range ID the given addr/size combination maps to. If
22342  *  range ID cannot be determined return the default range.
22343  */
22344 __attribute__((overloadable))
22345 vm_map_range_id_t
vm_map_get_user_range_id(vm_map_t map,mach_vm_offset_t addr,mach_vm_size_t size)22346 vm_map_get_user_range_id(
22347 	vm_map_t                map,
22348 	mach_vm_offset_t        addr,
22349 	mach_vm_size_t          size)
22350 {
22351 	vm_map_range_id_t range_id = UMEM_RANGE_ID_MAX;
22352 
22353 	if (map == NULL || !map->uses_user_ranges) {
22354 		return UMEM_RANGE_ID_DEFAULT;
22355 	}
22356 
22357 	for (; range_id > UMEM_RANGE_ID_DEFAULT; --range_id) {
22358 		if (mach_vm_range_contains(&map->user_range[range_id], addr, size)) {
22359 			break;
22360 		}
22361 	}
22362 
22363 	assert(range_id < UMEM_RANGE_COUNT);
22364 	return range_id;
22365 }
22366 
22367 /*
22368  * vm_map_get_user_range:
22369  *	copy the VM user range for the given VM map and range ID.
22370  */
22371 kern_return_t
vm_map_get_user_range(vm_map_t map,vm_map_range_id_t range_id,mach_vm_range_t range)22372 vm_map_get_user_range(
22373 	vm_map_t                map,
22374 	vm_map_range_id_t       range_id,
22375 	mach_vm_range_t         range)
22376 {
22377 	if (map == NULL ||
22378 	    !map->uses_user_ranges ||
22379 	    range_id > UMEM_RANGE_ID_MAX ||
22380 	    range == NULL) {
22381 		return KERN_INVALID_ARGUMENT;
22382 	}
22383 
22384 	*range = map->user_range[range_id];
22385 	return KERN_SUCCESS;
22386 }
22387 #endif /* CONFIG_MAP_RANGES */
22388 
22389 /*
22390  * vm_map_entry_has_device_pager:
22391  * Check if the vm map entry specified by the virtual address has a device pager.
22392  * If the vm map entry does not exist or if the map is NULL, this returns FALSE.
22393  */
22394 boolean_t
vm_map_entry_has_device_pager(vm_map_t map,vm_map_offset_t vaddr)22395 vm_map_entry_has_device_pager(vm_map_t map, vm_map_offset_t vaddr)
22396 {
22397 	vm_map_entry_t entry;
22398 	vm_object_t object;
22399 	boolean_t result;
22400 
22401 	if (map == NULL) {
22402 		return FALSE;
22403 	}
22404 
22405 	vm_map_lock(map);
22406 	while (TRUE) {
22407 		if (!vm_map_lookup_entry(map, vaddr, &entry)) {
22408 			result = FALSE;
22409 			break;
22410 		}
22411 		if (entry->is_sub_map) {
22412 			// Check the submap
22413 			vm_map_t submap = VME_SUBMAP(entry);
22414 			assert(submap != NULL);
22415 			vm_map_lock(submap);
22416 			vm_map_unlock(map);
22417 			map = submap;
22418 			continue;
22419 		}
22420 		object = VME_OBJECT(entry);
22421 		if (object != NULL && object->pager != NULL && is_device_pager_ops(object->pager->mo_pager_ops)) {
22422 			result = TRUE;
22423 			break;
22424 		}
22425 		result = FALSE;
22426 		break;
22427 	}
22428 
22429 	vm_map_unlock(map);
22430 	return result;
22431 }
22432 
22433 
22434 #if MACH_ASSERT
22435 
22436 extern int pmap_ledgers_panic;
22437 extern int pmap_ledgers_panic_leeway;
22438 
22439 #define LEDGER_DRIFT(__LEDGER)                    \
22440 	int             __LEDGER##_over;          \
22441 	ledger_amount_t __LEDGER##_over_total;    \
22442 	ledger_amount_t __LEDGER##_over_max;      \
22443 	int             __LEDGER##_under;         \
22444 	ledger_amount_t __LEDGER##_under_total;   \
22445 	ledger_amount_t __LEDGER##_under_max
22446 
22447 struct {
22448 	uint64_t        num_pmaps_checked;
22449 
22450 	LEDGER_DRIFT(phys_footprint);
22451 	LEDGER_DRIFT(internal);
22452 	LEDGER_DRIFT(internal_compressed);
22453 	LEDGER_DRIFT(external);
22454 	LEDGER_DRIFT(reusable);
22455 	LEDGER_DRIFT(iokit_mapped);
22456 	LEDGER_DRIFT(alternate_accounting);
22457 	LEDGER_DRIFT(alternate_accounting_compressed);
22458 	LEDGER_DRIFT(page_table);
22459 	LEDGER_DRIFT(purgeable_volatile);
22460 	LEDGER_DRIFT(purgeable_nonvolatile);
22461 	LEDGER_DRIFT(purgeable_volatile_compressed);
22462 	LEDGER_DRIFT(purgeable_nonvolatile_compressed);
22463 	LEDGER_DRIFT(tagged_nofootprint);
22464 	LEDGER_DRIFT(tagged_footprint);
22465 	LEDGER_DRIFT(tagged_nofootprint_compressed);
22466 	LEDGER_DRIFT(tagged_footprint_compressed);
22467 	LEDGER_DRIFT(network_volatile);
22468 	LEDGER_DRIFT(network_nonvolatile);
22469 	LEDGER_DRIFT(network_volatile_compressed);
22470 	LEDGER_DRIFT(network_nonvolatile_compressed);
22471 	LEDGER_DRIFT(media_nofootprint);
22472 	LEDGER_DRIFT(media_footprint);
22473 	LEDGER_DRIFT(media_nofootprint_compressed);
22474 	LEDGER_DRIFT(media_footprint_compressed);
22475 	LEDGER_DRIFT(graphics_nofootprint);
22476 	LEDGER_DRIFT(graphics_footprint);
22477 	LEDGER_DRIFT(graphics_nofootprint_compressed);
22478 	LEDGER_DRIFT(graphics_footprint_compressed);
22479 	LEDGER_DRIFT(neural_nofootprint);
22480 	LEDGER_DRIFT(neural_footprint);
22481 	LEDGER_DRIFT(neural_nofootprint_compressed);
22482 	LEDGER_DRIFT(neural_footprint_compressed);
22483 } pmap_ledgers_drift;
22484 
22485 void
vm_map_pmap_check_ledgers(pmap_t pmap,ledger_t ledger,int pid,char * procname)22486 vm_map_pmap_check_ledgers(
22487 	pmap_t          pmap,
22488 	ledger_t        ledger,
22489 	int             pid,
22490 	char            *procname)
22491 {
22492 	ledger_amount_t bal;
22493 	boolean_t       do_panic;
22494 
22495 	do_panic = FALSE;
22496 
22497 	pmap_ledgers_drift.num_pmaps_checked++;
22498 
22499 #define LEDGER_CHECK_BALANCE(__LEDGER)                                  \
22500 MACRO_BEGIN                                                             \
22501 	int panic_on_negative = TRUE;                                   \
22502 	ledger_get_balance(ledger,                                      \
22503 	                   task_ledgers.__LEDGER,                       \
22504 	                   &bal);                                       \
22505 	ledger_get_panic_on_negative(ledger,                            \
22506 	                             task_ledgers.__LEDGER,             \
22507 	                             &panic_on_negative);               \
22508 	if (bal != 0) {                                                 \
22509 	        if (panic_on_negative ||                                \
22510 	            (pmap_ledgers_panic &&                              \
22511 	             pmap_ledgers_panic_leeway > 0 &&                   \
22512 	             (bal > (pmap_ledgers_panic_leeway * PAGE_SIZE) ||  \
22513 	              bal < (-pmap_ledgers_panic_leeway * PAGE_SIZE)))) { \
22514 	                do_panic = TRUE;                                \
22515 	        }                                                       \
22516 	        printf("LEDGER BALANCE proc %d (%s) "                   \
22517 	               "\"%s\" = %lld\n",                               \
22518 	               pid, procname, #__LEDGER, bal);                  \
22519 	        if (bal > 0) {                                          \
22520 	                pmap_ledgers_drift.__LEDGER##_over++;           \
22521 	                pmap_ledgers_drift.__LEDGER##_over_total += bal; \
22522 	                if (bal > pmap_ledgers_drift.__LEDGER##_over_max) { \
22523 	                        pmap_ledgers_drift.__LEDGER##_over_max = bal; \
22524 	                }                                               \
22525 	        } else if (bal < 0) {                                   \
22526 	                pmap_ledgers_drift.__LEDGER##_under++;          \
22527 	                pmap_ledgers_drift.__LEDGER##_under_total += bal; \
22528 	                if (bal < pmap_ledgers_drift.__LEDGER##_under_max) { \
22529 	                        pmap_ledgers_drift.__LEDGER##_under_max = bal; \
22530 	                }                                               \
22531 	        }                                                       \
22532 	}                                                               \
22533 MACRO_END
22534 
22535 	LEDGER_CHECK_BALANCE(phys_footprint);
22536 	LEDGER_CHECK_BALANCE(internal);
22537 	LEDGER_CHECK_BALANCE(internal_compressed);
22538 	LEDGER_CHECK_BALANCE(external);
22539 	LEDGER_CHECK_BALANCE(reusable);
22540 	LEDGER_CHECK_BALANCE(iokit_mapped);
22541 	LEDGER_CHECK_BALANCE(alternate_accounting);
22542 	LEDGER_CHECK_BALANCE(alternate_accounting_compressed);
22543 	LEDGER_CHECK_BALANCE(page_table);
22544 	LEDGER_CHECK_BALANCE(purgeable_volatile);
22545 	LEDGER_CHECK_BALANCE(purgeable_nonvolatile);
22546 	LEDGER_CHECK_BALANCE(purgeable_volatile_compressed);
22547 	LEDGER_CHECK_BALANCE(purgeable_nonvolatile_compressed);
22548 	LEDGER_CHECK_BALANCE(tagged_nofootprint);
22549 	LEDGER_CHECK_BALANCE(tagged_footprint);
22550 	LEDGER_CHECK_BALANCE(tagged_nofootprint_compressed);
22551 	LEDGER_CHECK_BALANCE(tagged_footprint_compressed);
22552 	LEDGER_CHECK_BALANCE(network_volatile);
22553 	LEDGER_CHECK_BALANCE(network_nonvolatile);
22554 	LEDGER_CHECK_BALANCE(network_volatile_compressed);
22555 	LEDGER_CHECK_BALANCE(network_nonvolatile_compressed);
22556 	LEDGER_CHECK_BALANCE(media_nofootprint);
22557 	LEDGER_CHECK_BALANCE(media_footprint);
22558 	LEDGER_CHECK_BALANCE(media_nofootprint_compressed);
22559 	LEDGER_CHECK_BALANCE(media_footprint_compressed);
22560 	LEDGER_CHECK_BALANCE(graphics_nofootprint);
22561 	LEDGER_CHECK_BALANCE(graphics_footprint);
22562 	LEDGER_CHECK_BALANCE(graphics_nofootprint_compressed);
22563 	LEDGER_CHECK_BALANCE(graphics_footprint_compressed);
22564 	LEDGER_CHECK_BALANCE(neural_nofootprint);
22565 	LEDGER_CHECK_BALANCE(neural_footprint);
22566 	LEDGER_CHECK_BALANCE(neural_nofootprint_compressed);
22567 	LEDGER_CHECK_BALANCE(neural_footprint_compressed);
22568 
22569 	if (do_panic) {
22570 		if (pmap_ledgers_panic) {
22571 			panic("pmap_destroy(%p) %d[%s] has imbalanced ledgers",
22572 			    pmap, pid, procname);
22573 		} else {
22574 			printf("pmap_destroy(%p) %d[%s] has imbalanced ledgers\n",
22575 			    pmap, pid, procname);
22576 		}
22577 	}
22578 }
22579 
22580 void
vm_map_pmap_set_process(vm_map_t map,int pid,char * procname)22581 vm_map_pmap_set_process(
22582 	vm_map_t map,
22583 	int pid,
22584 	char *procname)
22585 {
22586 	pmap_set_process(vm_map_pmap(map), pid, procname);
22587 }
22588 
22589 #endif /* MACH_ASSERT */
22590