xref: /xnu-8792.81.2/osfmk/vm/vm_map.c (revision 19c3b8c28c31cb8130e034cfb5df6bf9ba342d90)
1 /*
2  * Copyright (c) 2000-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * @OSF_COPYRIGHT@
30  */
31 /*
32  * Mach Operating System
33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34  * All Rights Reserved.
35  *
36  * Permission to use, copy, modify and distribute this software and its
37  * documentation is hereby granted, provided that both the copyright
38  * notice and this permission notice appear in all copies of the
39  * software, derivative works or modified versions, and any portions
40  * thereof, and that both notices appear in supporting documentation.
41  *
42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45  *
46  * Carnegie Mellon requests users of this software to return to
47  *
48  *  Software Distribution Coordinator  or  [email protected]
49  *  School of Computer Science
50  *  Carnegie Mellon University
51  *  Pittsburgh PA 15213-3890
52  *
53  * any improvements or extensions that they make and grant Carnegie Mellon
54  * the rights to redistribute these changes.
55  */
56 /*
57  */
58 /*
59  *	File:	vm/vm_map.c
60  *	Author:	Avadis Tevanian, Jr., Michael Wayne Young
61  *	Date:	1985
62  *
63  *	Virtual memory mapping module.
64  */
65 
66 #include <mach/vm_types.h>
67 #include <mach_assert.h>
68 
69 #include <vm/vm_options.h>
70 
71 #include <libkern/OSAtomic.h>
72 
73 #include <mach/kern_return.h>
74 #include <mach/port.h>
75 #include <mach/vm_attributes.h>
76 #include <mach/vm_param.h>
77 #include <mach/vm_behavior.h>
78 #include <mach/vm_statistics.h>
79 #include <mach/memory_object.h>
80 #include <mach/mach_vm.h>
81 #include <machine/cpu_capabilities.h>
82 #include <mach/sdt.h>
83 
84 #include <kern/assert.h>
85 #include <kern/backtrace.h>
86 #include <kern/counter.h>
87 #include <kern/exc_guard.h>
88 #include <kern/kalloc.h>
89 #include <kern/zalloc_internal.h>
90 
91 #include <vm/cpm.h>
92 #include <vm/vm_compressor.h>
93 #include <vm/vm_compressor_pager.h>
94 #include <vm/vm_init.h>
95 #include <vm/vm_fault.h>
96 #include <vm/vm_map_internal.h>
97 #include <vm/vm_object.h>
98 #include <vm/vm_page.h>
99 #include <vm/vm_pageout.h>
100 #include <vm/pmap.h>
101 #include <vm/vm_kern.h>
102 #include <ipc/ipc_port.h>
103 #include <kern/sched_prim.h>
104 #include <kern/misc_protos.h>
105 
106 #include <mach/vm_map_server.h>
107 #include <mach/mach_host_server.h>
108 #include <vm/vm_protos.h>
109 #include <vm/vm_purgeable_internal.h>
110 #include <vm/vm_reclaim_internal.h>
111 
112 #include <vm/vm_protos.h>
113 #include <vm/vm_shared_region.h>
114 #include <vm/vm_map_store.h>
115 
116 #include <san/kasan.h>
117 
118 #include <sys/resource.h>
119 #include <sys/codesign.h>
120 #include <sys/code_signing.h>
121 #include <sys/mman.h>
122 #include <sys/reboot.h>
123 #include <sys/kdebug_triage.h>
124 
125 #include <libkern/section_keywords.h>
126 
127 #if DEVELOPMENT || DEBUG
128 extern int proc_selfcsflags(void);
129 int panic_on_unsigned_execute = 0;
130 int panic_on_mlock_failure = 0;
131 #endif /* DEVELOPMENT || DEBUG */
132 
133 #if MACH_ASSERT
134 int debug4k_filter = 0;
135 char debug4k_proc_name[1024] = "";
136 int debug4k_proc_filter = (int)-1 & ~(1 << __DEBUG4K_FAULT);
137 int debug4k_panic_on_misaligned_sharing = 0;
138 const char *debug4k_category_name[] = {
139 	"error",        /* 0 */
140 	"life",         /* 1 */
141 	"load",         /* 2 */
142 	"fault",        /* 3 */
143 	"copy",         /* 4 */
144 	"share",        /* 5 */
145 	"adjust",       /* 6 */
146 	"pmap",         /* 7 */
147 	"mementry",     /* 8 */
148 	"iokit",        /* 9 */
149 	"upl",          /* 10 */
150 	"exc",          /* 11 */
151 	"vfs"           /* 12 */
152 };
153 #endif /* MACH_ASSERT */
154 int debug4k_no_cow_copyin = 0;
155 
156 
157 #if __arm64__
158 extern const int fourk_binary_compatibility_unsafe;
159 extern const int fourk_binary_compatibility_allow_wx;
160 #endif /* __arm64__ */
161 extern int proc_selfpid(void);
162 extern char *proc_name_address(void *p);
163 
164 #if VM_MAP_DEBUG_APPLE_PROTECT
165 int vm_map_debug_apple_protect = 0;
166 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
167 #if VM_MAP_DEBUG_FOURK
168 int vm_map_debug_fourk = 0;
169 #endif /* VM_MAP_DEBUG_FOURK */
170 
171 #if DEBUG || DEVELOPMENT
172 static TUNABLE(bool, vm_map_executable_immutable,
173     "vm_map_executable_immutable", true);
174 #else
175 #define vm_map_executable_immutable true
176 #endif
177 
178 #if CONFIG_MAP_RANGES
179 static TUNABLE(bool, vm_map_user_ranges, "vm_map_user_ranges", true);
180 static SECURITY_READ_ONLY_LATE(uint8_t) vm_map_range_id_map[VM_MEMORY_COUNT];
181 #endif
182 
183 os_refgrp_decl(static, map_refgrp, "vm_map", NULL);
184 
185 extern u_int32_t random(void);  /* from <libkern/libkern.h> */
186 /* Internal prototypes
187  */
188 
189 typedef struct vm_map_zap {
190 	vm_map_entry_t          vmz_head;
191 	vm_map_entry_t         *vmz_tail;
192 } *vm_map_zap_t;
193 
194 #define VM_MAP_ZAP_DECLARE(zap) \
195 	struct vm_map_zap zap = { .vmz_tail = &zap.vmz_head }
196 
197 static vm_map_entry_t   vm_map_entry_insert(
198 	vm_map_t                map,
199 	vm_map_entry_t          insp_entry,
200 	vm_map_offset_t         start,
201 	vm_map_offset_t         end,
202 	vm_object_t             object,
203 	vm_object_offset_t      offset,
204 	vm_map_kernel_flags_t   vmk_flags,
205 	boolean_t               needs_copy,
206 	vm_prot_t               cur_protection,
207 	vm_prot_t               max_protection,
208 	vm_inherit_t            inheritance,
209 	boolean_t               no_cache,
210 	boolean_t               permanent,
211 	unsigned int            superpage_size,
212 	boolean_t               clear_map_aligned,
213 	int                     alias);
214 
215 static void vm_map_simplify_range(
216 	vm_map_t        map,
217 	vm_map_offset_t start,
218 	vm_map_offset_t end);   /* forward */
219 
220 static boolean_t        vm_map_range_check(
221 	vm_map_t        map,
222 	vm_map_offset_t start,
223 	vm_map_offset_t end,
224 	vm_map_entry_t  *entry);
225 
226 static void vm_map_submap_pmap_clean(
227 	vm_map_t        map,
228 	vm_map_offset_t start,
229 	vm_map_offset_t end,
230 	vm_map_t        sub_map,
231 	vm_map_offset_t offset);
232 
233 static void             vm_map_pmap_enter(
234 	vm_map_t                map,
235 	vm_map_offset_t         addr,
236 	vm_map_offset_t         end_addr,
237 	vm_object_t             object,
238 	vm_object_offset_t      offset,
239 	vm_prot_t               protection);
240 
241 static void             _vm_map_clip_end(
242 	struct vm_map_header    *map_header,
243 	vm_map_entry_t          entry,
244 	vm_map_offset_t         end);
245 
246 static void             _vm_map_clip_start(
247 	struct vm_map_header    *map_header,
248 	vm_map_entry_t          entry,
249 	vm_map_offset_t         start);
250 
251 static kmem_return_t vm_map_delete(
252 	vm_map_t        map,
253 	vm_map_offset_t start,
254 	vm_map_offset_t end,
255 	vmr_flags_t     flags,
256 	kmem_guard_t    guard,
257 	vm_map_zap_t    zap);
258 
259 static void             vm_map_copy_insert(
260 	vm_map_t        map,
261 	vm_map_entry_t  after_where,
262 	vm_map_copy_t   copy);
263 
264 static kern_return_t    vm_map_copy_overwrite_unaligned(
265 	vm_map_t        dst_map,
266 	vm_map_entry_t  entry,
267 	vm_map_copy_t   copy,
268 	vm_map_address_t start,
269 	boolean_t       discard_on_success);
270 
271 static kern_return_t    vm_map_copy_overwrite_aligned(
272 	vm_map_t        dst_map,
273 	vm_map_entry_t  tmp_entry,
274 	vm_map_copy_t   copy,
275 	vm_map_offset_t start,
276 	pmap_t          pmap);
277 
278 static kern_return_t    vm_map_copyin_kernel_buffer(
279 	vm_map_t        src_map,
280 	vm_map_address_t src_addr,
281 	vm_map_size_t   len,
282 	boolean_t       src_destroy,
283 	vm_map_copy_t   *copy_result);  /* OUT */
284 
285 static kern_return_t    vm_map_copyout_kernel_buffer(
286 	vm_map_t        map,
287 	vm_map_address_t *addr, /* IN/OUT */
288 	vm_map_copy_t   copy,
289 	vm_map_size_t   copy_size,
290 	boolean_t       overwrite,
291 	boolean_t       consume_on_success);
292 
293 static void             vm_map_fork_share(
294 	vm_map_t        old_map,
295 	vm_map_entry_t  old_entry,
296 	vm_map_t        new_map);
297 
298 static boolean_t        vm_map_fork_copy(
299 	vm_map_t        old_map,
300 	vm_map_entry_t  *old_entry_p,
301 	vm_map_t        new_map,
302 	int             vm_map_copyin_flags);
303 
304 static kern_return_t    vm_map_wire_nested(
305 	vm_map_t                   map,
306 	vm_map_offset_t            start,
307 	vm_map_offset_t            end,
308 	vm_prot_t                  caller_prot,
309 	vm_tag_t                   tag,
310 	boolean_t                  user_wire,
311 	pmap_t                     map_pmap,
312 	vm_map_offset_t            pmap_addr,
313 	ppnum_t                    *physpage_p);
314 
315 static kern_return_t    vm_map_unwire_nested(
316 	vm_map_t                   map,
317 	vm_map_offset_t            start,
318 	vm_map_offset_t            end,
319 	boolean_t                  user_wire,
320 	pmap_t                     map_pmap,
321 	vm_map_offset_t            pmap_addr);
322 
323 static kern_return_t    vm_map_overwrite_submap_recurse(
324 	vm_map_t                   dst_map,
325 	vm_map_offset_t            dst_addr,
326 	vm_map_size_t              dst_size);
327 
328 static kern_return_t    vm_map_copy_overwrite_nested(
329 	vm_map_t                   dst_map,
330 	vm_map_offset_t            dst_addr,
331 	vm_map_copy_t              copy,
332 	boolean_t                  interruptible,
333 	pmap_t                     pmap,
334 	boolean_t                  discard_on_success);
335 
336 static kern_return_t    vm_map_remap_extract(
337 	vm_map_t                map,
338 	vm_map_offset_t         addr,
339 	vm_map_size_t           size,
340 	boolean_t               copy,
341 	struct vm_map_header    *map_header,
342 	vm_prot_t               *cur_protection,
343 	vm_prot_t               *max_protection,
344 	vm_inherit_t            inheritance,
345 	vm_map_kernel_flags_t   vmk_flags);
346 
347 static kern_return_t    vm_map_remap_range_allocate(
348 	vm_map_t                map,
349 	vm_map_address_t        *address,
350 	vm_map_size_t           size,
351 	vm_map_offset_t         mask,
352 	int                     flags,
353 	vm_map_kernel_flags_t   vmk_flags,
354 	vm_tag_t                tag,
355 	vm_map_entry_t          *map_entry,
356 	vm_map_zap_t            zap_list);
357 
358 static void             vm_map_region_look_for_page(
359 	vm_map_t                   map,
360 	vm_map_offset_t            va,
361 	vm_object_t                object,
362 	vm_object_offset_t         offset,
363 	int                        max_refcnt,
364 	unsigned short             depth,
365 	vm_region_extended_info_t  extended,
366 	mach_msg_type_number_t count);
367 
368 static int              vm_map_region_count_obj_refs(
369 	vm_map_entry_t             entry,
370 	vm_object_t                object);
371 
372 
373 static kern_return_t    vm_map_willneed(
374 	vm_map_t        map,
375 	vm_map_offset_t start,
376 	vm_map_offset_t end);
377 
378 static kern_return_t    vm_map_reuse_pages(
379 	vm_map_t        map,
380 	vm_map_offset_t start,
381 	vm_map_offset_t end);
382 
383 static kern_return_t    vm_map_reusable_pages(
384 	vm_map_t        map,
385 	vm_map_offset_t start,
386 	vm_map_offset_t end);
387 
388 static kern_return_t    vm_map_can_reuse(
389 	vm_map_t        map,
390 	vm_map_offset_t start,
391 	vm_map_offset_t end);
392 
393 #if MACH_ASSERT
394 static kern_return_t    vm_map_pageout(
395 	vm_map_t        map,
396 	vm_map_offset_t start,
397 	vm_map_offset_t end);
398 #endif /* MACH_ASSERT */
399 
400 kern_return_t vm_map_corpse_footprint_collect(
401 	vm_map_t        old_map,
402 	vm_map_entry_t  old_entry,
403 	vm_map_t        new_map);
404 void vm_map_corpse_footprint_collect_done(
405 	vm_map_t        new_map);
406 void vm_map_corpse_footprint_destroy(
407 	vm_map_t        map);
408 kern_return_t vm_map_corpse_footprint_query_page_info(
409 	vm_map_t        map,
410 	vm_map_offset_t va,
411 	int             *disposition_p);
412 void vm_map_footprint_query_page_info(
413 	vm_map_t        map,
414 	vm_map_entry_t  map_entry,
415 	vm_map_offset_t curr_s_offset,
416 	int             *disposition_p);
417 
418 #if CONFIG_MAP_RANGES
419 static void vm_map_range_map_init(void);
420 #endif /* CONFIG_MAP_RANGES */
421 
422 pid_t find_largest_process_vm_map_entries(void);
423 
424 extern int exit_with_guard_exception(void *p, mach_exception_data_type_t code,
425     mach_exception_data_type_t subcode);
426 
427 /*
428  * Macros to copy a vm_map_entry. We must be careful to correctly
429  * manage the wired page count. vm_map_entry_copy() creates a new
430  * map entry to the same memory - the wired count in the new entry
431  * must be set to zero. vm_map_entry_copy_full() creates a new
432  * entry that is identical to the old entry.  This preserves the
433  * wire count; it's used for map splitting and zone changing in
434  * vm_map_copyout.
435  */
436 
437 static inline void
vm_map_entry_copy_pmap_cs_assoc(vm_map_t map __unused,vm_map_entry_t new __unused,vm_map_entry_t old __unused)438 vm_map_entry_copy_pmap_cs_assoc(
439 	vm_map_t map __unused,
440 	vm_map_entry_t new __unused,
441 	vm_map_entry_t old __unused)
442 {
443 	/* when pmap_cs is not enabled, assert as a sanity check */
444 	assert(new->pmap_cs_associated == FALSE);
445 }
446 
447 /*
448  * The "used_for_jit" flag was copied from OLD to NEW in vm_map_entry_copy().
449  * But for security reasons on some platforms, we don't want the
450  * new mapping to be "used for jit", so we reset the flag here.
451  */
452 static inline void
vm_map_entry_copy_code_signing(vm_map_t map,vm_map_entry_t new,vm_map_entry_t old __unused)453 vm_map_entry_copy_code_signing(
454 	vm_map_t map,
455 	vm_map_entry_t new,
456 	vm_map_entry_t old __unused)
457 {
458 	if (VM_MAP_POLICY_ALLOW_JIT_COPY(map)) {
459 		assert(new->used_for_jit == old->used_for_jit);
460 	} else {
461 		new->used_for_jit = FALSE;
462 	}
463 }
464 
465 static inline void
vm_map_entry_copy_full(vm_map_entry_t new,vm_map_entry_t old)466 vm_map_entry_copy_full(
467 	vm_map_entry_t new,
468 	vm_map_entry_t old)
469 {
470 #if MAP_ENTRY_CREATION_DEBUG
471 	btref_put(new->vme_creation_bt);
472 	btref_retain(old->vme_creation_bt);
473 #endif
474 #if MAP_ENTRY_INSERTION_DEBUG
475 	btref_put(new->vme_insertion_bt);
476 	btref_retain(old->vme_insertion_bt);
477 #endif
478 	*new = *old;
479 }
480 
481 static inline void
vm_map_entry_copy(vm_map_t map,vm_map_entry_t new,vm_map_entry_t old)482 vm_map_entry_copy(
483 	vm_map_t map,
484 	vm_map_entry_t new,
485 	vm_map_entry_t old)
486 {
487 	vm_map_entry_copy_full(new, old);
488 
489 	new->is_shared = FALSE;
490 	new->needs_wakeup = FALSE;
491 	new->in_transition = FALSE;
492 	new->wired_count = 0;
493 	new->user_wired_count = 0;
494 	new->vme_permanent = FALSE;
495 	vm_map_entry_copy_code_signing(map, new, old);
496 	vm_map_entry_copy_pmap_cs_assoc(map, new, old);
497 	if (new->iokit_acct) {
498 		assertf(!new->use_pmap, "old %p new %p\n", old, new);
499 		new->iokit_acct = FALSE;
500 		new->use_pmap = TRUE;
501 	}
502 	new->vme_resilient_codesign = FALSE;
503 	new->vme_resilient_media = FALSE;
504 	new->vme_atomic = FALSE;
505 	new->vme_no_copy_on_read = FALSE;
506 }
507 
508 /*
509  * Normal lock_read_to_write() returns FALSE/0 on failure.
510  * These functions evaluate to zero on success and non-zero value on failure.
511  */
512 __attribute__((always_inline))
513 int
vm_map_lock_read_to_write(vm_map_t map)514 vm_map_lock_read_to_write(vm_map_t map)
515 {
516 	if (lck_rw_lock_shared_to_exclusive(&(map)->lock)) {
517 		DTRACE_VM(vm_map_lock_upgrade);
518 		return 0;
519 	}
520 	return 1;
521 }
522 
523 __attribute__((always_inline))
524 boolean_t
vm_map_try_lock(vm_map_t map)525 vm_map_try_lock(vm_map_t map)
526 {
527 	if (lck_rw_try_lock_exclusive(&(map)->lock)) {
528 		DTRACE_VM(vm_map_lock_w);
529 		return TRUE;
530 	}
531 	return FALSE;
532 }
533 
534 __attribute__((always_inline))
535 boolean_t
vm_map_try_lock_read(vm_map_t map)536 vm_map_try_lock_read(vm_map_t map)
537 {
538 	if (lck_rw_try_lock_shared(&(map)->lock)) {
539 		DTRACE_VM(vm_map_lock_r);
540 		return TRUE;
541 	}
542 	return FALSE;
543 }
544 
545 /*!
546  * @function kdp_vm_map_is_acquired_exclusive
547  *
548  * @abstract
549  * Checks if vm map is acquired exclusive.
550  *
551  * @discussion
552  * NOT SAFE: To be used only by kernel debugger.
553  *
554  * @param map map to check
555  *
556  * @returns TRUE if the map is acquired exclusively.
557  */
558 boolean_t
kdp_vm_map_is_acquired_exclusive(vm_map_t map)559 kdp_vm_map_is_acquired_exclusive(vm_map_t map)
560 {
561 	return kdp_lck_rw_lock_is_acquired_exclusive(&map->lock);
562 }
563 
564 /*
565  * Routines to get the page size the caller should
566  * use while inspecting the target address space.
567  * Use the "_safely" variant if the caller is dealing with a user-provided
568  * array whose size depends on the page size, to avoid any overflow or
569  * underflow of a user-allocated buffer.
570  */
571 int
vm_self_region_page_shift_safely(vm_map_t target_map)572 vm_self_region_page_shift_safely(
573 	vm_map_t target_map)
574 {
575 	int effective_page_shift = 0;
576 
577 	if (PAGE_SIZE == (4096)) {
578 		/* x86_64 and 4k watches: always use 4k */
579 		return PAGE_SHIFT;
580 	}
581 	/* did caller provide an explicit page size for this thread to use? */
582 	effective_page_shift = thread_self_region_page_shift();
583 	if (effective_page_shift) {
584 		/* use the explicitly-provided page size */
585 		return effective_page_shift;
586 	}
587 	/* no explicit page size: use the caller's page size... */
588 	effective_page_shift = VM_MAP_PAGE_SHIFT(current_map());
589 	if (effective_page_shift == VM_MAP_PAGE_SHIFT(target_map)) {
590 		/* page size match: safe to use */
591 		return effective_page_shift;
592 	}
593 	/* page size mismatch */
594 	return -1;
595 }
596 int
vm_self_region_page_shift(vm_map_t target_map)597 vm_self_region_page_shift(
598 	vm_map_t target_map)
599 {
600 	int effective_page_shift;
601 
602 	effective_page_shift = vm_self_region_page_shift_safely(target_map);
603 	if (effective_page_shift == -1) {
604 		/* no safe value but OK to guess for caller */
605 		effective_page_shift = MIN(VM_MAP_PAGE_SHIFT(current_map()),
606 		    VM_MAP_PAGE_SHIFT(target_map));
607 	}
608 	return effective_page_shift;
609 }
610 
611 
612 /*
613  *	Decide if we want to allow processes to execute from their data or stack areas.
614  *	override_nx() returns true if we do.  Data/stack execution can be enabled independently
615  *	for 32 and 64 bit processes.  Set the VM_ABI_32 or VM_ABI_64 flags in allow_data_exec
616  *	or allow_stack_exec to enable data execution for that type of data area for that particular
617  *	ABI (or both by or'ing the flags together).  These are initialized in the architecture
618  *	specific pmap files since the default behavior varies according to architecture.  The
619  *	main reason it varies is because of the need to provide binary compatibility with old
620  *	applications that were written before these restrictions came into being.  In the old
621  *	days, an app could execute anything it could read, but this has slowly been tightened
622  *	up over time.  The default behavior is:
623  *
624  *	32-bit PPC apps		may execute from both stack and data areas
625  *	32-bit Intel apps	may exeucte from data areas but not stack
626  *	64-bit PPC/Intel apps	may not execute from either data or stack
627  *
628  *	An application on any architecture may override these defaults by explicitly
629  *	adding PROT_EXEC permission to the page in question with the mprotect(2)
630  *	system call.  This code here just determines what happens when an app tries to
631  *      execute from a page that lacks execute permission.
632  *
633  *	Note that allow_data_exec or allow_stack_exec may also be modified by sysctl to change the
634  *	default behavior for both 32 and 64 bit apps on a system-wide basis. Furthermore,
635  *	a Mach-O header flag bit (MH_NO_HEAP_EXECUTION) can be used to forcibly disallow
636  *	execution from data areas for a particular binary even if the arch normally permits it. As
637  *	a final wrinkle, a posix_spawn attribute flag can be used to negate this opt-in header bit
638  *	to support some complicated use cases, notably browsers with out-of-process plugins that
639  *	are not all NX-safe.
640  */
641 
642 extern int allow_data_exec, allow_stack_exec;
643 
644 int
override_nx(vm_map_t map,uint32_t user_tag)645 override_nx(vm_map_t map, uint32_t user_tag) /* map unused on arm */
646 {
647 	int current_abi;
648 
649 	if (map->pmap == kernel_pmap) {
650 		return FALSE;
651 	}
652 
653 	/*
654 	 * Determine if the app is running in 32 or 64 bit mode.
655 	 */
656 
657 	if (vm_map_is_64bit(map)) {
658 		current_abi = VM_ABI_64;
659 	} else {
660 		current_abi = VM_ABI_32;
661 	}
662 
663 	/*
664 	 * Determine if we should allow the execution based on whether it's a
665 	 * stack or data area and the current architecture.
666 	 */
667 
668 	if (user_tag == VM_MEMORY_STACK) {
669 		return allow_stack_exec & current_abi;
670 	}
671 
672 	return (allow_data_exec & current_abi) && (map->map_disallow_data_exec == FALSE);
673 }
674 
675 
676 /*
677  *	Virtual memory maps provide for the mapping, protection,
678  *	and sharing of virtual memory objects.  In addition,
679  *	this module provides for an efficient virtual copy of
680  *	memory from one map to another.
681  *
682  *	Synchronization is required prior to most operations.
683  *
684  *	Maps consist of an ordered doubly-linked list of simple
685  *	entries; a single hint is used to speed up lookups.
686  *
687  *	Sharing maps have been deleted from this version of Mach.
688  *	All shared objects are now mapped directly into the respective
689  *	maps.  This requires a change in the copy on write strategy;
690  *	the asymmetric (delayed) strategy is used for shared temporary
691  *	objects instead of the symmetric (shadow) strategy.  All maps
692  *	are now "top level" maps (either task map, kernel map or submap
693  *	of the kernel map).
694  *
695  *	Since portions of maps are specified by start/end addreses,
696  *	which may not align with existing map entries, all
697  *	routines merely "clip" entries to these start/end values.
698  *	[That is, an entry is split into two, bordering at a
699  *	start or end value.]  Note that these clippings may not
700  *	always be necessary (as the two resulting entries are then
701  *	not changed); however, the clipping is done for convenience.
702  *	No attempt is currently made to "glue back together" two
703  *	abutting entries.
704  *
705  *	The symmetric (shadow) copy strategy implements virtual copy
706  *	by copying VM object references from one map to
707  *	another, and then marking both regions as copy-on-write.
708  *	It is important to note that only one writeable reference
709  *	to a VM object region exists in any map when this strategy
710  *	is used -- this means that shadow object creation can be
711  *	delayed until a write operation occurs.  The symmetric (delayed)
712  *	strategy allows multiple maps to have writeable references to
713  *	the same region of a vm object, and hence cannot delay creating
714  *	its copy objects.  See vm_object_copy_quickly() in vm_object.c.
715  *	Copying of permanent objects is completely different; see
716  *	vm_object_copy_strategically() in vm_object.c.
717  */
718 
719 ZONE_DECLARE_ID(ZONE_ID_VM_MAP_COPY, struct vm_map_copy);
720 
721 #define VM_MAP_ZONE_NAME "maps"
722 #define VM_MAP_ZFLAGS ( \
723 	ZC_NOENCRYPT | \
724 	ZC_VM_LP64)
725 
726 #define VM_MAP_ENTRY_ZONE_NAME "VM map entries"
727 #define VM_MAP_ENTRY_ZFLAGS ( \
728 	ZC_NOENCRYPT | \
729 	ZC_CACHING | \
730 	ZC_KASAN_NOQUARANTINE | \
731 	ZC_VM_LP64)
732 
733 #define VM_MAP_HOLES_ZONE_NAME "VM map holes"
734 #define VM_MAP_HOLES_ZFLAGS ( \
735 	ZC_NOENCRYPT | \
736 	ZC_CACHING | \
737 	ZC_KASAN_NOQUARANTINE | \
738 	ZC_VM_LP64)
739 
740 /*
741  * Asserts that a vm_map_copy object is coming from the
742  * vm_map_copy_zone to ensure that it isn't a fake constructed
743  * anywhere else.
744  */
745 void
vm_map_copy_require(struct vm_map_copy * copy)746 vm_map_copy_require(struct vm_map_copy *copy)
747 {
748 	zone_id_require(ZONE_ID_VM_MAP_COPY, sizeof(struct vm_map_copy), copy);
749 }
750 
751 /*
752  *	vm_map_require:
753  *
754  *	Ensures that the argument is memory allocated from the genuine
755  *	vm map zone. (See zone_id_require_allow_foreign).
756  */
757 void
vm_map_require(vm_map_t map)758 vm_map_require(vm_map_t map)
759 {
760 	zone_id_require(ZONE_ID_VM_MAP, sizeof(struct _vm_map), map);
761 }
762 
763 #define VM_MAP_EARLY_COUNT_MAX         16
764 static __startup_data vm_offset_t      map_data;
765 static __startup_data vm_size_t        map_data_size;
766 static __startup_data vm_offset_t      kentry_data;
767 static __startup_data vm_size_t        kentry_data_size;
768 static __startup_data vm_offset_t      map_holes_data;
769 static __startup_data vm_size_t        map_holes_data_size;
770 static __startup_data vm_map_t        *early_map_owners[VM_MAP_EARLY_COUNT_MAX];
771 static __startup_data uint32_t         early_map_count;
772 
773 #if XNU_TARGET_OS_OSX
774 #define         NO_COALESCE_LIMIT  ((1024 * 128) - 1)
775 #else /* XNU_TARGET_OS_OSX */
776 #define         NO_COALESCE_LIMIT  0
777 #endif /* XNU_TARGET_OS_OSX */
778 
779 /* Skip acquiring locks if we're in the midst of a kernel core dump */
780 unsigned int not_in_kdp = 1;
781 
782 unsigned int vm_map_set_cache_attr_count = 0;
783 
784 kern_return_t
vm_map_set_cache_attr(vm_map_t map,vm_map_offset_t va)785 vm_map_set_cache_attr(
786 	vm_map_t        map,
787 	vm_map_offset_t va)
788 {
789 	vm_map_entry_t  map_entry;
790 	vm_object_t     object;
791 	kern_return_t   kr = KERN_SUCCESS;
792 
793 	vm_map_lock_read(map);
794 
795 	if (!vm_map_lookup_entry(map, va, &map_entry) ||
796 	    map_entry->is_sub_map) {
797 		/*
798 		 * that memory is not properly mapped
799 		 */
800 		kr = KERN_INVALID_ARGUMENT;
801 		goto done;
802 	}
803 	object = VME_OBJECT(map_entry);
804 
805 	if (object == VM_OBJECT_NULL) {
806 		/*
807 		 * there should be a VM object here at this point
808 		 */
809 		kr = KERN_INVALID_ARGUMENT;
810 		goto done;
811 	}
812 	vm_object_lock(object);
813 	object->set_cache_attr = TRUE;
814 	vm_object_unlock(object);
815 
816 	vm_map_set_cache_attr_count++;
817 done:
818 	vm_map_unlock_read(map);
819 
820 	return kr;
821 }
822 
823 
824 #if CONFIG_CODE_DECRYPTION
825 /*
826  * vm_map_apple_protected:
827  * This remaps the requested part of the object with an object backed by
828  * the decrypting pager.
829  * crypt_info contains entry points and session data for the crypt module.
830  * The crypt_info block will be copied by vm_map_apple_protected. The data structures
831  * referenced in crypt_info must remain valid until crypt_info->crypt_end() is called.
832  */
833 kern_return_t
vm_map_apple_protected(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_object_offset_t crypto_backing_offset,struct pager_crypt_info * crypt_info,uint32_t cryptid)834 vm_map_apple_protected(
835 	vm_map_t                map,
836 	vm_map_offset_t         start,
837 	vm_map_offset_t         end,
838 	vm_object_offset_t      crypto_backing_offset,
839 	struct pager_crypt_info *crypt_info,
840 	uint32_t                cryptid)
841 {
842 	boolean_t       map_locked;
843 	kern_return_t   kr;
844 	vm_map_entry_t  map_entry;
845 	struct vm_map_entry tmp_entry;
846 	memory_object_t unprotected_mem_obj;
847 	vm_object_t     protected_object;
848 	vm_map_offset_t map_addr;
849 	vm_map_offset_t start_aligned, end_aligned;
850 	vm_object_offset_t      crypto_start, crypto_end;
851 	int             vm_flags;
852 	vm_map_kernel_flags_t vmk_flags;
853 	boolean_t       cache_pager;
854 
855 	vm_flags = 0;
856 	vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
857 
858 	map_locked = FALSE;
859 	unprotected_mem_obj = MEMORY_OBJECT_NULL;
860 
861 	start_aligned = vm_map_trunc_page(start, PAGE_MASK_64);
862 	end_aligned = vm_map_round_page(end, PAGE_MASK_64);
863 	start_aligned = vm_map_trunc_page(start_aligned, VM_MAP_PAGE_MASK(map));
864 	end_aligned = vm_map_round_page(end_aligned, VM_MAP_PAGE_MASK(map));
865 
866 #if __arm64__
867 	/*
868 	 * "start" and "end" might be 4K-aligned but not 16K-aligned,
869 	 * so we might have to loop and establish up to 3 mappings:
870 	 *
871 	 * + the first 16K-page, which might overlap with the previous
872 	 *   4K-aligned mapping,
873 	 * + the center,
874 	 * + the last 16K-page, which might overlap with the next
875 	 *   4K-aligned mapping.
876 	 * Each of these mapping might be backed by a vnode pager (if
877 	 * properly page-aligned) or a "fourk_pager", itself backed by a
878 	 * vnode pager (if 4K-aligned but not page-aligned).
879 	 */
880 #endif /* __arm64__ */
881 
882 	map_addr = start_aligned;
883 	for (map_addr = start_aligned;
884 	    map_addr < end;
885 	    map_addr = tmp_entry.vme_end) {
886 		vm_map_lock(map);
887 		map_locked = TRUE;
888 
889 		/* lookup the protected VM object */
890 		if (!vm_map_lookup_entry(map,
891 		    map_addr,
892 		    &map_entry) ||
893 		    map_entry->is_sub_map ||
894 		    VME_OBJECT(map_entry) == VM_OBJECT_NULL) {
895 			/* that memory is not properly mapped */
896 			kr = KERN_INVALID_ARGUMENT;
897 			goto done;
898 		}
899 
900 		/* ensure mapped memory is mapped as executable except
901 		 *  except for model decryption flow */
902 		if ((cryptid != CRYPTID_MODEL_ENCRYPTION) &&
903 		    !(map_entry->protection & VM_PROT_EXECUTE)) {
904 			kr = KERN_INVALID_ARGUMENT;
905 			goto done;
906 		}
907 
908 		/* get the protected object to be decrypted */
909 		protected_object = VME_OBJECT(map_entry);
910 		if (protected_object == VM_OBJECT_NULL) {
911 			/* there should be a VM object here at this point */
912 			kr = KERN_INVALID_ARGUMENT;
913 			goto done;
914 		}
915 		/* ensure protected object stays alive while map is unlocked */
916 		vm_object_reference(protected_object);
917 
918 		/* limit the map entry to the area we want to cover */
919 		vm_map_clip_start(map, map_entry, start_aligned);
920 		vm_map_clip_end(map, map_entry, end_aligned);
921 
922 		tmp_entry = *map_entry;
923 		map_entry = VM_MAP_ENTRY_NULL; /* not valid after unlocking map */
924 		vm_map_unlock(map);
925 		map_locked = FALSE;
926 
927 		/*
928 		 * This map entry might be only partially encrypted
929 		 * (if not fully "page-aligned").
930 		 */
931 		crypto_start = 0;
932 		crypto_end = tmp_entry.vme_end - tmp_entry.vme_start;
933 		if (tmp_entry.vme_start < start) {
934 			if (tmp_entry.vme_start != start_aligned) {
935 				kr = KERN_INVALID_ADDRESS;
936 			}
937 			crypto_start += (start - tmp_entry.vme_start);
938 		}
939 		if (tmp_entry.vme_end > end) {
940 			if (tmp_entry.vme_end != end_aligned) {
941 				kr = KERN_INVALID_ADDRESS;
942 			}
943 			crypto_end -= (tmp_entry.vme_end - end);
944 		}
945 
946 		/*
947 		 * This "extra backing offset" is needed to get the decryption
948 		 * routine to use the right key.  It adjusts for the possibly
949 		 * relative offset of an interposed "4K" pager...
950 		 */
951 		if (crypto_backing_offset == (vm_object_offset_t) -1) {
952 			crypto_backing_offset = VME_OFFSET(&tmp_entry);
953 		}
954 
955 		cache_pager = TRUE;
956 #if XNU_TARGET_OS_OSX
957 		if (vm_map_is_alien(map)) {
958 			cache_pager = FALSE;
959 		}
960 #endif /* XNU_TARGET_OS_OSX */
961 
962 		/*
963 		 * Lookup (and create if necessary) the protected memory object
964 		 * matching that VM object.
965 		 * If successful, this also grabs a reference on the memory object,
966 		 * to guarantee that it doesn't go away before we get a chance to map
967 		 * it.
968 		 */
969 		unprotected_mem_obj = apple_protect_pager_setup(
970 			protected_object,
971 			VME_OFFSET(&tmp_entry),
972 			crypto_backing_offset,
973 			crypt_info,
974 			crypto_start,
975 			crypto_end,
976 			cache_pager);
977 
978 		/* release extra ref on protected object */
979 		vm_object_deallocate(protected_object);
980 
981 		if (unprotected_mem_obj == NULL) {
982 			kr = KERN_FAILURE;
983 			goto done;
984 		}
985 
986 		vm_flags = VM_FLAGS_FIXED | VM_FLAGS_OVERWRITE;
987 		/* can overwrite an immutable mapping */
988 		vmk_flags.vmkf_overwrite_immutable = TRUE;
989 #if __arm64__
990 		if (tmp_entry.used_for_jit &&
991 		    (VM_MAP_PAGE_SHIFT(map) != FOURK_PAGE_SHIFT ||
992 		    PAGE_SHIFT != FOURK_PAGE_SHIFT) &&
993 		    fourk_binary_compatibility_unsafe &&
994 		    fourk_binary_compatibility_allow_wx) {
995 			printf("** FOURK_COMPAT [%d]: "
996 			    "allowing write+execute at 0x%llx\n",
997 			    proc_selfpid(), tmp_entry.vme_start);
998 			vmk_flags.vmkf_map_jit = TRUE;
999 		}
1000 #endif /* __arm64__ */
1001 
1002 		/* map this memory object in place of the current one */
1003 		map_addr = tmp_entry.vme_start;
1004 		kr = vm_map_enter_mem_object(map,
1005 		    &map_addr,
1006 		    (tmp_entry.vme_end -
1007 		    tmp_entry.vme_start),
1008 		    (mach_vm_offset_t) 0,
1009 		    vm_flags,
1010 		    vmk_flags,
1011 		    VM_KERN_MEMORY_NONE,
1012 		    (ipc_port_t)(uintptr_t) unprotected_mem_obj,
1013 		    0,
1014 		    TRUE,
1015 		    tmp_entry.protection,
1016 		    tmp_entry.max_protection,
1017 		    tmp_entry.inheritance);
1018 		assertf(kr == KERN_SUCCESS,
1019 		    "kr = 0x%x\n", kr);
1020 		assertf(map_addr == tmp_entry.vme_start,
1021 		    "map_addr=0x%llx vme_start=0x%llx tmp_entry=%p\n",
1022 		    (uint64_t)map_addr,
1023 		    (uint64_t) tmp_entry.vme_start,
1024 		    &tmp_entry);
1025 
1026 #if VM_MAP_DEBUG_APPLE_PROTECT
1027 		if (vm_map_debug_apple_protect) {
1028 			printf("APPLE_PROTECT: map %p [0x%llx:0x%llx] pager %p:"
1029 			    " backing:[object:%p,offset:0x%llx,"
1030 			    "crypto_backing_offset:0x%llx,"
1031 			    "crypto_start:0x%llx,crypto_end:0x%llx]\n",
1032 			    map,
1033 			    (uint64_t) map_addr,
1034 			    (uint64_t) (map_addr + (tmp_entry.vme_end -
1035 			    tmp_entry.vme_start)),
1036 			    unprotected_mem_obj,
1037 			    protected_object,
1038 			    VME_OFFSET(&tmp_entry),
1039 			    crypto_backing_offset,
1040 			    crypto_start,
1041 			    crypto_end);
1042 		}
1043 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
1044 
1045 		/*
1046 		 * Release the reference obtained by
1047 		 * apple_protect_pager_setup().
1048 		 * The mapping (if it succeeded) is now holding a reference on
1049 		 * the memory object.
1050 		 */
1051 		memory_object_deallocate(unprotected_mem_obj);
1052 		unprotected_mem_obj = MEMORY_OBJECT_NULL;
1053 
1054 		/* continue with next map entry */
1055 		crypto_backing_offset += (tmp_entry.vme_end -
1056 		    tmp_entry.vme_start);
1057 		crypto_backing_offset -= crypto_start;
1058 	}
1059 	kr = KERN_SUCCESS;
1060 
1061 done:
1062 	if (map_locked) {
1063 		vm_map_unlock(map);
1064 	}
1065 	return kr;
1066 }
1067 #endif  /* CONFIG_CODE_DECRYPTION */
1068 
1069 
1070 LCK_GRP_DECLARE(vm_map_lck_grp, "vm_map");
1071 LCK_ATTR_DECLARE(vm_map_lck_attr, 0, 0);
1072 LCK_ATTR_DECLARE(vm_map_lck_rw_attr, 0, LCK_ATTR_DEBUG);
1073 
1074 #if XNU_TARGET_OS_OSX
1075 int malloc_no_cow = 0;
1076 #else /* XNU_TARGET_OS_OSX */
1077 int malloc_no_cow = 1;
1078 #endif /* XNU_TARGET_OS_OSX */
1079 uint64_t vm_memory_malloc_no_cow_mask = 0ULL;
1080 #if DEBUG
1081 int vm_check_map_sanity = 0;
1082 #endif
1083 
1084 /*
1085  *	vm_map_init:
1086  *
1087  *	Initialize the vm_map module.  Must be called before
1088  *	any other vm_map routines.
1089  *
1090  *	Map and entry structures are allocated from zones -- we must
1091  *	initialize those zones.
1092  *
1093  *	There are three zones of interest:
1094  *
1095  *	vm_map_zone:		used to allocate maps.
1096  *	vm_map_entry_zone:	used to allocate map entries.
1097  *
1098  *	LP32:
1099  *	vm_map_entry_reserved_zone:     fallback zone for kernel map entries
1100  *
1101  *	The kernel allocates map entries from a special zone that is initially
1102  *	"crammed" with memory.  It would be difficult (perhaps impossible) for
1103  *	the kernel to allocate more memory to a entry zone when it became
1104  *	empty since the very act of allocating memory implies the creation
1105  *	of a new entry.
1106  */
1107 __startup_func
1108 void
vm_map_init(void)1109 vm_map_init(void)
1110 {
1111 
1112 #if MACH_ASSERT
1113 	PE_parse_boot_argn("debug4k_filter", &debug4k_filter,
1114 	    sizeof(debug4k_filter));
1115 #endif /* MACH_ASSERT */
1116 
1117 	zone_create_ext(VM_MAP_ZONE_NAME, sizeof(struct _vm_map),
1118 	    VM_MAP_ZFLAGS, ZONE_ID_VM_MAP, NULL);
1119 
1120 	/*
1121 	 * Don't quarantine because we always need elements available
1122 	 * Disallow GC on this zone... to aid the GC.
1123 	 */
1124 	zone_create_ext(VM_MAP_ENTRY_ZONE_NAME,
1125 	    sizeof(struct vm_map_entry), VM_MAP_ENTRY_ZFLAGS,
1126 	    ZONE_ID_VM_MAP_ENTRY, ^(zone_t z) {
1127 		z->z_elems_rsv = (uint16_t)(32 *
1128 		(ml_early_cpu_max_number() + 1));
1129 	});
1130 
1131 	zone_create_ext(VM_MAP_HOLES_ZONE_NAME,
1132 	    sizeof(struct vm_map_links), VM_MAP_HOLES_ZFLAGS,
1133 	    ZONE_ID_VM_MAP_HOLES, ^(zone_t z) {
1134 		z->z_elems_rsv = (uint16_t)(16 * 1024 / zone_elem_size(z));
1135 	});
1136 
1137 	zone_create_ext("VM map copies", sizeof(struct vm_map_copy),
1138 	    ZC_NOENCRYPT | ZC_CACHING, ZONE_ID_VM_MAP_COPY, NULL);
1139 
1140 	/*
1141 	 * Add the stolen memory to zones, adjust zone size and stolen counts.
1142 	 */
1143 	zone_cram_early(vm_map_zone, map_data, map_data_size);
1144 	zone_cram_early(vm_map_entry_zone, kentry_data, kentry_data_size);
1145 	zone_cram_early(vm_map_holes_zone, map_holes_data, map_holes_data_size);
1146 	printf("VM boostrap: %d maps, %d entries and %d holes available\n",
1147 	    vm_map_zone->z_elems_free,
1148 	    vm_map_entry_zone->z_elems_free,
1149 	    vm_map_holes_zone->z_elems_free);
1150 
1151 	/*
1152 	 * Since these are covered by zones, remove them from stolen page accounting.
1153 	 */
1154 	VM_PAGE_MOVE_STOLEN(atop_64(map_data_size) + atop_64(kentry_data_size) + atop_64(map_holes_data_size));
1155 
1156 #if VM_MAP_DEBUG_APPLE_PROTECT
1157 	PE_parse_boot_argn("vm_map_debug_apple_protect",
1158 	    &vm_map_debug_apple_protect,
1159 	    sizeof(vm_map_debug_apple_protect));
1160 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
1161 #if VM_MAP_DEBUG_APPLE_FOURK
1162 	PE_parse_boot_argn("vm_map_debug_fourk",
1163 	    &vm_map_debug_fourk,
1164 	    sizeof(vm_map_debug_fourk));
1165 #endif /* VM_MAP_DEBUG_FOURK */
1166 
1167 	PE_parse_boot_argn("malloc_no_cow",
1168 	    &malloc_no_cow,
1169 	    sizeof(malloc_no_cow));
1170 	if (malloc_no_cow) {
1171 		vm_memory_malloc_no_cow_mask = 0ULL;
1172 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC;
1173 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_SMALL;
1174 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_MEDIUM;
1175 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE;
1176 //		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_HUGE;
1177 //		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_REALLOC;
1178 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_TINY;
1179 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE_REUSABLE;
1180 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE_REUSED;
1181 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_NANO;
1182 //		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_TCMALLOC;
1183 		PE_parse_boot_argn("vm_memory_malloc_no_cow_mask",
1184 		    &vm_memory_malloc_no_cow_mask,
1185 		    sizeof(vm_memory_malloc_no_cow_mask));
1186 	}
1187 
1188 #if CONFIG_MAP_RANGES
1189 	vm_map_range_map_init();
1190 #endif /* CONFIG_MAP_RANGES */
1191 
1192 #if DEBUG
1193 	PE_parse_boot_argn("vm_check_map_sanity", &vm_check_map_sanity, sizeof(vm_check_map_sanity));
1194 	if (vm_check_map_sanity) {
1195 		kprintf("VM sanity checking enabled\n");
1196 	} else {
1197 		kprintf("VM sanity checking disabled. Set bootarg vm_check_map_sanity=1 to enable\n");
1198 	}
1199 #endif /* DEBUG */
1200 
1201 #if DEVELOPMENT || DEBUG
1202 	PE_parse_boot_argn("panic_on_unsigned_execute",
1203 	    &panic_on_unsigned_execute,
1204 	    sizeof(panic_on_unsigned_execute));
1205 	PE_parse_boot_argn("panic_on_mlock_failure",
1206 	    &panic_on_mlock_failure,
1207 	    sizeof(panic_on_mlock_failure));
1208 #endif /* DEVELOPMENT || DEBUG */
1209 }
1210 
1211 __startup_func
1212 static void
vm_map_steal_memory(void)1213 vm_map_steal_memory(void)
1214 {
1215 	/*
1216 	 * We need to reserve enough memory to support boostraping VM maps
1217 	 * and the zone subsystem.
1218 	 *
1219 	 * The VM Maps that need to function before zones can support them
1220 	 * are the ones registered with vm_map_will_allocate_early_map(),
1221 	 * which are:
1222 	 * - the kernel map
1223 	 * - the various submaps used by zones (pgz, meta, ...)
1224 	 *
1225 	 * We also need enough entries and holes to support them
1226 	 * until zone_metadata_init() is called, which is when
1227 	 * the zone allocator becomes capable of expanding dynamically.
1228 	 *
1229 	 * We need:
1230 	 * - VM_MAP_EARLY_COUNT_MAX worth of VM Maps.
1231 	 * - To allow for 3-4 entries per map, but the kernel map
1232 	 *   needs a multiple of VM_MAP_EARLY_COUNT_MAX entries
1233 	 *   to describe the submaps, so double it (and make it 8x too)
1234 	 * - To allow for holes between entries,
1235 	 *   hence needs the same budget as entries
1236 	 */
1237 	map_data_size = zone_get_early_alloc_size(VM_MAP_ZONE_NAME,
1238 	    sizeof(struct _vm_map), VM_MAP_ZFLAGS,
1239 	    VM_MAP_EARLY_COUNT_MAX);
1240 
1241 	kentry_data_size = zone_get_early_alloc_size(VM_MAP_ENTRY_ZONE_NAME,
1242 	    sizeof(struct vm_map_entry), VM_MAP_ENTRY_ZFLAGS,
1243 	    8 * VM_MAP_EARLY_COUNT_MAX);
1244 
1245 	map_holes_data_size = zone_get_early_alloc_size(VM_MAP_HOLES_ZONE_NAME,
1246 	    sizeof(struct vm_map_links), VM_MAP_HOLES_ZFLAGS,
1247 	    8 * VM_MAP_EARLY_COUNT_MAX);
1248 
1249 	/*
1250 	 * Steal a contiguous range of memory so that a simple range check
1251 	 * can validate early addresses being freed/crammed to these
1252 	 * zones
1253 	 */
1254 	map_data       = zone_early_mem_init(map_data_size + kentry_data_size +
1255 	    map_holes_data_size);
1256 	kentry_data    = map_data + map_data_size;
1257 	map_holes_data = kentry_data + kentry_data_size;
1258 }
1259 STARTUP(PMAP_STEAL, STARTUP_RANK_FIRST, vm_map_steal_memory);
1260 
1261 __startup_func
1262 static void
vm_kernel_boostraped(void)1263 vm_kernel_boostraped(void)
1264 {
1265 	printf("VM bootstrap done: %d maps, %d entries and %d holes left\n",
1266 	    vm_map_zone->z_elems_free,
1267 	    vm_map_entry_zone->z_elems_free,
1268 	    vm_map_holes_zone->z_elems_free);
1269 }
1270 STARTUP(ZALLOC, STARTUP_RANK_SECOND, vm_kernel_boostraped);
1271 
1272 void
vm_map_disable_hole_optimization(vm_map_t map)1273 vm_map_disable_hole_optimization(vm_map_t map)
1274 {
1275 	vm_map_entry_t  head_entry, hole_entry, next_hole_entry;
1276 
1277 	if (map->holelistenabled) {
1278 		head_entry = hole_entry = CAST_TO_VM_MAP_ENTRY(map->holes_list);
1279 
1280 		while (hole_entry != NULL) {
1281 			next_hole_entry = hole_entry->vme_next;
1282 
1283 			hole_entry->vme_next = NULL;
1284 			hole_entry->vme_prev = NULL;
1285 			zfree_id(ZONE_ID_VM_MAP_HOLES, hole_entry);
1286 
1287 			if (next_hole_entry == head_entry) {
1288 				hole_entry = NULL;
1289 			} else {
1290 				hole_entry = next_hole_entry;
1291 			}
1292 		}
1293 
1294 		map->holes_list = NULL;
1295 		map->holelistenabled = FALSE;
1296 
1297 		map->first_free = vm_map_first_entry(map);
1298 		SAVE_HINT_HOLE_WRITE(map, NULL);
1299 	}
1300 }
1301 
1302 boolean_t
vm_kernel_map_is_kernel(vm_map_t map)1303 vm_kernel_map_is_kernel(vm_map_t map)
1304 {
1305 	return map->pmap == kernel_pmap;
1306 }
1307 
1308 /*
1309  *	vm_map_create:
1310  *
1311  *	Creates and returns a new empty VM map with
1312  *	the given physical map structure, and having
1313  *	the given lower and upper address bounds.
1314  */
1315 
1316 extern vm_map_t vm_map_create_external(
1317 	pmap_t                  pmap,
1318 	vm_map_offset_t         min_off,
1319 	vm_map_offset_t         max_off,
1320 	boolean_t               pageable);
1321 
1322 vm_map_t
vm_map_create_external(pmap_t pmap,vm_map_offset_t min,vm_map_offset_t max,boolean_t pageable)1323 vm_map_create_external(
1324 	pmap_t                  pmap,
1325 	vm_map_offset_t         min,
1326 	vm_map_offset_t         max,
1327 	boolean_t               pageable)
1328 {
1329 	vm_map_create_options_t options = VM_MAP_CREATE_DEFAULT;
1330 
1331 	if (pageable) {
1332 		options |= VM_MAP_CREATE_PAGEABLE;
1333 	}
1334 	return vm_map_create_options(pmap, min, max, options);
1335 }
1336 
1337 __startup_func
1338 void
vm_map_will_allocate_early_map(vm_map_t * owner)1339 vm_map_will_allocate_early_map(vm_map_t *owner)
1340 {
1341 	if (early_map_count >= VM_MAP_EARLY_COUNT_MAX) {
1342 		panic("VM_MAP_EARLY_COUNT_MAX is too low");
1343 	}
1344 
1345 	early_map_owners[early_map_count++] = owner;
1346 }
1347 
1348 __startup_func
1349 void
vm_map_relocate_early_maps(vm_offset_t delta)1350 vm_map_relocate_early_maps(vm_offset_t delta)
1351 {
1352 	for (uint32_t i = 0; i < early_map_count; i++) {
1353 		vm_address_t addr = (vm_address_t)*early_map_owners[i];
1354 
1355 		*early_map_owners[i] = (vm_map_t)(addr + delta);
1356 	}
1357 
1358 	early_map_count = ~0u;
1359 }
1360 
1361 /*
1362  *	Routine:	vm_map_relocate_early_elem
1363  *
1364  *	Purpose:
1365  *		Early zone elements are allocated in a temporary part
1366  *		of the address space.
1367  *
1368  *		Once the zones live in their final place, the early
1369  *		VM maps, map entries and map holes need to be relocated.
1370  *
1371  *		It involves rewriting any vm_map_t, vm_map_entry_t or
1372  *		pointers to vm_map_links. Other pointers to other types
1373  *		are fine.
1374  *
1375  *		Fortunately, pointers to those types are self-contained
1376  *		in those zones, _except_ for pointers to VM maps,
1377  *		which are tracked during early boot and fixed with
1378  *		vm_map_relocate_early_maps().
1379  */
1380 __startup_func
1381 void
vm_map_relocate_early_elem(uint32_t zone_id,vm_offset_t new_addr,vm_offset_t delta)1382 vm_map_relocate_early_elem(
1383 	uint32_t                zone_id,
1384 	vm_offset_t             new_addr,
1385 	vm_offset_t             delta)
1386 {
1387 #define relocate(type_t, field)  ({ \
1388 	typeof(((type_t)NULL)->field) *__field = &((type_t)new_addr)->field;   \
1389 	if (*__field) {                                                        \
1390 	        *__field = (typeof(*__field))((vm_offset_t)*__field + delta);  \
1391 	}                                                                      \
1392 })
1393 
1394 	switch (zone_id) {
1395 	case ZONE_ID_VM_MAP:
1396 	case ZONE_ID_VM_MAP_ENTRY:
1397 	case ZONE_ID_VM_MAP_HOLES:
1398 		break;
1399 
1400 	default:
1401 		panic("Unexpected zone ID %d", zone_id);
1402 	}
1403 
1404 	if (zone_id == ZONE_ID_VM_MAP) {
1405 		relocate(vm_map_t, hdr.links.prev);
1406 		relocate(vm_map_t, hdr.links.next);
1407 		((vm_map_t)new_addr)->pmap = kernel_pmap;
1408 #ifdef VM_MAP_STORE_USE_RB
1409 		relocate(vm_map_t, hdr.rb_head_store.rbh_root);
1410 #endif /* VM_MAP_STORE_USE_RB */
1411 		relocate(vm_map_t, hint);
1412 		relocate(vm_map_t, hole_hint);
1413 		relocate(vm_map_t, first_free);
1414 		return;
1415 	}
1416 
1417 	relocate(struct vm_map_links *, prev);
1418 	relocate(struct vm_map_links *, next);
1419 
1420 	if (zone_id == ZONE_ID_VM_MAP_ENTRY) {
1421 #ifdef VM_MAP_STORE_USE_RB
1422 		relocate(vm_map_entry_t, store.entry.rbe_left);
1423 		relocate(vm_map_entry_t, store.entry.rbe_right);
1424 		relocate(vm_map_entry_t, store.entry.rbe_parent);
1425 #endif /* VM_MAP_STORE_USE_RB */
1426 		if (((vm_map_entry_t)new_addr)->is_sub_map) {
1427 			/* no object to relocate because we haven't made any */
1428 			((vm_map_entry_t)new_addr)->vme_submap +=
1429 			    delta >> VME_SUBMAP_SHIFT;
1430 		}
1431 #if MAP_ENTRY_CREATION_DEBUG
1432 		relocate(vm_map_entry_t, vme_creation_maphdr);
1433 #endif /* MAP_ENTRY_CREATION_DEBUG */
1434 	}
1435 
1436 #undef relocate
1437 }
1438 
1439 vm_map_t
vm_map_create_options(pmap_t pmap,vm_map_offset_t min,vm_map_offset_t max,vm_map_create_options_t options)1440 vm_map_create_options(
1441 	pmap_t                  pmap,
1442 	vm_map_offset_t         min,
1443 	vm_map_offset_t         max,
1444 	vm_map_create_options_t options)
1445 {
1446 	vm_map_t result;
1447 
1448 #if DEBUG || DEVELOPMENT
1449 	if (__improbable(startup_phase < STARTUP_SUB_ZALLOC)) {
1450 		if (early_map_count != ~0u && early_map_count !=
1451 		    zone_count_allocated(vm_map_zone) + 1) {
1452 			panic("allocating %dth early map, owner not known",
1453 			    zone_count_allocated(vm_map_zone) + 1);
1454 		}
1455 		if (early_map_count != ~0u && pmap && pmap != kernel_pmap) {
1456 			panic("allocating %dth early map for non kernel pmap",
1457 			    early_map_count);
1458 		}
1459 	}
1460 #endif /* DEBUG || DEVELOPMENT */
1461 
1462 	result = zalloc_id(ZONE_ID_VM_MAP, Z_WAITOK | Z_NOFAIL | Z_ZERO);
1463 
1464 	vm_map_first_entry(result) = vm_map_to_entry(result);
1465 	vm_map_last_entry(result)  = vm_map_to_entry(result);
1466 
1467 	vm_map_store_init(&result->hdr);
1468 	result->hdr.entries_pageable = (bool)(options & VM_MAP_CREATE_PAGEABLE);
1469 	vm_map_set_page_shift(result, PAGE_SHIFT);
1470 
1471 	result->size_limit = RLIM_INFINITY;             /* default unlimited */
1472 	result->data_limit = RLIM_INFINITY;             /* default unlimited */
1473 	result->user_wire_limit = MACH_VM_MAX_ADDRESS;  /* default limit is unlimited */
1474 	os_ref_init_count_raw(&result->map_refcnt, &map_refgrp, 1);
1475 	result->pmap = pmap;
1476 	result->min_offset = min;
1477 	result->max_offset = max;
1478 	result->first_free = vm_map_to_entry(result);
1479 	result->hint = vm_map_to_entry(result);
1480 
1481 	if (options & VM_MAP_CREATE_NEVER_FAULTS) {
1482 		assert(pmap == kernel_pmap);
1483 		result->never_faults = true;
1484 	}
1485 
1486 	/* "has_corpse_footprint" and "holelistenabled" are mutually exclusive */
1487 	if (options & VM_MAP_CREATE_CORPSE_FOOTPRINT) {
1488 		result->has_corpse_footprint = true;
1489 	} else if (!(options & VM_MAP_CREATE_DISABLE_HOLELIST)) {
1490 		struct vm_map_links *hole_entry;
1491 
1492 		hole_entry = zalloc_id(ZONE_ID_VM_MAP_HOLES, Z_WAITOK | Z_NOFAIL);
1493 		hole_entry->start = min;
1494 #if defined(__arm64__)
1495 		hole_entry->end = result->max_offset;
1496 #else
1497 		hole_entry->end = MAX(max, (vm_map_offset_t)MACH_VM_MAX_ADDRESS);
1498 #endif
1499 		result->holes_list = result->hole_hint = hole_entry;
1500 		hole_entry->prev = hole_entry->next = CAST_TO_VM_MAP_ENTRY(hole_entry);
1501 		result->holelistenabled = true;
1502 	}
1503 
1504 	vm_map_lock_init(result);
1505 
1506 	return result;
1507 }
1508 
1509 /*
1510  * Adjusts a submap that was made by kmem_suballoc()
1511  * before it knew where it would be mapped,
1512  * so that it has the right min/max offsets.
1513  *
1514  * We do not need to hold any locks:
1515  * only the caller knows about this map,
1516  * and it is not published on any entry yet.
1517  */
1518 static void
vm_map_adjust_offsets(vm_map_t map,vm_map_offset_t min_off,vm_map_offset_t max_off)1519 vm_map_adjust_offsets(
1520 	vm_map_t                map,
1521 	vm_map_offset_t         min_off,
1522 	vm_map_offset_t         max_off)
1523 {
1524 	assert(map->min_offset == 0);
1525 	assert(map->max_offset == max_off - min_off);
1526 	assert(map->hdr.nentries == 0);
1527 	assert(os_ref_get_count_raw(&map->map_refcnt) == 2);
1528 
1529 	map->min_offset = min_off;
1530 	map->max_offset = max_off;
1531 
1532 	if (map->holelistenabled) {
1533 		struct vm_map_links *hole = map->holes_list;
1534 
1535 		hole->start = min_off;
1536 #if defined(__arm64__)
1537 		hole->end = max_off;
1538 #else
1539 		hole->end = MAX(max_off, (vm_map_offset_t)MACH_VM_MAX_ADDRESS);
1540 #endif
1541 	}
1542 }
1543 
1544 
1545 vm_map_size_t
vm_map_adjusted_size(vm_map_t map)1546 vm_map_adjusted_size(vm_map_t map)
1547 {
1548 	struct vm_reserved_region *regions = NULL;
1549 	size_t num_regions = 0;
1550 	mach_vm_size_t  reserved_size = 0, map_size = 0;
1551 
1552 	if (map == NULL || (map->size == 0)) {
1553 		return 0;
1554 	}
1555 
1556 	map_size = map->size;
1557 
1558 	if (map->reserved_regions == FALSE || !vm_map_is_exotic(map) || map->terminated) {
1559 		/*
1560 		 * No special reserved regions or not an exotic map or the task
1561 		 * is terminating and these special regions might have already
1562 		 * been deallocated.
1563 		 */
1564 		return map_size;
1565 	}
1566 
1567 	num_regions = ml_get_vm_reserved_regions(vm_map_is_64bit(map), &regions);
1568 	assert((num_regions == 0) || (num_regions > 0 && regions != NULL));
1569 
1570 	while (num_regions) {
1571 		reserved_size += regions[--num_regions].vmrr_size;
1572 	}
1573 
1574 	/*
1575 	 * There are a few places where the map is being switched out due to
1576 	 * 'termination' without that bit being set (e.g. exec and corpse purging).
1577 	 * In those cases, we could have the map's regions being deallocated on
1578 	 * a core while some accounting process is trying to get the map's size.
1579 	 * So this assert can't be enabled till all those places are uniform in
1580 	 * their use of the 'map->terminated' bit.
1581 	 *
1582 	 * assert(map_size >= reserved_size);
1583 	 */
1584 
1585 	return (map_size >= reserved_size) ? (map_size - reserved_size) : map_size;
1586 }
1587 
1588 /*
1589  *	vm_map_entry_create:	[ internal use only ]
1590  *
1591  *	Allocates a VM map entry for insertion in the
1592  *	given map (or map copy).  No fields are filled.
1593  *
1594  *	The VM entry will be zero initialized, except for:
1595  *	- behavior set to VM_BEHAVIOR_DEFAULT
1596  *	- inheritance set to VM_INHERIT_DEFAULT
1597  */
1598 #define vm_map_entry_create(map)    _vm_map_entry_create(&(map)->hdr)
1599 
1600 #define vm_map_copy_entry_create(copy) _vm_map_entry_create(&(copy)->cpy_hdr)
1601 
1602 static vm_map_entry_t
_vm_map_entry_create(struct vm_map_header * map_header __unused)1603 _vm_map_entry_create(
1604 	struct vm_map_header    *map_header __unused)
1605 {
1606 	vm_map_entry_t entry = NULL;
1607 
1608 	entry = zalloc_id(ZONE_ID_VM_MAP_ENTRY, Z_WAITOK | Z_ZERO);
1609 
1610 	/*
1611 	 * Help the compiler with what we know to be true,
1612 	 * so that the further bitfields inits have good codegen.
1613 	 *
1614 	 * See rdar://87041299
1615 	 */
1616 	__builtin_assume(entry->vme_object_value == 0);
1617 	__builtin_assume(*(uint64_t *)(&entry->vme_object_value + 1) == 0);
1618 	__builtin_assume(*(uint64_t *)(&entry->vme_object_value + 2) == 0);
1619 
1620 	static_assert(VM_MAX_TAG_VALUE <= VME_ALIAS_MASK,
1621 	    "VME_ALIAS_MASK covers tags");
1622 
1623 	static_assert(VM_BEHAVIOR_DEFAULT == 0,
1624 	    "can skip zeroing of the behavior field");
1625 	entry->inheritance = VM_INHERIT_DEFAULT;
1626 
1627 	vm_map_store_update((vm_map_t) NULL, entry, VM_MAP_ENTRY_CREATE);
1628 
1629 #if MAP_ENTRY_CREATION_DEBUG
1630 	entry->vme_creation_maphdr = map_header;
1631 	entry->vme_creation_bt = btref_get(__builtin_frame_address(0),
1632 	    BTREF_GET_NOWAIT);
1633 #endif
1634 	return entry;
1635 }
1636 
1637 /*
1638  *	vm_map_entry_dispose:	[ internal use only ]
1639  *
1640  *	Inverse of vm_map_entry_create.
1641  *
1642  *      write map lock held so no need to
1643  *	do anything special to insure correctness
1644  *      of the stores
1645  */
1646 static void
vm_map_entry_dispose(vm_map_entry_t entry)1647 vm_map_entry_dispose(
1648 	vm_map_entry_t          entry)
1649 {
1650 #if MAP_ENTRY_CREATION_DEBUG
1651 	btref_put(entry->vme_creation_bt);
1652 #endif
1653 #if MAP_ENTRY_INSERTION_DEBUG
1654 	btref_put(entry->vme_insertion_bt);
1655 #endif
1656 	zfree(vm_map_entry_zone, entry);
1657 }
1658 
1659 #define vm_map_copy_entry_dispose(copy_entry) \
1660 	vm_map_entry_dispose(copy_entry)
1661 
1662 static vm_map_entry_t
vm_map_zap_first_entry(vm_map_zap_t list)1663 vm_map_zap_first_entry(
1664 	vm_map_zap_t            list)
1665 {
1666 	return list->vmz_head;
1667 }
1668 
1669 static vm_map_entry_t
vm_map_zap_last_entry(vm_map_zap_t list)1670 vm_map_zap_last_entry(
1671 	vm_map_zap_t            list)
1672 {
1673 	assert(vm_map_zap_first_entry(list));
1674 	return __container_of(list->vmz_tail, struct vm_map_entry, vme_next);
1675 }
1676 
1677 static void
vm_map_zap_append(vm_map_zap_t list,vm_map_entry_t entry)1678 vm_map_zap_append(
1679 	vm_map_zap_t            list,
1680 	vm_map_entry_t          entry)
1681 {
1682 	entry->vme_next = VM_MAP_ENTRY_NULL;
1683 	*list->vmz_tail = entry;
1684 	list->vmz_tail = &entry->vme_next;
1685 }
1686 
1687 static vm_map_entry_t
vm_map_zap_pop(vm_map_zap_t list)1688 vm_map_zap_pop(
1689 	vm_map_zap_t            list)
1690 {
1691 	vm_map_entry_t head = list->vmz_head;
1692 
1693 	if (head != VM_MAP_ENTRY_NULL &&
1694 	    (list->vmz_head = head->vme_next) == VM_MAP_ENTRY_NULL) {
1695 		list->vmz_tail = &list->vmz_head;
1696 	}
1697 
1698 	return head;
1699 }
1700 
1701 static void
vm_map_zap_dispose(vm_map_zap_t list)1702 vm_map_zap_dispose(
1703 	vm_map_zap_t            list)
1704 {
1705 	vm_map_entry_t          entry;
1706 
1707 	while ((entry = vm_map_zap_pop(list))) {
1708 		if (entry->is_sub_map) {
1709 			vm_map_deallocate(VME_SUBMAP(entry));
1710 		} else {
1711 			vm_object_deallocate(VME_OBJECT(entry));
1712 		}
1713 
1714 		vm_map_entry_dispose(entry);
1715 	}
1716 }
1717 
1718 #if MACH_ASSERT
1719 static boolean_t first_free_check = FALSE;
1720 boolean_t
first_free_is_valid(vm_map_t map)1721 first_free_is_valid(
1722 	vm_map_t        map)
1723 {
1724 	if (!first_free_check) {
1725 		return TRUE;
1726 	}
1727 
1728 	return first_free_is_valid_store( map );
1729 }
1730 #endif /* MACH_ASSERT */
1731 
1732 
1733 #define vm_map_copy_entry_link(copy, after_where, entry)                \
1734 	_vm_map_store_entry_link(&(copy)->cpy_hdr, after_where, (entry))
1735 
1736 #define vm_map_copy_entry_unlink(copy, entry)                           \
1737 	_vm_map_store_entry_unlink(&(copy)->cpy_hdr, (entry), false)
1738 
1739 /*
1740  *	vm_map_destroy:
1741  *
1742  *	Actually destroy a map.
1743  */
1744 void
vm_map_destroy(vm_map_t map)1745 vm_map_destroy(
1746 	vm_map_t        map)
1747 {
1748 	/* final cleanup: this is not allowed to fail */
1749 	vmr_flags_t flags = VM_MAP_REMOVE_NO_FLAGS;
1750 
1751 	VM_MAP_ZAP_DECLARE(zap);
1752 
1753 	vm_map_lock(map);
1754 
1755 	map->terminated = true;
1756 	/* clean up regular map entries */
1757 	(void)vm_map_delete(map, map->min_offset, map->max_offset, flags,
1758 	    KMEM_GUARD_NONE, &zap);
1759 	/* clean up leftover special mappings (commpage, GPU carveout, etc...) */
1760 	(void)vm_map_delete(map, 0x0, 0xFFFFFFFFFFFFF000ULL, flags,
1761 	    KMEM_GUARD_NONE, &zap);
1762 
1763 	vm_map_disable_hole_optimization(map);
1764 	vm_map_corpse_footprint_destroy(map);
1765 
1766 	vm_map_unlock(map);
1767 
1768 	vm_map_zap_dispose(&zap);
1769 
1770 	assert(map->hdr.nentries == 0);
1771 
1772 	if (map->pmap) {
1773 		pmap_destroy(map->pmap);
1774 	}
1775 
1776 	lck_rw_destroy(&map->lock, &vm_map_lck_grp);
1777 
1778 	zfree_id(ZONE_ID_VM_MAP, map);
1779 }
1780 
1781 /*
1782  * Returns pid of the task with the largest number of VM map entries.
1783  * Used in the zone-map-exhaustion jetsam path.
1784  */
1785 pid_t
find_largest_process_vm_map_entries(void)1786 find_largest_process_vm_map_entries(void)
1787 {
1788 	pid_t victim_pid = -1;
1789 	int max_vm_map_entries = 0;
1790 	task_t task = TASK_NULL;
1791 	queue_head_t *task_list = &tasks;
1792 
1793 	lck_mtx_lock(&tasks_threads_lock);
1794 	queue_iterate(task_list, task, task_t, tasks) {
1795 		if (task == kernel_task || !task->active) {
1796 			continue;
1797 		}
1798 
1799 		vm_map_t task_map = task->map;
1800 		if (task_map != VM_MAP_NULL) {
1801 			int task_vm_map_entries = task_map->hdr.nentries;
1802 			if (task_vm_map_entries > max_vm_map_entries) {
1803 				max_vm_map_entries = task_vm_map_entries;
1804 				victim_pid = pid_from_task(task);
1805 			}
1806 		}
1807 	}
1808 	lck_mtx_unlock(&tasks_threads_lock);
1809 
1810 	printf("zone_map_exhaustion: victim pid %d, vm region count: %d\n", victim_pid, max_vm_map_entries);
1811 	return victim_pid;
1812 }
1813 
1814 
1815 /*
1816  *	vm_map_lookup_entry:	[ internal use only ]
1817  *
1818  *	Calls into the vm map store layer to find the map
1819  *	entry containing (or immediately preceding) the
1820  *	specified address in the given map; the entry is returned
1821  *	in the "entry" parameter.  The boolean
1822  *	result indicates whether the address is
1823  *	actually contained in the map.
1824  */
1825 boolean_t
vm_map_lookup_entry(vm_map_t map,vm_map_offset_t address,vm_map_entry_t * entry)1826 vm_map_lookup_entry(
1827 	vm_map_t        map,
1828 	vm_map_offset_t address,
1829 	vm_map_entry_t  *entry)         /* OUT */
1830 {
1831 #if CONFIG_KERNEL_TBI
1832 	if (VM_KERNEL_ADDRESS(address)) {
1833 		address = VM_KERNEL_STRIP_UPTR(address);
1834 	}
1835 #endif /* CONFIG_KERNEL_TBI */
1836 #if CONFIG_PROB_GZALLOC
1837 	if (map->pmap == kernel_pmap) {
1838 		assertf(!pgz_owned(address),
1839 		    "it is the responsibility of callers to unguard PGZ addresses");
1840 	}
1841 #endif /* CONFIG_PROB_GZALLOC */
1842 	return vm_map_store_lookup_entry( map, address, entry );
1843 }
1844 
1845 boolean_t
vm_map_lookup_entry_or_next(vm_map_t map,vm_map_offset_t address,vm_map_entry_t * entry)1846 vm_map_lookup_entry_or_next(
1847 	vm_map_t        map,
1848 	vm_map_offset_t address,
1849 	vm_map_entry_t  *entry)         /* OUT */
1850 {
1851 	if (vm_map_lookup_entry(map, address, entry)) {
1852 		return true;
1853 	}
1854 
1855 	*entry = (*entry)->vme_next;
1856 	return false;
1857 }
1858 
1859 #if CONFIG_PROB_GZALLOC
1860 boolean_t
vm_map_lookup_entry_allow_pgz(vm_map_t map,vm_map_offset_t address,vm_map_entry_t * entry)1861 vm_map_lookup_entry_allow_pgz(
1862 	vm_map_t        map,
1863 	vm_map_offset_t address,
1864 	vm_map_entry_t  *entry)         /* OUT */
1865 {
1866 #if CONFIG_KERNEL_TBI
1867 	if (VM_KERNEL_ADDRESS(address)) {
1868 		address = VM_KERNEL_STRIP_UPTR(address);
1869 	}
1870 #endif /* CONFIG_KERNEL_TBI */
1871 	return vm_map_store_lookup_entry( map, address, entry );
1872 }
1873 #endif /* CONFIG_PROB_GZALLOC */
1874 
1875 #if !ZSECURITY_CONFIG(KERNEL_DATA_SPLIT)
1876 /*
1877  *	Routine:	vm_map_adjust_direction
1878  *	Purpose:
1879  *			Overrides direction to reduce fragmentation. Allocate small
1880  *			allocations from the end and large allocations from the right.
1881  */
1882 static void
vm_map_adjust_direction(vm_map_kernel_flags_t * vmk_flags,vm_map_size_t size)1883 vm_map_adjust_direction(
1884 	vm_map_kernel_flags_t *vmk_flags,
1885 	vm_map_size_t          size)
1886 {
1887 	if (size < KMEM_SMALLMAP_THRESHOLD) {
1888 		vmk_flags->vmkf_last_free = true;
1889 	} else {
1890 		vmk_flags->vmkf_last_free = false;
1891 	}
1892 }
1893 #endif /* !ZSECURITY_CONFIG(KERNEL_DATA_SPLIT) */
1894 
1895 /*
1896  *	Routine:	vm_map_range_invalid_panic
1897  *	Purpose:
1898  *			Panic on detection of an invalid range id.
1899  */
1900 __abortlike
1901 static void
vm_map_range_invalid_panic(vm_map_t map,vm_map_range_id_t range_id)1902 vm_map_range_invalid_panic(
1903 	vm_map_t                map,
1904 	vm_map_range_id_t       range_id)
1905 {
1906 	panic("invalid range ID (%u) for map %p", range_id, map);
1907 }
1908 
1909 /*
1910  *	Routine:	vm_map_get_range
1911  *	Purpose:
1912  *			Adjust bounds based on security policy.
1913  */
1914 static struct mach_vm_range
vm_map_get_range(vm_map_t map,vm_map_address_t * address,vm_map_kernel_flags_t * vmk_flags,vm_map_size_t size)1915 vm_map_get_range(
1916 	vm_map_t                map,
1917 	vm_map_address_t       *address,
1918 	vm_map_kernel_flags_t  *vmk_flags,
1919 	vm_map_size_t           size)
1920 {
1921 	struct mach_vm_range effective_range = {};
1922 	vm_map_range_id_t range_id = vmk_flags->vmkf_range_id;
1923 
1924 	if (map == kernel_map) {
1925 		effective_range = kmem_ranges[range_id];
1926 
1927 		if (startup_phase >= STARTUP_SUB_KMEM) {
1928 			/*
1929 			 * Hint provided by caller is zeroed as the range is restricted to a
1930 			 * subset of the entire kernel_map VA, which could put the hint outside
1931 			 * the range, causing vm_map_store_find_space to fail.
1932 			 */
1933 			*address = 0ull;
1934 			/*
1935 			 * Ensure that range_id passed in by the caller is within meaningful
1936 			 * bounds. Range id of KMEM_RANGE_ID_NONE will cause vm_map_locate_space
1937 			 * to fail as the corresponding range is invalid. Range id larger than
1938 			 * KMEM_RANGE_ID_MAX will lead to an OOB access.
1939 			 */
1940 			if ((range_id == KMEM_RANGE_ID_NONE) ||
1941 			    (range_id > KMEM_RANGE_ID_MAX)) {
1942 				vm_map_range_invalid_panic(map, range_id);
1943 			}
1944 #if ZSECURITY_CONFIG(KERNEL_DATA_SPLIT)
1945 			/*
1946 			 * Each allocation front looks like [ S | L | S ]
1947 			 * Adjust range for allocations larger than KMEM_SMALLMAP_THRESHOLD.
1948 			 * Allocations smaller than KMEM_SMALLMAP_THRESHOLD are allowed to
1949 			 * use the entire range. Two small allocations from different fronts
1950 			 * (left and right) can only meet when memory in the that range is
1951 			 * entirely exhausted.
1952 			 */
1953 			if (size >= KMEM_SMALLMAP_THRESHOLD) {
1954 				effective_range = kmem_large_ranges[range_id];
1955 			}
1956 #else /* ZSECURITY_CONFIG(KERNEL_DATA_SPLIT) */
1957 			vm_map_adjust_direction(vmk_flags, size);
1958 #endif /* ZSECURITY_CONFIG(KERNEL_DATA_SPLIT) */
1959 		}
1960 #if CONFIG_MAP_RANGES
1961 	} else if (map->uses_user_ranges) {
1962 		if (range_id > UMEM_RANGE_ID_MAX) {
1963 			vm_map_range_invalid_panic(map, range_id);
1964 		}
1965 
1966 		effective_range = map->user_range[range_id];
1967 #endif /* CONFIG_MAP_RANGES */
1968 	} else {
1969 		/*
1970 		 * If minimum is 0, bump it up by PAGE_SIZE.  We want to limit
1971 		 * allocations of PAGEZERO to explicit requests since its
1972 		 * normal use is to catch dereferences of NULL and many
1973 		 * applications also treat pointers with a value of 0 as
1974 		 * special and suddenly having address 0 contain useable
1975 		 * memory would tend to confuse those applications.
1976 		 */
1977 		effective_range.min_address = MAX(map->min_offset, VM_MAP_PAGE_SIZE(map));
1978 		effective_range.max_address = map->max_offset;
1979 	}
1980 
1981 	return effective_range;
1982 }
1983 
1984 /*
1985  *	Routine:	vm_map_locate_space
1986  *	Purpose:
1987  *		Finds a range in the specified virtual address map,
1988  *		returning the start of that range,
1989  *		as well as the entry right before it.
1990  */
1991 kern_return_t
vm_map_locate_space(vm_map_t map,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_map_offset_t * start_inout,vm_map_entry_t * entry_out)1992 vm_map_locate_space(
1993 	vm_map_t                map,
1994 	vm_map_size_t           size,
1995 	vm_map_offset_t         mask,
1996 	vm_map_kernel_flags_t   vmk_flags,
1997 	vm_map_offset_t        *start_inout,
1998 	vm_map_entry_t         *entry_out)
1999 {
2000 	struct mach_vm_range effective_range = {};
2001 	vm_map_size_t   guard_offset;
2002 	vm_map_offset_t hint, limit;
2003 	vm_map_entry_t  entry;
2004 
2005 	/*
2006 	 * Only supported by vm_map_enter() with a fixed address.
2007 	 */
2008 	assert(!vmk_flags.vmkf_beyond_max);
2009 
2010 	if (__improbable(map->wait_for_space)) {
2011 		/*
2012 		 * support for "wait_for_space" is minimal,
2013 		 * its only consumer is the ipc_kernel_copy_map.
2014 		 */
2015 		assert(!map->holelistenabled &&
2016 		    !vmk_flags.vmkf_last_free &&
2017 		    !vmk_flags.vmkf_keep_map_locked &&
2018 		    !vmk_flags.vmkf_map_jit &&
2019 		    !vmk_flags.vmkf_random_address &&
2020 		    *start_inout <= map->min_offset);
2021 	} else if (vmk_flags.vmkf_last_free) {
2022 		assert(!vmk_flags.vmkf_map_jit &&
2023 		    !vmk_flags.vmkf_random_address);
2024 	}
2025 
2026 	if (vmk_flags.vmkf_guard_before) {
2027 		guard_offset = VM_MAP_PAGE_SIZE(map);
2028 		assert(size > guard_offset);
2029 		size -= guard_offset;
2030 	} else {
2031 		assert(size != 0);
2032 		guard_offset = 0;
2033 	}
2034 
2035 	effective_range = vm_map_get_range(map, start_inout, &vmk_flags, size);
2036 #if XNU_TARGET_OS_OSX
2037 	if (__improbable(vmk_flags.vmkf_32bit_map_va)) {
2038 		assert(map != kernel_map);
2039 		effective_range.max_address = MIN(map->max_offset, 0x00000000FFFFF000ULL);
2040 	}
2041 #endif /* XNU_TARGET_OS_OSX */
2042 
2043 again:
2044 	if (vmk_flags.vmkf_last_free) {
2045 		hint = *start_inout;
2046 
2047 		if (hint == 0 || hint > effective_range.max_address) {
2048 			hint = effective_range.max_address;
2049 		}
2050 		if (hint <= effective_range.min_address) {
2051 			return KERN_NO_SPACE;
2052 		}
2053 		limit = effective_range.min_address;
2054 	} else {
2055 		hint = *start_inout;
2056 
2057 		if (vmk_flags.vmkf_map_jit) {
2058 			if (map->jit_entry_exists &&
2059 			    !VM_MAP_POLICY_ALLOW_MULTIPLE_JIT(map)) {
2060 				return KERN_INVALID_ARGUMENT;
2061 			}
2062 			if (VM_MAP_POLICY_ALLOW_JIT_RANDOM_ADDRESS(map)) {
2063 				vmk_flags.vmkf_random_address = true;
2064 			}
2065 		}
2066 
2067 		if (vmk_flags.vmkf_random_address) {
2068 			kern_return_t kr;
2069 
2070 			kr = vm_map_random_address_for_size(map, &hint, size, vmk_flags);
2071 			if (kr != KERN_SUCCESS) {
2072 				return kr;
2073 			}
2074 		}
2075 #if XNU_TARGET_OS_OSX
2076 		else if ((hint == 0 || hint == vm_map_min(map)) &&
2077 		    !map->disable_vmentry_reuse &&
2078 		    map->vmmap_high_start != 0) {
2079 			hint = map->vmmap_high_start;
2080 		}
2081 #endif /* XNU_TARGET_OS_OSX */
2082 
2083 		if (hint < effective_range.min_address) {
2084 			hint = effective_range.min_address;
2085 		}
2086 		if (effective_range.max_address <= hint) {
2087 			return KERN_NO_SPACE;
2088 		}
2089 
2090 		limit = effective_range.max_address;
2091 	}
2092 	entry = vm_map_store_find_space(map,
2093 	    hint, limit, vmk_flags.vmkf_last_free,
2094 	    guard_offset, size, mask,
2095 	    start_inout);
2096 
2097 	if (__improbable(entry == NULL)) {
2098 		if (map->wait_for_space &&
2099 		    guard_offset + size <=
2100 		    effective_range.max_address - effective_range.min_address) {
2101 			assert_wait((event_t)map, THREAD_ABORTSAFE);
2102 			vm_map_unlock(map);
2103 			thread_block(THREAD_CONTINUE_NULL);
2104 			vm_map_lock(map);
2105 			goto again;
2106 		}
2107 		return KERN_NO_SPACE;
2108 	}
2109 
2110 	if (entry_out) {
2111 		*entry_out = entry;
2112 	}
2113 	return KERN_SUCCESS;
2114 }
2115 
2116 
2117 /*
2118  *	Routine:	vm_map_find_space
2119  *	Purpose:
2120  *		Allocate a range in the specified virtual address map,
2121  *		returning the entry allocated for that range.
2122  *		Used by kmem_alloc, etc.
2123  *
2124  *		The map must be NOT be locked. It will be returned locked
2125  *		on KERN_SUCCESS, unlocked on failure.
2126  *
2127  *		If an entry is allocated, the object/offset fields
2128  *		are initialized to zero.
2129  */
2130 kern_return_t
vm_map_find_space(vm_map_t map,vm_map_offset_t hint_address,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_map_entry_t * o_entry)2131 vm_map_find_space(
2132 	vm_map_t                map,
2133 	vm_map_offset_t         hint_address,
2134 	vm_map_size_t           size,
2135 	vm_map_offset_t         mask,
2136 	vm_map_kernel_flags_t   vmk_flags,
2137 	vm_map_entry_t          *o_entry)       /* OUT */
2138 {
2139 	vm_map_entry_t          new_entry, entry;
2140 	kern_return_t           kr;
2141 
2142 	if (size == 0) {
2143 		return KERN_INVALID_ARGUMENT;
2144 	}
2145 
2146 	new_entry = vm_map_entry_create(map);
2147 	new_entry->use_pmap = true;
2148 	new_entry->protection = VM_PROT_DEFAULT;
2149 	new_entry->max_protection = VM_PROT_ALL;
2150 
2151 	if (VM_MAP_PAGE_SHIFT(map) != PAGE_SHIFT) {
2152 		new_entry->map_aligned = true;
2153 	}
2154 	if (vmk_flags.vmkf_permanent) {
2155 		new_entry->vme_permanent = true;
2156 	}
2157 
2158 	vm_map_lock(map);
2159 
2160 	kr = vm_map_locate_space(map, size, mask, vmk_flags,
2161 	    &hint_address, &entry);
2162 	if (kr != KERN_SUCCESS) {
2163 		vm_map_unlock(map);
2164 		vm_map_entry_dispose(new_entry);
2165 		return kr;
2166 	}
2167 	new_entry->vme_start = hint_address;
2168 	new_entry->vme_end = hint_address + size;
2169 
2170 	/*
2171 	 *	At this point,
2172 	 *
2173 	 *	- new_entry's "vme_start" and "vme_end" should define
2174 	 *	  the endpoints of the available new range,
2175 	 *
2176 	 *	- and "entry" should refer to the region before
2177 	 *	  the new range,
2178 	 *
2179 	 *	- and the map should still be locked.
2180 	 */
2181 
2182 	assert(page_aligned(new_entry->vme_start));
2183 	assert(page_aligned(new_entry->vme_end));
2184 	assert(VM_MAP_PAGE_ALIGNED(new_entry->vme_start, VM_MAP_PAGE_MASK(map)));
2185 	assert(VM_MAP_PAGE_ALIGNED(new_entry->vme_end, VM_MAP_PAGE_MASK(map)));
2186 
2187 	/*
2188 	 *	Insert the new entry into the list
2189 	 */
2190 
2191 	vm_map_store_entry_link(map, entry, new_entry, VM_MAP_KERNEL_FLAGS_NONE);
2192 	map->size += size;
2193 
2194 	/*
2195 	 *	Update the lookup hint
2196 	 */
2197 	SAVE_HINT_MAP_WRITE(map, new_entry);
2198 
2199 	*o_entry = new_entry;
2200 	return KERN_SUCCESS;
2201 }
2202 
2203 int vm_map_pmap_enter_print = FALSE;
2204 int vm_map_pmap_enter_enable = FALSE;
2205 
2206 /*
2207  *	Routine:	vm_map_pmap_enter [internal only]
2208  *
2209  *	Description:
2210  *		Force pages from the specified object to be entered into
2211  *		the pmap at the specified address if they are present.
2212  *		As soon as a page not found in the object the scan ends.
2213  *
2214  *	Returns:
2215  *		Nothing.
2216  *
2217  *	In/out conditions:
2218  *		The source map should not be locked on entry.
2219  */
2220 __unused static void
vm_map_pmap_enter(vm_map_t map,vm_map_offset_t addr,vm_map_offset_t end_addr,vm_object_t object,vm_object_offset_t offset,vm_prot_t protection)2221 vm_map_pmap_enter(
2222 	vm_map_t                map,
2223 	vm_map_offset_t         addr,
2224 	vm_map_offset_t         end_addr,
2225 	vm_object_t             object,
2226 	vm_object_offset_t      offset,
2227 	vm_prot_t               protection)
2228 {
2229 	int                     type_of_fault;
2230 	kern_return_t           kr;
2231 	struct vm_object_fault_info fault_info = {};
2232 
2233 	if (map->pmap == 0) {
2234 		return;
2235 	}
2236 
2237 	assert(VM_MAP_PAGE_SHIFT(map) == PAGE_SHIFT);
2238 
2239 	while (addr < end_addr) {
2240 		vm_page_t       m;
2241 
2242 
2243 		/*
2244 		 * TODO:
2245 		 * From vm_map_enter(), we come into this function without the map
2246 		 * lock held or the object lock held.
2247 		 * We haven't taken a reference on the object either.
2248 		 * We should do a proper lookup on the map to make sure
2249 		 * that things are sane before we go locking objects that
2250 		 * could have been deallocated from under us.
2251 		 */
2252 
2253 		vm_object_lock(object);
2254 
2255 		m = vm_page_lookup(object, offset);
2256 
2257 		if (m == VM_PAGE_NULL || m->vmp_busy || m->vmp_fictitious ||
2258 		    (m->vmp_unusual && (VMP_ERROR_GET(m) || m->vmp_restart || m->vmp_absent))) {
2259 			vm_object_unlock(object);
2260 			return;
2261 		}
2262 
2263 		if (vm_map_pmap_enter_print) {
2264 			printf("vm_map_pmap_enter:");
2265 			printf("map: %p, addr: %llx, object: %p, offset: %llx\n",
2266 			    map, (unsigned long long)addr, object, (unsigned long long)offset);
2267 		}
2268 		type_of_fault = DBG_CACHE_HIT_FAULT;
2269 		kr = vm_fault_enter(m, map->pmap,
2270 		    addr,
2271 		    PAGE_SIZE, 0,
2272 		    protection, protection,
2273 		    VM_PAGE_WIRED(m),
2274 		    FALSE,                 /* change_wiring */
2275 		    VM_KERN_MEMORY_NONE,                 /* tag - not wiring */
2276 		    &fault_info,
2277 		    NULL,                  /* need_retry */
2278 		    &type_of_fault);
2279 
2280 		vm_object_unlock(object);
2281 
2282 		offset += PAGE_SIZE_64;
2283 		addr += PAGE_SIZE;
2284 	}
2285 }
2286 
2287 #define MAX_TRIES_TO_GET_RANDOM_ADDRESS 1000
2288 kern_return_t
vm_map_random_address_for_size(vm_map_t map,vm_map_offset_t * address,vm_map_size_t size,vm_map_kernel_flags_t vmk_flags)2289 vm_map_random_address_for_size(
2290 	vm_map_t                map,
2291 	vm_map_offset_t        *address,
2292 	vm_map_size_t           size,
2293 	vm_map_kernel_flags_t   vmk_flags)
2294 {
2295 	kern_return_t   kr = KERN_SUCCESS;
2296 	int             tries = 0;
2297 	vm_map_offset_t random_addr = 0;
2298 	vm_map_offset_t hole_end;
2299 
2300 	vm_map_entry_t  next_entry = VM_MAP_ENTRY_NULL;
2301 	vm_map_entry_t  prev_entry = VM_MAP_ENTRY_NULL;
2302 	vm_map_size_t   vm_hole_size = 0;
2303 	vm_map_size_t   addr_space_size;
2304 	struct mach_vm_range effective_range = vm_map_get_range(map, address, &vmk_flags, size);
2305 
2306 	addr_space_size = effective_range.max_address - effective_range.min_address;
2307 	if (size >= addr_space_size) {
2308 		return KERN_NO_SPACE;
2309 	}
2310 	addr_space_size -= size;
2311 
2312 	assert(VM_MAP_PAGE_ALIGNED(size, VM_MAP_PAGE_MASK(map)));
2313 
2314 	while (tries < MAX_TRIES_TO_GET_RANDOM_ADDRESS) {
2315 		if (startup_phase < STARTUP_SUB_ZALLOC) {
2316 			random_addr = (vm_map_offset_t)early_random();
2317 		} else {
2318 			random_addr = (vm_map_offset_t)random();
2319 		}
2320 		random_addr <<= VM_MAP_PAGE_SHIFT(map);
2321 		random_addr = vm_map_trunc_page(
2322 			effective_range.min_address + (random_addr % addr_space_size),
2323 			VM_MAP_PAGE_MASK(map));
2324 
2325 #if CONFIG_PROB_GZALLOC
2326 		if (map->pmap == kernel_pmap && pgz_owned(random_addr)) {
2327 			continue;
2328 		}
2329 #endif /* CONFIG_PROB_GZALLOC */
2330 
2331 		if (vm_map_lookup_entry(map, random_addr, &prev_entry) == FALSE) {
2332 			if (prev_entry == vm_map_to_entry(map)) {
2333 				next_entry = vm_map_first_entry(map);
2334 			} else {
2335 				next_entry = prev_entry->vme_next;
2336 			}
2337 			if (next_entry == vm_map_to_entry(map)) {
2338 				hole_end = vm_map_max(map);
2339 			} else {
2340 				hole_end = next_entry->vme_start;
2341 			}
2342 			vm_hole_size = hole_end - random_addr;
2343 			if (vm_hole_size >= size) {
2344 				*address = random_addr;
2345 				break;
2346 			}
2347 		}
2348 		tries++;
2349 	}
2350 
2351 	if (tries == MAX_TRIES_TO_GET_RANDOM_ADDRESS) {
2352 		kr = KERN_NO_SPACE;
2353 	}
2354 	return kr;
2355 }
2356 
2357 static boolean_t
vm_memory_malloc_no_cow(int alias)2358 vm_memory_malloc_no_cow(
2359 	int alias)
2360 {
2361 	uint64_t alias_mask;
2362 
2363 	if (alias > 63) {
2364 		return FALSE;
2365 	}
2366 
2367 	alias_mask = 1ULL << alias;
2368 	if (alias_mask & vm_memory_malloc_no_cow_mask) {
2369 		return TRUE;
2370 	}
2371 	return FALSE;
2372 }
2373 
2374 uint64_t vm_map_enter_RLIMIT_AS_count = 0;
2375 uint64_t vm_map_enter_RLIMIT_DATA_count = 0;
2376 /*
2377  *	Routine:	vm_map_enter
2378  *
2379  *	Description:
2380  *		Allocate a range in the specified virtual address map.
2381  *		The resulting range will refer to memory defined by
2382  *		the given memory object and offset into that object.
2383  *
2384  *		Arguments are as defined in the vm_map call.
2385  */
2386 static unsigned int vm_map_enter_restore_successes = 0;
2387 static unsigned int vm_map_enter_restore_failures = 0;
2388 kern_return_t
vm_map_enter(vm_map_t map,vm_map_offset_t * address,vm_map_size_t size,vm_map_offset_t mask,int flags,vm_map_kernel_flags_t vmk_flags,vm_tag_t alias,vm_object_t object,vm_object_offset_t offset,boolean_t needs_copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)2389 vm_map_enter(
2390 	vm_map_t                map,
2391 	vm_map_offset_t         *address,       /* IN/OUT */
2392 	vm_map_size_t           size,
2393 	vm_map_offset_t         mask,
2394 	int                     flags,
2395 	vm_map_kernel_flags_t   vmk_flags,
2396 	vm_tag_t                alias,
2397 	vm_object_t             object,
2398 	vm_object_offset_t      offset,
2399 	boolean_t               needs_copy,
2400 	vm_prot_t               cur_protection,
2401 	vm_prot_t               max_protection,
2402 	vm_inherit_t            inheritance)
2403 {
2404 	vm_map_entry_t          entry, new_entry;
2405 	vm_map_offset_t         start, tmp_start, tmp_offset;
2406 	vm_map_offset_t         end, tmp_end;
2407 	vm_map_offset_t         tmp2_start, tmp2_end;
2408 	vm_map_offset_t         step;
2409 	kern_return_t           result = KERN_SUCCESS;
2410 	boolean_t               map_locked = FALSE;
2411 	boolean_t               pmap_empty = TRUE;
2412 	boolean_t               new_mapping_established = FALSE;
2413 	boolean_t               keep_map_locked = vmk_flags.vmkf_keep_map_locked;
2414 	boolean_t               anywhere = ((flags & VM_FLAGS_ANYWHERE) != 0);
2415 	boolean_t               purgable = ((flags & VM_FLAGS_PURGABLE) != 0);
2416 	boolean_t               overwrite = ((flags & VM_FLAGS_OVERWRITE) != 0);
2417 	boolean_t               no_cache = ((flags & VM_FLAGS_NO_CACHE) != 0);
2418 	const boolean_t         is_submap = vmk_flags.vmkf_submap;
2419 	boolean_t               permanent = (((flags & VM_FLAGS_PERMANENT) != 0) || vmk_flags.vmkf_permanent);
2420 	const boolean_t         no_copy_on_read = vmk_flags.vmkf_no_copy_on_read;
2421 	const boolean_t         entry_for_jit = vmk_flags.vmkf_map_jit;
2422 	boolean_t               iokit_acct = vmk_flags.vmkf_iokit_acct;
2423 	boolean_t               resilient_codesign = ((flags & VM_FLAGS_RESILIENT_CODESIGN) != 0);
2424 	boolean_t               resilient_media = ((flags & VM_FLAGS_RESILIENT_MEDIA) != 0);
2425 	boolean_t               entry_for_tpro = ((flags & VM_FLAGS_TPRO) != 0);
2426 	unsigned int            superpage_size = ((flags & VM_FLAGS_SUPERPAGE_MASK) >> VM_FLAGS_SUPERPAGE_SHIFT);
2427 	vm_tag_t                user_alias;
2428 	kern_return_t           kr;
2429 	boolean_t               clear_map_aligned = FALSE;
2430 	vm_map_size_t           chunk_size = 0;
2431 	vm_object_t             caller_object;
2432 	VM_MAP_ZAP_DECLARE(zap_old_list);
2433 	VM_MAP_ZAP_DECLARE(zap_new_list);
2434 
2435 	caller_object = object;
2436 
2437 	assertf(vmk_flags.__vmkf_unused == 0, "vmk_flags unused=0x%x\n", vmk_flags.__vmkf_unused);
2438 
2439 	if (flags & VM_FLAGS_4GB_CHUNK) {
2440 #if defined(__LP64__)
2441 		chunk_size = (4ULL * 1024 * 1024 * 1024); /* max. 4GB chunks for the new allocation */
2442 #else /* __LP64__ */
2443 		chunk_size = ANON_CHUNK_SIZE;
2444 #endif /* __LP64__ */
2445 	} else {
2446 		chunk_size = ANON_CHUNK_SIZE;
2447 	}
2448 
2449 	if (superpage_size) {
2450 		switch (superpage_size) {
2451 			/*
2452 			 * Note that the current implementation only supports
2453 			 * a single size for superpages, SUPERPAGE_SIZE, per
2454 			 * architecture. As soon as more sizes are supposed
2455 			 * to be supported, SUPERPAGE_SIZE has to be replaced
2456 			 * with a lookup of the size depending on superpage_size.
2457 			 */
2458 #ifdef __x86_64__
2459 		case SUPERPAGE_SIZE_ANY:
2460 			/* handle it like 2 MB and round up to page size */
2461 			size = (size + 2 * 1024 * 1024 - 1) & ~(2 * 1024 * 1024 - 1);
2462 			OS_FALLTHROUGH;
2463 		case SUPERPAGE_SIZE_2MB:
2464 			break;
2465 #endif
2466 		default:
2467 			return KERN_INVALID_ARGUMENT;
2468 		}
2469 		mask = SUPERPAGE_SIZE - 1;
2470 		if (size & (SUPERPAGE_SIZE - 1)) {
2471 			return KERN_INVALID_ARGUMENT;
2472 		}
2473 		inheritance = VM_INHERIT_NONE;  /* fork() children won't inherit superpages */
2474 	}
2475 
2476 
2477 	if ((cur_protection & VM_PROT_WRITE) &&
2478 	    (cur_protection & VM_PROT_EXECUTE) &&
2479 #if XNU_TARGET_OS_OSX
2480 	    map->pmap != kernel_pmap &&
2481 	    (cs_process_global_enforcement() ||
2482 	    (vmk_flags.vmkf_cs_enforcement_override
2483 	    ? vmk_flags.vmkf_cs_enforcement
2484 	    : (vm_map_cs_enforcement(map)
2485 #if __arm64__
2486 	    || !VM_MAP_IS_EXOTIC(map)
2487 #endif /* __arm64__ */
2488 	    ))) &&
2489 #endif /* XNU_TARGET_OS_OSX */
2490 	    (VM_MAP_POLICY_WX_FAIL(map) ||
2491 	    VM_MAP_POLICY_WX_STRIP_X(map)) &&
2492 	    !entry_for_jit) {
2493 		boolean_t vm_protect_wx_fail = VM_MAP_POLICY_WX_FAIL(map);
2494 
2495 		DTRACE_VM3(cs_wx,
2496 		    uint64_t, 0,
2497 		    uint64_t, 0,
2498 		    vm_prot_t, cur_protection);
2499 		printf("CODE SIGNING: %d[%s] %s: curprot cannot be write+execute. %s\n",
2500 		    proc_selfpid(),
2501 		    (get_bsdtask_info(current_task())
2502 		    ? proc_name_address(get_bsdtask_info(current_task()))
2503 		    : "?"),
2504 		    __FUNCTION__,
2505 		    (vm_protect_wx_fail ? "failing" : "turning off execute"));
2506 		cur_protection &= ~VM_PROT_EXECUTE;
2507 		if (vm_protect_wx_fail) {
2508 			return KERN_PROTECTION_FAILURE;
2509 		}
2510 	}
2511 
2512 	/*
2513 	 * If the task has requested executable lockdown,
2514 	 * deny any new executable mapping.
2515 	 */
2516 	if (map->map_disallow_new_exec == TRUE) {
2517 		if (cur_protection & VM_PROT_EXECUTE) {
2518 			return KERN_PROTECTION_FAILURE;
2519 		}
2520 	}
2521 
2522 	if (resilient_codesign) {
2523 		assert(!is_submap);
2524 		int reject_prot = (needs_copy ? VM_PROT_ALLEXEC : (VM_PROT_WRITE | VM_PROT_ALLEXEC));
2525 		if ((cur_protection | max_protection) & reject_prot) {
2526 			return KERN_PROTECTION_FAILURE;
2527 		}
2528 	}
2529 
2530 	if (resilient_media) {
2531 		assert(!is_submap);
2532 //		assert(!needs_copy);
2533 		if (object != VM_OBJECT_NULL &&
2534 		    !object->internal) {
2535 			/*
2536 			 * This mapping is directly backed by an external
2537 			 * memory manager (e.g. a vnode pager for a file):
2538 			 * we would not have any safe place to inject
2539 			 * a zero-filled page if an actual page is not
2540 			 * available, without possibly impacting the actual
2541 			 * contents of the mapped object (e.g. the file),
2542 			 * so we can't provide any media resiliency here.
2543 			 */
2544 			return KERN_INVALID_ARGUMENT;
2545 		}
2546 	}
2547 
2548 	if (is_submap) {
2549 		vm_map_t submap;
2550 		if (purgable) {
2551 			/* submaps can not be purgeable */
2552 			return KERN_INVALID_ARGUMENT;
2553 		}
2554 		if (object == VM_OBJECT_NULL) {
2555 			/* submaps can not be created lazily */
2556 			return KERN_INVALID_ARGUMENT;
2557 		}
2558 		submap = (vm_map_t) object;
2559 		if (VM_MAP_PAGE_SHIFT(submap) != VM_MAP_PAGE_SHIFT(map)) {
2560 			/* page size mismatch */
2561 			return KERN_INVALID_ARGUMENT;
2562 		}
2563 	}
2564 	if (vmk_flags.vmkf_already) {
2565 		/*
2566 		 * VM_FLAGS_ALREADY says that it's OK if the same mapping
2567 		 * is already present.  For it to be meaningul, the requested
2568 		 * mapping has to be at a fixed address (!VM_FLAGS_ANYWHERE) and
2569 		 * we shouldn't try and remove what was mapped there first
2570 		 * (!VM_FLAGS_OVERWRITE).
2571 		 */
2572 		if ((flags & VM_FLAGS_ANYWHERE) ||
2573 		    (flags & VM_FLAGS_OVERWRITE)) {
2574 			return KERN_INVALID_ARGUMENT;
2575 		}
2576 	}
2577 
2578 	if (size == 0 ||
2579 	    (offset & MIN(VM_MAP_PAGE_MASK(map), PAGE_MASK_64)) != 0) {
2580 		*address = 0;
2581 		return KERN_INVALID_ARGUMENT;
2582 	}
2583 
2584 	if (map->pmap == kernel_pmap) {
2585 		user_alias = VM_KERN_MEMORY_NONE;
2586 	} else {
2587 		user_alias = alias;
2588 	}
2589 
2590 	if (user_alias == VM_MEMORY_MALLOC_MEDIUM) {
2591 		chunk_size = MALLOC_MEDIUM_CHUNK_SIZE;
2592 	}
2593 
2594 #define RETURN(value)   { result = value; goto BailOut; }
2595 
2596 	assertf(VM_MAP_PAGE_ALIGNED(*address, FOURK_PAGE_MASK), "0x%llx", (uint64_t)*address);
2597 	assertf(VM_MAP_PAGE_ALIGNED(size, FOURK_PAGE_MASK), "0x%llx", (uint64_t)size);
2598 	if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK) {
2599 		assertf(page_aligned(*address), "0x%llx", (uint64_t)*address);
2600 		assertf(page_aligned(size), "0x%llx", (uint64_t)size);
2601 	}
2602 
2603 	if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK &&
2604 	    !VM_MAP_PAGE_ALIGNED(size, VM_MAP_PAGE_MASK(map))) {
2605 		/*
2606 		 * In most cases, the caller rounds the size up to the
2607 		 * map's page size.
2608 		 * If we get a size that is explicitly not map-aligned here,
2609 		 * we'll have to respect the caller's wish and mark the
2610 		 * mapping as "not map-aligned" to avoid tripping the
2611 		 * map alignment checks later.
2612 		 */
2613 		clear_map_aligned = TRUE;
2614 	}
2615 	if (!anywhere &&
2616 	    VM_MAP_PAGE_MASK(map) >= PAGE_MASK &&
2617 	    !VM_MAP_PAGE_ALIGNED(*address, VM_MAP_PAGE_MASK(map))) {
2618 		/*
2619 		 * We've been asked to map at a fixed address and that
2620 		 * address is not aligned to the map's specific alignment.
2621 		 * The caller should know what it's doing (i.e. most likely
2622 		 * mapping some fragmented copy map, transferring memory from
2623 		 * a VM map with a different alignment), so clear map_aligned
2624 		 * for this new VM map entry and proceed.
2625 		 */
2626 		clear_map_aligned = TRUE;
2627 	}
2628 
2629 	/*
2630 	 * Only zero-fill objects are allowed to be purgable.
2631 	 * LP64todo - limit purgable objects to 32-bits for now
2632 	 */
2633 	if (purgable &&
2634 	    (offset != 0 ||
2635 	    (object != VM_OBJECT_NULL &&
2636 	    (object->vo_size != size ||
2637 	    object->purgable == VM_PURGABLE_DENY))
2638 #if __LP64__
2639 	    || size > ANON_MAX_SIZE
2640 #endif
2641 	    )) {
2642 		return KERN_INVALID_ARGUMENT;
2643 	}
2644 
2645 	start = *address;
2646 
2647 	if (anywhere) {
2648 		vm_map_lock(map);
2649 		map_locked = TRUE;
2650 
2651 		if (flags & VM_FLAGS_RANDOM_ADDR) {
2652 			vmk_flags.vmkf_random_address = true;
2653 		}
2654 
2655 		result = vm_map_locate_space(map, size, mask, vmk_flags,
2656 		    &start, &entry);
2657 		if (result != KERN_SUCCESS) {
2658 			goto BailOut;
2659 		}
2660 
2661 		*address = start;
2662 		end = start + size;
2663 		assert(VM_MAP_PAGE_ALIGNED(*address,
2664 		    VM_MAP_PAGE_MASK(map)));
2665 	} else {
2666 		vm_map_offset_t effective_min_offset, effective_max_offset;
2667 
2668 		effective_min_offset = map->min_offset;
2669 		effective_max_offset = map->max_offset;
2670 
2671 		if (vmk_flags.vmkf_beyond_max) {
2672 			/*
2673 			 * Allow an insertion beyond the map's max offset.
2674 			 */
2675 			effective_max_offset = 0x00000000FFFFF000ULL;
2676 			if (vm_map_is_64bit(map)) {
2677 				effective_max_offset = 0xFFFFFFFFFFFFF000ULL;
2678 			}
2679 #if XNU_TARGET_OS_OSX
2680 		} else if (__improbable(vmk_flags.vmkf_32bit_map_va)) {
2681 			effective_max_offset = MIN(map->max_offset, 0x00000000FFFFF000ULL);
2682 #endif /* XNU_TARGET_OS_OSX */
2683 		}
2684 
2685 		if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT &&
2686 		    !overwrite &&
2687 		    user_alias == VM_MEMORY_REALLOC) {
2688 			/*
2689 			 * Force realloc() to switch to a new allocation,
2690 			 * to prevent 4k-fragmented virtual ranges.
2691 			 */
2692 //			DEBUG4K_ERROR("no realloc in place");
2693 			return KERN_NO_SPACE;
2694 		}
2695 
2696 		/*
2697 		 *	Verify that:
2698 		 *		the address doesn't itself violate
2699 		 *		the mask requirement.
2700 		 */
2701 
2702 		vm_map_lock(map);
2703 		map_locked = TRUE;
2704 		if ((start & mask) != 0) {
2705 			RETURN(KERN_NO_SPACE);
2706 		}
2707 
2708 		/*
2709 		 *	...	the address is within bounds
2710 		 */
2711 
2712 		end = start + size;
2713 
2714 		if ((start < effective_min_offset) ||
2715 		    (end > effective_max_offset) ||
2716 		    (start >= end)) {
2717 			RETURN(KERN_INVALID_ADDRESS);
2718 		}
2719 
2720 		if (overwrite) {
2721 			vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_MAP_ALIGN;
2722 			kern_return_t remove_kr;
2723 
2724 			/*
2725 			 * Fixed mapping and "overwrite" flag: attempt to
2726 			 * remove all existing mappings in the specified
2727 			 * address range, saving them in our "zap_old_list".
2728 			 *
2729 			 * This avoids releasing the VM map lock in
2730 			 * vm_map_entry_delete() and allows atomicity
2731 			 * when we want to replace some mappings with a new one.
2732 			 * It also allows us to restore the old VM mappings if the
2733 			 * new mapping fails.
2734 			 */
2735 			remove_flags |= VM_MAP_REMOVE_NO_YIELD;
2736 
2737 			if (vmk_flags.vmkf_overwrite_immutable) {
2738 				/* we can overwrite immutable mappings */
2739 				remove_flags |= VM_MAP_REMOVE_IMMUTABLE;
2740 			}
2741 			if (vmk_flags.vmkf_remap_prot_copy) {
2742 				remove_flags |= VM_MAP_REMOVE_IMMUTABLE_CODE;
2743 			}
2744 			remove_kr = vm_map_delete(map, start, end, remove_flags,
2745 			    KMEM_GUARD_NONE, &zap_old_list).kmr_return;
2746 			if (remove_kr) {
2747 				/* XXX FBDP restore zap_old_list? */
2748 				RETURN(remove_kr);
2749 			}
2750 		}
2751 
2752 		/*
2753 		 *	...	the starting address isn't allocated
2754 		 */
2755 
2756 		if (vm_map_lookup_entry(map, start, &entry)) {
2757 			if (!(vmk_flags.vmkf_already)) {
2758 				RETURN(KERN_NO_SPACE);
2759 			}
2760 			/*
2761 			 * Check if what's already there is what we want.
2762 			 */
2763 			tmp_start = start;
2764 			tmp_offset = offset;
2765 			if (entry->vme_start < start) {
2766 				tmp_start -= start - entry->vme_start;
2767 				tmp_offset -= start - entry->vme_start;
2768 			}
2769 			for (; entry->vme_start < end;
2770 			    entry = entry->vme_next) {
2771 				/*
2772 				 * Check if the mapping's attributes
2773 				 * match the existing map entry.
2774 				 */
2775 				if (entry == vm_map_to_entry(map) ||
2776 				    entry->vme_start != tmp_start ||
2777 				    entry->is_sub_map != is_submap ||
2778 				    VME_OFFSET(entry) != tmp_offset ||
2779 				    entry->needs_copy != needs_copy ||
2780 				    entry->protection != cur_protection ||
2781 				    entry->max_protection != max_protection ||
2782 				    entry->inheritance != inheritance ||
2783 				    entry->iokit_acct != iokit_acct ||
2784 				    VME_ALIAS(entry) != alias) {
2785 					/* not the same mapping ! */
2786 					RETURN(KERN_NO_SPACE);
2787 				}
2788 				/*
2789 				 * Check if the same object is being mapped.
2790 				 */
2791 				if (is_submap) {
2792 					if (VME_SUBMAP(entry) !=
2793 					    (vm_map_t) object) {
2794 						/* not the same submap */
2795 						RETURN(KERN_NO_SPACE);
2796 					}
2797 				} else {
2798 					if (VME_OBJECT(entry) != object) {
2799 						/* not the same VM object... */
2800 						vm_object_t obj2;
2801 
2802 						obj2 = VME_OBJECT(entry);
2803 						if ((obj2 == VM_OBJECT_NULL ||
2804 						    obj2->internal) &&
2805 						    (object == VM_OBJECT_NULL ||
2806 						    object->internal)) {
2807 							/*
2808 							 * ... but both are
2809 							 * anonymous memory,
2810 							 * so equivalent.
2811 							 */
2812 						} else {
2813 							RETURN(KERN_NO_SPACE);
2814 						}
2815 					}
2816 				}
2817 
2818 				tmp_offset += entry->vme_end - entry->vme_start;
2819 				tmp_start += entry->vme_end - entry->vme_start;
2820 				if (entry->vme_end >= end) {
2821 					/* reached the end of our mapping */
2822 					break;
2823 				}
2824 			}
2825 			/* it all matches:  let's use what's already there ! */
2826 			RETURN(KERN_MEMORY_PRESENT);
2827 		}
2828 
2829 		/*
2830 		 *	...	the next region doesn't overlap the
2831 		 *		end point.
2832 		 */
2833 
2834 		if ((entry->vme_next != vm_map_to_entry(map)) &&
2835 		    (entry->vme_next->vme_start < end)) {
2836 			RETURN(KERN_NO_SPACE);
2837 		}
2838 	}
2839 
2840 	/*
2841 	 *	At this point,
2842 	 *		"start" and "end" should define the endpoints of the
2843 	 *			available new range, and
2844 	 *		"entry" should refer to the region before the new
2845 	 *			range, and
2846 	 *
2847 	 *		the map should be locked.
2848 	 */
2849 
2850 	/*
2851 	 *	See whether we can avoid creating a new entry (and object) by
2852 	 *	extending one of our neighbors.  [So far, we only attempt to
2853 	 *	extend from below.]  Note that we can never extend/join
2854 	 *	purgable objects because they need to remain distinct
2855 	 *	entities in order to implement their "volatile object"
2856 	 *	semantics.
2857 	 */
2858 
2859 	if (purgable ||
2860 	    entry_for_jit ||
2861 	    entry_for_tpro ||
2862 	    vm_memory_malloc_no_cow(user_alias)) {
2863 		if (object == VM_OBJECT_NULL) {
2864 			object = vm_object_allocate(size);
2865 			object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
2866 			object->true_share = FALSE;
2867 			if (purgable) {
2868 				task_t owner;
2869 				object->purgable = VM_PURGABLE_NONVOLATILE;
2870 				if (map->pmap == kernel_pmap) {
2871 					/*
2872 					 * Purgeable mappings made in a kernel
2873 					 * map are "owned" by the kernel itself
2874 					 * rather than the current user task
2875 					 * because they're likely to be used by
2876 					 * more than this user task (see
2877 					 * execargs_purgeable_allocate(), for
2878 					 * example).
2879 					 */
2880 					owner = kernel_task;
2881 				} else {
2882 					owner = current_task();
2883 				}
2884 				assert(object->vo_owner == NULL);
2885 				assert(object->resident_page_count == 0);
2886 				assert(object->wired_page_count == 0);
2887 				vm_object_lock(object);
2888 				vm_purgeable_nonvolatile_enqueue(object, owner);
2889 				vm_object_unlock(object);
2890 			}
2891 			offset = (vm_object_offset_t)0;
2892 		}
2893 	} else if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
2894 		/* no coalescing if address space uses sub-pages */
2895 	} else if ((is_submap == FALSE) &&
2896 	    (object == VM_OBJECT_NULL) &&
2897 	    (entry != vm_map_to_entry(map)) &&
2898 	    (entry->vme_end == start) &&
2899 	    (!entry->is_shared) &&
2900 	    (!entry->is_sub_map) &&
2901 	    (!entry->in_transition) &&
2902 	    (!entry->needs_wakeup) &&
2903 	    (entry->behavior == VM_BEHAVIOR_DEFAULT) &&
2904 	    (entry->protection == cur_protection) &&
2905 	    (entry->max_protection == max_protection) &&
2906 	    (entry->inheritance == inheritance) &&
2907 	    ((user_alias == VM_MEMORY_REALLOC) ||
2908 	    (VME_ALIAS(entry) == alias)) &&
2909 	    (entry->no_cache == no_cache) &&
2910 	    (entry->vme_permanent == permanent) &&
2911 	    /* no coalescing for immutable executable mappings */
2912 	    !((entry->protection & VM_PROT_EXECUTE) &&
2913 	    entry->vme_permanent) &&
2914 	    (!entry->superpage_size && !superpage_size) &&
2915 	    /*
2916 	     * No coalescing if not map-aligned, to avoid propagating
2917 	     * that condition any further than needed:
2918 	     */
2919 	    (!entry->map_aligned || !clear_map_aligned) &&
2920 	    (!entry->zero_wired_pages) &&
2921 	    (!entry->used_for_jit && !entry_for_jit) &&
2922 	    (!entry->pmap_cs_associated) &&
2923 	    (entry->iokit_acct == iokit_acct) &&
2924 	    (!entry->vme_resilient_codesign) &&
2925 	    (!entry->vme_resilient_media) &&
2926 	    (!entry->vme_atomic) &&
2927 	    (entry->vme_no_copy_on_read == no_copy_on_read) &&
2928 
2929 	    ((entry->vme_end - entry->vme_start) + size <=
2930 	    (user_alias == VM_MEMORY_REALLOC ?
2931 	    ANON_CHUNK_SIZE :
2932 	    NO_COALESCE_LIMIT)) &&
2933 
2934 	    (entry->wired_count == 0)) {        /* implies user_wired_count == 0 */
2935 		if (vm_object_coalesce(VME_OBJECT(entry),
2936 		    VM_OBJECT_NULL,
2937 		    VME_OFFSET(entry),
2938 		    (vm_object_offset_t) 0,
2939 		    (vm_map_size_t)(entry->vme_end - entry->vme_start),
2940 		    (vm_map_size_t)(end - entry->vme_end))) {
2941 			/*
2942 			 *	Coalesced the two objects - can extend
2943 			 *	the previous map entry to include the
2944 			 *	new range.
2945 			 */
2946 			map->size += (end - entry->vme_end);
2947 			assert(entry->vme_start < end);
2948 			assert(VM_MAP_PAGE_ALIGNED(end,
2949 			    VM_MAP_PAGE_MASK(map)));
2950 			if (__improbable(vm_debug_events)) {
2951 				DTRACE_VM5(map_entry_extend, vm_map_t, map, vm_map_entry_t, entry, vm_address_t, entry->vme_start, vm_address_t, entry->vme_end, vm_address_t, end);
2952 			}
2953 			entry->vme_end = end;
2954 			if (map->holelistenabled) {
2955 				vm_map_store_update_first_free(map, entry, TRUE);
2956 			} else {
2957 				vm_map_store_update_first_free(map, map->first_free, TRUE);
2958 			}
2959 			new_mapping_established = TRUE;
2960 			RETURN(KERN_SUCCESS);
2961 		}
2962 	}
2963 
2964 	step = superpage_size ? SUPERPAGE_SIZE : (end - start);
2965 	new_entry = NULL;
2966 
2967 	if (vmk_flags.vmkf_submap_adjust) {
2968 		vm_map_adjust_offsets((vm_map_t)caller_object, start, end);
2969 		offset = start;
2970 	}
2971 
2972 	for (tmp2_start = start; tmp2_start < end; tmp2_start += step) {
2973 		tmp2_end = tmp2_start + step;
2974 		/*
2975 		 *	Create a new entry
2976 		 *
2977 		 * XXX FBDP
2978 		 * The reserved "page zero" in each process's address space can
2979 		 * be arbitrarily large.  Splitting it into separate objects and
2980 		 * therefore different VM map entries serves no purpose and just
2981 		 * slows down operations on the VM map, so let's not split the
2982 		 * allocation into chunks if the max protection is NONE.  That
2983 		 * memory should never be accessible, so it will never get to the
2984 		 * default pager.
2985 		 */
2986 		tmp_start = tmp2_start;
2987 		if (!is_submap &&
2988 		    object == VM_OBJECT_NULL &&
2989 		    size > chunk_size &&
2990 		    max_protection != VM_PROT_NONE &&
2991 		    superpage_size == 0) {
2992 			tmp_end = tmp_start + chunk_size;
2993 		} else {
2994 			tmp_end = tmp2_end;
2995 		}
2996 		do {
2997 			if (!is_submap &&
2998 			    object != VM_OBJECT_NULL &&
2999 			    object->internal &&
3000 			    offset + (tmp_end - tmp_start) > object->vo_size) {
3001 //				printf("FBDP object %p size 0x%llx overmapping offset 0x%llx size 0x%llx\n", object, object->vo_size, offset, (uint64_t)(tmp_end - tmp_start));
3002 				DTRACE_VM5(vm_map_enter_overmap,
3003 				    vm_map_t, map,
3004 				    vm_map_address_t, tmp_start,
3005 				    vm_map_address_t, tmp_end,
3006 				    vm_object_offset_t, offset,
3007 				    vm_object_size_t, object->vo_size);
3008 			}
3009 			new_entry = vm_map_entry_insert(map,
3010 			    entry, tmp_start, tmp_end,
3011 			    object, offset, vmk_flags,
3012 			    needs_copy,
3013 			    cur_protection, max_protection,
3014 			    (entry_for_jit && !VM_MAP_POLICY_ALLOW_JIT_INHERIT(map) ?
3015 			    VM_INHERIT_NONE : inheritance),
3016 			    no_cache,
3017 			    permanent,
3018 			    superpage_size,
3019 			    clear_map_aligned,
3020 			    alias);
3021 
3022 			assert((object != kernel_object) || (VM_KERN_MEMORY_NONE != alias));
3023 
3024 			if (resilient_codesign) {
3025 				int reject_prot = (needs_copy ? VM_PROT_ALLEXEC : (VM_PROT_WRITE | VM_PROT_ALLEXEC));
3026 				if (!((cur_protection | max_protection) & reject_prot)) {
3027 					new_entry->vme_resilient_codesign = TRUE;
3028 				}
3029 			}
3030 
3031 			if (resilient_media &&
3032 			    (object == VM_OBJECT_NULL ||
3033 			    object->internal)) {
3034 				new_entry->vme_resilient_media = TRUE;
3035 			}
3036 
3037 			assert(!new_entry->iokit_acct);
3038 			if (!is_submap &&
3039 			    object != VM_OBJECT_NULL &&
3040 			    (object->purgable != VM_PURGABLE_DENY ||
3041 			    object->vo_ledger_tag)) {
3042 				assert(new_entry->use_pmap);
3043 				assert(!new_entry->iokit_acct);
3044 				/*
3045 				 * Turn off pmap accounting since
3046 				 * purgeable (or tagged) objects have their
3047 				 * own ledgers.
3048 				 */
3049 				new_entry->use_pmap = FALSE;
3050 			} else if (!is_submap &&
3051 			    iokit_acct &&
3052 			    object != VM_OBJECT_NULL &&
3053 			    object->internal) {
3054 				/* alternate accounting */
3055 				assert(!new_entry->iokit_acct);
3056 				assert(new_entry->use_pmap);
3057 				new_entry->iokit_acct = TRUE;
3058 				new_entry->use_pmap = FALSE;
3059 				DTRACE_VM4(
3060 					vm_map_iokit_mapped_region,
3061 					vm_map_t, map,
3062 					vm_map_offset_t, new_entry->vme_start,
3063 					vm_map_offset_t, new_entry->vme_end,
3064 					int, VME_ALIAS(new_entry));
3065 				vm_map_iokit_mapped_region(
3066 					map,
3067 					(new_entry->vme_end -
3068 					new_entry->vme_start));
3069 			} else if (!is_submap) {
3070 				assert(!new_entry->iokit_acct);
3071 				assert(new_entry->use_pmap);
3072 			}
3073 
3074 			if (is_submap) {
3075 				vm_map_t        submap;
3076 				boolean_t       submap_is_64bit;
3077 				boolean_t       use_pmap;
3078 
3079 				assert(new_entry->is_sub_map);
3080 				assert(!new_entry->use_pmap);
3081 				assert(!new_entry->iokit_acct);
3082 				submap = (vm_map_t) object;
3083 				submap_is_64bit = vm_map_is_64bit(submap);
3084 				use_pmap = vmk_flags.vmkf_nested_pmap;
3085 #ifndef NO_NESTED_PMAP
3086 				if (use_pmap && submap->pmap == NULL) {
3087 					ledger_t ledger = map->pmap->ledger;
3088 					/* we need a sub pmap to nest... */
3089 					submap->pmap = pmap_create_options(ledger, 0,
3090 					    submap_is_64bit ? PMAP_CREATE_64BIT : 0);
3091 					if (submap->pmap == NULL) {
3092 						/* let's proceed without nesting... */
3093 					}
3094 #if defined(__arm64__)
3095 					else {
3096 						pmap_set_nested(submap->pmap);
3097 					}
3098 #endif
3099 				}
3100 				if (use_pmap && submap->pmap != NULL) {
3101 					if (VM_MAP_PAGE_SHIFT(map) != VM_MAP_PAGE_SHIFT(submap)) {
3102 						DEBUG4K_ERROR("map %p (%d) submap %p (%d): incompatible page sizes\n", map, VM_MAP_PAGE_SHIFT(map), submap, VM_MAP_PAGE_SHIFT(submap));
3103 						kr = KERN_FAILURE;
3104 					} else {
3105 						kr = pmap_nest(map->pmap,
3106 						    submap->pmap,
3107 						    tmp_start,
3108 						    tmp_end - tmp_start);
3109 					}
3110 					if (kr != KERN_SUCCESS) {
3111 						printf("vm_map_enter: "
3112 						    "pmap_nest(0x%llx,0x%llx) "
3113 						    "error 0x%x\n",
3114 						    (long long)tmp_start,
3115 						    (long long)tmp_end,
3116 						    kr);
3117 					} else {
3118 						/* we're now nested ! */
3119 						new_entry->use_pmap = TRUE;
3120 						pmap_empty = FALSE;
3121 					}
3122 				}
3123 #endif /* NO_NESTED_PMAP */
3124 			}
3125 			entry = new_entry;
3126 
3127 			if (superpage_size) {
3128 				vm_page_t pages, m;
3129 				vm_object_t sp_object;
3130 				vm_object_offset_t sp_offset;
3131 
3132 				VME_OFFSET_SET(entry, 0);
3133 
3134 				/* allocate one superpage */
3135 				kr = cpm_allocate(SUPERPAGE_SIZE, &pages, 0, SUPERPAGE_NBASEPAGES - 1, TRUE, 0);
3136 				if (kr != KERN_SUCCESS) {
3137 					/* deallocate whole range... */
3138 					new_mapping_established = TRUE;
3139 					/* ... but only up to "tmp_end" */
3140 					size -= end - tmp_end;
3141 					RETURN(kr);
3142 				}
3143 
3144 				/* create one vm_object per superpage */
3145 				sp_object = vm_object_allocate((vm_map_size_t)(entry->vme_end - entry->vme_start));
3146 				sp_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
3147 				sp_object->phys_contiguous = TRUE;
3148 				sp_object->vo_shadow_offset = (vm_object_offset_t)VM_PAGE_GET_PHYS_PAGE(pages) * PAGE_SIZE;
3149 				VME_OBJECT_SET(entry, sp_object, false, 0);
3150 				assert(entry->use_pmap);
3151 
3152 				/* enter the base pages into the object */
3153 				vm_object_lock(sp_object);
3154 				for (sp_offset = 0;
3155 				    sp_offset < SUPERPAGE_SIZE;
3156 				    sp_offset += PAGE_SIZE) {
3157 					m = pages;
3158 					pmap_zero_page(VM_PAGE_GET_PHYS_PAGE(m));
3159 					pages = NEXT_PAGE(m);
3160 					*(NEXT_PAGE_PTR(m)) = VM_PAGE_NULL;
3161 					vm_page_insert_wired(m, sp_object, sp_offset, VM_KERN_MEMORY_OSFMK);
3162 				}
3163 				vm_object_unlock(sp_object);
3164 			}
3165 		} while (tmp_end != tmp2_end &&
3166 		    (tmp_start = tmp_end) &&
3167 		    (tmp_end = (tmp2_end - tmp_end > chunk_size) ?
3168 		    tmp_end + chunk_size : tmp2_end));
3169 	}
3170 
3171 	new_mapping_established = TRUE;
3172 
3173 BailOut:
3174 	assert(map_locked == TRUE);
3175 
3176 	/*
3177 	 * Address space limit enforcement (RLIMIT_AS and RLIMIT_DATA):
3178 	 * If we have identified and possibly established the new mapping(s),
3179 	 * make sure we did not go beyond the address space limit.
3180 	 */
3181 	if (result == KERN_SUCCESS) {
3182 		if (map->size_limit != RLIM_INFINITY &&
3183 		    map->size > map->size_limit) {
3184 			/*
3185 			 * Establishing the requested mappings would exceed
3186 			 * the process's RLIMIT_AS limit: fail with
3187 			 * KERN_NO_SPACE.
3188 			 */
3189 			result = KERN_NO_SPACE;
3190 			printf("%d[%s] %s: map size 0x%llx over RLIMIT_AS 0x%llx\n",
3191 			    proc_selfpid(),
3192 			    (get_bsdtask_info(current_task())
3193 			    ? proc_name_address(get_bsdtask_info(current_task()))
3194 			    : "?"),
3195 			    __FUNCTION__,
3196 			    (uint64_t) map->size,
3197 			    (uint64_t) map->size_limit);
3198 			DTRACE_VM2(vm_map_enter_RLIMIT_AS,
3199 			    vm_map_size_t, map->size,
3200 			    uint64_t, map->size_limit);
3201 			vm_map_enter_RLIMIT_AS_count++;
3202 		} else if (map->data_limit != RLIM_INFINITY &&
3203 		    map->size > map->data_limit) {
3204 			/*
3205 			 * Establishing the requested mappings would exceed
3206 			 * the process's RLIMIT_DATA limit: fail with
3207 			 * KERN_NO_SPACE.
3208 			 */
3209 			result = KERN_NO_SPACE;
3210 			printf("%d[%s] %s: map size 0x%llx over RLIMIT_DATA 0x%llx\n",
3211 			    proc_selfpid(),
3212 			    (get_bsdtask_info(current_task())
3213 			    ? proc_name_address(get_bsdtask_info(current_task()))
3214 			    : "?"),
3215 			    __FUNCTION__,
3216 			    (uint64_t) map->size,
3217 			    (uint64_t) map->data_limit);
3218 			DTRACE_VM2(vm_map_enter_RLIMIT_DATA,
3219 			    vm_map_size_t, map->size,
3220 			    uint64_t, map->data_limit);
3221 			vm_map_enter_RLIMIT_DATA_count++;
3222 		}
3223 	}
3224 
3225 	if (result == KERN_SUCCESS) {
3226 		vm_prot_t pager_prot;
3227 		memory_object_t pager;
3228 
3229 #if DEBUG
3230 		if (pmap_empty &&
3231 		    !(vmk_flags.vmkf_no_pmap_check)) {
3232 			assert(pmap_is_empty(map->pmap,
3233 			    *address,
3234 			    *address + size));
3235 		}
3236 #endif /* DEBUG */
3237 
3238 		/*
3239 		 * For "named" VM objects, let the pager know that the
3240 		 * memory object is being mapped.  Some pagers need to keep
3241 		 * track of this, to know when they can reclaim the memory
3242 		 * object, for example.
3243 		 * VM calls memory_object_map() for each mapping (specifying
3244 		 * the protection of each mapping) and calls
3245 		 * memory_object_last_unmap() when all the mappings are gone.
3246 		 */
3247 		pager_prot = max_protection;
3248 		if (needs_copy) {
3249 			/*
3250 			 * Copy-On-Write mapping: won't modify
3251 			 * the memory object.
3252 			 */
3253 			pager_prot &= ~VM_PROT_WRITE;
3254 		}
3255 		if (!is_submap &&
3256 		    object != VM_OBJECT_NULL &&
3257 		    object->named &&
3258 		    object->pager != MEMORY_OBJECT_NULL) {
3259 			vm_object_lock(object);
3260 			pager = object->pager;
3261 			if (object->named &&
3262 			    pager != MEMORY_OBJECT_NULL) {
3263 				assert(object->pager_ready);
3264 				vm_object_mapping_wait(object, THREAD_UNINT);
3265 				vm_object_mapping_begin(object);
3266 				vm_object_unlock(object);
3267 
3268 				kr = memory_object_map(pager, pager_prot);
3269 				assert(kr == KERN_SUCCESS);
3270 
3271 				vm_object_lock(object);
3272 				vm_object_mapping_end(object);
3273 			}
3274 			vm_object_unlock(object);
3275 		}
3276 	}
3277 
3278 	assert(map_locked == TRUE);
3279 
3280 	if (new_mapping_established) {
3281 		/*
3282 		 * If we release the map lock for any reason below,
3283 		 * another thread could deallocate our new mapping,
3284 		 * releasing the caller's reference on "caller_object",
3285 		 * which was transferred to the mapping.
3286 		 * If this was the only reference, the object could be
3287 		 * destroyed.
3288 		 *
3289 		 * We need to take an extra reference on "caller_object"
3290 		 * to keep it alive if we need to return the caller's
3291 		 * reference to the caller in case of failure.
3292 		 */
3293 		if (is_submap) {
3294 			vm_map_reference((vm_map_t)caller_object);
3295 		} else {
3296 			vm_object_reference(caller_object);
3297 		}
3298 	}
3299 
3300 	if (!keep_map_locked) {
3301 		vm_map_unlock(map);
3302 		map_locked = FALSE;
3303 		entry = VM_MAP_ENTRY_NULL;
3304 		new_entry = VM_MAP_ENTRY_NULL;
3305 	}
3306 
3307 	/*
3308 	 * We can't hold the map lock if we enter this block.
3309 	 */
3310 
3311 	if (result == KERN_SUCCESS) {
3312 		/*	Wire down the new entry if the user
3313 		 *	requested all new map entries be wired.
3314 		 */
3315 		if ((map->wiring_required) || (superpage_size)) {
3316 			assert(!keep_map_locked);
3317 			pmap_empty = FALSE; /* pmap won't be empty */
3318 			kr = vm_map_wire_kernel(map, start, end,
3319 			    cur_protection, VM_KERN_MEMORY_MLOCK,
3320 			    TRUE);
3321 			result = kr;
3322 		}
3323 
3324 	}
3325 
3326 	if (result != KERN_SUCCESS) {
3327 		if (new_mapping_established) {
3328 			vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_FLAGS;
3329 
3330 			/*
3331 			 * We have to get rid of the new mappings since we
3332 			 * won't make them available to the user.
3333 			 * Try and do that atomically, to minimize the risk
3334 			 * that someone else create new mappings that range.
3335 			 */
3336 			if (!map_locked) {
3337 				vm_map_lock(map);
3338 				map_locked = TRUE;
3339 			}
3340 			remove_flags |= VM_MAP_REMOVE_NO_MAP_ALIGN;
3341 			remove_flags |= VM_MAP_REMOVE_NO_YIELD;
3342 			if (permanent) {
3343 				remove_flags |= VM_MAP_REMOVE_IMMUTABLE;
3344 			}
3345 			(void) vm_map_delete(map,
3346 			    *address, *address + size,
3347 			    remove_flags,
3348 			    KMEM_GUARD_NONE, &zap_new_list);
3349 		}
3350 
3351 		if (vm_map_zap_first_entry(&zap_old_list)) {
3352 			vm_map_entry_t entry1, entry2;
3353 
3354 			/*
3355 			 * The new mapping failed.  Attempt to restore
3356 			 * the old mappings, saved in the "zap_old_map".
3357 			 */
3358 			if (!map_locked) {
3359 				vm_map_lock(map);
3360 				map_locked = TRUE;
3361 			}
3362 
3363 			/* first check if the coast is still clear */
3364 			start = vm_map_zap_first_entry(&zap_old_list)->vme_start;
3365 			end   = vm_map_zap_last_entry(&zap_old_list)->vme_end;
3366 
3367 			if (vm_map_lookup_entry(map, start, &entry1) ||
3368 			    vm_map_lookup_entry(map, end, &entry2) ||
3369 			    entry1 != entry2) {
3370 				/*
3371 				 * Part of that range has already been
3372 				 * re-mapped:  we can't restore the old
3373 				 * mappings...
3374 				 */
3375 				vm_map_enter_restore_failures++;
3376 			} else {
3377 				/*
3378 				 * Transfer the saved map entries from
3379 				 * "zap_old_map" to the original "map",
3380 				 * inserting them all after "entry1".
3381 				 */
3382 				while ((entry2 = vm_map_zap_pop(&zap_old_list))) {
3383 					vm_map_size_t entry_size;
3384 
3385 					entry_size = (entry2->vme_end -
3386 					    entry2->vme_start);
3387 					vm_map_store_entry_link(map, entry1, entry2,
3388 					    VM_MAP_KERNEL_FLAGS_NONE);
3389 					map->size += entry_size;
3390 					entry1 = entry2;
3391 				}
3392 				if (map->wiring_required) {
3393 					/*
3394 					 * XXX TODO: we should rewire the
3395 					 * old pages here...
3396 					 */
3397 				}
3398 				vm_map_enter_restore_successes++;
3399 			}
3400 		}
3401 	}
3402 
3403 	/*
3404 	 * The caller is responsible for releasing the lock if it requested to
3405 	 * keep the map locked.
3406 	 */
3407 	if (map_locked && !keep_map_locked) {
3408 		vm_map_unlock(map);
3409 	}
3410 
3411 	vm_map_zap_dispose(&zap_old_list);
3412 	vm_map_zap_dispose(&zap_new_list);
3413 
3414 	if (new_mapping_established) {
3415 		/*
3416 		 * The caller had a reference on "caller_object" and we
3417 		 * transferred that reference to the mapping.
3418 		 * We also took an extra reference on "caller_object" to keep
3419 		 * it alive while the map was unlocked.
3420 		 */
3421 		if (result == KERN_SUCCESS) {
3422 			/*
3423 			 * On success, the caller's reference on the object gets
3424 			 * tranferred to the mapping.
3425 			 * Release our extra reference.
3426 			 */
3427 			if (is_submap) {
3428 				vm_map_deallocate((vm_map_t)caller_object);
3429 			} else {
3430 				vm_object_deallocate(caller_object);
3431 			}
3432 		} else {
3433 			/*
3434 			 * On error, the caller expects to still have a
3435 			 * reference on the object it gave us.
3436 			 * Let's use our extra reference for that.
3437 			 */
3438 		}
3439 	}
3440 
3441 	return result;
3442 
3443 #undef  RETURN
3444 }
3445 
3446 #if __arm64__
3447 extern const struct memory_object_pager_ops fourk_pager_ops;
3448 kern_return_t
vm_map_enter_fourk(vm_map_t map,vm_map_offset_t * address,vm_map_size_t size,vm_map_offset_t mask,int flags,vm_map_kernel_flags_t vmk_flags,vm_tag_t alias,vm_object_t object,vm_object_offset_t offset,boolean_t needs_copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)3449 vm_map_enter_fourk(
3450 	vm_map_t                map,
3451 	vm_map_offset_t         *address,       /* IN/OUT */
3452 	vm_map_size_t           size,
3453 	vm_map_offset_t         mask,
3454 	int                     flags,
3455 	vm_map_kernel_flags_t   vmk_flags,
3456 	vm_tag_t                alias,
3457 	vm_object_t             object,
3458 	vm_object_offset_t      offset,
3459 	boolean_t               needs_copy,
3460 	vm_prot_t               cur_protection,
3461 	vm_prot_t               max_protection,
3462 	vm_inherit_t            inheritance)
3463 {
3464 	vm_map_entry_t          entry, new_entry;
3465 	vm_map_offset_t         start, fourk_start;
3466 	vm_map_offset_t         end, fourk_end;
3467 	vm_map_size_t           fourk_size;
3468 	kern_return_t           result = KERN_SUCCESS;
3469 	boolean_t               map_locked = FALSE;
3470 	boolean_t               pmap_empty = TRUE;
3471 	boolean_t               new_mapping_established = FALSE;
3472 	boolean_t               keep_map_locked = vmk_flags.vmkf_keep_map_locked;
3473 	boolean_t               anywhere = ((flags & VM_FLAGS_ANYWHERE) != 0);
3474 	boolean_t               purgable = ((flags & VM_FLAGS_PURGABLE) != 0);
3475 	boolean_t               overwrite = ((flags & VM_FLAGS_OVERWRITE) != 0);
3476 	boolean_t               no_cache = ((flags & VM_FLAGS_NO_CACHE) != 0);
3477 	const boolean_t         is_submap = vmk_flags.vmkf_submap;
3478 	boolean_t               permanent = (((flags & VM_FLAGS_PERMANENT) != 0) || vmk_flags.vmkf_permanent);
3479 	const boolean_t         entry_for_jit = vmk_flags.vmkf_map_jit;
3480 //	boolean_t		iokit_acct = vmk_flags.vmkf_iokit_acct;
3481 	unsigned int            superpage_size = ((flags & VM_FLAGS_SUPERPAGE_MASK) >> VM_FLAGS_SUPERPAGE_SHIFT);
3482 	vm_map_offset_t         effective_min_offset, effective_max_offset;
3483 	kern_return_t           kr;
3484 	boolean_t               clear_map_aligned = FALSE;
3485 	memory_object_t         fourk_mem_obj;
3486 	vm_object_t             fourk_object;
3487 	vm_map_offset_t         fourk_pager_offset;
3488 	int                     fourk_pager_index_start, fourk_pager_index_num;
3489 	int                     cur_idx;
3490 	boolean_t               fourk_copy;
3491 	vm_object_t             copy_object;
3492 	vm_object_offset_t      copy_offset;
3493 	VM_MAP_ZAP_DECLARE(zap_list);
3494 
3495 	if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
3496 		panic("%s:%d", __FUNCTION__, __LINE__);
3497 	}
3498 	fourk_mem_obj = MEMORY_OBJECT_NULL;
3499 	fourk_object = VM_OBJECT_NULL;
3500 
3501 	if (superpage_size) {
3502 		return KERN_NOT_SUPPORTED;
3503 	}
3504 
3505 	if ((cur_protection & VM_PROT_WRITE) &&
3506 	    (cur_protection & VM_PROT_EXECUTE) &&
3507 #if XNU_TARGET_OS_OSX
3508 	    map->pmap != kernel_pmap &&
3509 	    (vm_map_cs_enforcement(map)
3510 #if __arm64__
3511 	    || !VM_MAP_IS_EXOTIC(map)
3512 #endif /* __arm64__ */
3513 	    ) &&
3514 #endif /* XNU_TARGET_OS_OSX */
3515 	    !entry_for_jit) {
3516 		DTRACE_VM3(cs_wx,
3517 		    uint64_t, 0,
3518 		    uint64_t, 0,
3519 		    vm_prot_t, cur_protection);
3520 		printf("CODE SIGNING: %d[%s] %s: curprot cannot be write+execute. "
3521 		    "turning off execute\n",
3522 		    proc_selfpid(),
3523 		    (get_bsdtask_info(current_task())
3524 		    ? proc_name_address(get_bsdtask_info(current_task()))
3525 		    : "?"),
3526 		    __FUNCTION__);
3527 		cur_protection &= ~VM_PROT_EXECUTE;
3528 	}
3529 
3530 	/*
3531 	 * If the task has requested executable lockdown,
3532 	 * deny any new executable mapping.
3533 	 */
3534 	if (map->map_disallow_new_exec == TRUE) {
3535 		if (cur_protection & VM_PROT_EXECUTE) {
3536 			return KERN_PROTECTION_FAILURE;
3537 		}
3538 	}
3539 
3540 	if (is_submap) {
3541 		return KERN_NOT_SUPPORTED;
3542 	}
3543 	if (vmk_flags.vmkf_already) {
3544 		return KERN_NOT_SUPPORTED;
3545 	}
3546 	if (purgable || entry_for_jit) {
3547 		return KERN_NOT_SUPPORTED;
3548 	}
3549 
3550 	effective_min_offset = map->min_offset;
3551 
3552 	if (vmk_flags.vmkf_beyond_max) {
3553 		return KERN_NOT_SUPPORTED;
3554 	} else {
3555 		effective_max_offset = map->max_offset;
3556 	}
3557 
3558 	if (size == 0 ||
3559 	    (offset & FOURK_PAGE_MASK) != 0) {
3560 		*address = 0;
3561 		return KERN_INVALID_ARGUMENT;
3562 	}
3563 
3564 #define RETURN(value)   { result = value; goto BailOut; }
3565 
3566 	assert(VM_MAP_PAGE_ALIGNED(*address, FOURK_PAGE_MASK));
3567 	assert(VM_MAP_PAGE_ALIGNED(size, FOURK_PAGE_MASK));
3568 
3569 	if (!anywhere && overwrite) {
3570 		return KERN_NOT_SUPPORTED;
3571 	}
3572 
3573 	fourk_start = *address;
3574 	fourk_size = size;
3575 	fourk_end = fourk_start + fourk_size;
3576 
3577 	start = vm_map_trunc_page(*address, VM_MAP_PAGE_MASK(map));
3578 	end = vm_map_round_page(fourk_end, VM_MAP_PAGE_MASK(map));
3579 	size = end - start;
3580 
3581 	if (anywhere) {
3582 		return KERN_NOT_SUPPORTED;
3583 	} else {
3584 		/*
3585 		 *	Verify that:
3586 		 *		the address doesn't itself violate
3587 		 *		the mask requirement.
3588 		 */
3589 
3590 		vm_map_lock(map);
3591 		map_locked = TRUE;
3592 		if ((start & mask) != 0) {
3593 			RETURN(KERN_NO_SPACE);
3594 		}
3595 
3596 		/*
3597 		 *	...	the address is within bounds
3598 		 */
3599 
3600 		end = start + size;
3601 
3602 		if ((start < effective_min_offset) ||
3603 		    (end > effective_max_offset) ||
3604 		    (start >= end)) {
3605 			RETURN(KERN_INVALID_ADDRESS);
3606 		}
3607 
3608 		/*
3609 		 *	...	the starting address isn't allocated
3610 		 */
3611 		if (vm_map_lookup_entry(map, start, &entry)) {
3612 			vm_object_t cur_object, shadow_object;
3613 
3614 			/*
3615 			 * We might already some 4K mappings
3616 			 * in a 16K page here.
3617 			 */
3618 
3619 			if (entry->vme_end - entry->vme_start
3620 			    != SIXTEENK_PAGE_SIZE) {
3621 				RETURN(KERN_NO_SPACE);
3622 			}
3623 			if (entry->is_sub_map) {
3624 				RETURN(KERN_NO_SPACE);
3625 			}
3626 			if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
3627 				RETURN(KERN_NO_SPACE);
3628 			}
3629 
3630 			/* go all the way down the shadow chain */
3631 			cur_object = VME_OBJECT(entry);
3632 			vm_object_lock(cur_object);
3633 			while (cur_object->shadow != VM_OBJECT_NULL) {
3634 				shadow_object = cur_object->shadow;
3635 				vm_object_lock(shadow_object);
3636 				vm_object_unlock(cur_object);
3637 				cur_object = shadow_object;
3638 				shadow_object = VM_OBJECT_NULL;
3639 			}
3640 			if (cur_object->internal ||
3641 			    cur_object->pager == NULL) {
3642 				vm_object_unlock(cur_object);
3643 				RETURN(KERN_NO_SPACE);
3644 			}
3645 			if (cur_object->pager->mo_pager_ops
3646 			    != &fourk_pager_ops) {
3647 				vm_object_unlock(cur_object);
3648 				RETURN(KERN_NO_SPACE);
3649 			}
3650 			fourk_object = cur_object;
3651 			fourk_mem_obj = fourk_object->pager;
3652 
3653 			/* keep the "4K" object alive */
3654 			vm_object_reference_locked(fourk_object);
3655 			memory_object_reference(fourk_mem_obj);
3656 			vm_object_unlock(fourk_object);
3657 
3658 			/* merge permissions */
3659 			entry->protection |= cur_protection;
3660 			entry->max_protection |= max_protection;
3661 
3662 			if ((entry->protection & VM_PROT_WRITE) &&
3663 			    (entry->protection & VM_PROT_ALLEXEC) &&
3664 			    fourk_binary_compatibility_unsafe &&
3665 			    fourk_binary_compatibility_allow_wx) {
3666 				/* write+execute: need to be "jit" */
3667 				entry->used_for_jit = TRUE;
3668 			}
3669 			goto map_in_fourk_pager;
3670 		}
3671 
3672 		/*
3673 		 *	...	the next region doesn't overlap the
3674 		 *		end point.
3675 		 */
3676 
3677 		if ((entry->vme_next != vm_map_to_entry(map)) &&
3678 		    (entry->vme_next->vme_start < end)) {
3679 			RETURN(KERN_NO_SPACE);
3680 		}
3681 	}
3682 
3683 	/*
3684 	 *	At this point,
3685 	 *		"start" and "end" should define the endpoints of the
3686 	 *			available new range, and
3687 	 *		"entry" should refer to the region before the new
3688 	 *			range, and
3689 	 *
3690 	 *		the map should be locked.
3691 	 */
3692 
3693 	/* create a new "4K" pager */
3694 	fourk_mem_obj = fourk_pager_create();
3695 	fourk_object = fourk_pager_to_vm_object(fourk_mem_obj);
3696 	assert(fourk_object);
3697 
3698 	/* keep the "4" object alive */
3699 	vm_object_reference(fourk_object);
3700 
3701 	/* create a "copy" object, to map the "4K" object copy-on-write */
3702 	fourk_copy = TRUE;
3703 	result = vm_object_copy_strategically(fourk_object,
3704 	    0,
3705 	    end - start,
3706 	    &copy_object,
3707 	    &copy_offset,
3708 	    &fourk_copy);
3709 	assert(result == KERN_SUCCESS);
3710 	assert(copy_object != VM_OBJECT_NULL);
3711 	assert(copy_offset == 0);
3712 
3713 	/* map the "4K" pager's copy object */
3714 	new_entry = vm_map_entry_insert(map,
3715 	    entry,
3716 	    vm_map_trunc_page(start, VM_MAP_PAGE_MASK(map)),
3717 	    vm_map_round_page(end, VM_MAP_PAGE_MASK(map)),
3718 	    copy_object,
3719 	    0,                      /* offset */
3720 	    vmk_flags,
3721 	    FALSE,                  /* needs_copy */
3722 	    cur_protection, max_protection,
3723 	    (entry_for_jit && !VM_MAP_POLICY_ALLOW_JIT_INHERIT(map) ?
3724 	    VM_INHERIT_NONE : inheritance),
3725 	    no_cache,
3726 	    permanent,
3727 	    superpage_size,
3728 	    clear_map_aligned,
3729 	    alias);
3730 	entry = new_entry;
3731 
3732 #if VM_MAP_DEBUG_FOURK
3733 	if (vm_map_debug_fourk) {
3734 		printf("FOURK_PAGER: map %p [0x%llx:0x%llx] new pager %p\n",
3735 		    map,
3736 		    (uint64_t) entry->vme_start,
3737 		    (uint64_t) entry->vme_end,
3738 		    fourk_mem_obj);
3739 	}
3740 #endif /* VM_MAP_DEBUG_FOURK */
3741 
3742 	new_mapping_established = TRUE;
3743 
3744 map_in_fourk_pager:
3745 	/* "map" the original "object" where it belongs in the "4K" pager */
3746 	fourk_pager_offset = (fourk_start & SIXTEENK_PAGE_MASK);
3747 	fourk_pager_index_start = (int) (fourk_pager_offset / FOURK_PAGE_SIZE);
3748 	if (fourk_size > SIXTEENK_PAGE_SIZE) {
3749 		fourk_pager_index_num = 4;
3750 	} else {
3751 		fourk_pager_index_num = (int) (fourk_size / FOURK_PAGE_SIZE);
3752 	}
3753 	if (fourk_pager_index_start + fourk_pager_index_num > 4) {
3754 		fourk_pager_index_num = 4 - fourk_pager_index_start;
3755 	}
3756 	for (cur_idx = 0;
3757 	    cur_idx < fourk_pager_index_num;
3758 	    cur_idx++) {
3759 		vm_object_t             old_object;
3760 		vm_object_offset_t      old_offset;
3761 
3762 		kr = fourk_pager_populate(fourk_mem_obj,
3763 		    TRUE,                       /* overwrite */
3764 		    fourk_pager_index_start + cur_idx,
3765 		    object,
3766 		    (object
3767 		    ? (offset +
3768 		    (cur_idx * FOURK_PAGE_SIZE))
3769 		    : 0),
3770 		    &old_object,
3771 		    &old_offset);
3772 #if VM_MAP_DEBUG_FOURK
3773 		if (vm_map_debug_fourk) {
3774 			if (old_object == (vm_object_t) -1 &&
3775 			    old_offset == (vm_object_offset_t) -1) {
3776 				printf("FOURK_PAGER: map %p [0x%llx:0x%llx] "
3777 				    "pager [%p:0x%llx] "
3778 				    "populate[%d] "
3779 				    "[object:%p,offset:0x%llx]\n",
3780 				    map,
3781 				    (uint64_t) entry->vme_start,
3782 				    (uint64_t) entry->vme_end,
3783 				    fourk_mem_obj,
3784 				    VME_OFFSET(entry),
3785 				    fourk_pager_index_start + cur_idx,
3786 				    object,
3787 				    (object
3788 				    ? (offset + (cur_idx * FOURK_PAGE_SIZE))
3789 				    : 0));
3790 			} else {
3791 				printf("FOURK_PAGER: map %p [0x%llx:0x%llx] "
3792 				    "pager [%p:0x%llx] "
3793 				    "populate[%d] [object:%p,offset:0x%llx] "
3794 				    "old [%p:0x%llx]\n",
3795 				    map,
3796 				    (uint64_t) entry->vme_start,
3797 				    (uint64_t) entry->vme_end,
3798 				    fourk_mem_obj,
3799 				    VME_OFFSET(entry),
3800 				    fourk_pager_index_start + cur_idx,
3801 				    object,
3802 				    (object
3803 				    ? (offset + (cur_idx * FOURK_PAGE_SIZE))
3804 				    : 0),
3805 				    old_object,
3806 				    old_offset);
3807 			}
3808 		}
3809 #endif /* VM_MAP_DEBUG_FOURK */
3810 
3811 		assert(kr == KERN_SUCCESS);
3812 		if (object != old_object &&
3813 		    object != VM_OBJECT_NULL &&
3814 		    object != (vm_object_t) -1) {
3815 			vm_object_reference(object);
3816 		}
3817 		if (object != old_object &&
3818 		    old_object != VM_OBJECT_NULL &&
3819 		    old_object != (vm_object_t) -1) {
3820 			vm_object_deallocate(old_object);
3821 		}
3822 	}
3823 
3824 BailOut:
3825 	assert(map_locked == TRUE);
3826 
3827 	if (result == KERN_SUCCESS) {
3828 		vm_prot_t pager_prot;
3829 		memory_object_t pager;
3830 
3831 #if DEBUG
3832 		if (pmap_empty &&
3833 		    !(vmk_flags.vmkf_no_pmap_check)) {
3834 			assert(pmap_is_empty(map->pmap,
3835 			    *address,
3836 			    *address + size));
3837 		}
3838 #endif /* DEBUG */
3839 
3840 		/*
3841 		 * For "named" VM objects, let the pager know that the
3842 		 * memory object is being mapped.  Some pagers need to keep
3843 		 * track of this, to know when they can reclaim the memory
3844 		 * object, for example.
3845 		 * VM calls memory_object_map() for each mapping (specifying
3846 		 * the protection of each mapping) and calls
3847 		 * memory_object_last_unmap() when all the mappings are gone.
3848 		 */
3849 		pager_prot = max_protection;
3850 		if (needs_copy) {
3851 			/*
3852 			 * Copy-On-Write mapping: won't modify
3853 			 * the memory object.
3854 			 */
3855 			pager_prot &= ~VM_PROT_WRITE;
3856 		}
3857 		if (!is_submap &&
3858 		    object != VM_OBJECT_NULL &&
3859 		    object->named &&
3860 		    object->pager != MEMORY_OBJECT_NULL) {
3861 			vm_object_lock(object);
3862 			pager = object->pager;
3863 			if (object->named &&
3864 			    pager != MEMORY_OBJECT_NULL) {
3865 				assert(object->pager_ready);
3866 				vm_object_mapping_wait(object, THREAD_UNINT);
3867 				vm_object_mapping_begin(object);
3868 				vm_object_unlock(object);
3869 
3870 				kr = memory_object_map(pager, pager_prot);
3871 				assert(kr == KERN_SUCCESS);
3872 
3873 				vm_object_lock(object);
3874 				vm_object_mapping_end(object);
3875 			}
3876 			vm_object_unlock(object);
3877 		}
3878 		if (!is_submap &&
3879 		    fourk_object != VM_OBJECT_NULL &&
3880 		    fourk_object->named &&
3881 		    fourk_object->pager != MEMORY_OBJECT_NULL) {
3882 			vm_object_lock(fourk_object);
3883 			pager = fourk_object->pager;
3884 			if (fourk_object->named &&
3885 			    pager != MEMORY_OBJECT_NULL) {
3886 				assert(fourk_object->pager_ready);
3887 				vm_object_mapping_wait(fourk_object,
3888 				    THREAD_UNINT);
3889 				vm_object_mapping_begin(fourk_object);
3890 				vm_object_unlock(fourk_object);
3891 
3892 				kr = memory_object_map(pager, VM_PROT_READ);
3893 				assert(kr == KERN_SUCCESS);
3894 
3895 				vm_object_lock(fourk_object);
3896 				vm_object_mapping_end(fourk_object);
3897 			}
3898 			vm_object_unlock(fourk_object);
3899 		}
3900 	}
3901 
3902 	if (fourk_object != VM_OBJECT_NULL) {
3903 		vm_object_deallocate(fourk_object);
3904 		fourk_object = VM_OBJECT_NULL;
3905 		memory_object_deallocate(fourk_mem_obj);
3906 		fourk_mem_obj = MEMORY_OBJECT_NULL;
3907 	}
3908 
3909 	assert(map_locked == TRUE);
3910 
3911 	if (!keep_map_locked) {
3912 		vm_map_unlock(map);
3913 		map_locked = FALSE;
3914 	}
3915 
3916 	/*
3917 	 * We can't hold the map lock if we enter this block.
3918 	 */
3919 
3920 	if (result == KERN_SUCCESS) {
3921 		/*	Wire down the new entry if the user
3922 		 *	requested all new map entries be wired.
3923 		 */
3924 		if ((map->wiring_required) || (superpage_size)) {
3925 			assert(!keep_map_locked);
3926 			pmap_empty = FALSE; /* pmap won't be empty */
3927 			kr = vm_map_wire_kernel(map, start, end,
3928 			    new_entry->protection, VM_KERN_MEMORY_MLOCK,
3929 			    TRUE);
3930 			result = kr;
3931 		}
3932 
3933 	}
3934 
3935 	if (result != KERN_SUCCESS) {
3936 		if (new_mapping_established) {
3937 			/*
3938 			 * We have to get rid of the new mappings since we
3939 			 * won't make them available to the user.
3940 			 * Try and do that atomically, to minimize the risk
3941 			 * that someone else create new mappings that range.
3942 			 */
3943 
3944 			if (!map_locked) {
3945 				vm_map_lock(map);
3946 				map_locked = TRUE;
3947 			}
3948 			(void)vm_map_delete(map, *address, *address + size,
3949 			    VM_MAP_REMOVE_NO_MAP_ALIGN | VM_MAP_REMOVE_NO_YIELD,
3950 			    KMEM_GUARD_NONE, &zap_list);
3951 		}
3952 	}
3953 
3954 	/*
3955 	 * The caller is responsible for releasing the lock if it requested to
3956 	 * keep the map locked.
3957 	 */
3958 	if (map_locked && !keep_map_locked) {
3959 		vm_map_unlock(map);
3960 	}
3961 
3962 	vm_map_zap_dispose(&zap_list);
3963 
3964 	return result;
3965 
3966 #undef  RETURN
3967 }
3968 #endif /* __arm64__ */
3969 
3970 /*
3971  * Counters for the prefault optimization.
3972  */
3973 int64_t vm_prefault_nb_pages = 0;
3974 int64_t vm_prefault_nb_bailout = 0;
3975 
3976 static kern_return_t
vm_map_enter_mem_object_helper(vm_map_t target_map,vm_map_offset_t * address,vm_map_size_t initial_size,vm_map_offset_t mask,int flags,vm_map_kernel_flags_t vmk_flags,vm_tag_t tag,ipc_port_t port,vm_object_offset_t offset,boolean_t copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance,upl_page_list_ptr_t page_list,unsigned int page_list_count)3977 vm_map_enter_mem_object_helper(
3978 	vm_map_t                target_map,
3979 	vm_map_offset_t         *address,
3980 	vm_map_size_t           initial_size,
3981 	vm_map_offset_t         mask,
3982 	int                     flags,
3983 	vm_map_kernel_flags_t   vmk_flags,
3984 	vm_tag_t                tag,
3985 	ipc_port_t              port,
3986 	vm_object_offset_t      offset,
3987 	boolean_t               copy,
3988 	vm_prot_t               cur_protection,
3989 	vm_prot_t               max_protection,
3990 	vm_inherit_t            inheritance,
3991 	upl_page_list_ptr_t     page_list,
3992 	unsigned int            page_list_count)
3993 {
3994 	vm_map_address_t        map_addr;
3995 	vm_map_size_t           map_size;
3996 	vm_object_t             object;
3997 	vm_object_size_t        size;
3998 	kern_return_t           result;
3999 	boolean_t               mask_cur_protection, mask_max_protection;
4000 	boolean_t               kernel_prefault, try_prefault = (page_list_count != 0);
4001 	vm_map_offset_t         offset_in_mapping = 0;
4002 #if __arm64__
4003 	boolean_t               fourk = vmk_flags.vmkf_fourk;
4004 #endif /* __arm64__ */
4005 
4006 	if (VM_MAP_PAGE_SHIFT(target_map) < PAGE_SHIFT) {
4007 		/* XXX TODO4K prefaulting depends on page size... */
4008 		try_prefault = FALSE;
4009 	}
4010 
4011 	assertf(vmk_flags.__vmkf_unused == 0, "vmk_flags unused=0x%x\n", vmk_flags.__vmkf_unused);
4012 
4013 	mask_cur_protection = cur_protection & VM_PROT_IS_MASK;
4014 	mask_max_protection = max_protection & VM_PROT_IS_MASK;
4015 	cur_protection &= ~VM_PROT_IS_MASK;
4016 	max_protection &= ~VM_PROT_IS_MASK;
4017 
4018 	/*
4019 	 * Check arguments for validity
4020 	 */
4021 	if ((target_map == VM_MAP_NULL) ||
4022 	    (cur_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)) ||
4023 	    (max_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)) ||
4024 	    (inheritance > VM_INHERIT_LAST_VALID) ||
4025 	    (try_prefault && (copy || !page_list)) ||
4026 	    initial_size == 0) {
4027 		return KERN_INVALID_ARGUMENT;
4028 	}
4029 
4030 #if __arm64__
4031 	if (cur_protection & VM_PROT_EXECUTE) {
4032 		cur_protection |= VM_PROT_READ;
4033 	}
4034 
4035 	if (fourk && VM_MAP_PAGE_SHIFT(target_map) < PAGE_SHIFT) {
4036 		/* no "fourk" if map is using a sub-page page size */
4037 		fourk = FALSE;
4038 	}
4039 	if (fourk) {
4040 		map_addr = vm_map_trunc_page(*address, FOURK_PAGE_MASK);
4041 		map_size = vm_map_round_page(initial_size, FOURK_PAGE_MASK);
4042 	} else
4043 #endif /* __arm64__ */
4044 	{
4045 		map_addr = vm_map_trunc_page(*address,
4046 		    VM_MAP_PAGE_MASK(target_map));
4047 		map_size = vm_map_round_page(initial_size,
4048 		    VM_MAP_PAGE_MASK(target_map));
4049 	}
4050 	size = vm_object_round_page(initial_size);
4051 
4052 	/*
4053 	 * Find the vm object (if any) corresponding to this port.
4054 	 */
4055 	if (!IP_VALID(port)) {
4056 		object = VM_OBJECT_NULL;
4057 		offset = 0;
4058 		copy = FALSE;
4059 	} else if (ip_kotype(port) == IKOT_NAMED_ENTRY) {
4060 		vm_named_entry_t        named_entry;
4061 		vm_object_offset_t      data_offset;
4062 
4063 		named_entry = mach_memory_entry_from_port(port);
4064 
4065 		if (flags & (VM_FLAGS_RETURN_DATA_ADDR |
4066 		    VM_FLAGS_RETURN_4K_DATA_ADDR)) {
4067 			data_offset = named_entry->data_offset;
4068 			offset += named_entry->data_offset;
4069 		} else {
4070 			data_offset = 0;
4071 		}
4072 
4073 		/* a few checks to make sure user is obeying rules */
4074 		if (size == 0) {
4075 			if (offset >= named_entry->size) {
4076 				return KERN_INVALID_RIGHT;
4077 			}
4078 			size = named_entry->size - offset;
4079 		}
4080 		if (mask_max_protection) {
4081 			max_protection &= named_entry->protection;
4082 		}
4083 		if (mask_cur_protection) {
4084 			cur_protection &= named_entry->protection;
4085 		}
4086 		if ((named_entry->protection & max_protection) !=
4087 		    max_protection) {
4088 			return KERN_INVALID_RIGHT;
4089 		}
4090 		if ((named_entry->protection & cur_protection) !=
4091 		    cur_protection) {
4092 			return KERN_INVALID_RIGHT;
4093 		}
4094 		if (offset + size < offset) {
4095 			/* overflow */
4096 			return KERN_INVALID_ARGUMENT;
4097 		}
4098 		if (named_entry->size < (offset + initial_size)) {
4099 			return KERN_INVALID_ARGUMENT;
4100 		}
4101 
4102 		if (named_entry->is_copy) {
4103 			/* for a vm_map_copy, we can only map it whole */
4104 			if ((size != named_entry->size) &&
4105 			    (vm_map_round_page(size,
4106 			    VM_MAP_PAGE_MASK(target_map)) ==
4107 			    named_entry->size)) {
4108 				/* XXX FBDP use the rounded size... */
4109 				size = vm_map_round_page(
4110 					size,
4111 					VM_MAP_PAGE_MASK(target_map));
4112 			}
4113 		}
4114 
4115 		/* the callers parameter offset is defined to be the */
4116 		/* offset from beginning of named entry offset in object */
4117 		offset = offset + named_entry->offset;
4118 
4119 		if (!VM_MAP_PAGE_ALIGNED(size,
4120 		    VM_MAP_PAGE_MASK(target_map))) {
4121 			/*
4122 			 * Let's not map more than requested;
4123 			 * vm_map_enter() will handle this "not map-aligned"
4124 			 * case.
4125 			 */
4126 			map_size = size;
4127 		}
4128 
4129 		named_entry_lock(named_entry);
4130 		if (named_entry->is_sub_map) {
4131 			vm_map_t                submap;
4132 
4133 			if (flags & (VM_FLAGS_RETURN_DATA_ADDR |
4134 			    VM_FLAGS_RETURN_4K_DATA_ADDR)) {
4135 				panic("VM_FLAGS_RETURN_DATA_ADDR not expected for submap.");
4136 			}
4137 
4138 			submap = named_entry->backing.map;
4139 			vm_map_reference(submap);
4140 			named_entry_unlock(named_entry);
4141 
4142 			vmk_flags.vmkf_submap = TRUE;
4143 
4144 			result = vm_map_enter(target_map,
4145 			    &map_addr,
4146 			    map_size,
4147 			    mask,
4148 			    flags,
4149 			    vmk_flags,
4150 			    tag,
4151 			    (vm_object_t)(uintptr_t) submap,
4152 			    offset,
4153 			    copy,
4154 			    cur_protection,
4155 			    max_protection,
4156 			    inheritance);
4157 			if (result != KERN_SUCCESS) {
4158 				vm_map_deallocate(submap);
4159 			} else {
4160 				/*
4161 				 * No need to lock "submap" just to check its
4162 				 * "mapped" flag: that flag is never reset
4163 				 * once it's been set and if we race, we'll
4164 				 * just end up setting it twice, which is OK.
4165 				 */
4166 				if (submap->mapped_in_other_pmaps == FALSE &&
4167 				    vm_map_pmap(submap) != PMAP_NULL &&
4168 				    vm_map_pmap(submap) !=
4169 				    vm_map_pmap(target_map)) {
4170 					/*
4171 					 * This submap is being mapped in a map
4172 					 * that uses a different pmap.
4173 					 * Set its "mapped_in_other_pmaps" flag
4174 					 * to indicate that we now need to
4175 					 * remove mappings from all pmaps rather
4176 					 * than just the submap's pmap.
4177 					 */
4178 					vm_map_lock(submap);
4179 					submap->mapped_in_other_pmaps = TRUE;
4180 					vm_map_unlock(submap);
4181 				}
4182 				*address = map_addr;
4183 			}
4184 			return result;
4185 		} else if (named_entry->is_copy) {
4186 			kern_return_t   kr;
4187 			vm_map_copy_t   copy_map;
4188 			vm_map_entry_t  copy_entry;
4189 			vm_map_offset_t copy_addr;
4190 			vm_map_copy_t   target_copy_map;
4191 			vm_map_offset_t overmap_start, overmap_end;
4192 			vm_map_offset_t trimmed_start;
4193 			vm_map_size_t   target_size;
4194 
4195 			if (flags & ~(VM_FLAGS_FIXED |
4196 			    VM_FLAGS_ANYWHERE |
4197 			    VM_FLAGS_OVERWRITE |
4198 			    VM_FLAGS_RETURN_4K_DATA_ADDR |
4199 			    VM_FLAGS_RETURN_DATA_ADDR |
4200 			    VM_FLAGS_ALIAS_MASK)) {
4201 				named_entry_unlock(named_entry);
4202 				return KERN_INVALID_ARGUMENT;
4203 			}
4204 
4205 			copy_map = named_entry->backing.copy;
4206 			assert(copy_map->type == VM_MAP_COPY_ENTRY_LIST);
4207 			if (copy_map->type != VM_MAP_COPY_ENTRY_LIST) {
4208 				/* unsupported type; should not happen */
4209 				printf("vm_map_enter_mem_object: "
4210 				    "memory_entry->backing.copy "
4211 				    "unsupported type 0x%x\n",
4212 				    copy_map->type);
4213 				named_entry_unlock(named_entry);
4214 				return KERN_INVALID_ARGUMENT;
4215 			}
4216 
4217 			if (VM_MAP_PAGE_SHIFT(target_map) != copy_map->cpy_hdr.page_shift) {
4218 				DEBUG4K_SHARE("copy_map %p offset %llx size 0x%llx pgshift %d -> target_map %p pgshift %d\n", copy_map, offset, (uint64_t)map_size, copy_map->cpy_hdr.page_shift, target_map, VM_MAP_PAGE_SHIFT(target_map));
4219 			}
4220 
4221 			if (flags & (VM_FLAGS_RETURN_DATA_ADDR |
4222 			    VM_FLAGS_RETURN_4K_DATA_ADDR)) {
4223 				offset_in_mapping = offset & VM_MAP_PAGE_MASK(target_map);
4224 				if (flags & VM_FLAGS_RETURN_4K_DATA_ADDR) {
4225 					offset_in_mapping &= ~((signed)(0xFFF));
4226 				}
4227 			}
4228 
4229 			target_copy_map = VM_MAP_COPY_NULL;
4230 			target_size = copy_map->size;
4231 			overmap_start = 0;
4232 			overmap_end = 0;
4233 			trimmed_start = 0;
4234 			if (copy_map->cpy_hdr.page_shift != VM_MAP_PAGE_SHIFT(target_map)) {
4235 				DEBUG4K_ADJUST("adjusting...\n");
4236 				kr = vm_map_copy_adjust_to_target(
4237 					copy_map,
4238 					offset /* includes data_offset */,
4239 					initial_size,
4240 					target_map,
4241 					copy,
4242 					&target_copy_map,
4243 					&overmap_start,
4244 					&overmap_end,
4245 					&trimmed_start);
4246 				if (kr != KERN_SUCCESS) {
4247 					named_entry_unlock(named_entry);
4248 					return kr;
4249 				}
4250 				target_size = target_copy_map->size;
4251 				if (trimmed_start >= data_offset) {
4252 					data_offset = offset & VM_MAP_PAGE_MASK(target_map);
4253 				} else {
4254 					data_offset -= trimmed_start;
4255 				}
4256 			} else {
4257 				/*
4258 				 * Assert that the vm_map_copy is coming from the right
4259 				 * zone and hasn't been forged
4260 				 */
4261 				vm_map_copy_require(copy_map);
4262 				target_copy_map = copy_map;
4263 			}
4264 
4265 			/* reserve a contiguous range */
4266 			kr = vm_map_enter(target_map,
4267 			    &map_addr,
4268 			    vm_map_round_page(target_size, VM_MAP_PAGE_MASK(target_map)),
4269 			    mask,
4270 			    flags & (VM_FLAGS_ANYWHERE |
4271 			    VM_FLAGS_OVERWRITE |
4272 			    VM_FLAGS_RETURN_4K_DATA_ADDR |
4273 			    VM_FLAGS_RETURN_DATA_ADDR),
4274 			    vmk_flags,
4275 			    tag,
4276 			    VM_OBJECT_NULL,
4277 			    0,
4278 			    FALSE,               /* copy */
4279 			    cur_protection,
4280 			    max_protection,
4281 			    inheritance);
4282 			if (kr != KERN_SUCCESS) {
4283 				DEBUG4K_ERROR("kr 0x%x\n", kr);
4284 				if (target_copy_map != copy_map) {
4285 					vm_map_copy_discard(target_copy_map);
4286 					target_copy_map = VM_MAP_COPY_NULL;
4287 				}
4288 				named_entry_unlock(named_entry);
4289 				return kr;
4290 			}
4291 
4292 			copy_addr = map_addr;
4293 
4294 			for (copy_entry = vm_map_copy_first_entry(target_copy_map);
4295 			    copy_entry != vm_map_copy_to_entry(target_copy_map);
4296 			    copy_entry = copy_entry->vme_next) {
4297 				int                     remap_flags;
4298 				vm_map_kernel_flags_t   vmk_remap_flags;
4299 				vm_map_t                copy_submap = VM_MAP_NULL;
4300 				vm_object_t             copy_object = VM_OBJECT_NULL;
4301 				vm_map_size_t           copy_size;
4302 				vm_object_offset_t      copy_offset;
4303 				int                     copy_vm_alias;
4304 				boolean_t               do_copy;
4305 
4306 				do_copy = FALSE;
4307 				remap_flags = 0;
4308 				vmk_remap_flags = VM_MAP_KERNEL_FLAGS_NONE;
4309 
4310 				if (copy_entry->is_sub_map) {
4311 					copy_submap = VME_SUBMAP(copy_entry);
4312 					copy_object = (vm_object_t)copy_submap;
4313 				} else {
4314 					copy_object = VME_OBJECT(copy_entry);
4315 				}
4316 				copy_offset = VME_OFFSET(copy_entry);
4317 				copy_size = (copy_entry->vme_end -
4318 				    copy_entry->vme_start);
4319 				VM_GET_FLAGS_ALIAS(flags, copy_vm_alias);
4320 				if (copy_vm_alias == 0) {
4321 					/*
4322 					 * Caller does not want a specific
4323 					 * alias for this new mapping:  use
4324 					 * the alias of the original mapping.
4325 					 */
4326 					copy_vm_alias = VME_ALIAS(copy_entry);
4327 				}
4328 
4329 				/* sanity check */
4330 				if ((copy_addr + copy_size) >
4331 				    (map_addr +
4332 				    overmap_start + overmap_end +
4333 				    named_entry->size /* XXX full size */)) {
4334 					/* over-mapping too much !? */
4335 					kr = KERN_INVALID_ARGUMENT;
4336 					DEBUG4K_ERROR("kr 0x%x\n", kr);
4337 					/* abort */
4338 					break;
4339 				}
4340 
4341 				/* take a reference on the object */
4342 				if (copy_entry->is_sub_map) {
4343 					vmk_remap_flags.vmkf_submap = TRUE;
4344 					vm_map_reference(copy_submap);
4345 				} else {
4346 					if (!copy &&
4347 					    copy_object != VM_OBJECT_NULL &&
4348 					    copy_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
4349 						/*
4350 						 * We need to resolve our side of this
4351 						 * "symmetric" copy-on-write now; we
4352 						 * need a new object to map and share,
4353 						 * instead of the current one which
4354 						 * might still be shared with the
4355 						 * original mapping.
4356 						 *
4357 						 * Note: A "vm_map_copy_t" does not
4358 						 * have a lock but we're protected by
4359 						 * the named entry's lock here.
4360 						 */
4361 						// assert(copy_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC);
4362 						VME_OBJECT_SHADOW(copy_entry, copy_size, TRUE);
4363 						assert(copy_object != VME_OBJECT(copy_entry));
4364 						if (!copy_entry->needs_copy &&
4365 						    copy_entry->protection & VM_PROT_WRITE) {
4366 							vm_prot_t prot;
4367 
4368 							prot = copy_entry->protection & ~VM_PROT_WRITE;
4369 							vm_object_pmap_protect(copy_object,
4370 							    copy_offset,
4371 							    copy_size,
4372 							    PMAP_NULL,
4373 							    PAGE_SIZE,
4374 							    0,
4375 							    prot);
4376 						}
4377 						copy_entry->needs_copy = FALSE;
4378 						copy_entry->is_shared = TRUE;
4379 						copy_object = VME_OBJECT(copy_entry);
4380 						copy_offset = VME_OFFSET(copy_entry);
4381 						vm_object_lock(copy_object);
4382 						/* we're about to make a shared mapping of this object */
4383 						copy_object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
4384 						copy_object->true_share = TRUE;
4385 						vm_object_unlock(copy_object);
4386 					}
4387 
4388 					if (copy_object != VM_OBJECT_NULL &&
4389 					    copy_object->named &&
4390 					    copy_object->pager != MEMORY_OBJECT_NULL &&
4391 					    copy_object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4392 						memory_object_t pager;
4393 						vm_prot_t       pager_prot;
4394 
4395 						/*
4396 						 * For "named" VM objects, let the pager know that the
4397 						 * memory object is being mapped.  Some pagers need to keep
4398 						 * track of this, to know when they can reclaim the memory
4399 						 * object, for example.
4400 						 * VM calls memory_object_map() for each mapping (specifying
4401 						 * the protection of each mapping) and calls
4402 						 * memory_object_last_unmap() when all the mappings are gone.
4403 						 */
4404 						pager_prot = max_protection;
4405 						if (copy) {
4406 							/*
4407 							 * Copy-On-Write mapping: won't modify the
4408 							 * memory object.
4409 							 */
4410 							pager_prot &= ~VM_PROT_WRITE;
4411 						}
4412 						vm_object_lock(copy_object);
4413 						pager = copy_object->pager;
4414 						if (copy_object->named &&
4415 						    pager != MEMORY_OBJECT_NULL &&
4416 						    copy_object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4417 							assert(copy_object->pager_ready);
4418 							vm_object_mapping_wait(copy_object, THREAD_UNINT);
4419 							vm_object_mapping_begin(copy_object);
4420 							vm_object_unlock(copy_object);
4421 
4422 							kr = memory_object_map(pager, pager_prot);
4423 							assert(kr == KERN_SUCCESS);
4424 
4425 							vm_object_lock(copy_object);
4426 							vm_object_mapping_end(copy_object);
4427 						}
4428 						vm_object_unlock(copy_object);
4429 					}
4430 
4431 					/*
4432 					 *	Perform the copy if requested
4433 					 */
4434 
4435 					if (copy && copy_object != VM_OBJECT_NULL) {
4436 						vm_object_t             new_object;
4437 						vm_object_offset_t      new_offset;
4438 
4439 						result = vm_object_copy_strategically(copy_object, copy_offset,
4440 						    copy_size,
4441 						    &new_object, &new_offset,
4442 						    &do_copy);
4443 
4444 
4445 						if (result == KERN_MEMORY_RESTART_COPY) {
4446 							boolean_t success;
4447 							boolean_t src_needs_copy;
4448 
4449 							/*
4450 							 * XXX
4451 							 * We currently ignore src_needs_copy.
4452 							 * This really is the issue of how to make
4453 							 * MEMORY_OBJECT_COPY_SYMMETRIC safe for
4454 							 * non-kernel users to use. Solution forthcoming.
4455 							 * In the meantime, since we don't allow non-kernel
4456 							 * memory managers to specify symmetric copy,
4457 							 * we won't run into problems here.
4458 							 */
4459 							new_object = copy_object;
4460 							new_offset = copy_offset;
4461 							success = vm_object_copy_quickly(new_object,
4462 							    new_offset,
4463 							    copy_size,
4464 							    &src_needs_copy,
4465 							    &do_copy);
4466 							assert(success);
4467 							result = KERN_SUCCESS;
4468 						}
4469 						if (result != KERN_SUCCESS) {
4470 							kr = result;
4471 							break;
4472 						}
4473 
4474 						copy_object = new_object;
4475 						copy_offset = new_offset;
4476 						/*
4477 						 * No extra object reference for the mapping:
4478 						 * the mapping should be the only thing keeping
4479 						 * this new object alive.
4480 						 */
4481 					} else {
4482 						/*
4483 						 * We already have the right object
4484 						 * to map.
4485 						 */
4486 						copy_object = VME_OBJECT(copy_entry);
4487 						/* take an extra ref for the mapping below */
4488 						vm_object_reference(copy_object);
4489 					}
4490 				}
4491 
4492 				/* over-map the object into destination */
4493 				remap_flags |= flags;
4494 				remap_flags |= VM_FLAGS_FIXED;
4495 				remap_flags |= VM_FLAGS_OVERWRITE;
4496 				remap_flags &= ~VM_FLAGS_ANYWHERE;
4497 				if (!copy && !copy_entry->is_sub_map) {
4498 					/*
4499 					 * copy-on-write should have been
4500 					 * resolved at this point, or we would
4501 					 * end up sharing instead of copying.
4502 					 */
4503 					assert(!copy_entry->needs_copy);
4504 				}
4505 #if XNU_TARGET_OS_OSX
4506 				if (copy_entry->used_for_jit) {
4507 					vmk_remap_flags.vmkf_map_jit = TRUE;
4508 				}
4509 #endif /* XNU_TARGET_OS_OSX */
4510 
4511 				assertf((copy_vm_alias & VME_ALIAS_MASK) == copy_vm_alias,
4512 				    "VM Tag truncated from 0x%x to 0x%x\n", copy_vm_alias, (copy_vm_alias & VME_ALIAS_MASK));
4513 				kr = vm_map_enter(target_map,
4514 				    &copy_addr,
4515 				    copy_size,
4516 				    (vm_map_offset_t) 0,
4517 				    remap_flags,
4518 				    vmk_remap_flags,
4519 				    (vm_tag_t) copy_vm_alias, /* see comment at end of vm_fault_unwire re. cast*/
4520 				    copy_object,
4521 				    copy_offset,
4522 				    ((copy_object == NULL)
4523 				    ? FALSE
4524 				    : (copy || copy_entry->needs_copy)),
4525 				    cur_protection,
4526 				    max_protection,
4527 				    inheritance);
4528 				if (kr != KERN_SUCCESS) {
4529 					DEBUG4K_SHARE("failed kr 0x%x\n", kr);
4530 					if (copy_entry->is_sub_map) {
4531 						vm_map_deallocate(copy_submap);
4532 					} else {
4533 						vm_object_deallocate(copy_object);
4534 					}
4535 					/* abort */
4536 					break;
4537 				}
4538 
4539 				/* next mapping */
4540 				copy_addr += copy_size;
4541 			}
4542 
4543 			if (kr == KERN_SUCCESS) {
4544 				if (flags & (VM_FLAGS_RETURN_DATA_ADDR |
4545 				    VM_FLAGS_RETURN_4K_DATA_ADDR)) {
4546 					*address = map_addr + offset_in_mapping;
4547 				} else {
4548 					*address = map_addr;
4549 				}
4550 				if (overmap_start) {
4551 					*address += overmap_start;
4552 					DEBUG4K_SHARE("map %p map_addr 0x%llx offset_in_mapping 0x%llx overmap_start 0x%llx -> *address 0x%llx\n", target_map, (uint64_t)map_addr, (uint64_t) offset_in_mapping, (uint64_t)overmap_start, (uint64_t)*address);
4553 				}
4554 			}
4555 			named_entry_unlock(named_entry);
4556 			if (target_copy_map != copy_map) {
4557 				vm_map_copy_discard(target_copy_map);
4558 				target_copy_map = VM_MAP_COPY_NULL;
4559 			}
4560 
4561 			if (kr != KERN_SUCCESS) {
4562 				if (!(flags & VM_FLAGS_OVERWRITE)) {
4563 					/* deallocate the contiguous range */
4564 					(void) vm_deallocate(target_map,
4565 					    map_addr,
4566 					    map_size);
4567 				}
4568 			}
4569 
4570 			return kr;
4571 		}
4572 
4573 		if (named_entry->is_object) {
4574 			unsigned int    access;
4575 			vm_prot_t       protections;
4576 			unsigned int    wimg_mode;
4577 
4578 			/* we are mapping a VM object */
4579 
4580 			protections = named_entry->protection & (VM_PROT_ALL | VM_PROT_ALLEXEC);
4581 			access = GET_MAP_MEM(named_entry->protection);
4582 
4583 			if (flags & (VM_FLAGS_RETURN_DATA_ADDR |
4584 			    VM_FLAGS_RETURN_4K_DATA_ADDR)) {
4585 				offset_in_mapping = offset - VM_MAP_TRUNC_PAGE(offset, VM_MAP_PAGE_MASK(target_map));
4586 				if (flags & VM_FLAGS_RETURN_4K_DATA_ADDR) {
4587 					offset_in_mapping &= ~((signed)(0xFFF));
4588 				}
4589 				offset = VM_MAP_TRUNC_PAGE(offset, VM_MAP_PAGE_MASK(target_map));
4590 				map_size = VM_MAP_ROUND_PAGE((offset + offset_in_mapping + initial_size) - offset, VM_MAP_PAGE_MASK(target_map));
4591 			}
4592 
4593 			object = vm_named_entry_to_vm_object(named_entry);
4594 			assert(object != VM_OBJECT_NULL);
4595 			vm_object_lock(object);
4596 			named_entry_unlock(named_entry);
4597 
4598 			vm_object_reference_locked(object);
4599 
4600 			wimg_mode = object->wimg_bits;
4601 			vm_prot_to_wimg(access, &wimg_mode);
4602 			if (object->wimg_bits != wimg_mode) {
4603 				vm_object_change_wimg_mode(object, wimg_mode);
4604 			}
4605 
4606 			vm_object_unlock(object);
4607 		} else {
4608 			panic("invalid VM named entry %p", named_entry);
4609 		}
4610 	} else if (ip_kotype(port) == IKOT_MEMORY_OBJECT) {
4611 		/*
4612 		 * JMM - This is temporary until we unify named entries
4613 		 * and raw memory objects.
4614 		 *
4615 		 * Detected fake ip_kotype for a memory object.  In
4616 		 * this case, the port isn't really a port at all, but
4617 		 * instead is just a raw memory object.
4618 		 */
4619 		if (flags & (VM_FLAGS_RETURN_DATA_ADDR |
4620 		    VM_FLAGS_RETURN_4K_DATA_ADDR)) {
4621 			panic("VM_FLAGS_RETURN_DATA_ADDR not expected for raw memory object.");
4622 		}
4623 
4624 		object = memory_object_to_vm_object((memory_object_t)port);
4625 		if (object == VM_OBJECT_NULL) {
4626 			return KERN_INVALID_OBJECT;
4627 		}
4628 		vm_object_reference(object);
4629 
4630 		/* wait for object (if any) to be ready */
4631 		if (object != VM_OBJECT_NULL) {
4632 			if (object == kernel_object) {
4633 				printf("Warning: Attempt to map kernel object"
4634 				    " by a non-private kernel entity\n");
4635 				return KERN_INVALID_OBJECT;
4636 			}
4637 			if (!object->pager_ready) {
4638 				vm_object_lock(object);
4639 
4640 				while (!object->pager_ready) {
4641 					vm_object_wait(object,
4642 					    VM_OBJECT_EVENT_PAGER_READY,
4643 					    THREAD_UNINT);
4644 					vm_object_lock(object);
4645 				}
4646 				vm_object_unlock(object);
4647 			}
4648 		}
4649 	} else {
4650 		return KERN_INVALID_OBJECT;
4651 	}
4652 
4653 	if (object != VM_OBJECT_NULL &&
4654 	    object->named &&
4655 	    object->pager != MEMORY_OBJECT_NULL &&
4656 	    object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4657 		memory_object_t pager;
4658 		vm_prot_t       pager_prot;
4659 		kern_return_t   kr;
4660 
4661 		/*
4662 		 * For "named" VM objects, let the pager know that the
4663 		 * memory object is being mapped.  Some pagers need to keep
4664 		 * track of this, to know when they can reclaim the memory
4665 		 * object, for example.
4666 		 * VM calls memory_object_map() for each mapping (specifying
4667 		 * the protection of each mapping) and calls
4668 		 * memory_object_last_unmap() when all the mappings are gone.
4669 		 */
4670 		pager_prot = max_protection;
4671 		if (copy) {
4672 			/*
4673 			 * Copy-On-Write mapping: won't modify the
4674 			 * memory object.
4675 			 */
4676 			pager_prot &= ~VM_PROT_WRITE;
4677 		}
4678 		vm_object_lock(object);
4679 		pager = object->pager;
4680 		if (object->named &&
4681 		    pager != MEMORY_OBJECT_NULL &&
4682 		    object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4683 			assert(object->pager_ready);
4684 			vm_object_mapping_wait(object, THREAD_UNINT);
4685 			vm_object_mapping_begin(object);
4686 			vm_object_unlock(object);
4687 
4688 			kr = memory_object_map(pager, pager_prot);
4689 			assert(kr == KERN_SUCCESS);
4690 
4691 			vm_object_lock(object);
4692 			vm_object_mapping_end(object);
4693 		}
4694 		vm_object_unlock(object);
4695 	}
4696 
4697 	/*
4698 	 *	Perform the copy if requested
4699 	 */
4700 
4701 	if (copy) {
4702 		vm_object_t             new_object;
4703 		vm_object_offset_t      new_offset;
4704 
4705 		result = vm_object_copy_strategically(object, offset,
4706 		    map_size,
4707 		    &new_object, &new_offset,
4708 		    &copy);
4709 
4710 
4711 		if (result == KERN_MEMORY_RESTART_COPY) {
4712 			boolean_t success;
4713 			boolean_t src_needs_copy;
4714 
4715 			/*
4716 			 * XXX
4717 			 * We currently ignore src_needs_copy.
4718 			 * This really is the issue of how to make
4719 			 * MEMORY_OBJECT_COPY_SYMMETRIC safe for
4720 			 * non-kernel users to use. Solution forthcoming.
4721 			 * In the meantime, since we don't allow non-kernel
4722 			 * memory managers to specify symmetric copy,
4723 			 * we won't run into problems here.
4724 			 */
4725 			new_object = object;
4726 			new_offset = offset;
4727 			success = vm_object_copy_quickly(new_object,
4728 			    new_offset,
4729 			    map_size,
4730 			    &src_needs_copy,
4731 			    &copy);
4732 			assert(success);
4733 			result = KERN_SUCCESS;
4734 		}
4735 		/*
4736 		 *	Throw away the reference to the
4737 		 *	original object, as it won't be mapped.
4738 		 */
4739 
4740 		vm_object_deallocate(object);
4741 
4742 		if (result != KERN_SUCCESS) {
4743 			return result;
4744 		}
4745 
4746 		object = new_object;
4747 		offset = new_offset;
4748 	}
4749 
4750 	/*
4751 	 * If non-kernel users want to try to prefault pages, the mapping and prefault
4752 	 * needs to be atomic.
4753 	 */
4754 	kernel_prefault = (try_prefault && vm_kernel_map_is_kernel(target_map));
4755 	vmk_flags.vmkf_keep_map_locked = (try_prefault && !kernel_prefault);
4756 
4757 #if __arm64__
4758 	if (fourk) {
4759 		/* map this object in a "4K" pager */
4760 		result = vm_map_enter_fourk(target_map,
4761 		    &map_addr,
4762 		    map_size,
4763 		    (vm_map_offset_t) mask,
4764 		    flags,
4765 		    vmk_flags,
4766 		    tag,
4767 		    object,
4768 		    offset,
4769 		    copy,
4770 		    cur_protection,
4771 		    max_protection,
4772 		    inheritance);
4773 	} else
4774 #endif /* __arm64__ */
4775 	{
4776 		result = vm_map_enter(target_map,
4777 		    &map_addr, map_size,
4778 		    (vm_map_offset_t)mask,
4779 		    flags,
4780 		    vmk_flags,
4781 		    tag,
4782 		    object, offset,
4783 		    copy,
4784 		    cur_protection, max_protection,
4785 		    inheritance);
4786 	}
4787 	if (result != KERN_SUCCESS) {
4788 		vm_object_deallocate(object);
4789 	}
4790 
4791 	/*
4792 	 * Try to prefault, and do not forget to release the vm map lock.
4793 	 */
4794 	if (result == KERN_SUCCESS && try_prefault) {
4795 		mach_vm_address_t va = map_addr;
4796 		kern_return_t kr = KERN_SUCCESS;
4797 		unsigned int i = 0;
4798 		int pmap_options;
4799 
4800 		pmap_options = kernel_prefault ? 0 : PMAP_OPTIONS_NOWAIT;
4801 		if (object->internal) {
4802 			pmap_options |= PMAP_OPTIONS_INTERNAL;
4803 		}
4804 
4805 		for (i = 0; i < page_list_count; ++i) {
4806 			if (!UPL_VALID_PAGE(page_list, i)) {
4807 				if (kernel_prefault) {
4808 					assertf(FALSE, "kernel_prefault && !UPL_VALID_PAGE");
4809 					result = KERN_MEMORY_ERROR;
4810 					break;
4811 				}
4812 			} else {
4813 				/*
4814 				 * If this function call failed, we should stop
4815 				 * trying to optimize, other calls are likely
4816 				 * going to fail too.
4817 				 *
4818 				 * We are not gonna report an error for such
4819 				 * failure though. That's an optimization, not
4820 				 * something critical.
4821 				 */
4822 				kr = pmap_enter_options(target_map->pmap,
4823 				    va, UPL_PHYS_PAGE(page_list, i),
4824 				    cur_protection, VM_PROT_NONE,
4825 				    0, TRUE, pmap_options, NULL);
4826 				if (kr != KERN_SUCCESS) {
4827 					OSIncrementAtomic64(&vm_prefault_nb_bailout);
4828 					if (kernel_prefault) {
4829 						result = kr;
4830 					}
4831 					break;
4832 				}
4833 				OSIncrementAtomic64(&vm_prefault_nb_pages);
4834 			}
4835 
4836 			/* Next virtual address */
4837 			va += PAGE_SIZE;
4838 		}
4839 		if (vmk_flags.vmkf_keep_map_locked) {
4840 			vm_map_unlock(target_map);
4841 		}
4842 	}
4843 
4844 	if (flags & (VM_FLAGS_RETURN_DATA_ADDR |
4845 	    VM_FLAGS_RETURN_4K_DATA_ADDR)) {
4846 		*address = map_addr + offset_in_mapping;
4847 	} else {
4848 		*address = map_addr;
4849 	}
4850 	return result;
4851 }
4852 
4853 kern_return_t
vm_map_enter_mem_object(vm_map_t target_map,vm_map_offset_t * address,vm_map_size_t initial_size,vm_map_offset_t mask,int flags,vm_map_kernel_flags_t vmk_flags,vm_tag_t tag,ipc_port_t port,vm_object_offset_t offset,boolean_t copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)4854 vm_map_enter_mem_object(
4855 	vm_map_t                target_map,
4856 	vm_map_offset_t         *address,
4857 	vm_map_size_t           initial_size,
4858 	vm_map_offset_t         mask,
4859 	int                     flags,
4860 	vm_map_kernel_flags_t   vmk_flags,
4861 	vm_tag_t                tag,
4862 	ipc_port_t              port,
4863 	vm_object_offset_t      offset,
4864 	boolean_t               copy,
4865 	vm_prot_t               cur_protection,
4866 	vm_prot_t               max_protection,
4867 	vm_inherit_t            inheritance)
4868 {
4869 	kern_return_t ret;
4870 
4871 	ret = vm_map_enter_mem_object_helper(target_map,
4872 	    address,
4873 	    initial_size,
4874 	    mask,
4875 	    flags,
4876 	    vmk_flags,
4877 	    tag,
4878 	    port,
4879 	    offset,
4880 	    copy,
4881 	    cur_protection,
4882 	    max_protection,
4883 	    inheritance,
4884 	    NULL,
4885 	    0);
4886 
4887 #if KASAN
4888 	if (ret == KERN_SUCCESS && address && target_map->pmap == kernel_pmap) {
4889 		kasan_notify_address(*address, initial_size);
4890 	}
4891 #endif
4892 
4893 	return ret;
4894 }
4895 
4896 kern_return_t
vm_map_enter_mem_object_prefault(vm_map_t target_map,vm_map_offset_t * address,vm_map_size_t initial_size,vm_map_offset_t mask,int flags,vm_map_kernel_flags_t vmk_flags,vm_tag_t tag,ipc_port_t port,vm_object_offset_t offset,vm_prot_t cur_protection,vm_prot_t max_protection,upl_page_list_ptr_t page_list,unsigned int page_list_count)4897 vm_map_enter_mem_object_prefault(
4898 	vm_map_t                target_map,
4899 	vm_map_offset_t         *address,
4900 	vm_map_size_t           initial_size,
4901 	vm_map_offset_t         mask,
4902 	int                     flags,
4903 	vm_map_kernel_flags_t   vmk_flags,
4904 	vm_tag_t                tag,
4905 	ipc_port_t              port,
4906 	vm_object_offset_t      offset,
4907 	vm_prot_t               cur_protection,
4908 	vm_prot_t               max_protection,
4909 	upl_page_list_ptr_t     page_list,
4910 	unsigned int            page_list_count)
4911 {
4912 	kern_return_t ret;
4913 
4914 	ret = vm_map_enter_mem_object_helper(target_map,
4915 	    address,
4916 	    initial_size,
4917 	    mask,
4918 	    flags,
4919 	    vmk_flags,
4920 	    tag,
4921 	    port,
4922 	    offset,
4923 	    FALSE,
4924 	    cur_protection,
4925 	    max_protection,
4926 	    VM_INHERIT_DEFAULT,
4927 	    page_list,
4928 	    page_list_count);
4929 
4930 #if KASAN
4931 	if (ret == KERN_SUCCESS && address && target_map->pmap == kernel_pmap) {
4932 		kasan_notify_address(*address, initial_size);
4933 	}
4934 #endif
4935 
4936 	return ret;
4937 }
4938 
4939 
4940 kern_return_t
vm_map_enter_mem_object_control(vm_map_t target_map,vm_map_offset_t * address,vm_map_size_t initial_size,vm_map_offset_t mask,int flags,vm_map_kernel_flags_t vmk_flags,vm_tag_t tag,memory_object_control_t control,vm_object_offset_t offset,boolean_t copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)4941 vm_map_enter_mem_object_control(
4942 	vm_map_t                target_map,
4943 	vm_map_offset_t         *address,
4944 	vm_map_size_t           initial_size,
4945 	vm_map_offset_t         mask,
4946 	int                     flags,
4947 	vm_map_kernel_flags_t   vmk_flags,
4948 	vm_tag_t                tag,
4949 	memory_object_control_t control,
4950 	vm_object_offset_t      offset,
4951 	boolean_t               copy,
4952 	vm_prot_t               cur_protection,
4953 	vm_prot_t               max_protection,
4954 	vm_inherit_t            inheritance)
4955 {
4956 	vm_map_address_t        map_addr;
4957 	vm_map_size_t           map_size;
4958 	vm_object_t             object;
4959 	vm_object_size_t        size;
4960 	kern_return_t           result;
4961 	memory_object_t         pager;
4962 	vm_prot_t               pager_prot;
4963 	kern_return_t           kr;
4964 #if __arm64__
4965 	boolean_t               fourk = vmk_flags.vmkf_fourk;
4966 #endif /* __arm64__ */
4967 
4968 	/*
4969 	 * Check arguments for validity
4970 	 */
4971 	if ((target_map == VM_MAP_NULL) ||
4972 	    (cur_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)) ||
4973 	    (max_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)) ||
4974 	    (inheritance > VM_INHERIT_LAST_VALID) ||
4975 	    initial_size == 0) {
4976 		return KERN_INVALID_ARGUMENT;
4977 	}
4978 
4979 #if __arm64__
4980 	if (fourk && VM_MAP_PAGE_MASK(target_map) < PAGE_MASK) {
4981 		fourk = FALSE;
4982 	}
4983 
4984 	if (fourk) {
4985 		map_addr = vm_map_trunc_page(*address,
4986 		    FOURK_PAGE_MASK);
4987 		map_size = vm_map_round_page(initial_size,
4988 		    FOURK_PAGE_MASK);
4989 	} else
4990 #endif /* __arm64__ */
4991 	{
4992 		map_addr = vm_map_trunc_page(*address,
4993 		    VM_MAP_PAGE_MASK(target_map));
4994 		map_size = vm_map_round_page(initial_size,
4995 		    VM_MAP_PAGE_MASK(target_map));
4996 	}
4997 	size = vm_object_round_page(initial_size);
4998 
4999 	object = memory_object_control_to_vm_object(control);
5000 
5001 	if (object == VM_OBJECT_NULL) {
5002 		return KERN_INVALID_OBJECT;
5003 	}
5004 
5005 	if (object == kernel_object) {
5006 		printf("Warning: Attempt to map kernel object"
5007 		    " by a non-private kernel entity\n");
5008 		return KERN_INVALID_OBJECT;
5009 	}
5010 
5011 	vm_object_lock(object);
5012 	object->ref_count++;
5013 
5014 	/*
5015 	 * For "named" VM objects, let the pager know that the
5016 	 * memory object is being mapped.  Some pagers need to keep
5017 	 * track of this, to know when they can reclaim the memory
5018 	 * object, for example.
5019 	 * VM calls memory_object_map() for each mapping (specifying
5020 	 * the protection of each mapping) and calls
5021 	 * memory_object_last_unmap() when all the mappings are gone.
5022 	 */
5023 	pager_prot = max_protection;
5024 	if (copy) {
5025 		pager_prot &= ~VM_PROT_WRITE;
5026 	}
5027 	pager = object->pager;
5028 	if (object->named &&
5029 	    pager != MEMORY_OBJECT_NULL &&
5030 	    object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
5031 		assert(object->pager_ready);
5032 		vm_object_mapping_wait(object, THREAD_UNINT);
5033 		vm_object_mapping_begin(object);
5034 		vm_object_unlock(object);
5035 
5036 		kr = memory_object_map(pager, pager_prot);
5037 		assert(kr == KERN_SUCCESS);
5038 
5039 		vm_object_lock(object);
5040 		vm_object_mapping_end(object);
5041 	}
5042 	vm_object_unlock(object);
5043 
5044 	/*
5045 	 *	Perform the copy if requested
5046 	 */
5047 
5048 	if (copy) {
5049 		vm_object_t             new_object;
5050 		vm_object_offset_t      new_offset;
5051 
5052 		result = vm_object_copy_strategically(object, offset, size,
5053 		    &new_object, &new_offset,
5054 		    &copy);
5055 
5056 
5057 		if (result == KERN_MEMORY_RESTART_COPY) {
5058 			boolean_t success;
5059 			boolean_t src_needs_copy;
5060 
5061 			/*
5062 			 * XXX
5063 			 * We currently ignore src_needs_copy.
5064 			 * This really is the issue of how to make
5065 			 * MEMORY_OBJECT_COPY_SYMMETRIC safe for
5066 			 * non-kernel users to use. Solution forthcoming.
5067 			 * In the meantime, since we don't allow non-kernel
5068 			 * memory managers to specify symmetric copy,
5069 			 * we won't run into problems here.
5070 			 */
5071 			new_object = object;
5072 			new_offset = offset;
5073 			success = vm_object_copy_quickly(new_object,
5074 			    new_offset, size,
5075 			    &src_needs_copy,
5076 			    &copy);
5077 			assert(success);
5078 			result = KERN_SUCCESS;
5079 		}
5080 		/*
5081 		 *	Throw away the reference to the
5082 		 *	original object, as it won't be mapped.
5083 		 */
5084 
5085 		vm_object_deallocate(object);
5086 
5087 		if (result != KERN_SUCCESS) {
5088 			return result;
5089 		}
5090 
5091 		object = new_object;
5092 		offset = new_offset;
5093 	}
5094 
5095 #if __arm64__
5096 	if (fourk) {
5097 		result = vm_map_enter_fourk(target_map,
5098 		    &map_addr,
5099 		    map_size,
5100 		    (vm_map_offset_t)mask,
5101 		    flags,
5102 		    vmk_flags,
5103 		    tag,
5104 		    object, offset,
5105 		    copy,
5106 		    cur_protection, max_protection,
5107 		    inheritance);
5108 	} else
5109 #endif /* __arm64__ */
5110 	{
5111 		result = vm_map_enter(target_map,
5112 		    &map_addr, map_size,
5113 		    (vm_map_offset_t)mask,
5114 		    flags,
5115 		    vmk_flags,
5116 		    tag,
5117 		    object, offset,
5118 		    copy,
5119 		    cur_protection, max_protection,
5120 		    inheritance);
5121 	}
5122 	if (result != KERN_SUCCESS) {
5123 		vm_object_deallocate(object);
5124 	}
5125 	*address = map_addr;
5126 
5127 	return result;
5128 }
5129 
5130 
5131 #if     VM_CPM
5132 
5133 #ifdef MACH_ASSERT
5134 extern pmap_paddr_t     avail_start, avail_end;
5135 #endif
5136 
5137 /*
5138  *	Allocate memory in the specified map, with the caveat that
5139  *	the memory is physically contiguous.  This call may fail
5140  *	if the system can't find sufficient contiguous memory.
5141  *	This call may cause or lead to heart-stopping amounts of
5142  *	paging activity.
5143  *
5144  *	Memory obtained from this call should be freed in the
5145  *	normal way, viz., via vm_deallocate.
5146  */
5147 kern_return_t
vm_map_enter_cpm(vm_map_t map,vm_map_offset_t * addr,vm_map_size_t size,int flags,vm_map_kernel_flags_t vmk_flags)5148 vm_map_enter_cpm(
5149 	vm_map_t                map,
5150 	vm_map_offset_t        *addr,
5151 	vm_map_size_t           size,
5152 	int                     flags,
5153 	vm_map_kernel_flags_t   vmk_flags)
5154 {
5155 	vm_object_t             cpm_obj;
5156 	pmap_t                  pmap;
5157 	vm_page_t               m, pages;
5158 	kern_return_t           kr;
5159 	vm_map_offset_t         va, start, end, offset;
5160 #if     MACH_ASSERT
5161 	vm_map_offset_t         prev_addr = 0;
5162 #endif  /* MACH_ASSERT */
5163 
5164 	boolean_t               anywhere = ((VM_FLAGS_ANYWHERE & flags) != 0);
5165 	vm_tag_t tag;
5166 
5167 	if (VM_MAP_PAGE_SHIFT(map) != PAGE_SHIFT) {
5168 		/* XXX TODO4K do we need to support this? */
5169 		*addr = 0;
5170 		return KERN_NOT_SUPPORTED;
5171 	}
5172 
5173 	VM_GET_FLAGS_ALIAS(flags, tag);
5174 
5175 	if (size == 0) {
5176 		*addr = 0;
5177 		return KERN_SUCCESS;
5178 	}
5179 	if (anywhere) {
5180 		*addr = vm_map_min(map);
5181 	} else {
5182 		*addr = vm_map_trunc_page(*addr,
5183 		    VM_MAP_PAGE_MASK(map));
5184 	}
5185 	size = vm_map_round_page(size,
5186 	    VM_MAP_PAGE_MASK(map));
5187 
5188 	/*
5189 	 * LP64todo - cpm_allocate should probably allow
5190 	 * allocations of >4GB, but not with the current
5191 	 * algorithm, so just cast down the size for now.
5192 	 */
5193 	if (size > VM_MAX_ADDRESS) {
5194 		return KERN_RESOURCE_SHORTAGE;
5195 	}
5196 	if ((kr = cpm_allocate(CAST_DOWN(vm_size_t, size),
5197 	    &pages, 0, 0, TRUE, flags)) != KERN_SUCCESS) {
5198 		return kr;
5199 	}
5200 
5201 	cpm_obj = vm_object_allocate((vm_object_size_t)size);
5202 	assert(cpm_obj != VM_OBJECT_NULL);
5203 	assert(cpm_obj->internal);
5204 	assert(cpm_obj->vo_size == (vm_object_size_t)size);
5205 	assert(cpm_obj->can_persist == FALSE);
5206 	assert(cpm_obj->pager_created == FALSE);
5207 	assert(cpm_obj->pageout == FALSE);
5208 	assert(cpm_obj->shadow == VM_OBJECT_NULL);
5209 
5210 	/*
5211 	 *	Insert pages into object.
5212 	 */
5213 
5214 	vm_object_lock(cpm_obj);
5215 	for (offset = 0; offset < size; offset += PAGE_SIZE) {
5216 		m = pages;
5217 		pages = NEXT_PAGE(m);
5218 		*(NEXT_PAGE_PTR(m)) = VM_PAGE_NULL;
5219 
5220 		assert(!m->vmp_gobbled);
5221 		assert(!m->vmp_wanted);
5222 		assert(!m->vmp_pageout);
5223 		assert(!m->vmp_tabled);
5224 		assert(VM_PAGE_WIRED(m));
5225 		assert(m->vmp_busy);
5226 		assert(VM_PAGE_GET_PHYS_PAGE(m) >= (avail_start >> PAGE_SHIFT) && VM_PAGE_GET_PHYS_PAGE(m) <= (avail_end >> PAGE_SHIFT));
5227 
5228 		m->vmp_busy = FALSE;
5229 		vm_page_insert(m, cpm_obj, offset);
5230 	}
5231 	assert(cpm_obj->resident_page_count == size / PAGE_SIZE);
5232 	vm_object_unlock(cpm_obj);
5233 
5234 	/*
5235 	 *	Hang onto a reference on the object in case a
5236 	 *	multi-threaded application for some reason decides
5237 	 *	to deallocate the portion of the address space into
5238 	 *	which we will insert this object.
5239 	 *
5240 	 *	Unfortunately, we must insert the object now before
5241 	 *	we can talk to the pmap module about which addresses
5242 	 *	must be wired down.  Hence, the race with a multi-
5243 	 *	threaded app.
5244 	 */
5245 	vm_object_reference(cpm_obj);
5246 
5247 	/*
5248 	 *	Insert object into map.
5249 	 */
5250 
5251 	kr = vm_map_enter(
5252 		map,
5253 		addr,
5254 		size,
5255 		(vm_map_offset_t)0,
5256 		flags,
5257 		vmk_flags,
5258 		cpm_obj,
5259 		(vm_object_offset_t)0,
5260 		FALSE,
5261 		VM_PROT_ALL,
5262 		VM_PROT_ALL,
5263 		VM_INHERIT_DEFAULT);
5264 
5265 	if (kr != KERN_SUCCESS) {
5266 		/*
5267 		 *	A CPM object doesn't have can_persist set,
5268 		 *	so all we have to do is deallocate it to
5269 		 *	free up these pages.
5270 		 */
5271 		assert(cpm_obj->pager_created == FALSE);
5272 		assert(cpm_obj->can_persist == FALSE);
5273 		assert(cpm_obj->pageout == FALSE);
5274 		assert(cpm_obj->shadow == VM_OBJECT_NULL);
5275 		vm_object_deallocate(cpm_obj); /* kill acquired ref */
5276 		vm_object_deallocate(cpm_obj); /* kill creation ref */
5277 	}
5278 
5279 	/*
5280 	 *	Inform the physical mapping system that the
5281 	 *	range of addresses may not fault, so that
5282 	 *	page tables and such can be locked down as well.
5283 	 */
5284 	start = *addr;
5285 	end = start + size;
5286 	pmap = vm_map_pmap(map);
5287 	pmap_pageable(pmap, start, end, FALSE);
5288 
5289 	/*
5290 	 *	Enter each page into the pmap, to avoid faults.
5291 	 *	Note that this loop could be coded more efficiently,
5292 	 *	if the need arose, rather than looking up each page
5293 	 *	again.
5294 	 */
5295 	for (offset = 0, va = start; offset < size;
5296 	    va += PAGE_SIZE, offset += PAGE_SIZE) {
5297 		int type_of_fault;
5298 
5299 		vm_object_lock(cpm_obj);
5300 		m = vm_page_lookup(cpm_obj, (vm_object_offset_t)offset);
5301 		assert(m != VM_PAGE_NULL);
5302 
5303 		vm_page_zero_fill(m);
5304 
5305 		type_of_fault = DBG_ZERO_FILL_FAULT;
5306 
5307 		vm_fault_enter(m, pmap, va,
5308 		    PAGE_SIZE, 0,
5309 		    VM_PROT_ALL, VM_PROT_WRITE,
5310 		    VM_PAGE_WIRED(m),
5311 		    FALSE,                             /* change_wiring */
5312 		    VM_KERN_MEMORY_NONE,                             /* tag - not wiring */
5313 		    FALSE,                             /* no_cache */
5314 		    FALSE,                             /* cs_bypass */
5315 		    0,                                 /* user_tag */
5316 		    0,                             /* pmap_options */
5317 		    NULL,                              /* need_retry */
5318 		    &type_of_fault);
5319 
5320 		vm_object_unlock(cpm_obj);
5321 	}
5322 
5323 #if     MACH_ASSERT
5324 	/*
5325 	 *	Verify ordering in address space.
5326 	 */
5327 	for (offset = 0; offset < size; offset += PAGE_SIZE) {
5328 		vm_object_lock(cpm_obj);
5329 		m = vm_page_lookup(cpm_obj, (vm_object_offset_t)offset);
5330 		vm_object_unlock(cpm_obj);
5331 		if (m == VM_PAGE_NULL) {
5332 			panic("vm_allocate_cpm:  obj %p off 0x%llx no page",
5333 			    cpm_obj, (uint64_t)offset);
5334 		}
5335 		assert(m->vmp_tabled);
5336 		assert(!m->vmp_busy);
5337 		assert(!m->vmp_wanted);
5338 		assert(!m->vmp_fictitious);
5339 		assert(!m->vmp_private);
5340 		assert(!m->vmp_absent);
5341 		assert(!m->vmp_cleaning);
5342 		assert(!m->vmp_laundry);
5343 		assert(!m->vmp_precious);
5344 		assert(!m->vmp_clustered);
5345 		if (offset != 0) {
5346 			if (VM_PAGE_GET_PHYS_PAGE(m) != prev_addr + 1) {
5347 				printf("start 0x%llx end 0x%llx va 0x%llx\n",
5348 				    (uint64_t)start, (uint64_t)end, (uint64_t)va);
5349 				printf("obj %p off 0x%llx\n", cpm_obj, (uint64_t)offset);
5350 				printf("m %p prev_address 0x%llx\n", m, (uint64_t)prev_addr);
5351 				panic("vm_allocate_cpm:  pages not contig!");
5352 			}
5353 		}
5354 		prev_addr = VM_PAGE_GET_PHYS_PAGE(m);
5355 	}
5356 #endif  /* MACH_ASSERT */
5357 
5358 	vm_object_deallocate(cpm_obj); /* kill extra ref */
5359 
5360 	return kr;
5361 }
5362 
5363 
5364 #else   /* VM_CPM */
5365 
5366 /*
5367  *	Interface is defined in all cases, but unless the kernel
5368  *	is built explicitly for this option, the interface does
5369  *	nothing.
5370  */
5371 
5372 kern_return_t
vm_map_enter_cpm(__unused vm_map_t map,__unused vm_map_offset_t * addr,__unused vm_map_size_t size,__unused int flags,__unused vm_map_kernel_flags_t vmk_flags)5373 vm_map_enter_cpm(
5374 	__unused vm_map_t                map,
5375 	__unused vm_map_offset_t        *addr,
5376 	__unused vm_map_size_t           size,
5377 	__unused int                     flags,
5378 	__unused vm_map_kernel_flags_t   vmk_flags)
5379 {
5380 	return KERN_FAILURE;
5381 }
5382 #endif /* VM_CPM */
5383 
5384 /* Not used without nested pmaps */
5385 #ifndef NO_NESTED_PMAP
5386 /*
5387  * Clip and unnest a portion of a nested submap mapping.
5388  */
5389 
5390 
5391 static void
vm_map_clip_unnest(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t start_unnest,vm_map_offset_t end_unnest)5392 vm_map_clip_unnest(
5393 	vm_map_t        map,
5394 	vm_map_entry_t  entry,
5395 	vm_map_offset_t start_unnest,
5396 	vm_map_offset_t end_unnest)
5397 {
5398 	vm_map_offset_t old_start_unnest = start_unnest;
5399 	vm_map_offset_t old_end_unnest = end_unnest;
5400 
5401 	assert(entry->is_sub_map);
5402 	assert(VME_SUBMAP(entry) != NULL);
5403 	assert(entry->use_pmap);
5404 
5405 	/*
5406 	 * Query the platform for the optimal unnest range.
5407 	 * DRK: There's some duplication of effort here, since
5408 	 * callers may have adjusted the range to some extent. This
5409 	 * routine was introduced to support 1GiB subtree nesting
5410 	 * for x86 platforms, which can also nest on 2MiB boundaries
5411 	 * depending on size/alignment.
5412 	 */
5413 	if (pmap_adjust_unnest_parameters(map->pmap, &start_unnest, &end_unnest)) {
5414 		assert(VME_SUBMAP(entry)->is_nested_map);
5415 		assert(!VME_SUBMAP(entry)->disable_vmentry_reuse);
5416 		log_unnest_badness(map,
5417 		    old_start_unnest,
5418 		    old_end_unnest,
5419 		    VME_SUBMAP(entry)->is_nested_map,
5420 		    (entry->vme_start +
5421 		    VME_SUBMAP(entry)->lowest_unnestable_start -
5422 		    VME_OFFSET(entry)));
5423 	}
5424 
5425 	if (entry->vme_start > start_unnest ||
5426 	    entry->vme_end < end_unnest) {
5427 		panic("vm_map_clip_unnest(0x%llx,0x%llx): "
5428 		    "bad nested entry: start=0x%llx end=0x%llx\n",
5429 		    (long long)start_unnest, (long long)end_unnest,
5430 		    (long long)entry->vme_start, (long long)entry->vme_end);
5431 	}
5432 
5433 	if (start_unnest > entry->vme_start) {
5434 		_vm_map_clip_start(&map->hdr,
5435 		    entry,
5436 		    start_unnest);
5437 		if (map->holelistenabled) {
5438 			vm_map_store_update_first_free(map, NULL, FALSE);
5439 		} else {
5440 			vm_map_store_update_first_free(map, map->first_free, FALSE);
5441 		}
5442 	}
5443 	if (entry->vme_end > end_unnest) {
5444 		_vm_map_clip_end(&map->hdr,
5445 		    entry,
5446 		    end_unnest);
5447 		if (map->holelistenabled) {
5448 			vm_map_store_update_first_free(map, NULL, FALSE);
5449 		} else {
5450 			vm_map_store_update_first_free(map, map->first_free, FALSE);
5451 		}
5452 	}
5453 
5454 	pmap_unnest(map->pmap,
5455 	    entry->vme_start,
5456 	    entry->vme_end - entry->vme_start);
5457 	if ((map->mapped_in_other_pmaps) && os_ref_get_count_raw(&map->map_refcnt) != 0) {
5458 		/* clean up parent map/maps */
5459 		vm_map_submap_pmap_clean(
5460 			map, entry->vme_start,
5461 			entry->vme_end,
5462 			VME_SUBMAP(entry),
5463 			VME_OFFSET(entry));
5464 	}
5465 	entry->use_pmap = FALSE;
5466 	if ((map->pmap != kernel_pmap) &&
5467 	    (VME_ALIAS(entry) == VM_MEMORY_SHARED_PMAP)) {
5468 		VME_ALIAS_SET(entry, VM_MEMORY_UNSHARED_PMAP);
5469 	}
5470 }
5471 #endif  /* NO_NESTED_PMAP */
5472 
5473 __abortlike
5474 static void
__vm_map_clip_atomic_entry_panic(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t where)5475 __vm_map_clip_atomic_entry_panic(
5476 	vm_map_t        map,
5477 	vm_map_entry_t  entry,
5478 	vm_map_offset_t where)
5479 {
5480 	panic("vm_map_clip(%p): Attempting to clip an atomic VM map entry "
5481 	    "%p [0x%llx:0x%llx] at 0x%llx", map, entry,
5482 	    (uint64_t)entry->vme_start,
5483 	    (uint64_t)entry->vme_end,
5484 	    (uint64_t)where);
5485 }
5486 
5487 /*
5488  *	vm_map_clip_start:	[ internal use only ]
5489  *
5490  *	Asserts that the given entry begins at or after
5491  *	the specified address; if necessary,
5492  *	it splits the entry into two.
5493  */
5494 void
vm_map_clip_start(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t startaddr)5495 vm_map_clip_start(
5496 	vm_map_t        map,
5497 	vm_map_entry_t  entry,
5498 	vm_map_offset_t startaddr)
5499 {
5500 #ifndef NO_NESTED_PMAP
5501 	if (entry->is_sub_map &&
5502 	    entry->use_pmap &&
5503 	    startaddr >= entry->vme_start) {
5504 		vm_map_offset_t start_unnest, end_unnest;
5505 
5506 		/*
5507 		 * Make sure "startaddr" is no longer in a nested range
5508 		 * before we clip.  Unnest only the minimum range the platform
5509 		 * can handle.
5510 		 * vm_map_clip_unnest may perform additional adjustments to
5511 		 * the unnest range.
5512 		 */
5513 		start_unnest = startaddr & ~(pmap_shared_region_size_min(map->pmap) - 1);
5514 		end_unnest = start_unnest + pmap_shared_region_size_min(map->pmap);
5515 		vm_map_clip_unnest(map, entry, start_unnest, end_unnest);
5516 	}
5517 #endif /* NO_NESTED_PMAP */
5518 	if (startaddr > entry->vme_start) {
5519 		if (!entry->is_sub_map &&
5520 		    VME_OBJECT(entry) &&
5521 		    VME_OBJECT(entry)->phys_contiguous) {
5522 			pmap_remove(map->pmap,
5523 			    (addr64_t)(entry->vme_start),
5524 			    (addr64_t)(entry->vme_end));
5525 		}
5526 		if (entry->vme_atomic) {
5527 			__vm_map_clip_atomic_entry_panic(map, entry, startaddr);
5528 		}
5529 
5530 		DTRACE_VM5(
5531 			vm_map_clip_start,
5532 			vm_map_t, map,
5533 			vm_map_offset_t, entry->vme_start,
5534 			vm_map_offset_t, entry->vme_end,
5535 			vm_map_offset_t, startaddr,
5536 			int, VME_ALIAS(entry));
5537 
5538 		_vm_map_clip_start(&map->hdr, entry, startaddr);
5539 		if (map->holelistenabled) {
5540 			vm_map_store_update_first_free(map, NULL, FALSE);
5541 		} else {
5542 			vm_map_store_update_first_free(map, map->first_free, FALSE);
5543 		}
5544 	}
5545 }
5546 
5547 
5548 #define vm_map_copy_clip_start(copy, entry, startaddr) \
5549 	MACRO_BEGIN \
5550 	if ((startaddr) > (entry)->vme_start) \
5551 	        _vm_map_clip_start(&(copy)->cpy_hdr,(entry),(startaddr)); \
5552 	MACRO_END
5553 
5554 /*
5555  *	This routine is called only when it is known that
5556  *	the entry must be split.
5557  */
5558 static void
_vm_map_clip_start(struct vm_map_header * map_header,vm_map_entry_t entry,vm_map_offset_t start)5559 _vm_map_clip_start(
5560 	struct vm_map_header    *map_header,
5561 	vm_map_entry_t          entry,
5562 	vm_map_offset_t         start)
5563 {
5564 	vm_map_entry_t  new_entry;
5565 
5566 	/*
5567 	 *	Split off the front portion --
5568 	 *	note that we must insert the new
5569 	 *	entry BEFORE this one, so that
5570 	 *	this entry has the specified starting
5571 	 *	address.
5572 	 */
5573 
5574 	if (entry->map_aligned) {
5575 		assert(VM_MAP_PAGE_ALIGNED(start,
5576 		    VM_MAP_HDR_PAGE_MASK(map_header)));
5577 	}
5578 
5579 	new_entry = _vm_map_entry_create(map_header);
5580 	vm_map_entry_copy_full(new_entry, entry);
5581 
5582 	new_entry->vme_end = start;
5583 	assert(new_entry->vme_start < new_entry->vme_end);
5584 	VME_OFFSET_SET(entry, VME_OFFSET(entry) + (start - entry->vme_start));
5585 	assert(start < entry->vme_end);
5586 	entry->vme_start = start;
5587 
5588 	_vm_map_store_entry_link(map_header, entry->vme_prev, new_entry);
5589 
5590 	if (entry->is_sub_map) {
5591 		vm_map_reference(VME_SUBMAP(new_entry));
5592 	} else {
5593 		vm_object_reference(VME_OBJECT(new_entry));
5594 	}
5595 }
5596 
5597 
5598 /*
5599  *	vm_map_clip_end:	[ internal use only ]
5600  *
5601  *	Asserts that the given entry ends at or before
5602  *	the specified address; if necessary,
5603  *	it splits the entry into two.
5604  */
5605 void
vm_map_clip_end(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t endaddr)5606 vm_map_clip_end(
5607 	vm_map_t        map,
5608 	vm_map_entry_t  entry,
5609 	vm_map_offset_t endaddr)
5610 {
5611 	if (endaddr > entry->vme_end) {
5612 		/*
5613 		 * Within the scope of this clipping, limit "endaddr" to
5614 		 * the end of this map entry...
5615 		 */
5616 		endaddr = entry->vme_end;
5617 	}
5618 #ifndef NO_NESTED_PMAP
5619 	if (entry->is_sub_map && entry->use_pmap) {
5620 		vm_map_offset_t start_unnest, end_unnest;
5621 
5622 		/*
5623 		 * Make sure the range between the start of this entry and
5624 		 * the new "endaddr" is no longer nested before we clip.
5625 		 * Unnest only the minimum range the platform can handle.
5626 		 * vm_map_clip_unnest may perform additional adjustments to
5627 		 * the unnest range.
5628 		 */
5629 		start_unnest = entry->vme_start;
5630 		end_unnest =
5631 		    (endaddr + pmap_shared_region_size_min(map->pmap) - 1) &
5632 		    ~(pmap_shared_region_size_min(map->pmap) - 1);
5633 		vm_map_clip_unnest(map, entry, start_unnest, end_unnest);
5634 	}
5635 #endif /* NO_NESTED_PMAP */
5636 	if (endaddr < entry->vme_end) {
5637 		if (!entry->is_sub_map &&
5638 		    VME_OBJECT(entry) &&
5639 		    VME_OBJECT(entry)->phys_contiguous) {
5640 			pmap_remove(map->pmap,
5641 			    (addr64_t)(entry->vme_start),
5642 			    (addr64_t)(entry->vme_end));
5643 		}
5644 		if (entry->vme_atomic) {
5645 			__vm_map_clip_atomic_entry_panic(map, entry, endaddr);
5646 		}
5647 		DTRACE_VM5(
5648 			vm_map_clip_end,
5649 			vm_map_t, map,
5650 			vm_map_offset_t, entry->vme_start,
5651 			vm_map_offset_t, entry->vme_end,
5652 			vm_map_offset_t, endaddr,
5653 			int, VME_ALIAS(entry));
5654 
5655 		_vm_map_clip_end(&map->hdr, entry, endaddr);
5656 		if (map->holelistenabled) {
5657 			vm_map_store_update_first_free(map, NULL, FALSE);
5658 		} else {
5659 			vm_map_store_update_first_free(map, map->first_free, FALSE);
5660 		}
5661 	}
5662 }
5663 
5664 
5665 #define vm_map_copy_clip_end(copy, entry, endaddr) \
5666 	MACRO_BEGIN \
5667 	if ((endaddr) < (entry)->vme_end) \
5668 	        _vm_map_clip_end(&(copy)->cpy_hdr,(entry),(endaddr)); \
5669 	MACRO_END
5670 
5671 /*
5672  *	This routine is called only when it is known that
5673  *	the entry must be split.
5674  */
5675 static void
_vm_map_clip_end(struct vm_map_header * map_header,vm_map_entry_t entry,vm_map_offset_t end)5676 _vm_map_clip_end(
5677 	struct vm_map_header    *map_header,
5678 	vm_map_entry_t          entry,
5679 	vm_map_offset_t         end)
5680 {
5681 	vm_map_entry_t  new_entry;
5682 
5683 	/*
5684 	 *	Create a new entry and insert it
5685 	 *	AFTER the specified entry
5686 	 */
5687 
5688 	if (entry->map_aligned) {
5689 		assert(VM_MAP_PAGE_ALIGNED(end,
5690 		    VM_MAP_HDR_PAGE_MASK(map_header)));
5691 	}
5692 
5693 	new_entry = _vm_map_entry_create(map_header);
5694 	vm_map_entry_copy_full(new_entry, entry);
5695 
5696 	assert(entry->vme_start < end);
5697 	new_entry->vme_start = entry->vme_end = end;
5698 	VME_OFFSET_SET(new_entry,
5699 	    VME_OFFSET(new_entry) + (end - entry->vme_start));
5700 	assert(new_entry->vme_start < new_entry->vme_end);
5701 
5702 	_vm_map_store_entry_link(map_header, entry, new_entry);
5703 
5704 	if (entry->is_sub_map) {
5705 		vm_map_reference(VME_SUBMAP(new_entry));
5706 	} else {
5707 		vm_object_reference(VME_OBJECT(new_entry));
5708 	}
5709 }
5710 
5711 
5712 /*
5713  *	VM_MAP_RANGE_CHECK:	[ internal use only ]
5714  *
5715  *	Asserts that the starting and ending region
5716  *	addresses fall within the valid range of the map.
5717  */
5718 #define VM_MAP_RANGE_CHECK(map, start, end)     \
5719 	MACRO_BEGIN                             \
5720 	if (start < vm_map_min(map))            \
5721 	        start = vm_map_min(map);        \
5722 	if (end > vm_map_max(map))              \
5723 	        end = vm_map_max(map);          \
5724 	if (start > end)                        \
5725 	        start = end;                    \
5726 	MACRO_END
5727 
5728 /*
5729  *	vm_map_range_check:	[ internal use only ]
5730  *
5731  *	Check that the region defined by the specified start and
5732  *	end addresses are wholly contained within a single map
5733  *	entry or set of adjacent map entries of the spacified map,
5734  *	i.e. the specified region contains no unmapped space.
5735  *	If any or all of the region is unmapped, FALSE is returned.
5736  *	Otherwise, TRUE is returned and if the output argument 'entry'
5737  *	is not NULL it points to the map entry containing the start
5738  *	of the region.
5739  *
5740  *	The map is locked for reading on entry and is left locked.
5741  */
5742 static boolean_t
vm_map_range_check(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_map_entry_t * entry)5743 vm_map_range_check(
5744 	vm_map_t                map,
5745 	vm_map_offset_t         start,
5746 	vm_map_offset_t         end,
5747 	vm_map_entry_t          *entry)
5748 {
5749 	vm_map_entry_t          cur;
5750 	vm_map_offset_t         prev;
5751 
5752 	/*
5753 	 *      Basic sanity checks first
5754 	 */
5755 	if (start < vm_map_min(map) || end > vm_map_max(map) || start > end) {
5756 		return FALSE;
5757 	}
5758 
5759 	/*
5760 	 *      Check first if the region starts within a valid
5761 	 *	mapping for the map.
5762 	 */
5763 	if (!vm_map_lookup_entry(map, start, &cur)) {
5764 		return FALSE;
5765 	}
5766 
5767 	/*
5768 	 *	Optimize for the case that the region is contained
5769 	 *	in a single map entry.
5770 	 */
5771 	if (entry != (vm_map_entry_t *) NULL) {
5772 		*entry = cur;
5773 	}
5774 	if (end <= cur->vme_end) {
5775 		return TRUE;
5776 	}
5777 
5778 	/*
5779 	 *      If the region is not wholly contained within a
5780 	 *      single entry, walk the entries looking for holes.
5781 	 */
5782 	prev = cur->vme_end;
5783 	cur = cur->vme_next;
5784 	while ((cur != vm_map_to_entry(map)) && (prev == cur->vme_start)) {
5785 		if (end <= cur->vme_end) {
5786 			return TRUE;
5787 		}
5788 		prev = cur->vme_end;
5789 		cur = cur->vme_next;
5790 	}
5791 	return FALSE;
5792 }
5793 
5794 /*
5795  *	vm_map_protect:
5796  *
5797  *	Sets the protection of the specified address
5798  *	region in the target map.  If "set_max" is
5799  *	specified, the maximum protection is to be set;
5800  *	otherwise, only the current protection is affected.
5801  */
5802 kern_return_t
vm_map_protect(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t new_prot,boolean_t set_max)5803 vm_map_protect(
5804 	vm_map_t        map,
5805 	vm_map_offset_t start,
5806 	vm_map_offset_t end,
5807 	vm_prot_t       new_prot,
5808 	boolean_t       set_max)
5809 {
5810 	vm_map_entry_t                  current;
5811 	vm_map_offset_t                 prev;
5812 	vm_map_entry_t                  entry;
5813 	vm_prot_t                       new_max;
5814 	int                             pmap_options = 0;
5815 	kern_return_t                   kr;
5816 
5817 	if (new_prot & VM_PROT_COPY) {
5818 		vm_map_offset_t         new_start;
5819 		vm_prot_t               cur_prot, max_prot;
5820 		vm_map_kernel_flags_t   kflags;
5821 
5822 		/* LP64todo - see below */
5823 		if (start >= map->max_offset) {
5824 			return KERN_INVALID_ADDRESS;
5825 		}
5826 
5827 		if ((new_prot & VM_PROT_ALLEXEC) &&
5828 		    map->pmap != kernel_pmap &&
5829 		    (vm_map_cs_enforcement(map)
5830 #if XNU_TARGET_OS_OSX && __arm64__
5831 		    || !VM_MAP_IS_EXOTIC(map)
5832 #endif /* XNU_TARGET_OS_OSX && __arm64__ */
5833 		    ) &&
5834 		    VM_MAP_POLICY_WX_FAIL(map)) {
5835 			DTRACE_VM3(cs_wx,
5836 			    uint64_t, (uint64_t) start,
5837 			    uint64_t, (uint64_t) end,
5838 			    vm_prot_t, new_prot);
5839 			printf("CODE SIGNING: %d[%s] %s can't have both write and exec at the same time\n",
5840 			    proc_selfpid(),
5841 			    (get_bsdtask_info(current_task())
5842 			    ? proc_name_address(get_bsdtask_info(current_task()))
5843 			    : "?"),
5844 			    __FUNCTION__);
5845 			return KERN_PROTECTION_FAILURE;
5846 		}
5847 
5848 		/*
5849 		 * Let vm_map_remap_extract() know that it will need to:
5850 		 * + make a copy of the mapping
5851 		 * + add VM_PROT_WRITE to the max protections
5852 		 * + remove any protections that are no longer allowed from the
5853 		 *   max protections (to avoid any WRITE/EXECUTE conflict, for
5854 		 *   example).
5855 		 * Note that "max_prot" is an IN/OUT parameter only for this
5856 		 * specific (VM_PROT_COPY) case.  It's usually an OUT parameter
5857 		 * only.
5858 		 */
5859 		max_prot = new_prot & (VM_PROT_ALL | VM_PROT_ALLEXEC);
5860 		cur_prot = VM_PROT_NONE;
5861 		kflags = VM_MAP_KERNEL_FLAGS_NONE;
5862 		kflags.vmkf_remap_prot_copy = TRUE;
5863 		new_start = start;
5864 		kr = vm_map_remap(map,
5865 		    &new_start,
5866 		    end - start,
5867 		    0, /* mask */
5868 		    VM_FLAGS_FIXED | VM_FLAGS_OVERWRITE,
5869 		    kflags,
5870 		    0,
5871 		    map,
5872 		    start,
5873 		    TRUE, /* copy-on-write remapping! */
5874 		    &cur_prot, /* IN/OUT */
5875 		    &max_prot, /* IN/OUT */
5876 		    VM_INHERIT_DEFAULT);
5877 		if (kr != KERN_SUCCESS) {
5878 			return kr;
5879 		}
5880 		new_prot &= ~VM_PROT_COPY;
5881 	}
5882 
5883 	vm_map_lock(map);
5884 
5885 	/* LP64todo - remove this check when vm_map_commpage64()
5886 	 * no longer has to stuff in a map_entry for the commpage
5887 	 * above the map's max_offset.
5888 	 */
5889 	if (start >= map->max_offset) {
5890 		vm_map_unlock(map);
5891 		return KERN_INVALID_ADDRESS;
5892 	}
5893 
5894 	while (1) {
5895 		/*
5896 		 *      Lookup the entry.  If it doesn't start in a valid
5897 		 *	entry, return an error.
5898 		 */
5899 		if (!vm_map_lookup_entry(map, start, &entry)) {
5900 			vm_map_unlock(map);
5901 			return KERN_INVALID_ADDRESS;
5902 		}
5903 
5904 		if (entry->superpage_size && (start & (SUPERPAGE_SIZE - 1))) { /* extend request to whole entry */
5905 			start = SUPERPAGE_ROUND_DOWN(start);
5906 			continue;
5907 		}
5908 		break;
5909 	}
5910 	if (entry->superpage_size) {
5911 		end = SUPERPAGE_ROUND_UP(end);
5912 	}
5913 
5914 	/*
5915 	 *	Make a first pass to check for protection and address
5916 	 *	violations.
5917 	 */
5918 
5919 	current = entry;
5920 	prev = current->vme_start;
5921 	while ((current != vm_map_to_entry(map)) &&
5922 	    (current->vme_start < end)) {
5923 		/*
5924 		 * If there is a hole, return an error.
5925 		 */
5926 		if (current->vme_start != prev) {
5927 			vm_map_unlock(map);
5928 			return KERN_INVALID_ADDRESS;
5929 		}
5930 
5931 		new_max = current->max_protection;
5932 
5933 #if defined(__x86_64__)
5934 		/* Allow max mask to include execute prot bits if this map doesn't enforce CS */
5935 		if (set_max && (new_prot & VM_PROT_ALLEXEC) && !vm_map_cs_enforcement(map)) {
5936 			new_max = (new_max & ~VM_PROT_ALLEXEC) | (new_prot & VM_PROT_ALLEXEC);
5937 		}
5938 #endif
5939 		if ((new_prot & new_max) != new_prot) {
5940 			vm_map_unlock(map);
5941 			return KERN_PROTECTION_FAILURE;
5942 		}
5943 
5944 		if (current->used_for_jit &&
5945 		    pmap_has_prot_policy(map->pmap, current->translated_allow_execute, current->protection)) {
5946 			vm_map_unlock(map);
5947 			return KERN_PROTECTION_FAILURE;
5948 		}
5949 
5950 #if __arm64e__
5951 		/* Disallow remapping hw assisted TPRO mappings */
5952 		if (current->used_for_tpro) {
5953 			vm_map_unlock(map);
5954 			return KERN_PROTECTION_FAILURE;
5955 		}
5956 #endif /* __arm64e__ */
5957 
5958 
5959 		if ((new_prot & VM_PROT_WRITE) &&
5960 		    (new_prot & VM_PROT_ALLEXEC) &&
5961 #if XNU_TARGET_OS_OSX
5962 		    map->pmap != kernel_pmap &&
5963 		    (vm_map_cs_enforcement(map)
5964 #if __arm64__
5965 		    || !VM_MAP_IS_EXOTIC(map)
5966 #endif /* __arm64__ */
5967 		    ) &&
5968 #endif /* XNU_TARGET_OS_OSX */
5969 		    !(current->used_for_jit)) {
5970 			DTRACE_VM3(cs_wx,
5971 			    uint64_t, (uint64_t) current->vme_start,
5972 			    uint64_t, (uint64_t) current->vme_end,
5973 			    vm_prot_t, new_prot);
5974 			printf("CODE SIGNING: %d[%s] %s can't have both write and exec at the same time\n",
5975 			    proc_selfpid(),
5976 			    (get_bsdtask_info(current_task())
5977 			    ? proc_name_address(get_bsdtask_info(current_task()))
5978 			    : "?"),
5979 			    __FUNCTION__);
5980 			new_prot &= ~VM_PROT_ALLEXEC;
5981 			if (VM_MAP_POLICY_WX_FAIL(map)) {
5982 				vm_map_unlock(map);
5983 				return KERN_PROTECTION_FAILURE;
5984 			}
5985 		}
5986 
5987 		/*
5988 		 * If the task has requested executable lockdown,
5989 		 * deny both:
5990 		 * - adding executable protections OR
5991 		 * - adding write protections to an existing executable mapping.
5992 		 */
5993 		if (map->map_disallow_new_exec == TRUE) {
5994 			if ((new_prot & VM_PROT_ALLEXEC) ||
5995 			    ((current->protection & VM_PROT_EXECUTE) && (new_prot & VM_PROT_WRITE))) {
5996 				vm_map_unlock(map);
5997 				return KERN_PROTECTION_FAILURE;
5998 			}
5999 		}
6000 
6001 		prev = current->vme_end;
6002 		current = current->vme_next;
6003 	}
6004 
6005 #if __arm64__
6006 	if (end > prev &&
6007 	    end == vm_map_round_page(prev, VM_MAP_PAGE_MASK(map))) {
6008 		vm_map_entry_t prev_entry;
6009 
6010 		prev_entry = current->vme_prev;
6011 		if (prev_entry != vm_map_to_entry(map) &&
6012 		    !prev_entry->map_aligned &&
6013 		    (vm_map_round_page(prev_entry->vme_end,
6014 		    VM_MAP_PAGE_MASK(map))
6015 		    == end)) {
6016 			/*
6017 			 * The last entry in our range is not "map-aligned"
6018 			 * but it would have reached all the way to "end"
6019 			 * if it had been map-aligned, so this is not really
6020 			 * a hole in the range and we can proceed.
6021 			 */
6022 			prev = end;
6023 		}
6024 	}
6025 #endif /* __arm64__ */
6026 
6027 	if (end > prev) {
6028 		vm_map_unlock(map);
6029 		return KERN_INVALID_ADDRESS;
6030 	}
6031 
6032 	/*
6033 	 *	Go back and fix up protections.
6034 	 *	Clip to start here if the range starts within
6035 	 *	the entry.
6036 	 */
6037 
6038 	current = entry;
6039 	if (current != vm_map_to_entry(map)) {
6040 		/* clip and unnest if necessary */
6041 		vm_map_clip_start(map, current, start);
6042 	}
6043 
6044 	while ((current != vm_map_to_entry(map)) &&
6045 	    (current->vme_start < end)) {
6046 		vm_prot_t       old_prot;
6047 
6048 		vm_map_clip_end(map, current, end);
6049 
6050 		if (current->is_sub_map) {
6051 			/* clipping did unnest if needed */
6052 			assert(!current->use_pmap);
6053 		}
6054 
6055 		old_prot = current->protection;
6056 
6057 		if (set_max) {
6058 			current->max_protection = new_prot;
6059 			/* Consider either EXECUTE or UEXEC as EXECUTE for this masking */
6060 			current->protection = (new_prot & old_prot);
6061 		} else {
6062 			current->protection = new_prot;
6063 		}
6064 
6065 		/*
6066 		 *	Update physical map if necessary.
6067 		 *	If the request is to turn off write protection,
6068 		 *	we won't do it for real (in pmap). This is because
6069 		 *	it would cause copy-on-write to fail.  We've already
6070 		 *	set, the new protection in the map, so if a
6071 		 *	write-protect fault occurred, it will be fixed up
6072 		 *	properly, COW or not.
6073 		 */
6074 		if (current->protection != old_prot) {
6075 			/* Look one level in we support nested pmaps */
6076 			/* from mapped submaps which are direct entries */
6077 			/* in our map */
6078 
6079 			vm_prot_t prot;
6080 
6081 			prot = current->protection;
6082 			if (current->is_sub_map || (VME_OBJECT(current) == NULL) || (VME_OBJECT(current) != compressor_object)) {
6083 				prot &= ~VM_PROT_WRITE;
6084 			} else {
6085 				assert(!VME_OBJECT(current)->code_signed);
6086 				assert(VME_OBJECT(current)->copy_strategy == MEMORY_OBJECT_COPY_NONE);
6087 				if (prot & VM_PROT_WRITE) {
6088 					/*
6089 					 * For write requests on the
6090 					 * compressor, we wil ask the
6091 					 * pmap layer to prevent us from
6092 					 * taking a write fault when we
6093 					 * attempt to access the mapping
6094 					 * next.
6095 					 */
6096 					pmap_options |= PMAP_OPTIONS_PROTECT_IMMEDIATE;
6097 				}
6098 			}
6099 
6100 			if (override_nx(map, VME_ALIAS(current)) && prot) {
6101 				prot |= VM_PROT_EXECUTE;
6102 			}
6103 
6104 #if DEVELOPMENT || DEBUG
6105 			if (!(old_prot & VM_PROT_EXECUTE) &&
6106 			    (prot & VM_PROT_EXECUTE) &&
6107 			    panic_on_unsigned_execute &&
6108 			    (proc_selfcsflags() & CS_KILL)) {
6109 				panic("vm_map_protect(%p,0x%llx,0x%llx) old=0x%x new=0x%x - <rdar://23770418> code-signing bypass?", map, (uint64_t)current->vme_start, (uint64_t)current->vme_end, old_prot, prot);
6110 			}
6111 #endif /* DEVELOPMENT || DEBUG */
6112 
6113 			if (pmap_has_prot_policy(map->pmap, current->translated_allow_execute, prot)) {
6114 				if (current->wired_count) {
6115 					panic("vm_map_protect(%p,0x%llx,0x%llx) new=0x%x wired=%x",
6116 					    map, (uint64_t)current->vme_start, (uint64_t)current->vme_end, prot, current->wired_count);
6117 				}
6118 
6119 				/* If the pmap layer cares about this
6120 				 * protection type, force a fault for
6121 				 * each page so that vm_fault will
6122 				 * repopulate the page with the full
6123 				 * set of protections.
6124 				 */
6125 				/*
6126 				 * TODO: We don't seem to need this,
6127 				 * but this is due to an internal
6128 				 * implementation detail of
6129 				 * pmap_protect.  Do we want to rely
6130 				 * on this?
6131 				 */
6132 				prot = VM_PROT_NONE;
6133 			}
6134 
6135 			if (current->is_sub_map && current->use_pmap) {
6136 				pmap_protect(VME_SUBMAP(current)->pmap,
6137 				    current->vme_start,
6138 				    current->vme_end,
6139 				    prot);
6140 			} else {
6141 				pmap_protect_options(map->pmap,
6142 				    current->vme_start,
6143 				    current->vme_end,
6144 				    prot,
6145 				    pmap_options,
6146 				    NULL);
6147 			}
6148 		}
6149 		current = current->vme_next;
6150 	}
6151 
6152 	current = entry;
6153 	while ((current != vm_map_to_entry(map)) &&
6154 	    (current->vme_start <= end)) {
6155 		vm_map_simplify_entry(map, current);
6156 		current = current->vme_next;
6157 	}
6158 
6159 	vm_map_unlock(map);
6160 	return KERN_SUCCESS;
6161 }
6162 
6163 /*
6164  *	vm_map_inherit:
6165  *
6166  *	Sets the inheritance of the specified address
6167  *	range in the target map.  Inheritance
6168  *	affects how the map will be shared with
6169  *	child maps at the time of vm_map_fork.
6170  */
6171 kern_return_t
vm_map_inherit(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_inherit_t new_inheritance)6172 vm_map_inherit(
6173 	vm_map_t        map,
6174 	vm_map_offset_t start,
6175 	vm_map_offset_t end,
6176 	vm_inherit_t    new_inheritance)
6177 {
6178 	vm_map_entry_t  entry;
6179 	vm_map_entry_t  temp_entry;
6180 
6181 	vm_map_lock(map);
6182 
6183 	VM_MAP_RANGE_CHECK(map, start, end);
6184 
6185 	if (vm_map_lookup_entry(map, start, &temp_entry)) {
6186 		entry = temp_entry;
6187 	} else {
6188 		temp_entry = temp_entry->vme_next;
6189 		entry = temp_entry;
6190 	}
6191 
6192 	/* first check entire range for submaps which can't support the */
6193 	/* given inheritance. */
6194 	while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
6195 		if (entry->is_sub_map) {
6196 			if (new_inheritance == VM_INHERIT_COPY) {
6197 				vm_map_unlock(map);
6198 				return KERN_INVALID_ARGUMENT;
6199 			}
6200 		}
6201 
6202 		entry = entry->vme_next;
6203 	}
6204 
6205 	entry = temp_entry;
6206 	if (entry != vm_map_to_entry(map)) {
6207 		/* clip and unnest if necessary */
6208 		vm_map_clip_start(map, entry, start);
6209 	}
6210 
6211 	while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
6212 		vm_map_clip_end(map, entry, end);
6213 		if (entry->is_sub_map) {
6214 			/* clip did unnest if needed */
6215 			assert(!entry->use_pmap);
6216 		}
6217 
6218 		entry->inheritance = new_inheritance;
6219 
6220 		entry = entry->vme_next;
6221 	}
6222 
6223 	vm_map_unlock(map);
6224 	return KERN_SUCCESS;
6225 }
6226 
6227 /*
6228  * Update the accounting for the amount of wired memory in this map.  If the user has
6229  * exceeded the defined limits, then we fail.  Wiring on behalf of the kernel never fails.
6230  */
6231 
6232 static kern_return_t
add_wire_counts(vm_map_t map,vm_map_entry_t entry,boolean_t user_wire)6233 add_wire_counts(
6234 	vm_map_t        map,
6235 	vm_map_entry_t  entry,
6236 	boolean_t       user_wire)
6237 {
6238 	vm_map_size_t   size;
6239 
6240 	if (user_wire) {
6241 		unsigned int total_wire_count =  vm_page_wire_count + vm_lopage_free_count;
6242 
6243 		/*
6244 		 * We're wiring memory at the request of the user.  Check if this is the first time the user is wiring
6245 		 * this map entry.
6246 		 */
6247 
6248 		if (entry->user_wired_count == 0) {
6249 			size = entry->vme_end - entry->vme_start;
6250 
6251 			/*
6252 			 * Since this is the first time the user is wiring this map entry, check to see if we're
6253 			 * exceeding the user wire limits.  There is a per map limit which is the smaller of either
6254 			 * the process's rlimit or the global vm_per_task_user_wire_limit which caps this value.  There is also
6255 			 * a system-wide limit on the amount of memory all users can wire.  If the user is over either
6256 			 * limit, then we fail.
6257 			 */
6258 
6259 			if (size + map->user_wire_size > MIN(map->user_wire_limit, vm_per_task_user_wire_limit) ||
6260 			    size + ptoa_64(total_wire_count) > vm_global_user_wire_limit) {
6261 				if (size + ptoa_64(total_wire_count) > vm_global_user_wire_limit) {
6262 #if DEVELOPMENT || DEBUG
6263 					if (panic_on_mlock_failure) {
6264 						panic("mlock: Over global wire limit. %llu bytes wired and requested to wire %llu bytes more", ptoa_64(total_wire_count), (uint64_t) size);
6265 					}
6266 #endif /* DEVELOPMENT || DEBUG */
6267 					os_atomic_inc(&vm_add_wire_count_over_global_limit, relaxed);
6268 				} else {
6269 					os_atomic_inc(&vm_add_wire_count_over_user_limit, relaxed);
6270 #if DEVELOPMENT || DEBUG
6271 					if (panic_on_mlock_failure) {
6272 						panic("mlock: Over process wire limit. %llu bytes wired and requested to wire %llu bytes more", (uint64_t) map->user_wire_size, (uint64_t) size);
6273 					}
6274 #endif /* DEVELOPMENT || DEBUG */
6275 				}
6276 				return KERN_RESOURCE_SHORTAGE;
6277 			}
6278 
6279 			/*
6280 			 * The first time the user wires an entry, we also increment the wired_count and add this to
6281 			 * the total that has been wired in the map.
6282 			 */
6283 
6284 			if (entry->wired_count >= MAX_WIRE_COUNT) {
6285 				return KERN_FAILURE;
6286 			}
6287 
6288 			entry->wired_count++;
6289 			map->user_wire_size += size;
6290 		}
6291 
6292 		if (entry->user_wired_count >= MAX_WIRE_COUNT) {
6293 			return KERN_FAILURE;
6294 		}
6295 
6296 		entry->user_wired_count++;
6297 	} else {
6298 		/*
6299 		 * The kernel's wiring the memory.  Just bump the count and continue.
6300 		 */
6301 
6302 		if (entry->wired_count >= MAX_WIRE_COUNT) {
6303 			panic("vm_map_wire: too many wirings");
6304 		}
6305 
6306 		entry->wired_count++;
6307 	}
6308 
6309 	return KERN_SUCCESS;
6310 }
6311 
6312 /*
6313  * Update the memory wiring accounting now that the given map entry is being unwired.
6314  */
6315 
6316 static void
subtract_wire_counts(vm_map_t map,vm_map_entry_t entry,boolean_t user_wire)6317 subtract_wire_counts(
6318 	vm_map_t        map,
6319 	vm_map_entry_t  entry,
6320 	boolean_t       user_wire)
6321 {
6322 	if (user_wire) {
6323 		/*
6324 		 * We're unwiring memory at the request of the user.  See if we're removing the last user wire reference.
6325 		 */
6326 
6327 		if (entry->user_wired_count == 1) {
6328 			/*
6329 			 * We're removing the last user wire reference.  Decrement the wired_count and the total
6330 			 * user wired memory for this map.
6331 			 */
6332 
6333 			assert(entry->wired_count >= 1);
6334 			entry->wired_count--;
6335 			map->user_wire_size -= entry->vme_end - entry->vme_start;
6336 		}
6337 
6338 		assert(entry->user_wired_count >= 1);
6339 		entry->user_wired_count--;
6340 	} else {
6341 		/*
6342 		 * The kernel is unwiring the memory.   Just update the count.
6343 		 */
6344 
6345 		assert(entry->wired_count >= 1);
6346 		entry->wired_count--;
6347 	}
6348 }
6349 
6350 int cs_executable_wire = 0;
6351 
6352 /*
6353  *	vm_map_wire:
6354  *
6355  *	Sets the pageability of the specified address range in the
6356  *	target map as wired.  Regions specified as not pageable require
6357  *	locked-down physical memory and physical page maps.  The
6358  *	access_type variable indicates types of accesses that must not
6359  *	generate page faults.  This is checked against protection of
6360  *	memory being locked-down.
6361  *
6362  *	The map must not be locked, but a reference must remain to the
6363  *	map throughout the call.
6364  */
6365 static kern_return_t
vm_map_wire_nested(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t caller_prot,vm_tag_t tag,boolean_t user_wire,pmap_t map_pmap,vm_map_offset_t pmap_addr,ppnum_t * physpage_p)6366 vm_map_wire_nested(
6367 	vm_map_t                map,
6368 	vm_map_offset_t         start,
6369 	vm_map_offset_t         end,
6370 	vm_prot_t               caller_prot,
6371 	vm_tag_t                tag,
6372 	boolean_t               user_wire,
6373 	pmap_t                  map_pmap,
6374 	vm_map_offset_t         pmap_addr,
6375 	ppnum_t                 *physpage_p)
6376 {
6377 	vm_map_entry_t          entry;
6378 	vm_prot_t               access_type;
6379 	struct vm_map_entry     *first_entry, tmp_entry;
6380 	vm_map_t                real_map;
6381 	vm_map_offset_t         s, e;
6382 	kern_return_t           rc;
6383 	boolean_t               need_wakeup;
6384 	boolean_t               main_map = FALSE;
6385 	wait_interrupt_t        interruptible_state;
6386 	thread_t                cur_thread;
6387 	unsigned int            last_timestamp;
6388 	vm_map_size_t           size;
6389 	boolean_t               wire_and_extract;
6390 	vm_prot_t               extra_prots;
6391 
6392 	extra_prots = VM_PROT_COPY;
6393 	extra_prots |= VM_PROT_COPY_FAIL_IF_EXECUTABLE;
6394 #if XNU_TARGET_OS_OSX
6395 	if (map->pmap == kernel_pmap ||
6396 	    !vm_map_cs_enforcement(map)) {
6397 		extra_prots &= ~VM_PROT_COPY_FAIL_IF_EXECUTABLE;
6398 	}
6399 #endif /* XNU_TARGET_OS_OSX */
6400 
6401 	access_type = (caller_prot & (VM_PROT_ALL | VM_PROT_ALLEXEC));
6402 
6403 	wire_and_extract = FALSE;
6404 	if (physpage_p != NULL) {
6405 		/*
6406 		 * The caller wants the physical page number of the
6407 		 * wired page.  We return only one physical page number
6408 		 * so this works for only one page at a time.
6409 		 */
6410 		if ((end - start) != PAGE_SIZE) {
6411 			return KERN_INVALID_ARGUMENT;
6412 		}
6413 		wire_and_extract = TRUE;
6414 		*physpage_p = 0;
6415 	}
6416 
6417 	vm_map_lock(map);
6418 	if (map_pmap == NULL) {
6419 		main_map = TRUE;
6420 	}
6421 	last_timestamp = map->timestamp;
6422 
6423 	VM_MAP_RANGE_CHECK(map, start, end);
6424 	assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
6425 	assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
6426 
6427 	if (start == end) {
6428 		/* We wired what the caller asked for, zero pages */
6429 		vm_map_unlock(map);
6430 		return KERN_SUCCESS;
6431 	}
6432 
6433 	need_wakeup = FALSE;
6434 	cur_thread = current_thread();
6435 
6436 	s = start;
6437 	rc = KERN_SUCCESS;
6438 
6439 	if (vm_map_lookup_entry(map, s, &first_entry)) {
6440 		entry = first_entry;
6441 		/*
6442 		 * vm_map_clip_start will be done later.
6443 		 * We don't want to unnest any nested submaps here !
6444 		 */
6445 	} else {
6446 		/* Start address is not in map */
6447 		rc = KERN_INVALID_ADDRESS;
6448 		goto done;
6449 	}
6450 
6451 	while ((entry != vm_map_to_entry(map)) && (s < end)) {
6452 		/*
6453 		 * At this point, we have wired from "start" to "s".
6454 		 * We still need to wire from "s" to "end".
6455 		 *
6456 		 * "entry" hasn't been clipped, so it could start before "s"
6457 		 * and/or end after "end".
6458 		 */
6459 
6460 		/* "e" is how far we want to wire in this entry */
6461 		e = entry->vme_end;
6462 		if (e > end) {
6463 			e = end;
6464 		}
6465 
6466 		/*
6467 		 * If another thread is wiring/unwiring this entry then
6468 		 * block after informing other thread to wake us up.
6469 		 */
6470 		if (entry->in_transition) {
6471 			wait_result_t wait_result;
6472 
6473 			/*
6474 			 * We have not clipped the entry.  Make sure that
6475 			 * the start address is in range so that the lookup
6476 			 * below will succeed.
6477 			 * "s" is the current starting point: we've already
6478 			 * wired from "start" to "s" and we still have
6479 			 * to wire from "s" to "end".
6480 			 */
6481 
6482 			entry->needs_wakeup = TRUE;
6483 
6484 			/*
6485 			 * wake up anybody waiting on entries that we have
6486 			 * already wired.
6487 			 */
6488 			if (need_wakeup) {
6489 				vm_map_entry_wakeup(map);
6490 				need_wakeup = FALSE;
6491 			}
6492 			/*
6493 			 * User wiring is interruptible
6494 			 */
6495 			wait_result = vm_map_entry_wait(map,
6496 			    (user_wire) ? THREAD_ABORTSAFE :
6497 			    THREAD_UNINT);
6498 			if (user_wire && wait_result == THREAD_INTERRUPTED) {
6499 				/*
6500 				 * undo the wirings we have done so far
6501 				 * We do not clear the needs_wakeup flag,
6502 				 * because we cannot tell if we were the
6503 				 * only one waiting.
6504 				 */
6505 				rc = KERN_FAILURE;
6506 				goto done;
6507 			}
6508 
6509 			/*
6510 			 * Cannot avoid a lookup here. reset timestamp.
6511 			 */
6512 			last_timestamp = map->timestamp;
6513 
6514 			/*
6515 			 * The entry could have been clipped, look it up again.
6516 			 * Worse that can happen is, it may not exist anymore.
6517 			 */
6518 			if (!vm_map_lookup_entry(map, s, &first_entry)) {
6519 				/*
6520 				 * User: undo everything upto the previous
6521 				 * entry.  let vm_map_unwire worry about
6522 				 * checking the validity of the range.
6523 				 */
6524 				rc = KERN_FAILURE;
6525 				goto done;
6526 			}
6527 			entry = first_entry;
6528 			continue;
6529 		}
6530 
6531 		if (entry->is_sub_map) {
6532 			vm_map_offset_t sub_start;
6533 			vm_map_offset_t sub_end;
6534 			vm_map_offset_t local_start;
6535 			vm_map_offset_t local_end;
6536 			pmap_t          pmap;
6537 
6538 			if (wire_and_extract) {
6539 				/*
6540 				 * Wiring would result in copy-on-write
6541 				 * which would not be compatible with
6542 				 * the sharing we have with the original
6543 				 * provider of this memory.
6544 				 */
6545 				rc = KERN_INVALID_ARGUMENT;
6546 				goto done;
6547 			}
6548 
6549 			vm_map_clip_start(map, entry, s);
6550 			vm_map_clip_end(map, entry, end);
6551 
6552 			sub_start = VME_OFFSET(entry);
6553 			sub_end = entry->vme_end;
6554 			sub_end += VME_OFFSET(entry) - entry->vme_start;
6555 
6556 			local_end = entry->vme_end;
6557 			if (map_pmap == NULL) {
6558 				vm_object_t             object;
6559 				vm_object_offset_t      offset;
6560 				vm_prot_t               prot;
6561 				boolean_t               wired;
6562 				vm_map_entry_t          local_entry;
6563 				vm_map_version_t         version;
6564 				vm_map_t                lookup_map;
6565 
6566 				if (entry->use_pmap) {
6567 					pmap = VME_SUBMAP(entry)->pmap;
6568 					/* ppc implementation requires that */
6569 					/* submaps pmap address ranges line */
6570 					/* up with parent map */
6571 #ifdef notdef
6572 					pmap_addr = sub_start;
6573 #endif
6574 					pmap_addr = s;
6575 				} else {
6576 					pmap = map->pmap;
6577 					pmap_addr = s;
6578 				}
6579 
6580 				if (entry->wired_count) {
6581 					if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6582 						goto done;
6583 					}
6584 
6585 					/*
6586 					 * The map was not unlocked:
6587 					 * no need to goto re-lookup.
6588 					 * Just go directly to next entry.
6589 					 */
6590 					entry = entry->vme_next;
6591 					s = entry->vme_start;
6592 					continue;
6593 				}
6594 
6595 				/* call vm_map_lookup_and_lock_object to */
6596 				/* cause any needs copy to be   */
6597 				/* evaluated */
6598 				local_start = entry->vme_start;
6599 				lookup_map = map;
6600 				vm_map_lock_write_to_read(map);
6601 				rc = vm_map_lookup_and_lock_object(
6602 					&lookup_map, local_start,
6603 					(access_type | extra_prots),
6604 					OBJECT_LOCK_EXCLUSIVE,
6605 					&version, &object,
6606 					&offset, &prot, &wired,
6607 					NULL,
6608 					&real_map, NULL);
6609 				if (rc != KERN_SUCCESS) {
6610 					vm_map_unlock_read(lookup_map);
6611 					assert(map_pmap == NULL);
6612 					vm_map_unwire(map, start,
6613 					    s, user_wire);
6614 					return rc;
6615 				}
6616 				vm_object_unlock(object);
6617 				if (real_map != lookup_map) {
6618 					vm_map_unlock(real_map);
6619 				}
6620 				vm_map_unlock_read(lookup_map);
6621 				vm_map_lock(map);
6622 
6623 				/* we unlocked, so must re-lookup */
6624 				if (!vm_map_lookup_entry(map,
6625 				    local_start,
6626 				    &local_entry)) {
6627 					rc = KERN_FAILURE;
6628 					goto done;
6629 				}
6630 
6631 				/*
6632 				 * entry could have been "simplified",
6633 				 * so re-clip
6634 				 */
6635 				entry = local_entry;
6636 				assert(s == local_start);
6637 				vm_map_clip_start(map, entry, s);
6638 				vm_map_clip_end(map, entry, end);
6639 				/* re-compute "e" */
6640 				e = entry->vme_end;
6641 				if (e > end) {
6642 					e = end;
6643 				}
6644 
6645 				/* did we have a change of type? */
6646 				if (!entry->is_sub_map) {
6647 					last_timestamp = map->timestamp;
6648 					continue;
6649 				}
6650 			} else {
6651 				local_start = entry->vme_start;
6652 				pmap = map_pmap;
6653 			}
6654 
6655 			if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6656 				goto done;
6657 			}
6658 
6659 			entry->in_transition = TRUE;
6660 
6661 			vm_map_unlock(map);
6662 			rc = vm_map_wire_nested(VME_SUBMAP(entry),
6663 			    sub_start, sub_end,
6664 			    caller_prot, tag,
6665 			    user_wire, pmap, pmap_addr,
6666 			    NULL);
6667 			vm_map_lock(map);
6668 
6669 			/*
6670 			 * Find the entry again.  It could have been clipped
6671 			 * after we unlocked the map.
6672 			 */
6673 			if (!vm_map_lookup_entry(map, local_start,
6674 			    &first_entry)) {
6675 				panic("vm_map_wire: re-lookup failed");
6676 			}
6677 			entry = first_entry;
6678 
6679 			assert(local_start == s);
6680 			/* re-compute "e" */
6681 			e = entry->vme_end;
6682 			if (e > end) {
6683 				e = end;
6684 			}
6685 
6686 			last_timestamp = map->timestamp;
6687 			while ((entry != vm_map_to_entry(map)) &&
6688 			    (entry->vme_start < e)) {
6689 				assert(entry->in_transition);
6690 				entry->in_transition = FALSE;
6691 				if (entry->needs_wakeup) {
6692 					entry->needs_wakeup = FALSE;
6693 					need_wakeup = TRUE;
6694 				}
6695 				if (rc != KERN_SUCCESS) {/* from vm_*_wire */
6696 					subtract_wire_counts(map, entry, user_wire);
6697 				}
6698 				entry = entry->vme_next;
6699 			}
6700 			if (rc != KERN_SUCCESS) {       /* from vm_*_wire */
6701 				goto done;
6702 			}
6703 
6704 			/* no need to relookup again */
6705 			s = entry->vme_start;
6706 			continue;
6707 		}
6708 
6709 		/*
6710 		 * If this entry is already wired then increment
6711 		 * the appropriate wire reference count.
6712 		 */
6713 		if (entry->wired_count) {
6714 			if ((entry->protection & access_type) != access_type) {
6715 				/* found a protection problem */
6716 
6717 				/*
6718 				 * XXX FBDP
6719 				 * We should always return an error
6720 				 * in this case but since we didn't
6721 				 * enforce it before, let's do
6722 				 * it only for the new "wire_and_extract"
6723 				 * code path for now...
6724 				 */
6725 				if (wire_and_extract) {
6726 					rc = KERN_PROTECTION_FAILURE;
6727 					goto done;
6728 				}
6729 			}
6730 
6731 			/*
6732 			 * entry is already wired down, get our reference
6733 			 * after clipping to our range.
6734 			 */
6735 			vm_map_clip_start(map, entry, s);
6736 			vm_map_clip_end(map, entry, end);
6737 
6738 			if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6739 				goto done;
6740 			}
6741 
6742 			if (wire_and_extract) {
6743 				vm_object_t             object;
6744 				vm_object_offset_t      offset;
6745 				vm_page_t               m;
6746 
6747 				/*
6748 				 * We don't have to "wire" the page again
6749 				 * bit we still have to "extract" its
6750 				 * physical page number, after some sanity
6751 				 * checks.
6752 				 */
6753 				assert((entry->vme_end - entry->vme_start)
6754 				    == PAGE_SIZE);
6755 				assert(!entry->needs_copy);
6756 				assert(!entry->is_sub_map);
6757 				assert(VME_OBJECT(entry));
6758 				if (((entry->vme_end - entry->vme_start)
6759 				    != PAGE_SIZE) ||
6760 				    entry->needs_copy ||
6761 				    entry->is_sub_map ||
6762 				    VME_OBJECT(entry) == VM_OBJECT_NULL) {
6763 					rc = KERN_INVALID_ARGUMENT;
6764 					goto done;
6765 				}
6766 
6767 				object = VME_OBJECT(entry);
6768 				offset = VME_OFFSET(entry);
6769 				/* need exclusive lock to update m->dirty */
6770 				if (entry->protection & VM_PROT_WRITE) {
6771 					vm_object_lock(object);
6772 				} else {
6773 					vm_object_lock_shared(object);
6774 				}
6775 				m = vm_page_lookup(object, offset);
6776 				assert(m != VM_PAGE_NULL);
6777 				assert(VM_PAGE_WIRED(m));
6778 				if (m != VM_PAGE_NULL && VM_PAGE_WIRED(m)) {
6779 					*physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
6780 					if (entry->protection & VM_PROT_WRITE) {
6781 						vm_object_lock_assert_exclusive(
6782 							object);
6783 						m->vmp_dirty = TRUE;
6784 					}
6785 				} else {
6786 					/* not already wired !? */
6787 					*physpage_p = 0;
6788 				}
6789 				vm_object_unlock(object);
6790 			}
6791 
6792 			/* map was not unlocked: no need to relookup */
6793 			entry = entry->vme_next;
6794 			s = entry->vme_start;
6795 			continue;
6796 		}
6797 
6798 		/*
6799 		 * Unwired entry or wire request transmitted via submap
6800 		 */
6801 
6802 		/*
6803 		 * Wiring would copy the pages to the shadow object.
6804 		 * The shadow object would not be code-signed so
6805 		 * attempting to execute code from these copied pages
6806 		 * would trigger a code-signing violation.
6807 		 */
6808 
6809 		if ((entry->protection & VM_PROT_EXECUTE)
6810 #if XNU_TARGET_OS_OSX
6811 		    &&
6812 		    map->pmap != kernel_pmap &&
6813 		    (vm_map_cs_enforcement(map)
6814 #if __arm64__
6815 		    || !VM_MAP_IS_EXOTIC(map)
6816 #endif /* __arm64__ */
6817 		    )
6818 #endif /* XNU_TARGET_OS_OSX */
6819 		    ) {
6820 #if MACH_ASSERT
6821 			printf("pid %d[%s] wiring executable range from "
6822 			    "0x%llx to 0x%llx: rejected to preserve "
6823 			    "code-signing\n",
6824 			    proc_selfpid(),
6825 			    (get_bsdtask_info(current_task())
6826 			    ? proc_name_address(get_bsdtask_info(current_task()))
6827 			    : "?"),
6828 			    (uint64_t) entry->vme_start,
6829 			    (uint64_t) entry->vme_end);
6830 #endif /* MACH_ASSERT */
6831 			DTRACE_VM2(cs_executable_wire,
6832 			    uint64_t, (uint64_t)entry->vme_start,
6833 			    uint64_t, (uint64_t)entry->vme_end);
6834 			cs_executable_wire++;
6835 			rc = KERN_PROTECTION_FAILURE;
6836 			goto done;
6837 		}
6838 
6839 		/*
6840 		 * Perform actions of vm_map_lookup that need the write
6841 		 * lock on the map: create a shadow object for a
6842 		 * copy-on-write region, or an object for a zero-fill
6843 		 * region.
6844 		 */
6845 		size = entry->vme_end - entry->vme_start;
6846 		/*
6847 		 * If wiring a copy-on-write page, we need to copy it now
6848 		 * even if we're only (currently) requesting read access.
6849 		 * This is aggressive, but once it's wired we can't move it.
6850 		 */
6851 		if (entry->needs_copy) {
6852 			if (wire_and_extract) {
6853 				/*
6854 				 * We're supposed to share with the original
6855 				 * provider so should not be "needs_copy"
6856 				 */
6857 				rc = KERN_INVALID_ARGUMENT;
6858 				goto done;
6859 			}
6860 
6861 			VME_OBJECT_SHADOW(entry, size,
6862 			    vm_map_always_shadow(map));
6863 			entry->needs_copy = FALSE;
6864 		} else if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
6865 			if (wire_and_extract) {
6866 				/*
6867 				 * We're supposed to share with the original
6868 				 * provider so should already have an object.
6869 				 */
6870 				rc = KERN_INVALID_ARGUMENT;
6871 				goto done;
6872 			}
6873 			VME_OBJECT_SET(entry, vm_object_allocate(size), false, 0);
6874 			VME_OFFSET_SET(entry, (vm_object_offset_t)0);
6875 			assert(entry->use_pmap);
6876 		} else if (VME_OBJECT(entry)->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
6877 			if (wire_and_extract) {
6878 				/*
6879 				 * We're supposed to share with the original
6880 				 * provider so should not be COPY_SYMMETRIC.
6881 				 */
6882 				rc = KERN_INVALID_ARGUMENT;
6883 				goto done;
6884 			}
6885 			/*
6886 			 * Force an unrequested "copy-on-write" but only for
6887 			 * the range we're wiring.
6888 			 */
6889 //			printf("FBDP %s:%d map %p entry %p [ 0x%llx 0x%llx ] s 0x%llx end 0x%llx wire&extract=%d\n", __FUNCTION__, __LINE__, map, entry, (uint64_t)entry->vme_start, (uint64_t)entry->vme_end, (uint64_t)s, (uint64_t)end, wire_and_extract);
6890 			vm_map_clip_start(map, entry, s);
6891 			vm_map_clip_end(map, entry, end);
6892 			/* recompute "size" */
6893 			size = entry->vme_end - entry->vme_start;
6894 			/* make a shadow object */
6895 			vm_object_t orig_object;
6896 			vm_object_offset_t orig_offset;
6897 			orig_object = VME_OBJECT(entry);
6898 			orig_offset = VME_OFFSET(entry);
6899 			VME_OBJECT_SHADOW(entry, size, vm_map_always_shadow(map));
6900 			if (VME_OBJECT(entry) != orig_object) {
6901 				/*
6902 				 * This mapping has not been shared (or it would be
6903 				 * COPY_DELAY instead of COPY_SYMMETRIC) and it has
6904 				 * not been copied-on-write (or it would be marked
6905 				 * as "needs_copy" and would have been handled above
6906 				 * and also already write-protected).
6907 				 * We still need to write-protect here to prevent
6908 				 * other threads from modifying these pages while
6909 				 * we're in the process of copying and wiring
6910 				 * the copied pages.
6911 				 * Since the mapping is neither shared nor COWed,
6912 				 * we only need to write-protect the PTEs for this
6913 				 * mapping.
6914 				 */
6915 				vm_object_pmap_protect(orig_object,
6916 				    orig_offset,
6917 				    size,
6918 				    map->pmap,
6919 				    VM_MAP_PAGE_SIZE(map),
6920 				    entry->vme_start,
6921 				    entry->protection & ~VM_PROT_WRITE);
6922 			}
6923 		}
6924 		if (VME_OBJECT(entry)->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
6925 			/*
6926 			 * Make the object COPY_DELAY to get a stable object
6927 			 * to wire.
6928 			 * That should avoid creating long shadow chains while
6929 			 * wiring/unwiring the same range repeatedly.
6930 			 * That also prevents part of the object from being
6931 			 * wired while another part is "needs_copy", which
6932 			 * could result in conflicting rules wrt copy-on-write.
6933 			 */
6934 			vm_object_t object;
6935 
6936 			object = VME_OBJECT(entry);
6937 			vm_object_lock(object);
6938 			if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
6939 				assertf(vm_object_round_page(VME_OFFSET(entry) + size) - vm_object_trunc_page(VME_OFFSET(entry)) == object->vo_size,
6940 				    "object %p size 0x%llx entry %p [0x%llx:0x%llx:0x%llx] size 0x%llx\n",
6941 				    object, (uint64_t)object->vo_size,
6942 				    entry,
6943 				    (uint64_t)entry->vme_start,
6944 				    (uint64_t)entry->vme_end,
6945 				    (uint64_t)VME_OFFSET(entry),
6946 				    (uint64_t)size);
6947 				assertf(object->ref_count == 1,
6948 				    "object %p ref_count %d\n",
6949 				    object, object->ref_count);
6950 				assertf(!entry->needs_copy,
6951 				    "entry %p\n", entry);
6952 				object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
6953 				object->true_share = TRUE;
6954 			}
6955 			vm_object_unlock(object);
6956 		}
6957 
6958 		vm_map_clip_start(map, entry, s);
6959 		vm_map_clip_end(map, entry, end);
6960 
6961 		/* re-compute "e" */
6962 		e = entry->vme_end;
6963 		if (e > end) {
6964 			e = end;
6965 		}
6966 
6967 		/*
6968 		 * Check for holes and protection mismatch.
6969 		 * Holes: Next entry should be contiguous unless this
6970 		 *	  is the end of the region.
6971 		 * Protection: Access requested must be allowed, unless
6972 		 *	wiring is by protection class
6973 		 */
6974 		if ((entry->vme_end < end) &&
6975 		    ((entry->vme_next == vm_map_to_entry(map)) ||
6976 		    (entry->vme_next->vme_start > entry->vme_end))) {
6977 			/* found a hole */
6978 			rc = KERN_INVALID_ADDRESS;
6979 			goto done;
6980 		}
6981 		if ((entry->protection & access_type) != access_type) {
6982 			/* found a protection problem */
6983 			rc = KERN_PROTECTION_FAILURE;
6984 			goto done;
6985 		}
6986 
6987 		assert(entry->wired_count == 0 && entry->user_wired_count == 0);
6988 
6989 		if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6990 			goto done;
6991 		}
6992 
6993 		entry->in_transition = TRUE;
6994 
6995 		/*
6996 		 * This entry might get split once we unlock the map.
6997 		 * In vm_fault_wire(), we need the current range as
6998 		 * defined by this entry.  In order for this to work
6999 		 * along with a simultaneous clip operation, we make a
7000 		 * temporary copy of this entry and use that for the
7001 		 * wiring.  Note that the underlying objects do not
7002 		 * change during a clip.
7003 		 */
7004 		tmp_entry = *entry;
7005 
7006 		/*
7007 		 * The in_transition state guarentees that the entry
7008 		 * (or entries for this range, if split occured) will be
7009 		 * there when the map lock is acquired for the second time.
7010 		 */
7011 		vm_map_unlock(map);
7012 
7013 		if (!user_wire && cur_thread != THREAD_NULL) {
7014 			interruptible_state = thread_interrupt_level(THREAD_UNINT);
7015 		} else {
7016 			interruptible_state = THREAD_UNINT;
7017 		}
7018 
7019 		if (map_pmap) {
7020 			rc = vm_fault_wire(map,
7021 			    &tmp_entry, caller_prot, tag, map_pmap, pmap_addr,
7022 			    physpage_p);
7023 		} else {
7024 			rc = vm_fault_wire(map,
7025 			    &tmp_entry, caller_prot, tag, map->pmap,
7026 			    tmp_entry.vme_start,
7027 			    physpage_p);
7028 		}
7029 
7030 		if (!user_wire && cur_thread != THREAD_NULL) {
7031 			thread_interrupt_level(interruptible_state);
7032 		}
7033 
7034 		vm_map_lock(map);
7035 
7036 		if (last_timestamp + 1 != map->timestamp) {
7037 			/*
7038 			 * Find the entry again.  It could have been clipped
7039 			 * after we unlocked the map.
7040 			 */
7041 			if (!vm_map_lookup_entry(map, tmp_entry.vme_start,
7042 			    &first_entry)) {
7043 				panic("vm_map_wire: re-lookup failed");
7044 			}
7045 
7046 			entry = first_entry;
7047 		}
7048 
7049 		last_timestamp = map->timestamp;
7050 
7051 		while ((entry != vm_map_to_entry(map)) &&
7052 		    (entry->vme_start < tmp_entry.vme_end)) {
7053 			assert(entry->in_transition);
7054 			entry->in_transition = FALSE;
7055 			if (entry->needs_wakeup) {
7056 				entry->needs_wakeup = FALSE;
7057 				need_wakeup = TRUE;
7058 			}
7059 			if (rc != KERN_SUCCESS) {       /* from vm_*_wire */
7060 				subtract_wire_counts(map, entry, user_wire);
7061 			}
7062 			entry = entry->vme_next;
7063 		}
7064 
7065 		if (rc != KERN_SUCCESS) {               /* from vm_*_wire */
7066 			goto done;
7067 		}
7068 
7069 		if ((entry != vm_map_to_entry(map)) && /* we still have entries in the map */
7070 		    (tmp_entry.vme_end != end) &&    /* AND, we are not at the end of the requested range */
7071 		    (entry->vme_start != tmp_entry.vme_end)) { /* AND, the next entry is not contiguous. */
7072 			/* found a "new" hole */
7073 			s = tmp_entry.vme_end;
7074 			rc = KERN_INVALID_ADDRESS;
7075 			goto done;
7076 		}
7077 
7078 		s = entry->vme_start;
7079 	} /* end while loop through map entries */
7080 
7081 done:
7082 	if (rc == KERN_SUCCESS) {
7083 		/* repair any damage we may have made to the VM map */
7084 		vm_map_simplify_range(map, start, end);
7085 	}
7086 
7087 	vm_map_unlock(map);
7088 
7089 	/*
7090 	 * wake up anybody waiting on entries we wired.
7091 	 */
7092 	if (need_wakeup) {
7093 		vm_map_entry_wakeup(map);
7094 	}
7095 
7096 	if (rc != KERN_SUCCESS) {
7097 		/* undo what has been wired so far */
7098 		vm_map_unwire_nested(map, start, s, user_wire,
7099 		    map_pmap, pmap_addr);
7100 		if (physpage_p) {
7101 			*physpage_p = 0;
7102 		}
7103 	}
7104 
7105 	return rc;
7106 }
7107 
7108 kern_return_t
vm_map_wire_external(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t caller_prot,boolean_t user_wire)7109 vm_map_wire_external(
7110 	vm_map_t                map,
7111 	vm_map_offset_t         start,
7112 	vm_map_offset_t         end,
7113 	vm_prot_t               caller_prot,
7114 	boolean_t               user_wire)
7115 {
7116 	kern_return_t   kret;
7117 
7118 	kret = vm_map_wire_nested(map, start, end, caller_prot, vm_tag_bt(),
7119 	    user_wire, (pmap_t)NULL, 0, NULL);
7120 	return kret;
7121 }
7122 
7123 kern_return_t
vm_map_wire_kernel(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t caller_prot,vm_tag_t tag,boolean_t user_wire)7124 vm_map_wire_kernel(
7125 	vm_map_t                map,
7126 	vm_map_offset_t         start,
7127 	vm_map_offset_t         end,
7128 	vm_prot_t               caller_prot,
7129 	vm_tag_t                tag,
7130 	boolean_t               user_wire)
7131 {
7132 	kern_return_t   kret;
7133 
7134 	kret = vm_map_wire_nested(map, start, end, caller_prot, tag,
7135 	    user_wire, (pmap_t)NULL, 0, NULL);
7136 	return kret;
7137 }
7138 
7139 kern_return_t
vm_map_wire_and_extract_external(vm_map_t map,vm_map_offset_t start,vm_prot_t caller_prot,boolean_t user_wire,ppnum_t * physpage_p)7140 vm_map_wire_and_extract_external(
7141 	vm_map_t        map,
7142 	vm_map_offset_t start,
7143 	vm_prot_t       caller_prot,
7144 	boolean_t       user_wire,
7145 	ppnum_t         *physpage_p)
7146 {
7147 	kern_return_t   kret;
7148 
7149 	kret = vm_map_wire_nested(map,
7150 	    start,
7151 	    start + VM_MAP_PAGE_SIZE(map),
7152 	    caller_prot,
7153 	    vm_tag_bt(),
7154 	    user_wire,
7155 	    (pmap_t)NULL,
7156 	    0,
7157 	    physpage_p);
7158 	if (kret != KERN_SUCCESS &&
7159 	    physpage_p != NULL) {
7160 		*physpage_p = 0;
7161 	}
7162 	return kret;
7163 }
7164 
7165 kern_return_t
vm_map_wire_and_extract_kernel(vm_map_t map,vm_map_offset_t start,vm_prot_t caller_prot,vm_tag_t tag,boolean_t user_wire,ppnum_t * physpage_p)7166 vm_map_wire_and_extract_kernel(
7167 	vm_map_t        map,
7168 	vm_map_offset_t start,
7169 	vm_prot_t       caller_prot,
7170 	vm_tag_t        tag,
7171 	boolean_t       user_wire,
7172 	ppnum_t         *physpage_p)
7173 {
7174 	kern_return_t   kret;
7175 
7176 	kret = vm_map_wire_nested(map,
7177 	    start,
7178 	    start + VM_MAP_PAGE_SIZE(map),
7179 	    caller_prot,
7180 	    tag,
7181 	    user_wire,
7182 	    (pmap_t)NULL,
7183 	    0,
7184 	    physpage_p);
7185 	if (kret != KERN_SUCCESS &&
7186 	    physpage_p != NULL) {
7187 		*physpage_p = 0;
7188 	}
7189 	return kret;
7190 }
7191 
7192 /*
7193  *	vm_map_unwire:
7194  *
7195  *	Sets the pageability of the specified address range in the target
7196  *	as pageable.  Regions specified must have been wired previously.
7197  *
7198  *	The map must not be locked, but a reference must remain to the map
7199  *	throughout the call.
7200  *
7201  *	Kernel will panic on failures.  User unwire ignores holes and
7202  *	unwired and intransition entries to avoid losing memory by leaving
7203  *	it unwired.
7204  */
7205 static kern_return_t
vm_map_unwire_nested(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,boolean_t user_wire,pmap_t map_pmap,vm_map_offset_t pmap_addr)7206 vm_map_unwire_nested(
7207 	vm_map_t                map,
7208 	vm_map_offset_t         start,
7209 	vm_map_offset_t         end,
7210 	boolean_t               user_wire,
7211 	pmap_t                  map_pmap,
7212 	vm_map_offset_t         pmap_addr)
7213 {
7214 	vm_map_entry_t          entry;
7215 	struct vm_map_entry     *first_entry, tmp_entry;
7216 	boolean_t               need_wakeup;
7217 	boolean_t               main_map = FALSE;
7218 	unsigned int            last_timestamp;
7219 
7220 	vm_map_lock(map);
7221 	if (map_pmap == NULL) {
7222 		main_map = TRUE;
7223 	}
7224 	last_timestamp = map->timestamp;
7225 
7226 	VM_MAP_RANGE_CHECK(map, start, end);
7227 	assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
7228 	assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
7229 
7230 	if (start == end) {
7231 		/* We unwired what the caller asked for: zero pages */
7232 		vm_map_unlock(map);
7233 		return KERN_SUCCESS;
7234 	}
7235 
7236 	if (vm_map_lookup_entry(map, start, &first_entry)) {
7237 		entry = first_entry;
7238 		/*
7239 		 * vm_map_clip_start will be done later.
7240 		 * We don't want to unnest any nested sub maps here !
7241 		 */
7242 	} else {
7243 		if (!user_wire) {
7244 			panic("vm_map_unwire: start not found");
7245 		}
7246 		/*	Start address is not in map. */
7247 		vm_map_unlock(map);
7248 		return KERN_INVALID_ADDRESS;
7249 	}
7250 
7251 	if (entry->superpage_size) {
7252 		/* superpages are always wired */
7253 		vm_map_unlock(map);
7254 		return KERN_INVALID_ADDRESS;
7255 	}
7256 
7257 	need_wakeup = FALSE;
7258 	while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
7259 		if (entry->in_transition) {
7260 			/*
7261 			 * 1)
7262 			 * Another thread is wiring down this entry. Note
7263 			 * that if it is not for the other thread we would
7264 			 * be unwiring an unwired entry.  This is not
7265 			 * permitted.  If we wait, we will be unwiring memory
7266 			 * we did not wire.
7267 			 *
7268 			 * 2)
7269 			 * Another thread is unwiring this entry.  We did not
7270 			 * have a reference to it, because if we did, this
7271 			 * entry will not be getting unwired now.
7272 			 */
7273 			if (!user_wire) {
7274 				/*
7275 				 * XXX FBDP
7276 				 * This could happen:  there could be some
7277 				 * overlapping vslock/vsunlock operations
7278 				 * going on.
7279 				 * We should probably just wait and retry,
7280 				 * but then we have to be careful that this
7281 				 * entry could get "simplified" after
7282 				 * "in_transition" gets unset and before
7283 				 * we re-lookup the entry, so we would
7284 				 * have to re-clip the entry to avoid
7285 				 * re-unwiring what we have already unwired...
7286 				 * See vm_map_wire_nested().
7287 				 *
7288 				 * Or we could just ignore "in_transition"
7289 				 * here and proceed to decement the wired
7290 				 * count(s) on this entry.  That should be fine
7291 				 * as long as "wired_count" doesn't drop all
7292 				 * the way to 0 (and we should panic if THAT
7293 				 * happens).
7294 				 */
7295 				panic("vm_map_unwire: in_transition entry");
7296 			}
7297 
7298 			entry = entry->vme_next;
7299 			continue;
7300 		}
7301 
7302 		if (entry->is_sub_map) {
7303 			vm_map_offset_t sub_start;
7304 			vm_map_offset_t sub_end;
7305 			vm_map_offset_t local_end;
7306 			pmap_t          pmap;
7307 
7308 			vm_map_clip_start(map, entry, start);
7309 			vm_map_clip_end(map, entry, end);
7310 
7311 			sub_start = VME_OFFSET(entry);
7312 			sub_end = entry->vme_end - entry->vme_start;
7313 			sub_end += VME_OFFSET(entry);
7314 			local_end = entry->vme_end;
7315 			if (map_pmap == NULL) {
7316 				if (entry->use_pmap) {
7317 					pmap = VME_SUBMAP(entry)->pmap;
7318 					pmap_addr = sub_start;
7319 				} else {
7320 					pmap = map->pmap;
7321 					pmap_addr = start;
7322 				}
7323 				if (entry->wired_count == 0 ||
7324 				    (user_wire && entry->user_wired_count == 0)) {
7325 					if (!user_wire) {
7326 						panic("vm_map_unwire: entry is unwired");
7327 					}
7328 					entry = entry->vme_next;
7329 					continue;
7330 				}
7331 
7332 				/*
7333 				 * Check for holes
7334 				 * Holes: Next entry should be contiguous unless
7335 				 * this is the end of the region.
7336 				 */
7337 				if (((entry->vme_end < end) &&
7338 				    ((entry->vme_next == vm_map_to_entry(map)) ||
7339 				    (entry->vme_next->vme_start
7340 				    > entry->vme_end)))) {
7341 					if (!user_wire) {
7342 						panic("vm_map_unwire: non-contiguous region");
7343 					}
7344 /*
7345  *                                       entry = entry->vme_next;
7346  *                                       continue;
7347  */
7348 				}
7349 
7350 				subtract_wire_counts(map, entry, user_wire);
7351 
7352 				if (entry->wired_count != 0) {
7353 					entry = entry->vme_next;
7354 					continue;
7355 				}
7356 
7357 				entry->in_transition = TRUE;
7358 				tmp_entry = *entry;/* see comment in vm_map_wire() */
7359 
7360 				/*
7361 				 * We can unlock the map now. The in_transition state
7362 				 * guarantees existance of the entry.
7363 				 */
7364 				vm_map_unlock(map);
7365 				vm_map_unwire_nested(VME_SUBMAP(entry),
7366 				    sub_start, sub_end, user_wire, pmap, pmap_addr);
7367 				vm_map_lock(map);
7368 
7369 				if (last_timestamp + 1 != map->timestamp) {
7370 					/*
7371 					 * Find the entry again.  It could have been
7372 					 * clipped or deleted after we unlocked the map.
7373 					 */
7374 					if (!vm_map_lookup_entry(map,
7375 					    tmp_entry.vme_start,
7376 					    &first_entry)) {
7377 						if (!user_wire) {
7378 							panic("vm_map_unwire: re-lookup failed");
7379 						}
7380 						entry = first_entry->vme_next;
7381 					} else {
7382 						entry = first_entry;
7383 					}
7384 				}
7385 				last_timestamp = map->timestamp;
7386 
7387 				/*
7388 				 * clear transition bit for all constituent entries
7389 				 * that were in the original entry (saved in
7390 				 * tmp_entry).  Also check for waiters.
7391 				 */
7392 				while ((entry != vm_map_to_entry(map)) &&
7393 				    (entry->vme_start < tmp_entry.vme_end)) {
7394 					assert(entry->in_transition);
7395 					entry->in_transition = FALSE;
7396 					if (entry->needs_wakeup) {
7397 						entry->needs_wakeup = FALSE;
7398 						need_wakeup = TRUE;
7399 					}
7400 					entry = entry->vme_next;
7401 				}
7402 				continue;
7403 			} else {
7404 				tmp_entry = *entry;
7405 				vm_map_unlock(map);
7406 				vm_map_unwire_nested(VME_SUBMAP(entry),
7407 				    sub_start, sub_end, user_wire, map_pmap,
7408 				    pmap_addr);
7409 				vm_map_lock(map);
7410 
7411 				if (last_timestamp + 1 != map->timestamp) {
7412 					/*
7413 					 * Find the entry again.  It could have been
7414 					 * clipped or deleted after we unlocked the map.
7415 					 */
7416 					if (!vm_map_lookup_entry(map,
7417 					    tmp_entry.vme_start,
7418 					    &first_entry)) {
7419 						if (!user_wire) {
7420 							panic("vm_map_unwire: re-lookup failed");
7421 						}
7422 						entry = first_entry->vme_next;
7423 					} else {
7424 						entry = first_entry;
7425 					}
7426 				}
7427 				last_timestamp = map->timestamp;
7428 			}
7429 		}
7430 
7431 
7432 		if ((entry->wired_count == 0) ||
7433 		    (user_wire && entry->user_wired_count == 0)) {
7434 			if (!user_wire) {
7435 				panic("vm_map_unwire: entry is unwired");
7436 			}
7437 
7438 			entry = entry->vme_next;
7439 			continue;
7440 		}
7441 
7442 		assert(entry->wired_count > 0 &&
7443 		    (!user_wire || entry->user_wired_count > 0));
7444 
7445 		vm_map_clip_start(map, entry, start);
7446 		vm_map_clip_end(map, entry, end);
7447 
7448 		/*
7449 		 * Check for holes
7450 		 * Holes: Next entry should be contiguous unless
7451 		 *	  this is the end of the region.
7452 		 */
7453 		if (((entry->vme_end < end) &&
7454 		    ((entry->vme_next == vm_map_to_entry(map)) ||
7455 		    (entry->vme_next->vme_start > entry->vme_end)))) {
7456 			if (!user_wire) {
7457 				panic("vm_map_unwire: non-contiguous region");
7458 			}
7459 			entry = entry->vme_next;
7460 			continue;
7461 		}
7462 
7463 		subtract_wire_counts(map, entry, user_wire);
7464 
7465 		if (entry->wired_count != 0) {
7466 			entry = entry->vme_next;
7467 			continue;
7468 		}
7469 
7470 		if (entry->zero_wired_pages) {
7471 			entry->zero_wired_pages = FALSE;
7472 		}
7473 
7474 		entry->in_transition = TRUE;
7475 		tmp_entry = *entry;     /* see comment in vm_map_wire() */
7476 
7477 		/*
7478 		 * We can unlock the map now. The in_transition state
7479 		 * guarantees existance of the entry.
7480 		 */
7481 		vm_map_unlock(map);
7482 		if (map_pmap) {
7483 			vm_fault_unwire(map,
7484 			    &tmp_entry, FALSE, map_pmap, pmap_addr);
7485 		} else {
7486 			vm_fault_unwire(map,
7487 			    &tmp_entry, FALSE, map->pmap,
7488 			    tmp_entry.vme_start);
7489 		}
7490 		vm_map_lock(map);
7491 
7492 		if (last_timestamp + 1 != map->timestamp) {
7493 			/*
7494 			 * Find the entry again.  It could have been clipped
7495 			 * or deleted after we unlocked the map.
7496 			 */
7497 			if (!vm_map_lookup_entry(map, tmp_entry.vme_start,
7498 			    &first_entry)) {
7499 				if (!user_wire) {
7500 					panic("vm_map_unwire: re-lookup failed");
7501 				}
7502 				entry = first_entry->vme_next;
7503 			} else {
7504 				entry = first_entry;
7505 			}
7506 		}
7507 		last_timestamp = map->timestamp;
7508 
7509 		/*
7510 		 * clear transition bit for all constituent entries that
7511 		 * were in the original entry (saved in tmp_entry).  Also
7512 		 * check for waiters.
7513 		 */
7514 		while ((entry != vm_map_to_entry(map)) &&
7515 		    (entry->vme_start < tmp_entry.vme_end)) {
7516 			assert(entry->in_transition);
7517 			entry->in_transition = FALSE;
7518 			if (entry->needs_wakeup) {
7519 				entry->needs_wakeup = FALSE;
7520 				need_wakeup = TRUE;
7521 			}
7522 			entry = entry->vme_next;
7523 		}
7524 	}
7525 
7526 	/*
7527 	 * We might have fragmented the address space when we wired this
7528 	 * range of addresses.  Attempt to re-coalesce these VM map entries
7529 	 * with their neighbors now that they're no longer wired.
7530 	 * Under some circumstances, address space fragmentation can
7531 	 * prevent VM object shadow chain collapsing, which can cause
7532 	 * swap space leaks.
7533 	 */
7534 	vm_map_simplify_range(map, start, end);
7535 
7536 	vm_map_unlock(map);
7537 	/*
7538 	 * wake up anybody waiting on entries that we have unwired.
7539 	 */
7540 	if (need_wakeup) {
7541 		vm_map_entry_wakeup(map);
7542 	}
7543 	return KERN_SUCCESS;
7544 }
7545 
7546 kern_return_t
vm_map_unwire(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,boolean_t user_wire)7547 vm_map_unwire(
7548 	vm_map_t                map,
7549 	vm_map_offset_t         start,
7550 	vm_map_offset_t         end,
7551 	boolean_t               user_wire)
7552 {
7553 	return vm_map_unwire_nested(map, start, end,
7554 	           user_wire, (pmap_t)NULL, 0);
7555 }
7556 
7557 
7558 /*
7559  *	vm_map_entry_zap:	[ internal use only ]
7560  *
7561  *	Remove the entry from the target map
7562  *	and put it on a zap list.
7563  */
7564 static void
vm_map_entry_zap(vm_map_t map,vm_map_entry_t entry,vm_map_zap_t zap)7565 vm_map_entry_zap(
7566 	vm_map_t                map,
7567 	vm_map_entry_t          entry,
7568 	vm_map_zap_t            zap)
7569 {
7570 	vm_map_offset_t s, e;
7571 
7572 	s = entry->vme_start;
7573 	e = entry->vme_end;
7574 	assert(VM_MAP_PAGE_ALIGNED(s, FOURK_PAGE_MASK));
7575 	assert(VM_MAP_PAGE_ALIGNED(e, FOURK_PAGE_MASK));
7576 	if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK) {
7577 		assert(page_aligned(s));
7578 		assert(page_aligned(e));
7579 	}
7580 	if (entry->map_aligned == TRUE) {
7581 		assert(VM_MAP_PAGE_ALIGNED(s, VM_MAP_PAGE_MASK(map)));
7582 		assert(VM_MAP_PAGE_ALIGNED(e, VM_MAP_PAGE_MASK(map)));
7583 	}
7584 	assert(entry->wired_count == 0);
7585 	assert(entry->user_wired_count == 0);
7586 	assert(!entry->vme_permanent);
7587 
7588 	vm_map_store_entry_unlink(map, entry, false);
7589 	map->size -= e - s;
7590 
7591 	vm_map_zap_append(zap, entry);
7592 }
7593 
7594 static void
vm_map_submap_pmap_clean(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_map_t sub_map,vm_map_offset_t offset)7595 vm_map_submap_pmap_clean(
7596 	vm_map_t        map,
7597 	vm_map_offset_t start,
7598 	vm_map_offset_t end,
7599 	vm_map_t        sub_map,
7600 	vm_map_offset_t offset)
7601 {
7602 	vm_map_offset_t submap_start;
7603 	vm_map_offset_t submap_end;
7604 	vm_map_size_t   remove_size;
7605 	vm_map_entry_t  entry;
7606 
7607 	submap_end = offset + (end - start);
7608 	submap_start = offset;
7609 
7610 	vm_map_lock_read(sub_map);
7611 	if (vm_map_lookup_entry(sub_map, offset, &entry)) {
7612 		remove_size = (entry->vme_end - entry->vme_start);
7613 		if (offset > entry->vme_start) {
7614 			remove_size -= offset - entry->vme_start;
7615 		}
7616 
7617 
7618 		if (submap_end < entry->vme_end) {
7619 			remove_size -=
7620 			    entry->vme_end - submap_end;
7621 		}
7622 		if (entry->is_sub_map) {
7623 			vm_map_submap_pmap_clean(
7624 				sub_map,
7625 				start,
7626 				start + remove_size,
7627 				VME_SUBMAP(entry),
7628 				VME_OFFSET(entry));
7629 		} else {
7630 			if (map->mapped_in_other_pmaps &&
7631 			    os_ref_get_count_raw(&map->map_refcnt) != 0 &&
7632 			    VME_OBJECT(entry) != NULL) {
7633 				vm_object_pmap_protect_options(
7634 					VME_OBJECT(entry),
7635 					(VME_OFFSET(entry) +
7636 					offset -
7637 					entry->vme_start),
7638 					remove_size,
7639 					PMAP_NULL,
7640 					PAGE_SIZE,
7641 					entry->vme_start,
7642 					VM_PROT_NONE,
7643 					PMAP_OPTIONS_REMOVE);
7644 			} else {
7645 				pmap_remove(map->pmap,
7646 				    (addr64_t)start,
7647 				    (addr64_t)(start + remove_size));
7648 			}
7649 		}
7650 	}
7651 
7652 	entry = entry->vme_next;
7653 
7654 	while ((entry != vm_map_to_entry(sub_map))
7655 	    && (entry->vme_start < submap_end)) {
7656 		remove_size = (entry->vme_end - entry->vme_start);
7657 		if (submap_end < entry->vme_end) {
7658 			remove_size -= entry->vme_end - submap_end;
7659 		}
7660 		if (entry->is_sub_map) {
7661 			vm_map_submap_pmap_clean(
7662 				sub_map,
7663 				(start + entry->vme_start) - offset,
7664 				((start + entry->vme_start) - offset) + remove_size,
7665 				VME_SUBMAP(entry),
7666 				VME_OFFSET(entry));
7667 		} else {
7668 			if (map->mapped_in_other_pmaps &&
7669 			    os_ref_get_count_raw(&map->map_refcnt) != 0 &&
7670 			    VME_OBJECT(entry) != NULL) {
7671 				vm_object_pmap_protect_options(
7672 					VME_OBJECT(entry),
7673 					VME_OFFSET(entry),
7674 					remove_size,
7675 					PMAP_NULL,
7676 					PAGE_SIZE,
7677 					entry->vme_start,
7678 					VM_PROT_NONE,
7679 					PMAP_OPTIONS_REMOVE);
7680 			} else {
7681 				pmap_remove(map->pmap,
7682 				    (addr64_t)((start + entry->vme_start)
7683 				    - offset),
7684 				    (addr64_t)(((start + entry->vme_start)
7685 				    - offset) + remove_size));
7686 			}
7687 		}
7688 		entry = entry->vme_next;
7689 	}
7690 	vm_map_unlock_read(sub_map);
7691 	return;
7692 }
7693 
7694 /*
7695  *     virt_memory_guard_ast:
7696  *
7697  *     Handle the AST callout for a virtual memory guard.
7698  *	   raise an EXC_GUARD exception and terminate the task
7699  *     if configured to do so.
7700  */
7701 void
virt_memory_guard_ast(thread_t thread,mach_exception_data_type_t code,mach_exception_data_type_t subcode)7702 virt_memory_guard_ast(
7703 	thread_t thread,
7704 	mach_exception_data_type_t code,
7705 	mach_exception_data_type_t subcode)
7706 {
7707 	task_t task = get_threadtask(thread);
7708 	assert(task != kernel_task);
7709 	assert(task == current_task());
7710 	kern_return_t sync_exception_result;
7711 	uint32_t behavior;
7712 
7713 	behavior = task->task_exc_guard;
7714 
7715 	/* Is delivery enabled */
7716 	if ((behavior & TASK_EXC_GUARD_VM_DELIVER) == 0) {
7717 		return;
7718 	}
7719 
7720 	/* If only once, make sure we're that once */
7721 	while (behavior & TASK_EXC_GUARD_VM_ONCE) {
7722 		uint32_t new_behavior = behavior & ~TASK_EXC_GUARD_VM_DELIVER;
7723 
7724 		if (OSCompareAndSwap(behavior, new_behavior, &task->task_exc_guard)) {
7725 			break;
7726 		}
7727 		behavior = task->task_exc_guard;
7728 		if ((behavior & TASK_EXC_GUARD_VM_DELIVER) == 0) {
7729 			return;
7730 		}
7731 	}
7732 
7733 	/* Raise exception synchronously and see if handler claimed it */
7734 	sync_exception_result = task_exception_notify(EXC_GUARD, code, subcode);
7735 
7736 	if (task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL) {
7737 		/*
7738 		 * If Synchronous EXC_GUARD delivery was successful then
7739 		 * kill the process and return, else kill the process
7740 		 * and deliver the exception via EXC_CORPSE_NOTIFY.
7741 		 */
7742 		if (sync_exception_result == KERN_SUCCESS) {
7743 			task_bsdtask_kill(current_task());
7744 		} else {
7745 			exit_with_guard_exception(current_proc(), code, subcode);
7746 		}
7747 	} else if (task->task_exc_guard & TASK_EXC_GUARD_VM_CORPSE) {
7748 		/*
7749 		 * If the synchronous EXC_GUARD delivery was not successful,
7750 		 * raise a simulated crash.
7751 		 */
7752 		if (sync_exception_result != KERN_SUCCESS) {
7753 			task_violated_guard(code, subcode, NULL, FALSE);
7754 		}
7755 	}
7756 }
7757 
7758 /*
7759  *     vm_map_guard_exception:
7760  *
7761  *     Generate a GUARD_TYPE_VIRTUAL_MEMORY EXC_GUARD exception.
7762  *
7763  *     Right now, we do this when we find nothing mapped, or a
7764  *     gap in the mapping when a user address space deallocate
7765  *     was requested. We report the address of the first gap found.
7766  */
7767 static void
vm_map_guard_exception(vm_map_offset_t gap_start,unsigned reason)7768 vm_map_guard_exception(
7769 	vm_map_offset_t gap_start,
7770 	unsigned reason)
7771 {
7772 	mach_exception_code_t code = 0;
7773 	unsigned int guard_type = GUARD_TYPE_VIRT_MEMORY;
7774 	unsigned int target = 0; /* should we pass in pid associated with map? */
7775 	mach_exception_data_type_t subcode = (uint64_t)gap_start;
7776 	boolean_t fatal = FALSE;
7777 
7778 	task_t task = current_task_early();
7779 
7780 	/* Can't deliver exceptions to a NULL task (early boot) or kernel task */
7781 	if (task == NULL || task == kernel_task) {
7782 		return;
7783 	}
7784 
7785 	EXC_GUARD_ENCODE_TYPE(code, guard_type);
7786 	EXC_GUARD_ENCODE_FLAVOR(code, reason);
7787 	EXC_GUARD_ENCODE_TARGET(code, target);
7788 
7789 	if (task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL) {
7790 		fatal = TRUE;
7791 	}
7792 	thread_guard_violation(current_thread(), code, subcode, fatal);
7793 }
7794 
7795 static kern_return_t
vm_map_delete_submap_recurse(vm_map_t submap,vm_map_offset_t submap_start,vm_map_offset_t submap_end)7796 vm_map_delete_submap_recurse(
7797 	vm_map_t submap,
7798 	vm_map_offset_t submap_start,
7799 	vm_map_offset_t submap_end)
7800 {
7801 	vm_map_entry_t submap_entry;
7802 
7803 	/*
7804 	 * Verify that the submap does not contain any "permanent" entries
7805 	 * within the specified range.
7806 	 * We do not care about gaps.
7807 	 */
7808 
7809 	vm_map_lock(submap);
7810 
7811 	if (!vm_map_lookup_entry(submap, submap_start, &submap_entry)) {
7812 		submap_entry = submap_entry->vme_next;
7813 	}
7814 
7815 	for (;
7816 	    submap_entry != vm_map_to_entry(submap) &&
7817 	    submap_entry->vme_start < submap_end;
7818 	    submap_entry = submap_entry->vme_next) {
7819 		if (submap_entry->vme_permanent) {
7820 			/* "permanent" entry -> fail */
7821 			vm_map_unlock(submap);
7822 			return KERN_PROTECTION_FAILURE;
7823 		}
7824 	}
7825 	/* no "permanent" entries in the range -> success */
7826 	vm_map_unlock(submap);
7827 	return KERN_SUCCESS;
7828 }
7829 
7830 __abortlike
7831 static void
__vm_map_delete_misaligned_panic(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)7832 __vm_map_delete_misaligned_panic(
7833 	vm_map_t                map,
7834 	vm_map_offset_t         start,
7835 	vm_map_offset_t         end)
7836 {
7837 	panic("vm_map_delete(%p,0x%llx,0x%llx): start is not aligned to 0x%x",
7838 	    map, (uint64_t)start, (uint64_t)end, VM_MAP_PAGE_SIZE(map));
7839 }
7840 
7841 __abortlike
7842 static void
__vm_map_delete_failed_panic(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,kern_return_t kr)7843 __vm_map_delete_failed_panic(
7844 	vm_map_t                map,
7845 	vm_map_offset_t         start,
7846 	vm_map_offset_t         end,
7847 	kern_return_t           kr)
7848 {
7849 	panic("vm_map_delete(%p,0x%llx,0x%llx): failed unexpected with %d",
7850 	    map, (uint64_t)start, (uint64_t)end, kr);
7851 }
7852 
7853 __abortlike
7854 static void
__vm_map_delete_gap_panic(vm_map_t map,vm_map_offset_t where,vm_map_offset_t start,vm_map_offset_t end)7855 __vm_map_delete_gap_panic(
7856 	vm_map_t                map,
7857 	vm_map_offset_t         where,
7858 	vm_map_offset_t         start,
7859 	vm_map_offset_t         end)
7860 {
7861 	panic("vm_map_delete(%p,0x%llx,0x%llx): no map entry at 0x%llx",
7862 	    map, (uint64_t)start, (uint64_t)end, (uint64_t)where);
7863 }
7864 
7865 __abortlike
7866 static void
__vm_map_delete_permanent_panic(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_map_entry_t entry)7867 __vm_map_delete_permanent_panic(
7868 	vm_map_t                map,
7869 	vm_map_offset_t         start,
7870 	vm_map_offset_t         end,
7871 	vm_map_entry_t          entry)
7872 {
7873 	panic("vm_map_delete(%p,0x%llx,0x%llx): "
7874 	    "Attempting to remove permanent VM map entry %p [0x%llx:0x%llx]",
7875 	    map, (uint64_t)start, (uint64_t)end, entry,
7876 	    (uint64_t)entry->vme_start,
7877 	    (uint64_t)entry->vme_end);
7878 }
7879 
7880 __options_decl(vm_map_delete_state_t, uint32_t, {
7881 	VMDS_NONE               = 0x0000,
7882 
7883 	VMDS_FOUND_GAP          = 0x0001,
7884 	VMDS_GAPS_OK            = 0x0002,
7885 
7886 	VMDS_KERNEL_PMAP        = 0x0004,
7887 	VMDS_NEEDS_LOOKUP       = 0x0008,
7888 	VMDS_NEEDS_WAKEUP       = 0x0010,
7889 });
7890 
7891 /*
7892  *	vm_map_delete:	[ internal use only ]
7893  *
7894  *	Deallocates the given address range from the target map.
7895  *	Removes all user wirings. Unwires one kernel wiring if
7896  *	VM_MAP_REMOVE_KUNWIRE is set.  Waits for kernel wirings to go
7897  *	away if VM_MAP_REMOVE_WAIT_FOR_KWIRE is set.  Sleeps
7898  *	interruptibly if VM_MAP_REMOVE_INTERRUPTIBLE is set.
7899  *
7900  *
7901  *	When the map is a kernel map, then any error in removing mappings
7902  *	will lead to a panic so that clients do not have to repeat the panic
7903  *	code at each call site.  If VM_MAP_REMOVE_INTERRUPTIBLE
7904  *	is also passed, then KERN_ABORTED will not lead to a panic.
7905  *
7906  *	This routine is called with map locked and leaves map locked.
7907  */
7908 static kmem_return_t
vm_map_delete(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vmr_flags_t flags,kmem_guard_t guard,vm_map_zap_t zap_list)7909 vm_map_delete(
7910 	vm_map_t                map,
7911 	vm_map_offset_t         start,
7912 	vm_map_offset_t         end,
7913 	vmr_flags_t             flags,
7914 	kmem_guard_t            guard,
7915 	vm_map_zap_t            zap_list)
7916 {
7917 	vm_map_entry_t          entry, next;
7918 	int                     interruptible;
7919 	vm_map_offset_t         gap_start = 0;
7920 	vm_map_offset_t         clear_in_transition_end = 0;
7921 	__unused vm_map_offset_t save_start = start;
7922 	__unused vm_map_offset_t save_end = end;
7923 	vm_map_delete_state_t   state = VMDS_NONE;
7924 	kmem_return_t           ret = { };
7925 
7926 	if (vm_map_pmap(map) == kernel_pmap) {
7927 		state |= VMDS_KERNEL_PMAP;
7928 	}
7929 
7930 	if (map->terminated || os_ref_get_count_raw(&map->map_refcnt) == 0) {
7931 		state |= VMDS_GAPS_OK;
7932 	}
7933 
7934 	interruptible = (flags & VM_MAP_REMOVE_INTERRUPTIBLE) ?
7935 	    THREAD_ABORTSAFE : THREAD_UNINT;
7936 
7937 	if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) == 0 &&
7938 	    (start & VM_MAP_PAGE_MASK(map))) {
7939 		__vm_map_delete_misaligned_panic(map, start, end);
7940 	}
7941 
7942 	if ((state & VMDS_GAPS_OK) == 0) {
7943 		/*
7944 		 * If the map isn't terminated then all deletions must have
7945 		 * no gaps, and be within the [min, max) of the map.
7946 		 *
7947 		 * We got here without VM_MAP_RANGE_CHECK() being called,
7948 		 * and hence must validate bounds manually.
7949 		 *
7950 		 * It is worth noting that because vm_deallocate() will
7951 		 * round_page() the deallocation size, it's possible for "end"
7952 		 * to be 0 here due to overflow. We hence must treat it as being
7953 		 * beyond vm_map_max(map).
7954 		 *
7955 		 * Similarly, end < start means some wrap around happend,
7956 		 * which should cause an error or panic.
7957 		 */
7958 		if (end == 0 || end > vm_map_max(map)) {
7959 			state |= VMDS_FOUND_GAP;
7960 			gap_start = vm_map_max(map);
7961 			if (state & VMDS_KERNEL_PMAP) {
7962 				__vm_map_delete_gap_panic(map,
7963 				    gap_start, start, end);
7964 			}
7965 			goto out;
7966 		}
7967 
7968 		if (end < start) {
7969 			if (state & VMDS_KERNEL_PMAP) {
7970 				__vm_map_delete_gap_panic(map,
7971 				    vm_map_max(map), start, end);
7972 			}
7973 			ret.kmr_return = KERN_INVALID_ARGUMENT;
7974 			goto out;
7975 		}
7976 
7977 		if (start < vm_map_min(map)) {
7978 			state |= VMDS_FOUND_GAP;
7979 			gap_start = start;
7980 			if (state & VMDS_KERNEL_PMAP) {
7981 				__vm_map_delete_gap_panic(map,
7982 				    gap_start, start, end);
7983 			}
7984 			goto out;
7985 		}
7986 	} else {
7987 		/*
7988 		 * If the map is terminated, we must accept start/end
7989 		 * being beyond the boundaries of the map as this is
7990 		 * how some of the mappings like commpage mappings
7991 		 * can be destroyed (they're outside of those bounds).
7992 		 *
7993 		 * end < start is still something we can't cope with,
7994 		 * so just bail.
7995 		 */
7996 		if (end < start) {
7997 			goto out;
7998 		}
7999 	}
8000 
8001 
8002 	/*
8003 	 *	Find the start of the region.
8004 	 *
8005 	 *	If in a superpage, extend the range
8006 	 *	to include the start of the mapping.
8007 	 */
8008 	while (vm_map_lookup_entry_or_next(map, start, &entry)) {
8009 		if (entry->superpage_size && (start & ~SUPERPAGE_MASK)) {
8010 			start = SUPERPAGE_ROUND_DOWN(start);
8011 		} else {
8012 			SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
8013 			break;
8014 		}
8015 	}
8016 
8017 	if (entry->superpage_size) {
8018 		end = SUPERPAGE_ROUND_UP(end);
8019 	}
8020 
8021 	/*
8022 	 *	Step through all entries in this region
8023 	 */
8024 	for (vm_map_offset_t s = start; s < end;) {
8025 		/*
8026 		 * At this point, we have deleted all the memory entries
8027 		 * in [start, s) and are proceeding with the [s, end) range.
8028 		 *
8029 		 * This loop might drop the map lock, and it is possible that
8030 		 * some memory was already reallocated within [start, s)
8031 		 * and we don't want to mess with those entries.
8032 		 *
8033 		 * Some of those entries could even have been re-assembled
8034 		 * with an entry after "s" (in vm_map_simplify_entry()), so
8035 		 * we may have to vm_map_clip_start() again.
8036 		 *
8037 		 * When clear_in_transition_end is set, the we had marked
8038 		 * [start, clear_in_transition_end) as "in_transition"
8039 		 * during a previous iteration and we need to clear it.
8040 		 */
8041 
8042 		/*
8043 		 * Step 1: If needed (because we dropped locks),
8044 		 *         lookup the entry again.
8045 		 *
8046 		 *         If we're coming back from unwiring (Step 5),
8047 		 *         we also need to mark the entries as no longer
8048 		 *         in transition after that.
8049 		 */
8050 
8051 		if (state & VMDS_NEEDS_LOOKUP) {
8052 			state &= ~VMDS_NEEDS_LOOKUP;
8053 
8054 			if (vm_map_lookup_entry_or_next(map, s, &entry)) {
8055 				SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
8056 			}
8057 		}
8058 
8059 		if (clear_in_transition_end) {
8060 			for (vm_map_entry_t it = entry;
8061 			    it != vm_map_to_entry(map) &&
8062 			    it->vme_start < clear_in_transition_end;
8063 			    it = it->vme_next) {
8064 				assert(it->in_transition);
8065 				it->in_transition = FALSE;
8066 				if (it->needs_wakeup) {
8067 					it->needs_wakeup = FALSE;
8068 					state |= VMDS_NEEDS_WAKEUP;
8069 				}
8070 			}
8071 
8072 			clear_in_transition_end = 0;
8073 		}
8074 
8075 
8076 		/*
8077 		 * Step 2: Perform various policy checks
8078 		 *         before we do _anything_ to this entry.
8079 		 */
8080 
8081 		if (entry == vm_map_to_entry(map) || s < entry->vme_start) {
8082 			if (state & (VMDS_GAPS_OK | VMDS_FOUND_GAP)) {
8083 				/*
8084 				 * Either we found a gap already,
8085 				 * or we are tearing down a map,
8086 				 * keep going.
8087 				 */
8088 			} else if (state & VMDS_KERNEL_PMAP) {
8089 				__vm_map_delete_gap_panic(map, s, start, end);
8090 			} else if (vm_map_round_page(s, VM_MAP_PAGE_MASK(map)) < end) {
8091 				/*
8092 				 * The vm_map_round_page() is needed since an entry
8093 				 * can be less than VM_MAP_PAGE_MASK() sized.
8094 				 *
8095 				 * For example, devices which have h/w 4K pages,
8096 				 * but entry sizes are all now 16K.
8097 				 */
8098 				state |= VMDS_FOUND_GAP;
8099 				gap_start = s;
8100 			}
8101 
8102 			if (entry == vm_map_to_entry(map) ||
8103 			    end <= entry->vme_start) {
8104 				break;
8105 			}
8106 
8107 			s = entry->vme_start;
8108 		}
8109 
8110 		if (state & VMDS_KERNEL_PMAP) {
8111 			/*
8112 			 * In the kernel map and its submaps,
8113 			 * permanent entries never die, even
8114 			 * if VM_MAP_REMOVE_IMMUTABLE is passed.
8115 			 */
8116 			if (entry->vme_permanent) {
8117 				__vm_map_delete_permanent_panic(map, start, end, entry);
8118 			}
8119 
8120 			if (flags & VM_MAP_REMOVE_GUESS_SIZE) {
8121 				end = entry->vme_end;
8122 				flags &= ~VM_MAP_REMOVE_GUESS_SIZE;
8123 			}
8124 
8125 			/*
8126 			 * In the kernel map and its submaps,
8127 			 * the removal of an atomic/guarded entry is strict.
8128 			 *
8129 			 * An atomic entry is processed only if it was
8130 			 * specifically targeted.
8131 			 *
8132 			 * We might have deleted non-atomic entries before
8133 			 * we reach this this point however...
8134 			 */
8135 			kmem_entry_validate_guard(map, entry,
8136 			    start, end - start, guard);
8137 		}
8138 
8139 		/*
8140 		 * Step 2.1: handle "permanent" and "submap" entries
8141 		 * *before* clipping to avoid triggering some unnecessary
8142 		 * un-nesting of the shared region.
8143 		 */
8144 		if (entry->vme_permanent && entry->is_sub_map) {
8145 //			printf("FBDP %s:%d permanent submap...\n", __FUNCTION__, __LINE__);
8146 			/*
8147 			 * Un-mapping a "permanent" mapping of a user-space
8148 			 * submap is not allowed unless...
8149 			 */
8150 			if (flags & VM_MAP_REMOVE_IMMUTABLE) {
8151 				/*
8152 				 * a. explicitly requested by the kernel caller.
8153 				 */
8154 //				printf("FBDP %s:%d flags & REMOVE_IMMUTABLE\n", __FUNCTION__, __LINE__);
8155 			} else if ((flags & VM_MAP_REMOVE_IMMUTABLE_CODE) &&
8156 			    developer_mode_state()) {
8157 				/*
8158 				 * b. we're in "developer" mode (for
8159 				 *    breakpoints, dtrace probes, ...).
8160 				 */
8161 //				printf("FBDP %s:%d flags & REMOVE_IMMUTABLE_CODE\n", __FUNCTION__, __LINE__);
8162 			} else if (map->terminated) {
8163 				/*
8164 				 * c. this is the final address space cleanup.
8165 				 */
8166 //				printf("FBDP %s:%d map->terminated\n", __FUNCTION__, __LINE__);
8167 			} else {
8168 				vm_map_offset_t submap_start, submap_end;
8169 				kern_return_t submap_kr;
8170 
8171 				/*
8172 				 * Check if there are any "permanent" mappings
8173 				 * in this range in the submap.
8174 				 */
8175 				if (entry->in_transition) {
8176 					/* can that even happen ? */
8177 					goto in_transition;
8178 				}
8179 				/* compute the clipped range in the submap */
8180 				submap_start = s - entry->vme_start;
8181 				submap_start += VME_OFFSET(entry);
8182 				submap_end = end - entry->vme_start;
8183 				submap_end += VME_OFFSET(entry);
8184 				submap_kr = vm_map_delete_submap_recurse(
8185 					VME_SUBMAP(entry),
8186 					submap_start,
8187 					submap_end);
8188 				if (submap_kr != KERN_SUCCESS) {
8189 					/*
8190 					 * There are some "permanent" mappings
8191 					 * in the submap: we are not allowed
8192 					 * to remove this range.
8193 					 */
8194 					printf("%d[%s] removing permanent submap entry "
8195 					    "%p [0x%llx:0x%llx] prot 0x%x/0x%x -> KERN_PROT_FAILURE\n",
8196 					    proc_selfpid(),
8197 					    (get_bsdtask_info(current_task())
8198 					    ? proc_name_address(get_bsdtask_info(current_task()))
8199 					    : "?"), entry,
8200 					    (uint64_t)entry->vme_start,
8201 					    (uint64_t)entry->vme_end,
8202 					    entry->protection,
8203 					    entry->max_protection);
8204 					DTRACE_VM6(vm_map_delete_permanent_deny_submap,
8205 					    vm_map_entry_t, entry,
8206 					    vm_map_offset_t, entry->vme_start,
8207 					    vm_map_offset_t, entry->vme_end,
8208 					    vm_prot_t, entry->protection,
8209 					    vm_prot_t, entry->max_protection,
8210 					    int, VME_ALIAS(entry));
8211 					ret.kmr_return = KERN_PROTECTION_FAILURE;
8212 					goto out;
8213 				}
8214 				/* no permanent mappings: proceed */
8215 			}
8216 		}
8217 
8218 		/*
8219 		 * Step 3: Perform any clipping needed.
8220 		 *
8221 		 *         After this, "entry" starts at "s", ends before "end"
8222 		 */
8223 
8224 		if (entry->vme_start < s) {
8225 			if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) &&
8226 			    entry->map_aligned &&
8227 			    !VM_MAP_PAGE_ALIGNED(s, VM_MAP_PAGE_MASK(map))) {
8228 				/*
8229 				 * The entry will no longer be map-aligned
8230 				 * after clipping and the caller said it's OK.
8231 				 */
8232 				entry->map_aligned = FALSE;
8233 			}
8234 			vm_map_clip_start(map, entry, s);
8235 			SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
8236 		}
8237 
8238 		if (end < entry->vme_end) {
8239 			if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) &&
8240 			    entry->map_aligned &&
8241 			    !VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map))) {
8242 				/*
8243 				 * The entry will no longer be map-aligned
8244 				 * after clipping and the caller said it's OK.
8245 				 */
8246 				entry->map_aligned = FALSE;
8247 			}
8248 			vm_map_clip_end(map, entry, end);
8249 		}
8250 
8251 		if (entry->vme_permanent && entry->is_sub_map) {
8252 			/*
8253 			 * We already went through step 2.1 which did not deny
8254 			 * the removal of this "permanent" and "is_sub_map"
8255 			 * entry.
8256 			 * Now that we've clipped what we actually want to
8257 			 * delete, undo the "permanent" part to allow the
8258 			 * removal to proceed.
8259 			 */
8260 			DTRACE_VM6(vm_map_delete_permanent_allow_submap,
8261 			    vm_map_entry_t, entry,
8262 			    vm_map_offset_t, entry->vme_start,
8263 			    vm_map_offset_t, entry->vme_end,
8264 			    vm_prot_t, entry->protection,
8265 			    vm_prot_t, entry->max_protection,
8266 			    int, VME_ALIAS(entry));
8267 			entry->vme_permanent = false;
8268 		}
8269 
8270 		assert(s == entry->vme_start);
8271 		assert(entry->vme_end <= end);
8272 
8273 
8274 		/*
8275 		 * Step 4: If the entry is in flux, wait for this to resolve.
8276 		 */
8277 
8278 		if (entry->in_transition) {
8279 			wait_result_t wait_result;
8280 
8281 in_transition:
8282 			/*
8283 			 * Another thread is wiring/unwiring this entry.
8284 			 * Let the other thread know we are waiting.
8285 			 */
8286 
8287 			entry->needs_wakeup = TRUE;
8288 
8289 			/*
8290 			 * wake up anybody waiting on entries that we have
8291 			 * already unwired/deleted.
8292 			 */
8293 			if (state & VMDS_NEEDS_WAKEUP) {
8294 				vm_map_entry_wakeup(map);
8295 				state &= ~VMDS_NEEDS_WAKEUP;
8296 			}
8297 
8298 			wait_result = vm_map_entry_wait(map, interruptible);
8299 
8300 			if (interruptible &&
8301 			    wait_result == THREAD_INTERRUPTED) {
8302 				/*
8303 				 * We do not clear the needs_wakeup flag,
8304 				 * since we cannot tell if we were the only one.
8305 				 */
8306 				ret.kmr_return = KERN_ABORTED;
8307 				return ret;
8308 			}
8309 
8310 			/*
8311 			 * The entry could have been clipped or it
8312 			 * may not exist anymore.  Look it up again.
8313 			 */
8314 			state |= VMDS_NEEDS_LOOKUP;
8315 			continue;
8316 		}
8317 
8318 
8319 		/*
8320 		 * Step 5: Handle wiring
8321 		 */
8322 
8323 		if (entry->wired_count) {
8324 			struct vm_map_entry tmp_entry;
8325 			boolean_t           user_wire;
8326 			unsigned int        last_timestamp;
8327 
8328 			user_wire = entry->user_wired_count > 0;
8329 
8330 			/*
8331 			 *      Remove a kernel wiring if requested
8332 			 */
8333 			if (flags & VM_MAP_REMOVE_KUNWIRE) {
8334 				entry->wired_count--;
8335 			}
8336 
8337 			/*
8338 			 *	Remove all user wirings for proper accounting
8339 			 */
8340 			while (entry->user_wired_count) {
8341 				subtract_wire_counts(map, entry, user_wire);
8342 			}
8343 
8344 			/*
8345 			 * All our DMA I/O operations in IOKit are currently
8346 			 * done by wiring through the map entries of the task
8347 			 * requesting the I/O.
8348 			 *
8349 			 * Because of this, we must always wait for kernel wirings
8350 			 * to go away on the entries before deleting them.
8351 			 *
8352 			 * Any caller who wants to actually remove a kernel wiring
8353 			 * should explicitly set the VM_MAP_REMOVE_KUNWIRE flag to
8354 			 * properly remove one wiring instead of blasting through
8355 			 * them all.
8356 			 */
8357 			if (entry->wired_count != 0) {
8358 				assert(map != kernel_map);
8359 				/*
8360 				 * Cannot continue.  Typical case is when
8361 				 * a user thread has physical io pending on
8362 				 * on this page.  Either wait for the
8363 				 * kernel wiring to go away or return an
8364 				 * error.
8365 				 */
8366 				wait_result_t wait_result;
8367 
8368 				entry->needs_wakeup = TRUE;
8369 				wait_result = vm_map_entry_wait(map,
8370 				    interruptible);
8371 
8372 				if (interruptible &&
8373 				    wait_result == THREAD_INTERRUPTED) {
8374 					/*
8375 					 * We do not clear the
8376 					 * needs_wakeup flag, since we
8377 					 * cannot tell if we were the
8378 					 * only one.
8379 					 */
8380 					ret.kmr_return = KERN_ABORTED;
8381 					return ret;
8382 				}
8383 
8384 
8385 				/*
8386 				 * The entry could have been clipped or
8387 				 * it may not exist anymore.  Look it
8388 				 * up again.
8389 				 */
8390 				state |= VMDS_NEEDS_LOOKUP;
8391 				continue;
8392 			}
8393 
8394 			/*
8395 			 * We can unlock the map now.
8396 			 *
8397 			 * The entry might be split once we unlock the map,
8398 			 * but we need the range as defined by this entry
8399 			 * to be stable. So we must make a local copy.
8400 			 *
8401 			 * The underlying objects do not change during clips,
8402 			 * and the in_transition state guarentees existence
8403 			 * of the entry.
8404 			 */
8405 			last_timestamp = map->timestamp;
8406 			entry->in_transition = TRUE;
8407 			tmp_entry = *entry;
8408 			vm_map_unlock(map);
8409 
8410 			if (tmp_entry.is_sub_map) {
8411 				vm_map_t sub_map;
8412 				vm_map_offset_t sub_start, sub_end;
8413 				pmap_t pmap;
8414 				vm_map_offset_t pmap_addr;
8415 
8416 
8417 				sub_map = VME_SUBMAP(&tmp_entry);
8418 				sub_start = VME_OFFSET(&tmp_entry);
8419 				sub_end = sub_start + (tmp_entry.vme_end -
8420 				    tmp_entry.vme_start);
8421 				if (tmp_entry.use_pmap) {
8422 					pmap = sub_map->pmap;
8423 					pmap_addr = tmp_entry.vme_start;
8424 				} else {
8425 					pmap = map->pmap;
8426 					pmap_addr = tmp_entry.vme_start;
8427 				}
8428 				(void) vm_map_unwire_nested(sub_map,
8429 				    sub_start, sub_end,
8430 				    user_wire,
8431 				    pmap, pmap_addr);
8432 			} else {
8433 				if (tmp_entry.vme_kernel_object) {
8434 					pmap_protect_options(
8435 						map->pmap,
8436 						tmp_entry.vme_start,
8437 						tmp_entry.vme_end,
8438 						VM_PROT_NONE,
8439 						PMAP_OPTIONS_REMOVE,
8440 						NULL);
8441 				}
8442 				vm_fault_unwire(map, &tmp_entry,
8443 				    tmp_entry.vme_kernel_object,
8444 				    map->pmap, tmp_entry.vme_start);
8445 			}
8446 
8447 			vm_map_lock(map);
8448 
8449 			/*
8450 			 * Unwiring happened, we can now go back to deleting
8451 			 * them (after we clear the in_transition bit for the range).
8452 			 */
8453 			if (last_timestamp + 1 != map->timestamp) {
8454 				state |= VMDS_NEEDS_LOOKUP;
8455 			}
8456 			clear_in_transition_end = tmp_entry.vme_end;
8457 			continue;
8458 		}
8459 
8460 		assert(entry->wired_count == 0);
8461 		assert(entry->user_wired_count == 0);
8462 
8463 
8464 		/*
8465 		 * Step 6: Entry is unwired and ready for us to delete !
8466 		 */
8467 
8468 		if (!entry->vme_permanent) {
8469 			/*
8470 			 * Typical case: the entry really shouldn't be permanent
8471 			 */
8472 		} else if ((flags & VM_MAP_REMOVE_IMMUTABLE_CODE) &&
8473 		    (entry->protection & VM_PROT_EXECUTE) &&
8474 		    developer_mode_state()) {
8475 			/*
8476 			 * Allow debuggers to undo executable mappings
8477 			 * when developer mode is on.
8478 			 */
8479 #if 0
8480 			printf("FBDP %d[%s] removing permanent executable entry "
8481 			    "%p [0x%llx:0x%llx] prot 0x%x/0x%x\n",
8482 			    proc_selfpid(),
8483 			    (current_task()->bsd_info
8484 			    ? proc_name_address(current_task()->bsd_info)
8485 			    : "?"), entry,
8486 			    (uint64_t)entry->vme_start,
8487 			    (uint64_t)entry->vme_end,
8488 			    entry->protection,
8489 			    entry->max_protection);
8490 #endif
8491 			entry->vme_permanent = FALSE;
8492 		} else if ((flags & VM_MAP_REMOVE_IMMUTABLE) || map->terminated) {
8493 #if 0
8494 			printf("FBDP %d[%s] removing permanent entry "
8495 			    "%p [0x%llx:0x%llx] prot 0x%x/0x%x\n",
8496 			    proc_selfpid(),
8497 			    (current_task()->bsd_info
8498 			    ? proc_name_address(current_task()->bsd_info)
8499 			    : "?"), entry,
8500 			    (uint64_t)entry->vme_start,
8501 			    (uint64_t)entry->vme_end,
8502 			    entry->protection,
8503 			    entry->max_protection);
8504 #endif
8505 			entry->vme_permanent = FALSE;
8506 		} else {
8507 			DTRACE_VM6(vm_map_delete_permanent,
8508 			    vm_map_entry_t, entry,
8509 			    vm_map_offset_t, entry->vme_start,
8510 			    vm_map_offset_t, entry->vme_end,
8511 			    vm_prot_t, entry->protection,
8512 			    vm_prot_t, entry->max_protection,
8513 			    int, VME_ALIAS(entry));
8514 		}
8515 
8516 		if (entry->is_sub_map) {
8517 			assertf(VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)) >= VM_MAP_PAGE_SHIFT(map),
8518 			    "map %p (%d) entry %p submap %p (%d)\n",
8519 			    map, VM_MAP_PAGE_SHIFT(map), entry,
8520 			    VME_SUBMAP(entry),
8521 			    VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)));
8522 			if (entry->use_pmap) {
8523 #ifndef NO_NESTED_PMAP
8524 				int pmap_flags;
8525 
8526 				if (map->terminated) {
8527 					/*
8528 					 * This is the final cleanup of the
8529 					 * address space being terminated.
8530 					 * No new mappings are expected and
8531 					 * we don't really need to unnest the
8532 					 * shared region (and lose the "global"
8533 					 * pmap mappings, if applicable).
8534 					 *
8535 					 * Tell the pmap layer that we're
8536 					 * "clean" wrt nesting.
8537 					 */
8538 					pmap_flags = PMAP_UNNEST_CLEAN;
8539 				} else {
8540 					/*
8541 					 * We're unmapping part of the nested
8542 					 * shared region, so we can't keep the
8543 					 * nested pmap.
8544 					 */
8545 					pmap_flags = 0;
8546 				}
8547 				pmap_unnest_options(
8548 					map->pmap,
8549 					(addr64_t)entry->vme_start,
8550 					entry->vme_end - entry->vme_start,
8551 					pmap_flags);
8552 #endif  /* NO_NESTED_PMAP */
8553 				if (map->mapped_in_other_pmaps &&
8554 				    os_ref_get_count_raw(&map->map_refcnt) != 0) {
8555 					/* clean up parent map/maps */
8556 					vm_map_submap_pmap_clean(
8557 						map, entry->vme_start,
8558 						entry->vme_end,
8559 						VME_SUBMAP(entry),
8560 						VME_OFFSET(entry));
8561 				}
8562 			} else {
8563 				vm_map_submap_pmap_clean(
8564 					map, entry->vme_start, entry->vme_end,
8565 					VME_SUBMAP(entry),
8566 					VME_OFFSET(entry));
8567 			}
8568 		} else if (entry->vme_kernel_object ||
8569 		    VME_OBJECT(entry) == compressor_object) {
8570 			/*
8571 			 * nothing to do
8572 			 */
8573 		} else if (map->mapped_in_other_pmaps &&
8574 		    os_ref_get_count_raw(&map->map_refcnt) != 0) {
8575 			vm_object_pmap_protect_options(
8576 				VME_OBJECT(entry), VME_OFFSET(entry),
8577 				entry->vme_end - entry->vme_start,
8578 				PMAP_NULL,
8579 				PAGE_SIZE,
8580 				entry->vme_start,
8581 				VM_PROT_NONE,
8582 				PMAP_OPTIONS_REMOVE);
8583 		} else if ((VME_OBJECT(entry) != VM_OBJECT_NULL) ||
8584 		    (state & VMDS_KERNEL_PMAP)) {
8585 			/* Remove translations associated
8586 			 * with this range unless the entry
8587 			 * does not have an object, or
8588 			 * it's the kernel map or a descendant
8589 			 * since the platform could potentially
8590 			 * create "backdoor" mappings invisible
8591 			 * to the VM. It is expected that
8592 			 * objectless, non-kernel ranges
8593 			 * do not have such VM invisible
8594 			 * translations.
8595 			 */
8596 			pmap_remove_options(map->pmap,
8597 			    (addr64_t)entry->vme_start,
8598 			    (addr64_t)entry->vme_end,
8599 			    PMAP_OPTIONS_REMOVE);
8600 		}
8601 
8602 #if DEBUG
8603 		/*
8604 		 * All pmap mappings for this map entry must have been
8605 		 * cleared by now.
8606 		 */
8607 		assert(pmap_is_empty(map->pmap,
8608 		    entry->vme_start,
8609 		    entry->vme_end));
8610 #endif /* DEBUG */
8611 
8612 		if (entry->iokit_acct) {
8613 			/* alternate accounting */
8614 			DTRACE_VM4(vm_map_iokit_unmapped_region,
8615 			    vm_map_t, map,
8616 			    vm_map_offset_t, entry->vme_start,
8617 			    vm_map_offset_t, entry->vme_end,
8618 			    int, VME_ALIAS(entry));
8619 			vm_map_iokit_unmapped_region(map,
8620 			    (entry->vme_end -
8621 			    entry->vme_start));
8622 			entry->iokit_acct = FALSE;
8623 			entry->use_pmap = FALSE;
8624 		}
8625 
8626 		s = entry->vme_end;
8627 		next = entry->vme_next;
8628 		ret.kmr_size += entry->vme_end - entry->vme_start;
8629 
8630 		if (entry->vme_permanent) {
8631 			/*
8632 			 * A permanent entry can not be removed, so leave it
8633 			 * in place but remove all access permissions.
8634 			 */
8635 			if (!entry->pmap_cs_associated) {
8636 				printf("%s:%d %d[%s] map %p entry %p [ 0x%llx - 0x%llx ] submap %d prot 0x%x/0x%x -> 0/0\n",
8637 				    __FUNCTION__, __LINE__,
8638 				    proc_selfpid(),
8639 				    (get_bsdtask_info(current_task())
8640 				    ? proc_name_address(get_bsdtask_info(current_task()))
8641 				    : "?"),
8642 				    map,
8643 				    entry,
8644 				    (uint64_t)entry->vme_start,
8645 				    (uint64_t)entry->vme_end,
8646 				    entry->is_sub_map,
8647 				    entry->protection,
8648 				    entry->max_protection);
8649 			}
8650 			DTRACE_VM6(vm_map_delete_permanent_prot_none,
8651 			    vm_map_entry_t, entry,
8652 			    vm_map_offset_t, entry->vme_start,
8653 			    vm_map_offset_t, entry->vme_end,
8654 			    vm_prot_t, entry->protection,
8655 			    vm_prot_t, entry->max_protection,
8656 			    int, VME_ALIAS(entry));
8657 			entry->protection = VM_PROT_NONE;
8658 			entry->max_protection = VM_PROT_NONE;
8659 		} else {
8660 			vm_map_entry_zap(map, entry, zap_list);
8661 		}
8662 
8663 		entry = next;
8664 
8665 		if ((flags & VM_MAP_REMOVE_NO_YIELD) == 0 && s < end) {
8666 			unsigned int last_timestamp = map->timestamp++;
8667 
8668 			if (lck_rw_lock_yield_exclusive(&map->lock,
8669 			    LCK_RW_YIELD_ANY_WAITER)) {
8670 				if (last_timestamp != map->timestamp + 1) {
8671 					state |= VMDS_NEEDS_LOOKUP;
8672 				}
8673 			} else {
8674 				/* we didn't yield, undo our change */
8675 				map->timestamp--;
8676 			}
8677 		}
8678 	}
8679 
8680 	if (map->wait_for_space) {
8681 		thread_wakeup((event_t) map);
8682 	}
8683 
8684 	if (state & VMDS_NEEDS_WAKEUP) {
8685 		vm_map_entry_wakeup(map);
8686 	}
8687 
8688 out:
8689 	if ((state & VMDS_KERNEL_PMAP) && ret.kmr_return) {
8690 		__vm_map_delete_failed_panic(map, start, end, ret.kmr_return);
8691 	}
8692 
8693 	if (state & VMDS_FOUND_GAP) {
8694 		DTRACE_VM3(kern_vm_deallocate_gap,
8695 		    vm_map_offset_t, gap_start,
8696 		    vm_map_offset_t, save_start,
8697 		    vm_map_offset_t, save_end);
8698 		if (flags & VM_MAP_REMOVE_GAPS_FAIL) {
8699 			ret.kmr_return = KERN_INVALID_VALUE;
8700 		} else {
8701 			vm_map_guard_exception(gap_start, kGUARD_EXC_DEALLOC_GAP);
8702 		}
8703 	}
8704 
8705 	return ret;
8706 }
8707 
8708 kmem_return_t
vm_map_remove_and_unlock(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vmr_flags_t flags,kmem_guard_t guard)8709 vm_map_remove_and_unlock(
8710 	vm_map_t        map,
8711 	vm_map_offset_t start,
8712 	vm_map_offset_t end,
8713 	vmr_flags_t     flags,
8714 	kmem_guard_t    guard)
8715 {
8716 	kmem_return_t ret;
8717 	VM_MAP_ZAP_DECLARE(zap);
8718 
8719 	ret = vm_map_delete(map, start, end, flags, guard, &zap);
8720 	vm_map_unlock(map);
8721 
8722 	vm_map_zap_dispose(&zap);
8723 
8724 	return ret;
8725 }
8726 
8727 /*
8728  *	vm_map_remove_guard:
8729  *
8730  *	Remove the given address range from the target map.
8731  *	This is the exported form of vm_map_delete.
8732  */
8733 kmem_return_t
vm_map_remove_guard(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vmr_flags_t flags,kmem_guard_t guard)8734 vm_map_remove_guard(
8735 	vm_map_t        map,
8736 	vm_map_offset_t start,
8737 	vm_map_offset_t end,
8738 	vmr_flags_t     flags,
8739 	kmem_guard_t    guard)
8740 {
8741 	vm_map_lock(map);
8742 	return vm_map_remove_and_unlock(map, start, end, flags, guard);
8743 }
8744 
8745 /*
8746  *	vm_map_terminate:
8747  *
8748  *	Clean out a task's map.
8749  */
8750 kern_return_t
vm_map_terminate(vm_map_t map)8751 vm_map_terminate(
8752 	vm_map_t        map)
8753 {
8754 	vm_map_lock(map);
8755 	map->terminated = TRUE;
8756 	vm_map_disable_hole_optimization(map);
8757 	(void)vm_map_remove_and_unlock(map, map->min_offset, map->max_offset,
8758 	    VM_MAP_REMOVE_NO_FLAGS, KMEM_GUARD_NONE);
8759 	return KERN_SUCCESS;
8760 }
8761 
8762 /*
8763  *	Routine:	vm_map_copy_allocate
8764  *
8765  *	Description:
8766  *		Allocates and initializes a map copy object.
8767  */
8768 static vm_map_copy_t
vm_map_copy_allocate(void)8769 vm_map_copy_allocate(void)
8770 {
8771 	vm_map_copy_t new_copy;
8772 
8773 	new_copy = zalloc_id(ZONE_ID_VM_MAP_COPY, Z_WAITOK | Z_ZERO);
8774 	new_copy->c_u.hdr.rb_head_store.rbh_root = (void*)(int)SKIP_RB_TREE;
8775 	vm_map_copy_first_entry(new_copy) = vm_map_copy_to_entry(new_copy);
8776 	vm_map_copy_last_entry(new_copy) = vm_map_copy_to_entry(new_copy);
8777 	return new_copy;
8778 }
8779 
8780 /*
8781  *	Routine:	vm_map_copy_discard
8782  *
8783  *	Description:
8784  *		Dispose of a map copy object (returned by
8785  *		vm_map_copyin).
8786  */
8787 void
vm_map_copy_discard(vm_map_copy_t copy)8788 vm_map_copy_discard(
8789 	vm_map_copy_t   copy)
8790 {
8791 	if (copy == VM_MAP_COPY_NULL) {
8792 		return;
8793 	}
8794 
8795 	/*
8796 	 * Assert that the vm_map_copy is coming from the right
8797 	 * zone and hasn't been forged
8798 	 */
8799 	vm_map_copy_require(copy);
8800 
8801 	switch (copy->type) {
8802 	case VM_MAP_COPY_ENTRY_LIST:
8803 		while (vm_map_copy_first_entry(copy) !=
8804 		    vm_map_copy_to_entry(copy)) {
8805 			vm_map_entry_t  entry = vm_map_copy_first_entry(copy);
8806 
8807 			vm_map_copy_entry_unlink(copy, entry);
8808 			if (entry->is_sub_map) {
8809 				vm_map_deallocate(VME_SUBMAP(entry));
8810 			} else {
8811 				vm_object_deallocate(VME_OBJECT(entry));
8812 			}
8813 			vm_map_copy_entry_dispose(entry);
8814 		}
8815 		break;
8816 	case VM_MAP_COPY_OBJECT:
8817 		vm_object_deallocate(copy->cpy_object);
8818 		break;
8819 	case VM_MAP_COPY_KERNEL_BUFFER:
8820 
8821 		/*
8822 		 * The vm_map_copy_t and possibly the data buffer were
8823 		 * allocated by a single call to kalloc_data(), i.e. the
8824 		 * vm_map_copy_t was not allocated out of the zone.
8825 		 */
8826 		if (copy->size > msg_ool_size_small || copy->offset) {
8827 			panic("Invalid vm_map_copy_t sz:%lld, ofst:%lld",
8828 			    (long long)copy->size, (long long)copy->offset);
8829 		}
8830 		kfree_data(copy->cpy_kdata, copy->size);
8831 	}
8832 	zfree_id(ZONE_ID_VM_MAP_COPY, copy);
8833 }
8834 
8835 /*
8836  *	Routine:	vm_map_copy_copy
8837  *
8838  *	Description:
8839  *			Move the information in a map copy object to
8840  *			a new map copy object, leaving the old one
8841  *			empty.
8842  *
8843  *			This is used by kernel routines that need
8844  *			to look at out-of-line data (in copyin form)
8845  *			before deciding whether to return SUCCESS.
8846  *			If the routine returns FAILURE, the original
8847  *			copy object will be deallocated; therefore,
8848  *			these routines must make a copy of the copy
8849  *			object and leave the original empty so that
8850  *			deallocation will not fail.
8851  */
8852 vm_map_copy_t
vm_map_copy_copy(vm_map_copy_t copy)8853 vm_map_copy_copy(
8854 	vm_map_copy_t   copy)
8855 {
8856 	vm_map_copy_t   new_copy;
8857 
8858 	if (copy == VM_MAP_COPY_NULL) {
8859 		return VM_MAP_COPY_NULL;
8860 	}
8861 
8862 	/*
8863 	 * Assert that the vm_map_copy is coming from the right
8864 	 * zone and hasn't been forged
8865 	 */
8866 	vm_map_copy_require(copy);
8867 
8868 	/*
8869 	 * Allocate a new copy object, and copy the information
8870 	 * from the old one into it.
8871 	 */
8872 
8873 	new_copy = zalloc_id(ZONE_ID_VM_MAP_COPY, Z_WAITOK | Z_ZERO | Z_NOFAIL);
8874 	memcpy((void *) new_copy, (void *) copy, sizeof(struct vm_map_copy));
8875 #if __has_feature(ptrauth_calls)
8876 	if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) {
8877 		new_copy->cpy_kdata = copy->cpy_kdata;
8878 	}
8879 #endif
8880 
8881 	if (copy->type == VM_MAP_COPY_ENTRY_LIST) {
8882 		/*
8883 		 * The links in the entry chain must be
8884 		 * changed to point to the new copy object.
8885 		 */
8886 		vm_map_copy_first_entry(copy)->vme_prev
8887 		        = vm_map_copy_to_entry(new_copy);
8888 		vm_map_copy_last_entry(copy)->vme_next
8889 		        = vm_map_copy_to_entry(new_copy);
8890 	}
8891 
8892 	/*
8893 	 * Change the old copy object into one that contains
8894 	 * nothing to be deallocated.
8895 	 */
8896 	copy->type = VM_MAP_COPY_OBJECT;
8897 	copy->cpy_object = VM_OBJECT_NULL;
8898 
8899 	/*
8900 	 * Return the new object.
8901 	 */
8902 	return new_copy;
8903 }
8904 
8905 static boolean_t
vm_map_entry_is_overwritable(vm_map_t dst_map __unused,vm_map_entry_t entry)8906 vm_map_entry_is_overwritable(
8907 	vm_map_t        dst_map __unused,
8908 	vm_map_entry_t  entry)
8909 {
8910 	if (!(entry->protection & VM_PROT_WRITE)) {
8911 		/* can't overwrite if not writable */
8912 		return FALSE;
8913 	}
8914 #if !__x86_64__
8915 	if (entry->used_for_jit &&
8916 	    vm_map_cs_enforcement(dst_map) &&
8917 	    !dst_map->cs_debugged) {
8918 		/*
8919 		 * Can't overwrite a JIT region while cs_enforced
8920 		 * and not cs_debugged.
8921 		 */
8922 		return FALSE;
8923 	}
8924 
8925 #if __arm64e__
8926 	/* Do not allow overwrite HW assisted TPRO entries */
8927 	if (entry->used_for_tpro) {
8928 		return FALSE;
8929 	}
8930 #endif /* __arm64e__ */
8931 
8932 	if (entry->vme_permanent) {
8933 		if (entry->is_sub_map) {
8934 			/*
8935 			 * We can't tell if the submap contains "permanent"
8936 			 * entries within the range targeted by the caller.
8937 			 * The caller will have to check for that with
8938 			 * vm_map_overwrite_submap_recurse() for example.
8939 			 */
8940 		} else {
8941 			/*
8942 			 * Do not allow overwriting of a "permanent"
8943 			 * entry.
8944 			 */
8945 			DTRACE_VM6(vm_map_delete_permanent_deny_overwrite,
8946 			    vm_map_entry_t, entry,
8947 			    vm_map_offset_t, entry->vme_start,
8948 			    vm_map_offset_t, entry->vme_end,
8949 			    vm_prot_t, entry->protection,
8950 			    vm_prot_t, entry->max_protection,
8951 			    int, VME_ALIAS(entry));
8952 			return FALSE;
8953 		}
8954 	}
8955 #endif /* !__x86_64__ */
8956 	return TRUE;
8957 }
8958 
8959 static kern_return_t
vm_map_overwrite_submap_recurse(vm_map_t dst_map,vm_map_offset_t dst_addr,vm_map_size_t dst_size)8960 vm_map_overwrite_submap_recurse(
8961 	vm_map_t        dst_map,
8962 	vm_map_offset_t dst_addr,
8963 	vm_map_size_t   dst_size)
8964 {
8965 	vm_map_offset_t dst_end;
8966 	vm_map_entry_t  tmp_entry;
8967 	vm_map_entry_t  entry;
8968 	kern_return_t   result;
8969 	boolean_t       encountered_sub_map = FALSE;
8970 
8971 
8972 
8973 	/*
8974 	 *	Verify that the destination is all writeable
8975 	 *	initially.  We have to trunc the destination
8976 	 *	address and round the copy size or we'll end up
8977 	 *	splitting entries in strange ways.
8978 	 */
8979 
8980 	dst_end = vm_map_round_page(dst_addr + dst_size,
8981 	    VM_MAP_PAGE_MASK(dst_map));
8982 	vm_map_lock(dst_map);
8983 
8984 start_pass_1:
8985 	if (!vm_map_lookup_entry(dst_map, dst_addr, &tmp_entry)) {
8986 		vm_map_unlock(dst_map);
8987 		return KERN_INVALID_ADDRESS;
8988 	}
8989 
8990 	vm_map_clip_start(dst_map,
8991 	    tmp_entry,
8992 	    vm_map_trunc_page(dst_addr,
8993 	    VM_MAP_PAGE_MASK(dst_map)));
8994 	if (tmp_entry->is_sub_map) {
8995 		/* clipping did unnest if needed */
8996 		assert(!tmp_entry->use_pmap);
8997 	}
8998 
8999 	for (entry = tmp_entry;;) {
9000 		vm_map_entry_t  next;
9001 
9002 		next = entry->vme_next;
9003 		while (entry->is_sub_map) {
9004 			vm_map_offset_t sub_start;
9005 			vm_map_offset_t sub_end;
9006 			vm_map_offset_t local_end;
9007 
9008 			if (entry->in_transition) {
9009 				/*
9010 				 * Say that we are waiting, and wait for entry.
9011 				 */
9012 				entry->needs_wakeup = TRUE;
9013 				vm_map_entry_wait(dst_map, THREAD_UNINT);
9014 
9015 				goto start_pass_1;
9016 			}
9017 
9018 			encountered_sub_map = TRUE;
9019 			sub_start = VME_OFFSET(entry);
9020 
9021 			if (entry->vme_end < dst_end) {
9022 				sub_end = entry->vme_end;
9023 			} else {
9024 				sub_end = dst_end;
9025 			}
9026 			sub_end -= entry->vme_start;
9027 			sub_end += VME_OFFSET(entry);
9028 			local_end = entry->vme_end;
9029 			vm_map_unlock(dst_map);
9030 
9031 			result = vm_map_overwrite_submap_recurse(
9032 				VME_SUBMAP(entry),
9033 				sub_start,
9034 				sub_end - sub_start);
9035 
9036 			if (result != KERN_SUCCESS) {
9037 				return result;
9038 			}
9039 			if (dst_end <= entry->vme_end) {
9040 				return KERN_SUCCESS;
9041 			}
9042 			vm_map_lock(dst_map);
9043 			if (!vm_map_lookup_entry(dst_map, local_end,
9044 			    &tmp_entry)) {
9045 				vm_map_unlock(dst_map);
9046 				return KERN_INVALID_ADDRESS;
9047 			}
9048 			entry = tmp_entry;
9049 			next = entry->vme_next;
9050 		}
9051 
9052 		if (!(entry->protection & VM_PROT_WRITE)) {
9053 			vm_map_unlock(dst_map);
9054 			return KERN_PROTECTION_FAILURE;
9055 		}
9056 
9057 		if (!vm_map_entry_is_overwritable(dst_map, entry)) {
9058 			vm_map_unlock(dst_map);
9059 			return KERN_PROTECTION_FAILURE;
9060 		}
9061 
9062 		/*
9063 		 *	If the entry is in transition, we must wait
9064 		 *	for it to exit that state.  Anything could happen
9065 		 *	when we unlock the map, so start over.
9066 		 */
9067 		if (entry->in_transition) {
9068 			/*
9069 			 * Say that we are waiting, and wait for entry.
9070 			 */
9071 			entry->needs_wakeup = TRUE;
9072 			vm_map_entry_wait(dst_map, THREAD_UNINT);
9073 
9074 			goto start_pass_1;
9075 		}
9076 
9077 /*
9078  *		our range is contained completely within this map entry
9079  */
9080 		if (dst_end <= entry->vme_end) {
9081 			vm_map_unlock(dst_map);
9082 			return KERN_SUCCESS;
9083 		}
9084 /*
9085  *		check that range specified is contiguous region
9086  */
9087 		if ((next == vm_map_to_entry(dst_map)) ||
9088 		    (next->vme_start != entry->vme_end)) {
9089 			vm_map_unlock(dst_map);
9090 			return KERN_INVALID_ADDRESS;
9091 		}
9092 
9093 		/*
9094 		 *	Check for permanent objects in the destination.
9095 		 */
9096 		if ((VME_OBJECT(entry) != VM_OBJECT_NULL) &&
9097 		    ((!VME_OBJECT(entry)->internal) ||
9098 		    (VME_OBJECT(entry)->true_share))) {
9099 			if (encountered_sub_map) {
9100 				vm_map_unlock(dst_map);
9101 				return KERN_FAILURE;
9102 			}
9103 		}
9104 
9105 
9106 		entry = next;
9107 	}/* for */
9108 	vm_map_unlock(dst_map);
9109 	return KERN_SUCCESS;
9110 }
9111 
9112 /*
9113  *	Routine:	vm_map_copy_overwrite
9114  *
9115  *	Description:
9116  *		Copy the memory described by the map copy
9117  *		object (copy; returned by vm_map_copyin) onto
9118  *		the specified destination region (dst_map, dst_addr).
9119  *		The destination must be writeable.
9120  *
9121  *		Unlike vm_map_copyout, this routine actually
9122  *		writes over previously-mapped memory.  If the
9123  *		previous mapping was to a permanent (user-supplied)
9124  *		memory object, it is preserved.
9125  *
9126  *		The attributes (protection and inheritance) of the
9127  *		destination region are preserved.
9128  *
9129  *		If successful, consumes the copy object.
9130  *		Otherwise, the caller is responsible for it.
9131  *
9132  *	Implementation notes:
9133  *		To overwrite aligned temporary virtual memory, it is
9134  *		sufficient to remove the previous mapping and insert
9135  *		the new copy.  This replacement is done either on
9136  *		the whole region (if no permanent virtual memory
9137  *		objects are embedded in the destination region) or
9138  *		in individual map entries.
9139  *
9140  *		To overwrite permanent virtual memory , it is necessary
9141  *		to copy each page, as the external memory management
9142  *		interface currently does not provide any optimizations.
9143  *
9144  *		Unaligned memory also has to be copied.  It is possible
9145  *		to use 'vm_trickery' to copy the aligned data.  This is
9146  *		not done but not hard to implement.
9147  *
9148  *		Once a page of permanent memory has been overwritten,
9149  *		it is impossible to interrupt this function; otherwise,
9150  *		the call would be neither atomic nor location-independent.
9151  *		The kernel-state portion of a user thread must be
9152  *		interruptible.
9153  *
9154  *		It may be expensive to forward all requests that might
9155  *		overwrite permanent memory (vm_write, vm_copy) to
9156  *		uninterruptible kernel threads.  This routine may be
9157  *		called by interruptible threads; however, success is
9158  *		not guaranteed -- if the request cannot be performed
9159  *		atomically and interruptibly, an error indication is
9160  *		returned.
9161  *
9162  *		Callers of this function must call vm_map_copy_require on
9163  *		previously created vm_map_copy_t or pass a newly created
9164  *		one to ensure that it hasn't been forged.
9165  */
9166 
9167 static kern_return_t
vm_map_copy_overwrite_nested(vm_map_t dst_map,vm_map_address_t dst_addr,vm_map_copy_t copy,boolean_t interruptible,pmap_t pmap,boolean_t discard_on_success)9168 vm_map_copy_overwrite_nested(
9169 	vm_map_t                dst_map,
9170 	vm_map_address_t        dst_addr,
9171 	vm_map_copy_t           copy,
9172 	boolean_t               interruptible,
9173 	pmap_t                  pmap,
9174 	boolean_t               discard_on_success)
9175 {
9176 	vm_map_offset_t         dst_end;
9177 	vm_map_entry_t          tmp_entry;
9178 	vm_map_entry_t          entry;
9179 	kern_return_t           kr;
9180 	boolean_t               aligned = TRUE;
9181 	boolean_t               contains_permanent_objects = FALSE;
9182 	boolean_t               encountered_sub_map = FALSE;
9183 	vm_map_offset_t         base_addr;
9184 	vm_map_size_t           copy_size;
9185 	vm_map_size_t           total_size;
9186 	uint16_t                copy_page_shift;
9187 
9188 	/*
9189 	 *	Check for special kernel buffer allocated
9190 	 *	by new_ipc_kmsg_copyin.
9191 	 */
9192 
9193 	if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) {
9194 		return vm_map_copyout_kernel_buffer(
9195 			dst_map, &dst_addr,
9196 			copy, copy->size, TRUE, discard_on_success);
9197 	}
9198 
9199 	/*
9200 	 *      Only works for entry lists at the moment.  Will
9201 	 *	support page lists later.
9202 	 */
9203 
9204 	assert(copy->type == VM_MAP_COPY_ENTRY_LIST);
9205 
9206 	if (copy->size == 0) {
9207 		if (discard_on_success) {
9208 			vm_map_copy_discard(copy);
9209 		}
9210 		return KERN_SUCCESS;
9211 	}
9212 
9213 	copy_page_shift = copy->cpy_hdr.page_shift;
9214 
9215 	/*
9216 	 *	Verify that the destination is all writeable
9217 	 *	initially.  We have to trunc the destination
9218 	 *	address and round the copy size or we'll end up
9219 	 *	splitting entries in strange ways.
9220 	 */
9221 
9222 	if (!VM_MAP_PAGE_ALIGNED(copy->size,
9223 	    VM_MAP_PAGE_MASK(dst_map)) ||
9224 	    !VM_MAP_PAGE_ALIGNED(copy->offset,
9225 	    VM_MAP_PAGE_MASK(dst_map)) ||
9226 	    !VM_MAP_PAGE_ALIGNED(dst_addr,
9227 	    VM_MAP_PAGE_MASK(dst_map)) ||
9228 	    copy_page_shift != VM_MAP_PAGE_SHIFT(dst_map)) {
9229 		aligned = FALSE;
9230 		dst_end = vm_map_round_page(dst_addr + copy->size,
9231 		    VM_MAP_PAGE_MASK(dst_map));
9232 	} else {
9233 		dst_end = dst_addr + copy->size;
9234 	}
9235 
9236 	vm_map_lock(dst_map);
9237 
9238 	/* LP64todo - remove this check when vm_map_commpage64()
9239 	 * no longer has to stuff in a map_entry for the commpage
9240 	 * above the map's max_offset.
9241 	 */
9242 	if (dst_addr >= dst_map->max_offset) {
9243 		vm_map_unlock(dst_map);
9244 		return KERN_INVALID_ADDRESS;
9245 	}
9246 
9247 start_pass_1:
9248 	if (!vm_map_lookup_entry(dst_map, dst_addr, &tmp_entry)) {
9249 		vm_map_unlock(dst_map);
9250 		return KERN_INVALID_ADDRESS;
9251 	}
9252 	vm_map_clip_start(dst_map,
9253 	    tmp_entry,
9254 	    vm_map_trunc_page(dst_addr,
9255 	    VM_MAP_PAGE_MASK(dst_map)));
9256 	for (entry = tmp_entry;;) {
9257 		vm_map_entry_t  next = entry->vme_next;
9258 
9259 		while (entry->is_sub_map) {
9260 			vm_map_offset_t sub_start;
9261 			vm_map_offset_t sub_end;
9262 			vm_map_offset_t local_end;
9263 
9264 			if (entry->in_transition) {
9265 				/*
9266 				 * Say that we are waiting, and wait for entry.
9267 				 */
9268 				entry->needs_wakeup = TRUE;
9269 				vm_map_entry_wait(dst_map, THREAD_UNINT);
9270 
9271 				goto start_pass_1;
9272 			}
9273 
9274 			local_end = entry->vme_end;
9275 			if (!(entry->needs_copy)) {
9276 				/* if needs_copy we are a COW submap */
9277 				/* in such a case we just replace so */
9278 				/* there is no need for the follow-  */
9279 				/* ing check.                        */
9280 				encountered_sub_map = TRUE;
9281 				sub_start = VME_OFFSET(entry);
9282 
9283 				if (entry->vme_end < dst_end) {
9284 					sub_end = entry->vme_end;
9285 				} else {
9286 					sub_end = dst_end;
9287 				}
9288 				sub_end -= entry->vme_start;
9289 				sub_end += VME_OFFSET(entry);
9290 				vm_map_unlock(dst_map);
9291 
9292 				kr = vm_map_overwrite_submap_recurse(
9293 					VME_SUBMAP(entry),
9294 					sub_start,
9295 					sub_end - sub_start);
9296 				if (kr != KERN_SUCCESS) {
9297 					return kr;
9298 				}
9299 				vm_map_lock(dst_map);
9300 			}
9301 
9302 			if (dst_end <= entry->vme_end) {
9303 				goto start_overwrite;
9304 			}
9305 			if (!vm_map_lookup_entry(dst_map, local_end,
9306 			    &entry)) {
9307 				vm_map_unlock(dst_map);
9308 				return KERN_INVALID_ADDRESS;
9309 			}
9310 			next = entry->vme_next;
9311 		}
9312 
9313 		if (!(entry->protection & VM_PROT_WRITE)) {
9314 			vm_map_unlock(dst_map);
9315 			return KERN_PROTECTION_FAILURE;
9316 		}
9317 
9318 		if (!vm_map_entry_is_overwritable(dst_map, entry)) {
9319 			vm_map_unlock(dst_map);
9320 			return KERN_PROTECTION_FAILURE;
9321 		}
9322 
9323 		/*
9324 		 *	If the entry is in transition, we must wait
9325 		 *	for it to exit that state.  Anything could happen
9326 		 *	when we unlock the map, so start over.
9327 		 */
9328 		if (entry->in_transition) {
9329 			/*
9330 			 * Say that we are waiting, and wait for entry.
9331 			 */
9332 			entry->needs_wakeup = TRUE;
9333 			vm_map_entry_wait(dst_map, THREAD_UNINT);
9334 
9335 			goto start_pass_1;
9336 		}
9337 
9338 /*
9339  *		our range is contained completely within this map entry
9340  */
9341 		if (dst_end <= entry->vme_end) {
9342 			break;
9343 		}
9344 /*
9345  *		check that range specified is contiguous region
9346  */
9347 		if ((next == vm_map_to_entry(dst_map)) ||
9348 		    (next->vme_start != entry->vme_end)) {
9349 			vm_map_unlock(dst_map);
9350 			return KERN_INVALID_ADDRESS;
9351 		}
9352 
9353 
9354 		/*
9355 		 *	Check for permanent objects in the destination.
9356 		 */
9357 		if ((VME_OBJECT(entry) != VM_OBJECT_NULL) &&
9358 		    ((!VME_OBJECT(entry)->internal) ||
9359 		    (VME_OBJECT(entry)->true_share))) {
9360 			contains_permanent_objects = TRUE;
9361 		}
9362 
9363 		entry = next;
9364 	}/* for */
9365 
9366 start_overwrite:
9367 	/*
9368 	 *	If there are permanent objects in the destination, then
9369 	 *	the copy cannot be interrupted.
9370 	 */
9371 
9372 	if (interruptible && contains_permanent_objects) {
9373 		vm_map_unlock(dst_map);
9374 		return KERN_FAILURE;   /* XXX */
9375 	}
9376 
9377 	/*
9378 	 *
9379 	 *	Make a second pass, overwriting the data
9380 	 *	At the beginning of each loop iteration,
9381 	 *	the next entry to be overwritten is "tmp_entry"
9382 	 *	(initially, the value returned from the lookup above),
9383 	 *	and the starting address expected in that entry
9384 	 *	is "start".
9385 	 */
9386 
9387 	total_size = copy->size;
9388 	if (encountered_sub_map) {
9389 		copy_size = 0;
9390 		/* re-calculate tmp_entry since we've had the map */
9391 		/* unlocked */
9392 		if (!vm_map_lookup_entry( dst_map, dst_addr, &tmp_entry)) {
9393 			vm_map_unlock(dst_map);
9394 			return KERN_INVALID_ADDRESS;
9395 		}
9396 	} else {
9397 		copy_size = copy->size;
9398 	}
9399 
9400 	base_addr = dst_addr;
9401 	while (TRUE) {
9402 		/* deconstruct the copy object and do in parts */
9403 		/* only in sub_map, interruptable case */
9404 		vm_map_entry_t  copy_entry;
9405 		vm_map_entry_t  previous_prev = VM_MAP_ENTRY_NULL;
9406 		vm_map_entry_t  next_copy = VM_MAP_ENTRY_NULL;
9407 		int             nentries;
9408 		int             remaining_entries = 0;
9409 		vm_map_offset_t new_offset = 0;
9410 
9411 		for (entry = tmp_entry; copy_size == 0;) {
9412 			vm_map_entry_t  next;
9413 
9414 			next = entry->vme_next;
9415 
9416 			/* tmp_entry and base address are moved along */
9417 			/* each time we encounter a sub-map.  Otherwise */
9418 			/* entry can outpase tmp_entry, and the copy_size */
9419 			/* may reflect the distance between them */
9420 			/* if the current entry is found to be in transition */
9421 			/* we will start over at the beginning or the last */
9422 			/* encounter of a submap as dictated by base_addr */
9423 			/* we will zero copy_size accordingly. */
9424 			if (entry->in_transition) {
9425 				/*
9426 				 * Say that we are waiting, and wait for entry.
9427 				 */
9428 				entry->needs_wakeup = TRUE;
9429 				vm_map_entry_wait(dst_map, THREAD_UNINT);
9430 
9431 				if (!vm_map_lookup_entry(dst_map, base_addr,
9432 				    &tmp_entry)) {
9433 					vm_map_unlock(dst_map);
9434 					return KERN_INVALID_ADDRESS;
9435 				}
9436 				copy_size = 0;
9437 				entry = tmp_entry;
9438 				continue;
9439 			}
9440 			if (entry->is_sub_map) {
9441 				vm_map_offset_t sub_start;
9442 				vm_map_offset_t sub_end;
9443 				vm_map_offset_t local_end;
9444 
9445 				if (entry->needs_copy) {
9446 					/* if this is a COW submap */
9447 					/* just back the range with a */
9448 					/* anonymous entry */
9449 					assert(!entry->vme_permanent);
9450 					if (entry->vme_end < dst_end) {
9451 						sub_end = entry->vme_end;
9452 					} else {
9453 						sub_end = dst_end;
9454 					}
9455 					if (entry->vme_start < base_addr) {
9456 						sub_start = base_addr;
9457 					} else {
9458 						sub_start = entry->vme_start;
9459 					}
9460 					vm_map_clip_end(
9461 						dst_map, entry, sub_end);
9462 					vm_map_clip_start(
9463 						dst_map, entry, sub_start);
9464 					assert(!entry->use_pmap);
9465 					assert(!entry->iokit_acct);
9466 					entry->use_pmap = TRUE;
9467 					vm_map_deallocate(VME_SUBMAP(entry));
9468 					assert(!entry->vme_permanent);
9469 					VME_OBJECT_SET(entry, VM_OBJECT_NULL, false, 0);
9470 					VME_OFFSET_SET(entry, 0);
9471 					entry->is_shared = FALSE;
9472 					entry->needs_copy = FALSE;
9473 					entry->protection = VM_PROT_DEFAULT;
9474 					entry->max_protection = VM_PROT_ALL;
9475 					entry->wired_count = 0;
9476 					entry->user_wired_count = 0;
9477 					if (entry->inheritance
9478 					    == VM_INHERIT_SHARE) {
9479 						entry->inheritance = VM_INHERIT_COPY;
9480 					}
9481 					continue;
9482 				}
9483 				/* first take care of any non-sub_map */
9484 				/* entries to send */
9485 				if (base_addr < entry->vme_start) {
9486 					/* stuff to send */
9487 					copy_size =
9488 					    entry->vme_start - base_addr;
9489 					break;
9490 				}
9491 				sub_start = VME_OFFSET(entry);
9492 
9493 				if (entry->vme_end < dst_end) {
9494 					sub_end = entry->vme_end;
9495 				} else {
9496 					sub_end = dst_end;
9497 				}
9498 				sub_end -= entry->vme_start;
9499 				sub_end += VME_OFFSET(entry);
9500 				local_end = entry->vme_end;
9501 				vm_map_unlock(dst_map);
9502 				copy_size = sub_end - sub_start;
9503 
9504 				/* adjust the copy object */
9505 				if (total_size > copy_size) {
9506 					vm_map_size_t   local_size = 0;
9507 					vm_map_size_t   entry_size;
9508 
9509 					nentries = 1;
9510 					new_offset = copy->offset;
9511 					copy_entry = vm_map_copy_first_entry(copy);
9512 					while (copy_entry !=
9513 					    vm_map_copy_to_entry(copy)) {
9514 						entry_size = copy_entry->vme_end -
9515 						    copy_entry->vme_start;
9516 						if ((local_size < copy_size) &&
9517 						    ((local_size + entry_size)
9518 						    >= copy_size)) {
9519 							vm_map_copy_clip_end(copy,
9520 							    copy_entry,
9521 							    copy_entry->vme_start +
9522 							    (copy_size - local_size));
9523 							entry_size = copy_entry->vme_end -
9524 							    copy_entry->vme_start;
9525 							local_size += entry_size;
9526 							new_offset += entry_size;
9527 						}
9528 						if (local_size >= copy_size) {
9529 							next_copy = copy_entry->vme_next;
9530 							copy_entry->vme_next =
9531 							    vm_map_copy_to_entry(copy);
9532 							previous_prev =
9533 							    copy->cpy_hdr.links.prev;
9534 							copy->cpy_hdr.links.prev = copy_entry;
9535 							copy->size = copy_size;
9536 							remaining_entries =
9537 							    copy->cpy_hdr.nentries;
9538 							remaining_entries -= nentries;
9539 							copy->cpy_hdr.nentries = nentries;
9540 							break;
9541 						} else {
9542 							local_size += entry_size;
9543 							new_offset += entry_size;
9544 							nentries++;
9545 						}
9546 						copy_entry = copy_entry->vme_next;
9547 					}
9548 				}
9549 
9550 				if ((entry->use_pmap) && (pmap == NULL)) {
9551 					kr = vm_map_copy_overwrite_nested(
9552 						VME_SUBMAP(entry),
9553 						sub_start,
9554 						copy,
9555 						interruptible,
9556 						VME_SUBMAP(entry)->pmap,
9557 						TRUE);
9558 				} else if (pmap != NULL) {
9559 					kr = vm_map_copy_overwrite_nested(
9560 						VME_SUBMAP(entry),
9561 						sub_start,
9562 						copy,
9563 						interruptible, pmap,
9564 						TRUE);
9565 				} else {
9566 					kr = vm_map_copy_overwrite_nested(
9567 						VME_SUBMAP(entry),
9568 						sub_start,
9569 						copy,
9570 						interruptible,
9571 						dst_map->pmap,
9572 						TRUE);
9573 				}
9574 				if (kr != KERN_SUCCESS) {
9575 					if (next_copy != NULL) {
9576 						copy->cpy_hdr.nentries +=
9577 						    remaining_entries;
9578 						copy->cpy_hdr.links.prev->vme_next =
9579 						    next_copy;
9580 						copy->cpy_hdr.links.prev
9581 						        = previous_prev;
9582 						copy->size = total_size;
9583 					}
9584 					return kr;
9585 				}
9586 				if (dst_end <= local_end) {
9587 					return KERN_SUCCESS;
9588 				}
9589 				/* otherwise copy no longer exists, it was */
9590 				/* destroyed after successful copy_overwrite */
9591 				copy = vm_map_copy_allocate();
9592 				copy->type = VM_MAP_COPY_ENTRY_LIST;
9593 				copy->offset = new_offset;
9594 				copy->cpy_hdr.page_shift = copy_page_shift;
9595 
9596 				/*
9597 				 * XXX FBDP
9598 				 * this does not seem to deal with
9599 				 * the VM map store (R&B tree)
9600 				 */
9601 
9602 				total_size -= copy_size;
9603 				copy_size = 0;
9604 				/* put back remainder of copy in container */
9605 				if (next_copy != NULL) {
9606 					copy->cpy_hdr.nentries = remaining_entries;
9607 					copy->cpy_hdr.links.next = next_copy;
9608 					copy->cpy_hdr.links.prev = previous_prev;
9609 					copy->size = total_size;
9610 					next_copy->vme_prev =
9611 					    vm_map_copy_to_entry(copy);
9612 					next_copy = NULL;
9613 				}
9614 				base_addr = local_end;
9615 				vm_map_lock(dst_map);
9616 				if (!vm_map_lookup_entry(dst_map,
9617 				    local_end, &tmp_entry)) {
9618 					vm_map_unlock(dst_map);
9619 					return KERN_INVALID_ADDRESS;
9620 				}
9621 				entry = tmp_entry;
9622 				continue;
9623 			}
9624 			if (dst_end <= entry->vme_end) {
9625 				copy_size = dst_end - base_addr;
9626 				break;
9627 			}
9628 
9629 			if ((next == vm_map_to_entry(dst_map)) ||
9630 			    (next->vme_start != entry->vme_end)) {
9631 				vm_map_unlock(dst_map);
9632 				return KERN_INVALID_ADDRESS;
9633 			}
9634 
9635 			entry = next;
9636 		}/* for */
9637 
9638 		next_copy = NULL;
9639 		nentries = 1;
9640 
9641 		/* adjust the copy object */
9642 		if (total_size > copy_size) {
9643 			vm_map_size_t   local_size = 0;
9644 			vm_map_size_t   entry_size;
9645 
9646 			new_offset = copy->offset;
9647 			copy_entry = vm_map_copy_first_entry(copy);
9648 			while (copy_entry != vm_map_copy_to_entry(copy)) {
9649 				entry_size = copy_entry->vme_end -
9650 				    copy_entry->vme_start;
9651 				if ((local_size < copy_size) &&
9652 				    ((local_size + entry_size)
9653 				    >= copy_size)) {
9654 					vm_map_copy_clip_end(copy, copy_entry,
9655 					    copy_entry->vme_start +
9656 					    (copy_size - local_size));
9657 					entry_size = copy_entry->vme_end -
9658 					    copy_entry->vme_start;
9659 					local_size += entry_size;
9660 					new_offset += entry_size;
9661 				}
9662 				if (local_size >= copy_size) {
9663 					next_copy = copy_entry->vme_next;
9664 					copy_entry->vme_next =
9665 					    vm_map_copy_to_entry(copy);
9666 					previous_prev =
9667 					    copy->cpy_hdr.links.prev;
9668 					copy->cpy_hdr.links.prev = copy_entry;
9669 					copy->size = copy_size;
9670 					remaining_entries =
9671 					    copy->cpy_hdr.nentries;
9672 					remaining_entries -= nentries;
9673 					copy->cpy_hdr.nentries = nentries;
9674 					break;
9675 				} else {
9676 					local_size += entry_size;
9677 					new_offset += entry_size;
9678 					nentries++;
9679 				}
9680 				copy_entry = copy_entry->vme_next;
9681 			}
9682 		}
9683 
9684 		if (aligned) {
9685 			pmap_t  local_pmap;
9686 
9687 			if (pmap) {
9688 				local_pmap = pmap;
9689 			} else {
9690 				local_pmap = dst_map->pmap;
9691 			}
9692 
9693 			if ((kr =  vm_map_copy_overwrite_aligned(
9694 				    dst_map, tmp_entry, copy,
9695 				    base_addr, local_pmap)) != KERN_SUCCESS) {
9696 				if (next_copy != NULL) {
9697 					copy->cpy_hdr.nentries +=
9698 					    remaining_entries;
9699 					copy->cpy_hdr.links.prev->vme_next =
9700 					    next_copy;
9701 					copy->cpy_hdr.links.prev =
9702 					    previous_prev;
9703 					copy->size += copy_size;
9704 				}
9705 				return kr;
9706 			}
9707 			vm_map_unlock(dst_map);
9708 		} else {
9709 			/*
9710 			 * Performance gain:
9711 			 *
9712 			 * if the copy and dst address are misaligned but the same
9713 			 * offset within the page we can copy_not_aligned the
9714 			 * misaligned parts and copy aligned the rest.  If they are
9715 			 * aligned but len is unaligned we simply need to copy
9716 			 * the end bit unaligned.  We'll need to split the misaligned
9717 			 * bits of the region in this case !
9718 			 */
9719 			/* ALWAYS UNLOCKS THE dst_map MAP */
9720 			kr = vm_map_copy_overwrite_unaligned(
9721 				dst_map,
9722 				tmp_entry,
9723 				copy,
9724 				base_addr,
9725 				discard_on_success);
9726 			if (kr != KERN_SUCCESS) {
9727 				if (next_copy != NULL) {
9728 					copy->cpy_hdr.nentries +=
9729 					    remaining_entries;
9730 					copy->cpy_hdr.links.prev->vme_next =
9731 					    next_copy;
9732 					copy->cpy_hdr.links.prev =
9733 					    previous_prev;
9734 					copy->size += copy_size;
9735 				}
9736 				return kr;
9737 			}
9738 		}
9739 		total_size -= copy_size;
9740 		if (total_size == 0) {
9741 			break;
9742 		}
9743 		base_addr += copy_size;
9744 		copy_size = 0;
9745 		copy->offset = new_offset;
9746 		if (next_copy != NULL) {
9747 			copy->cpy_hdr.nentries = remaining_entries;
9748 			copy->cpy_hdr.links.next = next_copy;
9749 			copy->cpy_hdr.links.prev = previous_prev;
9750 			next_copy->vme_prev = vm_map_copy_to_entry(copy);
9751 			copy->size = total_size;
9752 		}
9753 		vm_map_lock(dst_map);
9754 		while (TRUE) {
9755 			if (!vm_map_lookup_entry(dst_map,
9756 			    base_addr, &tmp_entry)) {
9757 				vm_map_unlock(dst_map);
9758 				return KERN_INVALID_ADDRESS;
9759 			}
9760 			if (tmp_entry->in_transition) {
9761 				entry->needs_wakeup = TRUE;
9762 				vm_map_entry_wait(dst_map, THREAD_UNINT);
9763 			} else {
9764 				break;
9765 			}
9766 		}
9767 		vm_map_clip_start(dst_map,
9768 		    tmp_entry,
9769 		    vm_map_trunc_page(base_addr,
9770 		    VM_MAP_PAGE_MASK(dst_map)));
9771 
9772 		entry = tmp_entry;
9773 	} /* while */
9774 
9775 	/*
9776 	 *	Throw away the vm_map_copy object
9777 	 */
9778 	if (discard_on_success) {
9779 		vm_map_copy_discard(copy);
9780 	}
9781 
9782 	return KERN_SUCCESS;
9783 }/* vm_map_copy_overwrite */
9784 
9785 kern_return_t
vm_map_copy_overwrite(vm_map_t dst_map,vm_map_offset_t dst_addr,vm_map_copy_t copy,vm_map_size_t copy_size,boolean_t interruptible)9786 vm_map_copy_overwrite(
9787 	vm_map_t        dst_map,
9788 	vm_map_offset_t dst_addr,
9789 	vm_map_copy_t   copy,
9790 	vm_map_size_t   copy_size,
9791 	boolean_t       interruptible)
9792 {
9793 	vm_map_size_t   head_size, tail_size;
9794 	vm_map_copy_t   head_copy, tail_copy;
9795 	vm_map_offset_t head_addr, tail_addr;
9796 	vm_map_entry_t  entry;
9797 	kern_return_t   kr;
9798 	vm_map_offset_t effective_page_mask, effective_page_size;
9799 	uint16_t        copy_page_shift;
9800 
9801 	head_size = 0;
9802 	tail_size = 0;
9803 	head_copy = NULL;
9804 	tail_copy = NULL;
9805 	head_addr = 0;
9806 	tail_addr = 0;
9807 
9808 	/*
9809 	 *	Check for null copy object.
9810 	 */
9811 	if (copy == VM_MAP_COPY_NULL) {
9812 		return KERN_SUCCESS;
9813 	}
9814 
9815 	/*
9816 	 * Assert that the vm_map_copy is coming from the right
9817 	 * zone and hasn't been forged
9818 	 */
9819 	vm_map_copy_require(copy);
9820 
9821 	if (interruptible ||
9822 	    copy->type != VM_MAP_COPY_ENTRY_LIST) {
9823 		/*
9824 		 * We can't split the "copy" map if we're interruptible
9825 		 * or if we don't have a "copy" map...
9826 		 */
9827 blunt_copy:
9828 		return vm_map_copy_overwrite_nested(dst_map,
9829 		           dst_addr,
9830 		           copy,
9831 		           interruptible,
9832 		           (pmap_t) NULL,
9833 		           TRUE);
9834 	}
9835 
9836 	copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy);
9837 	if (copy_page_shift < PAGE_SHIFT ||
9838 	    VM_MAP_PAGE_SHIFT(dst_map) < PAGE_SHIFT) {
9839 		goto blunt_copy;
9840 	}
9841 
9842 	if (VM_MAP_PAGE_SHIFT(dst_map) < PAGE_SHIFT) {
9843 		effective_page_mask = VM_MAP_PAGE_MASK(dst_map);
9844 	} else {
9845 		effective_page_mask = MAX(VM_MAP_PAGE_MASK(dst_map), PAGE_MASK);
9846 		effective_page_mask = MAX(VM_MAP_COPY_PAGE_MASK(copy),
9847 		    effective_page_mask);
9848 	}
9849 	effective_page_size = effective_page_mask + 1;
9850 
9851 	if (copy_size < VM_MAP_COPY_OVERWRITE_OPTIMIZATION_THRESHOLD_PAGES * effective_page_size) {
9852 		/*
9853 		 * Too small to bother with optimizing...
9854 		 */
9855 		goto blunt_copy;
9856 	}
9857 
9858 	if ((dst_addr & effective_page_mask) !=
9859 	    (copy->offset & effective_page_mask)) {
9860 		/*
9861 		 * Incompatible mis-alignment of source and destination...
9862 		 */
9863 		goto blunt_copy;
9864 	}
9865 
9866 	/*
9867 	 * Proper alignment or identical mis-alignment at the beginning.
9868 	 * Let's try and do a small unaligned copy first (if needed)
9869 	 * and then an aligned copy for the rest.
9870 	 */
9871 	if (!vm_map_page_aligned(dst_addr, effective_page_mask)) {
9872 		head_addr = dst_addr;
9873 		head_size = (effective_page_size -
9874 		    (copy->offset & effective_page_mask));
9875 		head_size = MIN(head_size, copy_size);
9876 	}
9877 	if (!vm_map_page_aligned(copy->offset + copy_size,
9878 	    effective_page_mask)) {
9879 		/*
9880 		 * Mis-alignment at the end.
9881 		 * Do an aligned copy up to the last page and
9882 		 * then an unaligned copy for the remaining bytes.
9883 		 */
9884 		tail_size = ((copy->offset + copy_size) &
9885 		    effective_page_mask);
9886 		tail_size = MIN(tail_size, copy_size);
9887 		tail_addr = dst_addr + copy_size - tail_size;
9888 		assert(tail_addr >= head_addr + head_size);
9889 	}
9890 	assert(head_size + tail_size <= copy_size);
9891 
9892 	if (head_size + tail_size == copy_size) {
9893 		/*
9894 		 * It's all unaligned, no optimization possible...
9895 		 */
9896 		goto blunt_copy;
9897 	}
9898 
9899 	/*
9900 	 * Can't optimize if there are any submaps in the
9901 	 * destination due to the way we free the "copy" map
9902 	 * progressively in vm_map_copy_overwrite_nested()
9903 	 * in that case.
9904 	 */
9905 	vm_map_lock_read(dst_map);
9906 	if (!vm_map_lookup_entry(dst_map, dst_addr, &entry)) {
9907 		vm_map_unlock_read(dst_map);
9908 		goto blunt_copy;
9909 	}
9910 	for (;
9911 	    (entry != vm_map_to_entry(dst_map) &&
9912 	    entry->vme_start < dst_addr + copy_size);
9913 	    entry = entry->vme_next) {
9914 		if (entry->is_sub_map) {
9915 			vm_map_unlock_read(dst_map);
9916 			goto blunt_copy;
9917 		}
9918 	}
9919 	vm_map_unlock_read(dst_map);
9920 
9921 	if (head_size) {
9922 		/*
9923 		 * Unaligned copy of the first "head_size" bytes, to reach
9924 		 * a page boundary.
9925 		 */
9926 
9927 		/*
9928 		 * Extract "head_copy" out of "copy".
9929 		 */
9930 		head_copy = vm_map_copy_allocate();
9931 		head_copy->type = VM_MAP_COPY_ENTRY_LIST;
9932 		head_copy->cpy_hdr.entries_pageable =
9933 		    copy->cpy_hdr.entries_pageable;
9934 		vm_map_store_init(&head_copy->cpy_hdr);
9935 		head_copy->cpy_hdr.page_shift = copy_page_shift;
9936 
9937 		entry = vm_map_copy_first_entry(copy);
9938 		if (entry->vme_end < copy->offset + head_size) {
9939 			head_size = entry->vme_end - copy->offset;
9940 		}
9941 
9942 		head_copy->offset = copy->offset;
9943 		head_copy->size = head_size;
9944 		copy->offset += head_size;
9945 		copy->size -= head_size;
9946 		copy_size -= head_size;
9947 		assert(copy_size > 0);
9948 
9949 		vm_map_copy_clip_end(copy, entry, copy->offset);
9950 		vm_map_copy_entry_unlink(copy, entry);
9951 		vm_map_copy_entry_link(head_copy,
9952 		    vm_map_copy_to_entry(head_copy),
9953 		    entry);
9954 
9955 		/*
9956 		 * Do the unaligned copy.
9957 		 */
9958 		kr = vm_map_copy_overwrite_nested(dst_map,
9959 		    head_addr,
9960 		    head_copy,
9961 		    interruptible,
9962 		    (pmap_t) NULL,
9963 		    FALSE);
9964 		if (kr != KERN_SUCCESS) {
9965 			goto done;
9966 		}
9967 	}
9968 
9969 	if (tail_size) {
9970 		/*
9971 		 * Extract "tail_copy" out of "copy".
9972 		 */
9973 		tail_copy = vm_map_copy_allocate();
9974 		tail_copy->type = VM_MAP_COPY_ENTRY_LIST;
9975 		tail_copy->cpy_hdr.entries_pageable =
9976 		    copy->cpy_hdr.entries_pageable;
9977 		vm_map_store_init(&tail_copy->cpy_hdr);
9978 		tail_copy->cpy_hdr.page_shift = copy_page_shift;
9979 
9980 		tail_copy->offset = copy->offset + copy_size - tail_size;
9981 		tail_copy->size = tail_size;
9982 
9983 		copy->size -= tail_size;
9984 		copy_size -= tail_size;
9985 		assert(copy_size > 0);
9986 
9987 		entry = vm_map_copy_last_entry(copy);
9988 		vm_map_copy_clip_start(copy, entry, tail_copy->offset);
9989 		entry = vm_map_copy_last_entry(copy);
9990 		vm_map_copy_entry_unlink(copy, entry);
9991 		vm_map_copy_entry_link(tail_copy,
9992 		    vm_map_copy_last_entry(tail_copy),
9993 		    entry);
9994 	}
9995 
9996 	/*
9997 	 * If we are here from ipc_kmsg_copyout_ool_descriptor(),
9998 	 * we want to avoid TOCTOU issues w.r.t copy->size but
9999 	 * we don't need to change vm_map_copy_overwrite_nested()
10000 	 * and all other vm_map_copy_overwrite variants.
10001 	 *
10002 	 * So we assign the original copy_size that was passed into
10003 	 * this routine back to copy.
10004 	 *
10005 	 * This use of local 'copy_size' passed into this routine is
10006 	 * to try and protect against TOCTOU attacks where the kernel
10007 	 * has been exploited. We don't expect this to be an issue
10008 	 * during normal system operation.
10009 	 */
10010 	assertf(copy->size == copy_size,
10011 	    "Mismatch of copy sizes. Expected 0x%llx, Got 0x%llx\n", (uint64_t) copy_size, (uint64_t) copy->size);
10012 	copy->size = copy_size;
10013 
10014 	/*
10015 	 * Copy most (or possibly all) of the data.
10016 	 */
10017 	kr = vm_map_copy_overwrite_nested(dst_map,
10018 	    dst_addr + head_size,
10019 	    copy,
10020 	    interruptible,
10021 	    (pmap_t) NULL,
10022 	    FALSE);
10023 	if (kr != KERN_SUCCESS) {
10024 		goto done;
10025 	}
10026 
10027 	if (tail_size) {
10028 		kr = vm_map_copy_overwrite_nested(dst_map,
10029 		    tail_addr,
10030 		    tail_copy,
10031 		    interruptible,
10032 		    (pmap_t) NULL,
10033 		    FALSE);
10034 	}
10035 
10036 done:
10037 	assert(copy->type == VM_MAP_COPY_ENTRY_LIST);
10038 	if (kr == KERN_SUCCESS) {
10039 		/*
10040 		 * Discard all the copy maps.
10041 		 */
10042 		if (head_copy) {
10043 			vm_map_copy_discard(head_copy);
10044 			head_copy = NULL;
10045 		}
10046 		vm_map_copy_discard(copy);
10047 		if (tail_copy) {
10048 			vm_map_copy_discard(tail_copy);
10049 			tail_copy = NULL;
10050 		}
10051 	} else {
10052 		/*
10053 		 * Re-assemble the original copy map.
10054 		 */
10055 		if (head_copy) {
10056 			entry = vm_map_copy_first_entry(head_copy);
10057 			vm_map_copy_entry_unlink(head_copy, entry);
10058 			vm_map_copy_entry_link(copy,
10059 			    vm_map_copy_to_entry(copy),
10060 			    entry);
10061 			copy->offset -= head_size;
10062 			copy->size += head_size;
10063 			vm_map_copy_discard(head_copy);
10064 			head_copy = NULL;
10065 		}
10066 		if (tail_copy) {
10067 			entry = vm_map_copy_last_entry(tail_copy);
10068 			vm_map_copy_entry_unlink(tail_copy, entry);
10069 			vm_map_copy_entry_link(copy,
10070 			    vm_map_copy_last_entry(copy),
10071 			    entry);
10072 			copy->size += tail_size;
10073 			vm_map_copy_discard(tail_copy);
10074 			tail_copy = NULL;
10075 		}
10076 	}
10077 	return kr;
10078 }
10079 
10080 
10081 /*
10082  *	Routine: vm_map_copy_overwrite_unaligned	[internal use only]
10083  *
10084  *	Decription:
10085  *	Physically copy unaligned data
10086  *
10087  *	Implementation:
10088  *	Unaligned parts of pages have to be physically copied.  We use
10089  *	a modified form of vm_fault_copy (which understands none-aligned
10090  *	page offsets and sizes) to do the copy.  We attempt to copy as
10091  *	much memory in one go as possibly, however vm_fault_copy copies
10092  *	within 1 memory object so we have to find the smaller of "amount left"
10093  *	"source object data size" and "target object data size".  With
10094  *	unaligned data we don't need to split regions, therefore the source
10095  *	(copy) object should be one map entry, the target range may be split
10096  *	over multiple map entries however.  In any event we are pessimistic
10097  *	about these assumptions.
10098  *
10099  *	Callers of this function must call vm_map_copy_require on
10100  *	previously created vm_map_copy_t or pass a newly created
10101  *	one to ensure that it hasn't been forged.
10102  *
10103  *	Assumptions:
10104  *	dst_map is locked on entry and is return locked on success,
10105  *	unlocked on error.
10106  */
10107 
10108 static kern_return_t
vm_map_copy_overwrite_unaligned(vm_map_t dst_map,vm_map_entry_t entry,vm_map_copy_t copy,vm_map_offset_t start,boolean_t discard_on_success)10109 vm_map_copy_overwrite_unaligned(
10110 	vm_map_t        dst_map,
10111 	vm_map_entry_t  entry,
10112 	vm_map_copy_t   copy,
10113 	vm_map_offset_t start,
10114 	boolean_t       discard_on_success)
10115 {
10116 	vm_map_entry_t          copy_entry;
10117 	vm_map_entry_t          copy_entry_next;
10118 	vm_map_version_t        version;
10119 	vm_object_t             dst_object;
10120 	vm_object_offset_t      dst_offset;
10121 	vm_object_offset_t      src_offset;
10122 	vm_object_offset_t      entry_offset;
10123 	vm_map_offset_t         entry_end;
10124 	vm_map_size_t           src_size,
10125 	    dst_size,
10126 	    copy_size,
10127 	    amount_left;
10128 	kern_return_t           kr = KERN_SUCCESS;
10129 
10130 
10131 	copy_entry = vm_map_copy_first_entry(copy);
10132 
10133 	vm_map_lock_write_to_read(dst_map);
10134 
10135 	src_offset = copy->offset - trunc_page_mask_64(copy->offset, VM_MAP_COPY_PAGE_MASK(copy));
10136 	amount_left = copy->size;
10137 /*
10138  *	unaligned so we never clipped this entry, we need the offset into
10139  *	the vm_object not just the data.
10140  */
10141 	while (amount_left > 0) {
10142 		if (entry == vm_map_to_entry(dst_map)) {
10143 			vm_map_unlock_read(dst_map);
10144 			return KERN_INVALID_ADDRESS;
10145 		}
10146 
10147 		/* "start" must be within the current map entry */
10148 		assert((start >= entry->vme_start) && (start < entry->vme_end));
10149 
10150 		/*
10151 		 *	Check protection again
10152 		 */
10153 		if (!(entry->protection & VM_PROT_WRITE)) {
10154 			vm_map_unlock_read(dst_map);
10155 			return KERN_PROTECTION_FAILURE;
10156 		}
10157 		if (!vm_map_entry_is_overwritable(dst_map, entry)) {
10158 			vm_map_unlock_read(dst_map);
10159 			return KERN_PROTECTION_FAILURE;
10160 		}
10161 
10162 		dst_offset = start - entry->vme_start;
10163 
10164 		dst_size = entry->vme_end - start;
10165 
10166 		src_size = copy_entry->vme_end -
10167 		    (copy_entry->vme_start + src_offset);
10168 
10169 		if (dst_size < src_size) {
10170 /*
10171  *			we can only copy dst_size bytes before
10172  *			we have to get the next destination entry
10173  */
10174 			copy_size = dst_size;
10175 		} else {
10176 /*
10177  *			we can only copy src_size bytes before
10178  *			we have to get the next source copy entry
10179  */
10180 			copy_size = src_size;
10181 		}
10182 
10183 		if (copy_size > amount_left) {
10184 			copy_size = amount_left;
10185 		}
10186 /*
10187  *		Entry needs copy, create a shadow shadow object for
10188  *		Copy on write region.
10189  */
10190 		if (entry->needs_copy) {
10191 			if (vm_map_lock_read_to_write(dst_map)) {
10192 				vm_map_lock_read(dst_map);
10193 				goto RetryLookup;
10194 			}
10195 			VME_OBJECT_SHADOW(entry,
10196 			    (vm_map_size_t)(entry->vme_end
10197 			    - entry->vme_start),
10198 			    vm_map_always_shadow(dst_map));
10199 			entry->needs_copy = FALSE;
10200 			vm_map_lock_write_to_read(dst_map);
10201 		}
10202 		dst_object = VME_OBJECT(entry);
10203 /*
10204  *		unlike with the virtual (aligned) copy we're going
10205  *		to fault on it therefore we need a target object.
10206  */
10207 		if (dst_object == VM_OBJECT_NULL) {
10208 			if (vm_map_lock_read_to_write(dst_map)) {
10209 				vm_map_lock_read(dst_map);
10210 				goto RetryLookup;
10211 			}
10212 			dst_object = vm_object_allocate((vm_map_size_t)
10213 			    entry->vme_end - entry->vme_start);
10214 			VME_OBJECT_SET(entry, dst_object, false, 0);
10215 			VME_OFFSET_SET(entry, 0);
10216 			assert(entry->use_pmap);
10217 			vm_map_lock_write_to_read(dst_map);
10218 		}
10219 /*
10220  *		Take an object reference and unlock map. The "entry" may
10221  *		disappear or change when the map is unlocked.
10222  */
10223 		vm_object_reference(dst_object);
10224 		version.main_timestamp = dst_map->timestamp;
10225 		entry_offset = VME_OFFSET(entry);
10226 		entry_end = entry->vme_end;
10227 		vm_map_unlock_read(dst_map);
10228 /*
10229  *		Copy as much as possible in one pass
10230  */
10231 		kr = vm_fault_copy(
10232 			VME_OBJECT(copy_entry),
10233 			VME_OFFSET(copy_entry) + src_offset,
10234 			&copy_size,
10235 			dst_object,
10236 			entry_offset + dst_offset,
10237 			dst_map,
10238 			&version,
10239 			THREAD_UNINT );
10240 
10241 		start += copy_size;
10242 		src_offset += copy_size;
10243 		amount_left -= copy_size;
10244 /*
10245  *		Release the object reference
10246  */
10247 		vm_object_deallocate(dst_object);
10248 /*
10249  *		If a hard error occurred, return it now
10250  */
10251 		if (kr != KERN_SUCCESS) {
10252 			return kr;
10253 		}
10254 
10255 		if ((copy_entry->vme_start + src_offset) == copy_entry->vme_end
10256 		    || amount_left == 0) {
10257 /*
10258  *			all done with this copy entry, dispose.
10259  */
10260 			copy_entry_next = copy_entry->vme_next;
10261 
10262 			if (discard_on_success) {
10263 				vm_map_copy_entry_unlink(copy, copy_entry);
10264 				assert(!copy_entry->is_sub_map);
10265 				vm_object_deallocate(VME_OBJECT(copy_entry));
10266 				vm_map_copy_entry_dispose(copy_entry);
10267 			}
10268 
10269 			if (copy_entry_next == vm_map_copy_to_entry(copy) &&
10270 			    amount_left) {
10271 /*
10272  *				not finished copying but run out of source
10273  */
10274 				return KERN_INVALID_ADDRESS;
10275 			}
10276 
10277 			copy_entry = copy_entry_next;
10278 
10279 			src_offset = 0;
10280 		}
10281 
10282 		if (amount_left == 0) {
10283 			return KERN_SUCCESS;
10284 		}
10285 
10286 		vm_map_lock_read(dst_map);
10287 		if (version.main_timestamp == dst_map->timestamp) {
10288 			if (start == entry_end) {
10289 /*
10290  *				destination region is split.  Use the version
10291  *				information to avoid a lookup in the normal
10292  *				case.
10293  */
10294 				entry = entry->vme_next;
10295 /*
10296  *				should be contiguous. Fail if we encounter
10297  *				a hole in the destination.
10298  */
10299 				if (start != entry->vme_start) {
10300 					vm_map_unlock_read(dst_map);
10301 					return KERN_INVALID_ADDRESS;
10302 				}
10303 			}
10304 		} else {
10305 /*
10306  *			Map version check failed.
10307  *			we must lookup the entry because somebody
10308  *			might have changed the map behind our backs.
10309  */
10310 RetryLookup:
10311 			if (!vm_map_lookup_entry(dst_map, start, &entry)) {
10312 				vm_map_unlock_read(dst_map);
10313 				return KERN_INVALID_ADDRESS;
10314 			}
10315 		}
10316 	}/* while */
10317 
10318 	return KERN_SUCCESS;
10319 }/* vm_map_copy_overwrite_unaligned */
10320 
10321 /*
10322  *	Routine: vm_map_copy_overwrite_aligned	[internal use only]
10323  *
10324  *	Description:
10325  *	Does all the vm_trickery possible for whole pages.
10326  *
10327  *	Implementation:
10328  *
10329  *	If there are no permanent objects in the destination,
10330  *	and the source and destination map entry zones match,
10331  *	and the destination map entry is not shared,
10332  *	then the map entries can be deleted and replaced
10333  *	with those from the copy.  The following code is the
10334  *	basic idea of what to do, but there are lots of annoying
10335  *	little details about getting protection and inheritance
10336  *	right.  Should add protection, inheritance, and sharing checks
10337  *	to the above pass and make sure that no wiring is involved.
10338  *
10339  *	Callers of this function must call vm_map_copy_require on
10340  *	previously created vm_map_copy_t or pass a newly created
10341  *	one to ensure that it hasn't been forged.
10342  */
10343 
10344 int vm_map_copy_overwrite_aligned_src_not_internal = 0;
10345 int vm_map_copy_overwrite_aligned_src_not_symmetric = 0;
10346 int vm_map_copy_overwrite_aligned_src_large = 0;
10347 
10348 static kern_return_t
vm_map_copy_overwrite_aligned(vm_map_t dst_map,vm_map_entry_t tmp_entry,vm_map_copy_t copy,vm_map_offset_t start,__unused pmap_t pmap)10349 vm_map_copy_overwrite_aligned(
10350 	vm_map_t        dst_map,
10351 	vm_map_entry_t  tmp_entry,
10352 	vm_map_copy_t   copy,
10353 	vm_map_offset_t start,
10354 	__unused pmap_t pmap)
10355 {
10356 	vm_object_t     object;
10357 	vm_map_entry_t  copy_entry;
10358 	vm_map_size_t   copy_size;
10359 	vm_map_size_t   size;
10360 	vm_map_entry_t  entry;
10361 
10362 	while ((copy_entry = vm_map_copy_first_entry(copy))
10363 	    != vm_map_copy_to_entry(copy)) {
10364 		copy_size = (copy_entry->vme_end - copy_entry->vme_start);
10365 
10366 		entry = tmp_entry;
10367 		if (entry->is_sub_map) {
10368 			/* unnested when clipped earlier */
10369 			assert(!entry->use_pmap);
10370 		}
10371 		if (entry == vm_map_to_entry(dst_map)) {
10372 			vm_map_unlock(dst_map);
10373 			return KERN_INVALID_ADDRESS;
10374 		}
10375 		size = (entry->vme_end - entry->vme_start);
10376 		/*
10377 		 *	Make sure that no holes popped up in the
10378 		 *	address map, and that the protection is
10379 		 *	still valid, in case the map was unlocked
10380 		 *	earlier.
10381 		 */
10382 
10383 		if ((entry->vme_start != start) || ((entry->is_sub_map)
10384 		    && !entry->needs_copy)) {
10385 			vm_map_unlock(dst_map);
10386 			return KERN_INVALID_ADDRESS;
10387 		}
10388 		assert(entry != vm_map_to_entry(dst_map));
10389 
10390 		/*
10391 		 *	Check protection again
10392 		 */
10393 
10394 		if (!(entry->protection & VM_PROT_WRITE)) {
10395 			vm_map_unlock(dst_map);
10396 			return KERN_PROTECTION_FAILURE;
10397 		}
10398 
10399 		if (!vm_map_entry_is_overwritable(dst_map, entry)) {
10400 			vm_map_unlock(dst_map);
10401 			return KERN_PROTECTION_FAILURE;
10402 		}
10403 
10404 		/*
10405 		 *	Adjust to source size first
10406 		 */
10407 
10408 		if (copy_size < size) {
10409 			if (entry->map_aligned &&
10410 			    !VM_MAP_PAGE_ALIGNED(entry->vme_start + copy_size,
10411 			    VM_MAP_PAGE_MASK(dst_map))) {
10412 				/* no longer map-aligned */
10413 				entry->map_aligned = FALSE;
10414 			}
10415 			vm_map_clip_end(dst_map, entry, entry->vme_start + copy_size);
10416 			size = copy_size;
10417 		}
10418 
10419 		/*
10420 		 *	Adjust to destination size
10421 		 */
10422 
10423 		if (size < copy_size) {
10424 			vm_map_copy_clip_end(copy, copy_entry,
10425 			    copy_entry->vme_start + size);
10426 			copy_size = size;
10427 		}
10428 
10429 		assert((entry->vme_end - entry->vme_start) == size);
10430 		assert((tmp_entry->vme_end - tmp_entry->vme_start) == size);
10431 		assert((copy_entry->vme_end - copy_entry->vme_start) == size);
10432 
10433 		/*
10434 		 *	If the destination contains temporary unshared memory,
10435 		 *	we can perform the copy by throwing it away and
10436 		 *	installing the source data.
10437 		 */
10438 
10439 		object = VME_OBJECT(entry);
10440 		if ((!entry->is_shared &&
10441 		    ((object == VM_OBJECT_NULL) ||
10442 		    (object->internal && !object->true_share))) ||
10443 		    entry->needs_copy) {
10444 			vm_object_t     old_object = VME_OBJECT(entry);
10445 			vm_object_offset_t      old_offset = VME_OFFSET(entry);
10446 			vm_object_offset_t      offset;
10447 
10448 			/*
10449 			 * Ensure that the source and destination aren't
10450 			 * identical
10451 			 */
10452 			if (old_object == VME_OBJECT(copy_entry) &&
10453 			    old_offset == VME_OFFSET(copy_entry)) {
10454 				vm_map_copy_entry_unlink(copy, copy_entry);
10455 				vm_map_copy_entry_dispose(copy_entry);
10456 
10457 				if (old_object != VM_OBJECT_NULL) {
10458 					vm_object_deallocate(old_object);
10459 				}
10460 
10461 				start = tmp_entry->vme_end;
10462 				tmp_entry = tmp_entry->vme_next;
10463 				continue;
10464 			}
10465 
10466 #if XNU_TARGET_OS_OSX
10467 #define __TRADEOFF1_OBJ_SIZE (64 * 1024 * 1024) /* 64 MB */
10468 #define __TRADEOFF1_COPY_SIZE (128 * 1024)      /* 128 KB */
10469 			if (VME_OBJECT(copy_entry) != VM_OBJECT_NULL &&
10470 			    VME_OBJECT(copy_entry)->vo_size >= __TRADEOFF1_OBJ_SIZE &&
10471 			    copy_size <= __TRADEOFF1_COPY_SIZE) {
10472 				/*
10473 				 * Virtual vs. Physical copy tradeoff #1.
10474 				 *
10475 				 * Copying only a few pages out of a large
10476 				 * object:  do a physical copy instead of
10477 				 * a virtual copy, to avoid possibly keeping
10478 				 * the entire large object alive because of
10479 				 * those few copy-on-write pages.
10480 				 */
10481 				vm_map_copy_overwrite_aligned_src_large++;
10482 				goto slow_copy;
10483 			}
10484 #endif /* XNU_TARGET_OS_OSX */
10485 
10486 			if ((dst_map->pmap != kernel_pmap) &&
10487 			    (VME_ALIAS(entry) >= VM_MEMORY_MALLOC) &&
10488 			    (VME_ALIAS(entry) <= VM_MEMORY_MALLOC_MEDIUM)) {
10489 				vm_object_t new_object, new_shadow;
10490 
10491 				/*
10492 				 * We're about to map something over a mapping
10493 				 * established by malloc()...
10494 				 */
10495 				new_object = VME_OBJECT(copy_entry);
10496 				if (new_object != VM_OBJECT_NULL) {
10497 					vm_object_lock_shared(new_object);
10498 				}
10499 				while (new_object != VM_OBJECT_NULL &&
10500 #if XNU_TARGET_OS_OSX
10501 				    !new_object->true_share &&
10502 				    new_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC &&
10503 #endif /* XNU_TARGET_OS_OSX */
10504 				    new_object->internal) {
10505 					new_shadow = new_object->shadow;
10506 					if (new_shadow == VM_OBJECT_NULL) {
10507 						break;
10508 					}
10509 					vm_object_lock_shared(new_shadow);
10510 					vm_object_unlock(new_object);
10511 					new_object = new_shadow;
10512 				}
10513 				if (new_object != VM_OBJECT_NULL) {
10514 					if (!new_object->internal) {
10515 						/*
10516 						 * The new mapping is backed
10517 						 * by an external object.  We
10518 						 * don't want malloc'ed memory
10519 						 * to be replaced with such a
10520 						 * non-anonymous mapping, so
10521 						 * let's go off the optimized
10522 						 * path...
10523 						 */
10524 						vm_map_copy_overwrite_aligned_src_not_internal++;
10525 						vm_object_unlock(new_object);
10526 						goto slow_copy;
10527 					}
10528 #if XNU_TARGET_OS_OSX
10529 					if (new_object->true_share ||
10530 					    new_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
10531 						/*
10532 						 * Same if there's a "true_share"
10533 						 * object in the shadow chain, or
10534 						 * an object with a non-default
10535 						 * (SYMMETRIC) copy strategy.
10536 						 */
10537 						vm_map_copy_overwrite_aligned_src_not_symmetric++;
10538 						vm_object_unlock(new_object);
10539 						goto slow_copy;
10540 					}
10541 #endif /* XNU_TARGET_OS_OSX */
10542 					vm_object_unlock(new_object);
10543 				}
10544 				/*
10545 				 * The new mapping is still backed by
10546 				 * anonymous (internal) memory, so it's
10547 				 * OK to substitute it for the original
10548 				 * malloc() mapping.
10549 				 */
10550 			}
10551 
10552 			if (old_object != VM_OBJECT_NULL) {
10553 				assert(!entry->vme_permanent);
10554 				if (entry->is_sub_map) {
10555 					if (entry->use_pmap) {
10556 #ifndef NO_NESTED_PMAP
10557 						pmap_unnest(dst_map->pmap,
10558 						    (addr64_t)entry->vme_start,
10559 						    entry->vme_end - entry->vme_start);
10560 #endif  /* NO_NESTED_PMAP */
10561 						if (dst_map->mapped_in_other_pmaps) {
10562 							/* clean up parent */
10563 							/* map/maps */
10564 							vm_map_submap_pmap_clean(
10565 								dst_map, entry->vme_start,
10566 								entry->vme_end,
10567 								VME_SUBMAP(entry),
10568 								VME_OFFSET(entry));
10569 						}
10570 					} else {
10571 						vm_map_submap_pmap_clean(
10572 							dst_map, entry->vme_start,
10573 							entry->vme_end,
10574 							VME_SUBMAP(entry),
10575 							VME_OFFSET(entry));
10576 					}
10577 					vm_map_deallocate(VME_SUBMAP(entry));
10578 				} else {
10579 					if (dst_map->mapped_in_other_pmaps) {
10580 						vm_object_pmap_protect_options(
10581 							VME_OBJECT(entry),
10582 							VME_OFFSET(entry),
10583 							entry->vme_end
10584 							- entry->vme_start,
10585 							PMAP_NULL,
10586 							PAGE_SIZE,
10587 							entry->vme_start,
10588 							VM_PROT_NONE,
10589 							PMAP_OPTIONS_REMOVE);
10590 					} else {
10591 						pmap_remove_options(
10592 							dst_map->pmap,
10593 							(addr64_t)(entry->vme_start),
10594 							(addr64_t)(entry->vme_end),
10595 							PMAP_OPTIONS_REMOVE);
10596 					}
10597 					vm_object_deallocate(old_object);
10598 				}
10599 			}
10600 
10601 			if (entry->iokit_acct) {
10602 				/* keep using iokit accounting */
10603 				entry->use_pmap = FALSE;
10604 			} else {
10605 				/* use pmap accounting */
10606 				entry->use_pmap = TRUE;
10607 			}
10608 			assert(!entry->vme_permanent);
10609 			VME_OBJECT_SET(entry, VME_OBJECT(copy_entry), false, 0);
10610 			object = VME_OBJECT(entry);
10611 			entry->needs_copy = copy_entry->needs_copy;
10612 			entry->wired_count = 0;
10613 			entry->user_wired_count = 0;
10614 			offset = VME_OFFSET(copy_entry);
10615 			VME_OFFSET_SET(entry, offset);
10616 
10617 			vm_map_copy_entry_unlink(copy, copy_entry);
10618 			vm_map_copy_entry_dispose(copy_entry);
10619 
10620 			/*
10621 			 * we could try to push pages into the pmap at this point, BUT
10622 			 * this optimization only saved on average 2 us per page if ALL
10623 			 * the pages in the source were currently mapped
10624 			 * and ALL the pages in the dest were touched, if there were fewer
10625 			 * than 2/3 of the pages touched, this optimization actually cost more cycles
10626 			 * it also puts a lot of pressure on the pmap layer w/r to mapping structures
10627 			 */
10628 
10629 			/*
10630 			 *	Set up for the next iteration.  The map
10631 			 *	has not been unlocked, so the next
10632 			 *	address should be at the end of this
10633 			 *	entry, and the next map entry should be
10634 			 *	the one following it.
10635 			 */
10636 
10637 			start = tmp_entry->vme_end;
10638 			tmp_entry = tmp_entry->vme_next;
10639 		} else {
10640 			vm_map_version_t        version;
10641 			vm_object_t             dst_object;
10642 			vm_object_offset_t      dst_offset;
10643 			kern_return_t           r;
10644 
10645 slow_copy:
10646 			if (entry->needs_copy) {
10647 				VME_OBJECT_SHADOW(entry,
10648 				    (entry->vme_end -
10649 				    entry->vme_start),
10650 				    vm_map_always_shadow(dst_map));
10651 				entry->needs_copy = FALSE;
10652 			}
10653 
10654 			dst_object = VME_OBJECT(entry);
10655 			dst_offset = VME_OFFSET(entry);
10656 
10657 			/*
10658 			 *	Take an object reference, and record
10659 			 *	the map version information so that the
10660 			 *	map can be safely unlocked.
10661 			 */
10662 
10663 			if (dst_object == VM_OBJECT_NULL) {
10664 				/*
10665 				 * We would usually have just taken the
10666 				 * optimized path above if the destination
10667 				 * object has not been allocated yet.  But we
10668 				 * now disable that optimization if the copy
10669 				 * entry's object is not backed by anonymous
10670 				 * memory to avoid replacing malloc'ed
10671 				 * (i.e. re-usable) anonymous memory with a
10672 				 * not-so-anonymous mapping.
10673 				 * So we have to handle this case here and
10674 				 * allocate a new VM object for this map entry.
10675 				 */
10676 				dst_object = vm_object_allocate(
10677 					entry->vme_end - entry->vme_start);
10678 				dst_offset = 0;
10679 				VME_OBJECT_SET(entry, dst_object, false, 0);
10680 				VME_OFFSET_SET(entry, dst_offset);
10681 				assert(entry->use_pmap);
10682 			}
10683 
10684 			vm_object_reference(dst_object);
10685 
10686 			/* account for unlock bumping up timestamp */
10687 			version.main_timestamp = dst_map->timestamp + 1;
10688 
10689 			vm_map_unlock(dst_map);
10690 
10691 			/*
10692 			 *	Copy as much as possible in one pass
10693 			 */
10694 
10695 			copy_size = size;
10696 			r = vm_fault_copy(
10697 				VME_OBJECT(copy_entry),
10698 				VME_OFFSET(copy_entry),
10699 				&copy_size,
10700 				dst_object,
10701 				dst_offset,
10702 				dst_map,
10703 				&version,
10704 				THREAD_UNINT );
10705 
10706 			/*
10707 			 *	Release the object reference
10708 			 */
10709 
10710 			vm_object_deallocate(dst_object);
10711 
10712 			/*
10713 			 *	If a hard error occurred, return it now
10714 			 */
10715 
10716 			if (r != KERN_SUCCESS) {
10717 				return r;
10718 			}
10719 
10720 			if (copy_size != 0) {
10721 				/*
10722 				 *	Dispose of the copied region
10723 				 */
10724 
10725 				vm_map_copy_clip_end(copy, copy_entry,
10726 				    copy_entry->vme_start + copy_size);
10727 				vm_map_copy_entry_unlink(copy, copy_entry);
10728 				vm_object_deallocate(VME_OBJECT(copy_entry));
10729 				vm_map_copy_entry_dispose(copy_entry);
10730 			}
10731 
10732 			/*
10733 			 *	Pick up in the destination map where we left off.
10734 			 *
10735 			 *	Use the version information to avoid a lookup
10736 			 *	in the normal case.
10737 			 */
10738 
10739 			start += copy_size;
10740 			vm_map_lock(dst_map);
10741 			if (version.main_timestamp == dst_map->timestamp &&
10742 			    copy_size != 0) {
10743 				/* We can safely use saved tmp_entry value */
10744 
10745 				if (tmp_entry->map_aligned &&
10746 				    !VM_MAP_PAGE_ALIGNED(
10747 					    start,
10748 					    VM_MAP_PAGE_MASK(dst_map))) {
10749 					/* no longer map-aligned */
10750 					tmp_entry->map_aligned = FALSE;
10751 				}
10752 				vm_map_clip_end(dst_map, tmp_entry, start);
10753 				tmp_entry = tmp_entry->vme_next;
10754 			} else {
10755 				/* Must do lookup of tmp_entry */
10756 
10757 				if (!vm_map_lookup_entry(dst_map, start, &tmp_entry)) {
10758 					vm_map_unlock(dst_map);
10759 					return KERN_INVALID_ADDRESS;
10760 				}
10761 				if (tmp_entry->map_aligned &&
10762 				    !VM_MAP_PAGE_ALIGNED(
10763 					    start,
10764 					    VM_MAP_PAGE_MASK(dst_map))) {
10765 					/* no longer map-aligned */
10766 					tmp_entry->map_aligned = FALSE;
10767 				}
10768 				vm_map_clip_start(dst_map, tmp_entry, start);
10769 			}
10770 		}
10771 	}/* while */
10772 
10773 	return KERN_SUCCESS;
10774 }/* vm_map_copy_overwrite_aligned */
10775 
10776 /*
10777  *	Routine: vm_map_copyin_kernel_buffer [internal use only]
10778  *
10779  *	Description:
10780  *		Copy in data to a kernel buffer from space in the
10781  *		source map. The original space may be optionally
10782  *		deallocated.
10783  *
10784  *		If successful, returns a new copy object.
10785  */
10786 static kern_return_t
vm_map_copyin_kernel_buffer(vm_map_t src_map,vm_map_offset_t src_addr,vm_map_size_t len,boolean_t src_destroy,vm_map_copy_t * copy_result)10787 vm_map_copyin_kernel_buffer(
10788 	vm_map_t        src_map,
10789 	vm_map_offset_t src_addr,
10790 	vm_map_size_t   len,
10791 	boolean_t       src_destroy,
10792 	vm_map_copy_t   *copy_result)
10793 {
10794 	kern_return_t kr;
10795 	vm_map_copy_t copy;
10796 
10797 	if (len > msg_ool_size_small) {
10798 		return KERN_INVALID_ARGUMENT;
10799 	}
10800 
10801 	copy = zalloc_id(ZONE_ID_VM_MAP_COPY, Z_WAITOK | Z_ZERO | Z_NOFAIL);
10802 	copy->cpy_kdata = kalloc_data(len, Z_WAITOK);
10803 	if (copy->cpy_kdata == NULL) {
10804 		zfree_id(ZONE_ID_VM_MAP_COPY, copy);
10805 		return KERN_RESOURCE_SHORTAGE;
10806 	}
10807 
10808 	copy->type = VM_MAP_COPY_KERNEL_BUFFER;
10809 	copy->size = len;
10810 	copy->offset = 0;
10811 
10812 	kr = copyinmap(src_map, src_addr, copy->cpy_kdata, (vm_size_t)len);
10813 	if (kr != KERN_SUCCESS) {
10814 		kfree_data(copy->cpy_kdata, len);
10815 		zfree_id(ZONE_ID_VM_MAP_COPY, copy);
10816 		return kr;
10817 	}
10818 
10819 	if (src_destroy) {
10820 		vmr_flags_t flags = VM_MAP_REMOVE_INTERRUPTIBLE;
10821 
10822 		if (src_map == kernel_map) {
10823 			flags |= VM_MAP_REMOVE_KUNWIRE;
10824 		}
10825 
10826 		(void)vm_map_remove_guard(src_map,
10827 		    vm_map_trunc_page(src_addr, VM_MAP_PAGE_MASK(src_map)),
10828 		    vm_map_round_page(src_addr + len, VM_MAP_PAGE_MASK(src_map)),
10829 		    flags, KMEM_GUARD_NONE);
10830 	}
10831 
10832 	*copy_result = copy;
10833 	return KERN_SUCCESS;
10834 }
10835 
10836 /*
10837  *	Routine: vm_map_copyout_kernel_buffer	[internal use only]
10838  *
10839  *	Description:
10840  *		Copy out data from a kernel buffer into space in the
10841  *		destination map. The space may be otpionally dynamically
10842  *		allocated.
10843  *
10844  *		If successful, consumes the copy object.
10845  *		Otherwise, the caller is responsible for it.
10846  *
10847  *		Callers of this function must call vm_map_copy_require on
10848  *		previously created vm_map_copy_t or pass a newly created
10849  *		one to ensure that it hasn't been forged.
10850  */
10851 static int vm_map_copyout_kernel_buffer_failures = 0;
10852 static kern_return_t
vm_map_copyout_kernel_buffer(vm_map_t map,vm_map_address_t * addr,vm_map_copy_t copy,vm_map_size_t copy_size,boolean_t overwrite,boolean_t consume_on_success)10853 vm_map_copyout_kernel_buffer(
10854 	vm_map_t                map,
10855 	vm_map_address_t        *addr,  /* IN/OUT */
10856 	vm_map_copy_t           copy,
10857 	vm_map_size_t           copy_size,
10858 	boolean_t               overwrite,
10859 	boolean_t               consume_on_success)
10860 {
10861 	kern_return_t kr = KERN_SUCCESS;
10862 	thread_t thread = current_thread();
10863 
10864 	assert(copy->size == copy_size);
10865 
10866 	/*
10867 	 * check for corrupted vm_map_copy structure
10868 	 */
10869 	if (copy_size > msg_ool_size_small || copy->offset) {
10870 		panic("Invalid vm_map_copy_t sz:%lld, ofst:%lld",
10871 		    (long long)copy->size, (long long)copy->offset);
10872 	}
10873 
10874 	if (!overwrite) {
10875 		/*
10876 		 * Allocate space in the target map for the data
10877 		 */
10878 		vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
10879 
10880 		if (map == kernel_map) {
10881 			vmk_flags.vmkf_range_id = KMEM_RANGE_ID_DATA;
10882 		}
10883 		*addr = 0;
10884 		kr = vm_map_enter(map,
10885 		    addr,
10886 		    vm_map_round_page(copy_size,
10887 		    VM_MAP_PAGE_MASK(map)),
10888 		    (vm_map_offset_t) 0,
10889 		    VM_FLAGS_ANYWHERE,
10890 		    vmk_flags,
10891 		    VM_KERN_MEMORY_NONE,
10892 		    VM_OBJECT_NULL,
10893 		    (vm_object_offset_t) 0,
10894 		    FALSE,
10895 		    VM_PROT_DEFAULT,
10896 		    VM_PROT_ALL,
10897 		    VM_INHERIT_DEFAULT);
10898 		if (kr != KERN_SUCCESS) {
10899 			return kr;
10900 		}
10901 #if KASAN
10902 		if (map->pmap == kernel_pmap) {
10903 			kasan_notify_address(*addr, copy->size);
10904 		}
10905 #endif
10906 	}
10907 
10908 	/*
10909 	 * Copyout the data from the kernel buffer to the target map.
10910 	 */
10911 	if (thread->map == map) {
10912 		/*
10913 		 * If the target map is the current map, just do
10914 		 * the copy.
10915 		 */
10916 		assert((vm_size_t)copy_size == copy_size);
10917 		if (copyout(copy->cpy_kdata, *addr, (vm_size_t)copy_size)) {
10918 			kr = KERN_INVALID_ADDRESS;
10919 		}
10920 	} else {
10921 		vm_map_t oldmap;
10922 
10923 		/*
10924 		 * If the target map is another map, assume the
10925 		 * target's address space identity for the duration
10926 		 * of the copy.
10927 		 */
10928 		vm_map_reference(map);
10929 		oldmap = vm_map_switch(map);
10930 
10931 		assert((vm_size_t)copy_size == copy_size);
10932 		if (copyout(copy->cpy_kdata, *addr, (vm_size_t)copy_size)) {
10933 			vm_map_copyout_kernel_buffer_failures++;
10934 			kr = KERN_INVALID_ADDRESS;
10935 		}
10936 
10937 		(void) vm_map_switch(oldmap);
10938 		vm_map_deallocate(map);
10939 	}
10940 
10941 	if (kr != KERN_SUCCESS) {
10942 		/* the copy failed, clean up */
10943 		if (!overwrite) {
10944 			/*
10945 			 * Deallocate the space we allocated in the target map.
10946 			 */
10947 			(void) vm_map_remove(map,
10948 			    vm_map_trunc_page(*addr,
10949 			    VM_MAP_PAGE_MASK(map)),
10950 			    vm_map_round_page((*addr +
10951 			    vm_map_round_page(copy_size,
10952 			    VM_MAP_PAGE_MASK(map))),
10953 			    VM_MAP_PAGE_MASK(map)));
10954 			*addr = 0;
10955 		}
10956 	} else {
10957 		/* copy was successful, dicard the copy structure */
10958 		if (consume_on_success) {
10959 			kfree_data(copy->cpy_kdata, copy_size);
10960 			zfree_id(ZONE_ID_VM_MAP_COPY, copy);
10961 		}
10962 	}
10963 
10964 	return kr;
10965 }
10966 
10967 /*
10968  *	Routine:	vm_map_copy_insert      [internal use only]
10969  *
10970  *	Description:
10971  *		Link a copy chain ("copy") into a map at the
10972  *		specified location (after "where").
10973  *
10974  *		Callers of this function must call vm_map_copy_require on
10975  *		previously created vm_map_copy_t or pass a newly created
10976  *		one to ensure that it hasn't been forged.
10977  *	Side effects:
10978  *		The copy chain is destroyed.
10979  */
10980 static void
vm_map_copy_insert(vm_map_t map,vm_map_entry_t after_where,vm_map_copy_t copy)10981 vm_map_copy_insert(
10982 	vm_map_t        map,
10983 	vm_map_entry_t  after_where,
10984 	vm_map_copy_t   copy)
10985 {
10986 	vm_map_entry_t  entry;
10987 
10988 	while (vm_map_copy_first_entry(copy) != vm_map_copy_to_entry(copy)) {
10989 		entry = vm_map_copy_first_entry(copy);
10990 		vm_map_copy_entry_unlink(copy, entry);
10991 		vm_map_store_entry_link(map, after_where, entry,
10992 		    VM_MAP_KERNEL_FLAGS_NONE);
10993 		after_where = entry;
10994 	}
10995 	zfree_id(ZONE_ID_VM_MAP_COPY, copy);
10996 }
10997 
10998 /*
10999  * Callers of this function must call vm_map_copy_require on
11000  * previously created vm_map_copy_t or pass a newly created
11001  * one to ensure that it hasn't been forged.
11002  */
11003 void
vm_map_copy_remap(vm_map_t map,vm_map_entry_t where,vm_map_copy_t copy,vm_map_offset_t adjustment,vm_prot_t cur_prot,vm_prot_t max_prot,vm_inherit_t inheritance)11004 vm_map_copy_remap(
11005 	vm_map_t        map,
11006 	vm_map_entry_t  where,
11007 	vm_map_copy_t   copy,
11008 	vm_map_offset_t adjustment,
11009 	vm_prot_t       cur_prot,
11010 	vm_prot_t       max_prot,
11011 	vm_inherit_t    inheritance)
11012 {
11013 	vm_map_entry_t  copy_entry, new_entry;
11014 
11015 	for (copy_entry = vm_map_copy_first_entry(copy);
11016 	    copy_entry != vm_map_copy_to_entry(copy);
11017 	    copy_entry = copy_entry->vme_next) {
11018 		/* get a new VM map entry for the map */
11019 		new_entry = vm_map_entry_create(map);
11020 		/* copy the "copy entry" to the new entry */
11021 		vm_map_entry_copy(map, new_entry, copy_entry);
11022 		/* adjust "start" and "end" */
11023 		new_entry->vme_start += adjustment;
11024 		new_entry->vme_end += adjustment;
11025 		/* clear some attributes */
11026 		new_entry->inheritance = inheritance;
11027 		new_entry->protection = cur_prot;
11028 		new_entry->max_protection = max_prot;
11029 		new_entry->behavior = VM_BEHAVIOR_DEFAULT;
11030 		/* take an extra reference on the entry's "object" */
11031 		if (new_entry->is_sub_map) {
11032 			assert(!new_entry->use_pmap); /* not nested */
11033 			vm_map_reference(VME_SUBMAP(new_entry));
11034 		} else {
11035 			vm_object_reference(VME_OBJECT(new_entry));
11036 		}
11037 		/* insert the new entry in the map */
11038 		vm_map_store_entry_link(map, where, new_entry,
11039 		    VM_MAP_KERNEL_FLAGS_NONE);
11040 		/* continue inserting the "copy entries" after the new entry */
11041 		where = new_entry;
11042 	}
11043 }
11044 
11045 
11046 /*
11047  * Returns true if *size matches (or is in the range of) copy->size.
11048  * Upon returning true, the *size field is updated with the actual size of the
11049  * copy object (may be different for VM_MAP_COPY_ENTRY_LIST types)
11050  */
11051 boolean_t
vm_map_copy_validate_size(vm_map_t dst_map,vm_map_copy_t copy,vm_map_size_t * size)11052 vm_map_copy_validate_size(
11053 	vm_map_t                dst_map,
11054 	vm_map_copy_t           copy,
11055 	vm_map_size_t           *size)
11056 {
11057 	if (copy == VM_MAP_COPY_NULL) {
11058 		return FALSE;
11059 	}
11060 
11061 	/*
11062 	 * Assert that the vm_map_copy is coming from the right
11063 	 * zone and hasn't been forged
11064 	 */
11065 	vm_map_copy_require(copy);
11066 
11067 	vm_map_size_t copy_sz = copy->size;
11068 	vm_map_size_t sz = *size;
11069 	switch (copy->type) {
11070 	case VM_MAP_COPY_OBJECT:
11071 	case VM_MAP_COPY_KERNEL_BUFFER:
11072 		if (sz == copy_sz) {
11073 			return TRUE;
11074 		}
11075 		break;
11076 	case VM_MAP_COPY_ENTRY_LIST:
11077 		/*
11078 		 * potential page-size rounding prevents us from exactly
11079 		 * validating this flavor of vm_map_copy, but we can at least
11080 		 * assert that it's within a range.
11081 		 */
11082 		if (copy_sz >= sz &&
11083 		    copy_sz <= vm_map_round_page(sz, VM_MAP_PAGE_MASK(dst_map))) {
11084 			*size = copy_sz;
11085 			return TRUE;
11086 		}
11087 		break;
11088 	default:
11089 		break;
11090 	}
11091 	return FALSE;
11092 }
11093 
11094 /*
11095  *	Routine:	vm_map_copyout_size
11096  *
11097  *	Description:
11098  *		Copy out a copy chain ("copy") into newly-allocated
11099  *		space in the destination map. Uses a prevalidated
11100  *		size for the copy object (vm_map_copy_validate_size).
11101  *
11102  *		If successful, consumes the copy object.
11103  *		Otherwise, the caller is responsible for it.
11104  */
11105 kern_return_t
vm_map_copyout_size(vm_map_t dst_map,vm_map_address_t * dst_addr,vm_map_copy_t copy,vm_map_size_t copy_size)11106 vm_map_copyout_size(
11107 	vm_map_t                dst_map,
11108 	vm_map_address_t        *dst_addr,      /* OUT */
11109 	vm_map_copy_t           copy,
11110 	vm_map_size_t           copy_size)
11111 {
11112 	return vm_map_copyout_internal(dst_map, dst_addr, copy, copy_size,
11113 	           TRUE,                     /* consume_on_success */
11114 	           VM_PROT_DEFAULT,
11115 	           VM_PROT_ALL,
11116 	           VM_INHERIT_DEFAULT);
11117 }
11118 
11119 /*
11120  *	Routine:	vm_map_copyout
11121  *
11122  *	Description:
11123  *		Copy out a copy chain ("copy") into newly-allocated
11124  *		space in the destination map.
11125  *
11126  *		If successful, consumes the copy object.
11127  *		Otherwise, the caller is responsible for it.
11128  */
11129 kern_return_t
vm_map_copyout(vm_map_t dst_map,vm_map_address_t * dst_addr,vm_map_copy_t copy)11130 vm_map_copyout(
11131 	vm_map_t                dst_map,
11132 	vm_map_address_t        *dst_addr,      /* OUT */
11133 	vm_map_copy_t           copy)
11134 {
11135 	return vm_map_copyout_internal(dst_map, dst_addr, copy, copy ? copy->size : 0,
11136 	           TRUE,                     /* consume_on_success */
11137 	           VM_PROT_DEFAULT,
11138 	           VM_PROT_ALL,
11139 	           VM_INHERIT_DEFAULT);
11140 }
11141 
11142 kern_return_t
vm_map_copyout_internal(vm_map_t dst_map,vm_map_address_t * dst_addr,vm_map_copy_t copy,vm_map_size_t copy_size,boolean_t consume_on_success,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)11143 vm_map_copyout_internal(
11144 	vm_map_t                dst_map,
11145 	vm_map_address_t        *dst_addr,      /* OUT */
11146 	vm_map_copy_t           copy,
11147 	vm_map_size_t           copy_size,
11148 	boolean_t               consume_on_success,
11149 	vm_prot_t               cur_protection,
11150 	vm_prot_t               max_protection,
11151 	vm_inherit_t            inheritance)
11152 {
11153 	vm_map_size_t           size;
11154 	vm_map_size_t           adjustment;
11155 	vm_map_offset_t         start;
11156 	vm_object_offset_t      vm_copy_start;
11157 	vm_map_entry_t          last;
11158 	vm_map_entry_t          entry;
11159 	vm_map_copy_t           original_copy;
11160 	kern_return_t           kr;
11161 	vm_map_kernel_flags_t   vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
11162 
11163 	/*
11164 	 *	Check for null copy object.
11165 	 */
11166 
11167 	if (copy == VM_MAP_COPY_NULL) {
11168 		*dst_addr = 0;
11169 		return KERN_SUCCESS;
11170 	}
11171 
11172 	/*
11173 	 * Assert that the vm_map_copy is coming from the right
11174 	 * zone and hasn't been forged
11175 	 */
11176 	vm_map_copy_require(copy);
11177 
11178 	if (copy->size != copy_size) {
11179 		*dst_addr = 0;
11180 		return KERN_FAILURE;
11181 	}
11182 
11183 	/*
11184 	 *	Check for special copy object, created
11185 	 *	by vm_map_copyin_object.
11186 	 */
11187 
11188 	if (copy->type == VM_MAP_COPY_OBJECT) {
11189 		vm_object_t             object = copy->cpy_object;
11190 		vm_object_offset_t      offset;
11191 
11192 		offset = vm_object_trunc_page(copy->offset);
11193 		size = vm_map_round_page((copy_size +
11194 		    (vm_map_size_t)(copy->offset -
11195 		    offset)),
11196 		    VM_MAP_PAGE_MASK(dst_map));
11197 		*dst_addr = 0;
11198 		kr = vm_map_enter(dst_map, dst_addr, size,
11199 		    (vm_map_offset_t) 0, VM_FLAGS_ANYWHERE,
11200 		    VM_MAP_KERNEL_FLAGS_NONE,
11201 		    VM_KERN_MEMORY_NONE,
11202 		    object, offset, FALSE,
11203 		    VM_PROT_DEFAULT, VM_PROT_ALL,
11204 		    VM_INHERIT_DEFAULT);
11205 		if (kr != KERN_SUCCESS) {
11206 			return kr;
11207 		}
11208 		/* Account for non-pagealigned copy object */
11209 		*dst_addr += (vm_map_offset_t)(copy->offset - offset);
11210 		if (consume_on_success) {
11211 			zfree_id(ZONE_ID_VM_MAP_COPY, copy);
11212 		}
11213 		return KERN_SUCCESS;
11214 	}
11215 
11216 	/*
11217 	 *	Check for special kernel buffer allocated
11218 	 *	by new_ipc_kmsg_copyin.
11219 	 */
11220 
11221 	if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) {
11222 		return vm_map_copyout_kernel_buffer(dst_map, dst_addr,
11223 		           copy, copy_size, FALSE,
11224 		           consume_on_success);
11225 	}
11226 
11227 	original_copy = copy;
11228 	if (copy->cpy_hdr.page_shift != VM_MAP_PAGE_SHIFT(dst_map)) {
11229 		vm_map_copy_t target_copy;
11230 		vm_map_offset_t overmap_start, overmap_end, trimmed_start;
11231 
11232 		target_copy = VM_MAP_COPY_NULL;
11233 		DEBUG4K_ADJUST("adjusting...\n");
11234 		kr = vm_map_copy_adjust_to_target(
11235 			copy,
11236 			0, /* offset */
11237 			copy->size, /* size */
11238 			dst_map,
11239 			TRUE, /* copy */
11240 			&target_copy,
11241 			&overmap_start,
11242 			&overmap_end,
11243 			&trimmed_start);
11244 		if (kr != KERN_SUCCESS) {
11245 			DEBUG4K_COPY("adjust failed 0x%x\n", kr);
11246 			return kr;
11247 		}
11248 		DEBUG4K_COPY("copy %p (%d 0x%llx 0x%llx) dst_map %p (%d) target_copy %p (%d 0x%llx 0x%llx) overmap_start 0x%llx overmap_end 0x%llx trimmed_start 0x%llx\n", copy, copy->cpy_hdr.page_shift, copy->offset, (uint64_t)copy->size, dst_map, VM_MAP_PAGE_SHIFT(dst_map), target_copy, target_copy->cpy_hdr.page_shift, target_copy->offset, (uint64_t)target_copy->size, (uint64_t)overmap_start, (uint64_t)overmap_end, (uint64_t)trimmed_start);
11249 		if (target_copy != copy) {
11250 			copy = target_copy;
11251 		}
11252 		copy_size = copy->size;
11253 	}
11254 
11255 	/*
11256 	 *	Find space for the data
11257 	 */
11258 
11259 	vm_copy_start = vm_map_trunc_page((vm_map_size_t)copy->offset,
11260 	    VM_MAP_COPY_PAGE_MASK(copy));
11261 	size = vm_map_round_page((vm_map_size_t)copy->offset + copy_size,
11262 	    VM_MAP_COPY_PAGE_MASK(copy))
11263 	    - vm_copy_start;
11264 
11265 
11266 	if (dst_map == kernel_map) {
11267 		vmk_flags.vmkf_range_id = KMEM_RANGE_ID_DATA;
11268 	}
11269 
11270 	vm_map_lock(dst_map);
11271 	kr = vm_map_locate_space(dst_map, size, 0, vmk_flags,
11272 	    &start, &last);
11273 	if (kr != KERN_SUCCESS) {
11274 		vm_map_unlock(dst_map);
11275 		return kr;
11276 	}
11277 
11278 	adjustment = start - vm_copy_start;
11279 	if (!consume_on_success) {
11280 		/*
11281 		 * We're not allowed to consume "copy", so we'll have to
11282 		 * copy its map entries into the destination map below.
11283 		 * No need to re-allocate map entries from the correct
11284 		 * (pageable or not) zone, since we'll get new map entries
11285 		 * during the transfer.
11286 		 * We'll also adjust the map entries's "start" and "end"
11287 		 * during the transfer, to keep "copy"'s entries consistent
11288 		 * with its "offset".
11289 		 */
11290 		goto after_adjustments;
11291 	}
11292 
11293 	/*
11294 	 *	Since we're going to just drop the map
11295 	 *	entries from the copy into the destination
11296 	 *	map, they must come from the same pool.
11297 	 */
11298 
11299 	if (copy->cpy_hdr.entries_pageable != dst_map->hdr.entries_pageable) {
11300 		/*
11301 		 * Mismatches occur when dealing with the default
11302 		 * pager.
11303 		 */
11304 		vm_map_entry_t  next, new;
11305 
11306 		/*
11307 		 * Find the zone that the copies were allocated from
11308 		 */
11309 
11310 		entry = vm_map_copy_first_entry(copy);
11311 
11312 		/*
11313 		 * Reinitialize the copy so that vm_map_copy_entry_link
11314 		 * will work.
11315 		 */
11316 		vm_map_store_copy_reset(copy, entry);
11317 		copy->cpy_hdr.entries_pageable = dst_map->hdr.entries_pageable;
11318 
11319 		/*
11320 		 * Copy each entry.
11321 		 */
11322 		while (entry != vm_map_copy_to_entry(copy)) {
11323 			new = vm_map_copy_entry_create(copy);
11324 			vm_map_entry_copy_full(new, entry);
11325 			new->vme_no_copy_on_read = FALSE;
11326 			assert(!new->iokit_acct);
11327 			if (new->is_sub_map) {
11328 				/* clr address space specifics */
11329 				new->use_pmap = FALSE;
11330 			}
11331 			vm_map_copy_entry_link(copy,
11332 			    vm_map_copy_last_entry(copy),
11333 			    new);
11334 			next = entry->vme_next;
11335 			vm_map_entry_dispose(entry);
11336 			entry = next;
11337 		}
11338 	}
11339 
11340 	/*
11341 	 *	Adjust the addresses in the copy chain, and
11342 	 *	reset the region attributes.
11343 	 */
11344 
11345 	for (entry = vm_map_copy_first_entry(copy);
11346 	    entry != vm_map_copy_to_entry(copy);
11347 	    entry = entry->vme_next) {
11348 		if (VM_MAP_PAGE_SHIFT(dst_map) == PAGE_SHIFT) {
11349 			/*
11350 			 * We're injecting this copy entry into a map that
11351 			 * has the standard page alignment, so clear
11352 			 * "map_aligned" (which might have been inherited
11353 			 * from the original map entry).
11354 			 */
11355 			entry->map_aligned = FALSE;
11356 		}
11357 
11358 		entry->vme_start += adjustment;
11359 		entry->vme_end += adjustment;
11360 
11361 		if (entry->map_aligned) {
11362 			assert(VM_MAP_PAGE_ALIGNED(entry->vme_start,
11363 			    VM_MAP_PAGE_MASK(dst_map)));
11364 			assert(VM_MAP_PAGE_ALIGNED(entry->vme_end,
11365 			    VM_MAP_PAGE_MASK(dst_map)));
11366 		}
11367 
11368 		entry->inheritance = VM_INHERIT_DEFAULT;
11369 		entry->protection = VM_PROT_DEFAULT;
11370 		entry->max_protection = VM_PROT_ALL;
11371 		entry->behavior = VM_BEHAVIOR_DEFAULT;
11372 
11373 		/*
11374 		 * If the entry is now wired,
11375 		 * map the pages into the destination map.
11376 		 */
11377 		if (entry->wired_count != 0) {
11378 			vm_map_offset_t va;
11379 			vm_object_offset_t       offset;
11380 			vm_object_t object;
11381 			vm_prot_t prot;
11382 			int     type_of_fault;
11383 
11384 			/* TODO4K would need to use actual page size */
11385 			assert(VM_MAP_PAGE_SHIFT(dst_map) == PAGE_SHIFT);
11386 
11387 			object = VME_OBJECT(entry);
11388 			offset = VME_OFFSET(entry);
11389 			va = entry->vme_start;
11390 
11391 			pmap_pageable(dst_map->pmap,
11392 			    entry->vme_start,
11393 			    entry->vme_end,
11394 			    TRUE);
11395 
11396 			while (va < entry->vme_end) {
11397 				vm_page_t       m;
11398 				struct vm_object_fault_info fault_info = {};
11399 
11400 				/*
11401 				 * Look up the page in the object.
11402 				 * Assert that the page will be found in the
11403 				 * top object:
11404 				 * either
11405 				 *	the object was newly created by
11406 				 *	vm_object_copy_slowly, and has
11407 				 *	copies of all of the pages from
11408 				 *	the source object
11409 				 * or
11410 				 *	the object was moved from the old
11411 				 *	map entry; because the old map
11412 				 *	entry was wired, all of the pages
11413 				 *	were in the top-level object.
11414 				 *	(XXX not true if we wire pages for
11415 				 *	 reading)
11416 				 */
11417 				vm_object_lock(object);
11418 
11419 				m = vm_page_lookup(object, offset);
11420 				if (m == VM_PAGE_NULL || !VM_PAGE_WIRED(m) ||
11421 				    m->vmp_absent) {
11422 					panic("vm_map_copyout: wiring %p", m);
11423 				}
11424 
11425 				prot = entry->protection;
11426 
11427 				if (override_nx(dst_map, VME_ALIAS(entry)) &&
11428 				    prot) {
11429 					prot |= VM_PROT_EXECUTE;
11430 				}
11431 
11432 				type_of_fault = DBG_CACHE_HIT_FAULT;
11433 
11434 				fault_info.user_tag = VME_ALIAS(entry);
11435 				fault_info.pmap_options = 0;
11436 				if (entry->iokit_acct ||
11437 				    (!entry->is_sub_map && !entry->use_pmap)) {
11438 					fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
11439 				}
11440 
11441 				vm_fault_enter(m,
11442 				    dst_map->pmap,
11443 				    va,
11444 				    PAGE_SIZE, 0,
11445 				    prot,
11446 				    prot,
11447 				    VM_PAGE_WIRED(m),
11448 				    FALSE,            /* change_wiring */
11449 				    VM_KERN_MEMORY_NONE,            /* tag - not wiring */
11450 				    &fault_info,
11451 				    NULL,             /* need_retry */
11452 				    &type_of_fault);
11453 
11454 				vm_object_unlock(object);
11455 
11456 				offset += PAGE_SIZE_64;
11457 				va += PAGE_SIZE;
11458 			}
11459 		}
11460 	}
11461 
11462 after_adjustments:
11463 
11464 	/*
11465 	 *	Correct the page alignment for the result
11466 	 */
11467 
11468 	*dst_addr = start + (copy->offset - vm_copy_start);
11469 
11470 #if KASAN
11471 	kasan_notify_address(*dst_addr, size);
11472 #endif
11473 
11474 	/*
11475 	 *	Update the hints and the map size
11476 	 */
11477 
11478 	if (consume_on_success) {
11479 		SAVE_HINT_MAP_WRITE(dst_map, vm_map_copy_last_entry(copy));
11480 	} else {
11481 		SAVE_HINT_MAP_WRITE(dst_map, last);
11482 	}
11483 
11484 	dst_map->size += size;
11485 
11486 	/*
11487 	 *	Link in the copy
11488 	 */
11489 
11490 	if (consume_on_success) {
11491 		vm_map_copy_insert(dst_map, last, copy);
11492 		if (copy != original_copy) {
11493 			vm_map_copy_discard(original_copy);
11494 			original_copy = VM_MAP_COPY_NULL;
11495 		}
11496 	} else {
11497 		vm_map_copy_remap(dst_map, last, copy, adjustment,
11498 		    cur_protection, max_protection,
11499 		    inheritance);
11500 		if (copy != original_copy && original_copy != VM_MAP_COPY_NULL) {
11501 			vm_map_copy_discard(copy);
11502 			copy = original_copy;
11503 		}
11504 	}
11505 
11506 
11507 	vm_map_unlock(dst_map);
11508 
11509 	/*
11510 	 * XXX	If wiring_required, call vm_map_pageable
11511 	 */
11512 
11513 	return KERN_SUCCESS;
11514 }
11515 
11516 /*
11517  *	Routine:	vm_map_copyin
11518  *
11519  *	Description:
11520  *		see vm_map_copyin_common.  Exported via Unsupported.exports.
11521  *
11522  */
11523 
11524 #undef vm_map_copyin
11525 
11526 kern_return_t
vm_map_copyin(vm_map_t src_map,vm_map_address_t src_addr,vm_map_size_t len,boolean_t src_destroy,vm_map_copy_t * copy_result)11527 vm_map_copyin(
11528 	vm_map_t                        src_map,
11529 	vm_map_address_t        src_addr,
11530 	vm_map_size_t           len,
11531 	boolean_t                       src_destroy,
11532 	vm_map_copy_t           *copy_result)   /* OUT */
11533 {
11534 	return vm_map_copyin_common(src_map, src_addr, len, src_destroy,
11535 	           FALSE, copy_result, FALSE);
11536 }
11537 
11538 /*
11539  *	Routine:	vm_map_copyin_common
11540  *
11541  *	Description:
11542  *		Copy the specified region (src_addr, len) from the
11543  *		source address space (src_map), possibly removing
11544  *		the region from the source address space (src_destroy).
11545  *
11546  *	Returns:
11547  *		A vm_map_copy_t object (copy_result), suitable for
11548  *		insertion into another address space (using vm_map_copyout),
11549  *		copying over another address space region (using
11550  *		vm_map_copy_overwrite).  If the copy is unused, it
11551  *		should be destroyed (using vm_map_copy_discard).
11552  *
11553  *	In/out conditions:
11554  *		The source map should not be locked on entry.
11555  */
11556 
11557 typedef struct submap_map {
11558 	vm_map_t        parent_map;
11559 	vm_map_offset_t base_start;
11560 	vm_map_offset_t base_end;
11561 	vm_map_size_t   base_len;
11562 	struct submap_map *next;
11563 } submap_map_t;
11564 
11565 kern_return_t
vm_map_copyin_common(vm_map_t src_map,vm_map_address_t src_addr,vm_map_size_t len,boolean_t src_destroy,__unused boolean_t src_volatile,vm_map_copy_t * copy_result,boolean_t use_maxprot)11566 vm_map_copyin_common(
11567 	vm_map_t        src_map,
11568 	vm_map_address_t src_addr,
11569 	vm_map_size_t   len,
11570 	boolean_t       src_destroy,
11571 	__unused boolean_t      src_volatile,
11572 	vm_map_copy_t   *copy_result,   /* OUT */
11573 	boolean_t       use_maxprot)
11574 {
11575 	int flags;
11576 
11577 	flags = 0;
11578 	if (src_destroy) {
11579 		flags |= VM_MAP_COPYIN_SRC_DESTROY;
11580 	}
11581 	if (use_maxprot) {
11582 		flags |= VM_MAP_COPYIN_USE_MAXPROT;
11583 	}
11584 	return vm_map_copyin_internal(src_map,
11585 	           src_addr,
11586 	           len,
11587 	           flags,
11588 	           copy_result);
11589 }
11590 kern_return_t
vm_map_copyin_internal(vm_map_t src_map,vm_map_address_t src_addr,vm_map_size_t len,int flags,vm_map_copy_t * copy_result)11591 vm_map_copyin_internal(
11592 	vm_map_t        src_map,
11593 	vm_map_address_t src_addr,
11594 	vm_map_size_t   len,
11595 	int             flags,
11596 	vm_map_copy_t   *copy_result)   /* OUT */
11597 {
11598 	vm_map_entry_t  tmp_entry;      /* Result of last map lookup --
11599 	                                 * in multi-level lookup, this
11600 	                                 * entry contains the actual
11601 	                                 * vm_object/offset.
11602 	                                 */
11603 	vm_map_entry_t  new_entry = VM_MAP_ENTRY_NULL;  /* Map entry for copy */
11604 
11605 	vm_map_offset_t src_start;      /* Start of current entry --
11606 	                                 * where copy is taking place now
11607 	                                 */
11608 	vm_map_offset_t src_end;        /* End of entire region to be
11609 	                                 * copied */
11610 	vm_map_offset_t src_base;
11611 	vm_map_t        base_map = src_map;
11612 	boolean_t       map_share = FALSE;
11613 	submap_map_t    *parent_maps = NULL;
11614 
11615 	vm_map_copy_t   copy;           /* Resulting copy */
11616 	vm_map_address_t copy_addr;
11617 	vm_map_size_t   copy_size;
11618 	boolean_t       src_destroy;
11619 	boolean_t       use_maxprot;
11620 	boolean_t       preserve_purgeable;
11621 	boolean_t       entry_was_shared;
11622 	vm_map_entry_t  saved_src_entry;
11623 
11624 	if (flags & ~VM_MAP_COPYIN_ALL_FLAGS) {
11625 		return KERN_INVALID_ARGUMENT;
11626 	}
11627 
11628 #if CONFIG_KERNEL_TBI
11629 	if (src_map->pmap == kernel_pmap) {
11630 		src_addr = VM_KERNEL_TBI_FILL(src_addr);
11631 	}
11632 #endif /* CONFIG_KERNEL_TBI && KASAN_TBI */
11633 
11634 	src_destroy = (flags & VM_MAP_COPYIN_SRC_DESTROY) ? TRUE : FALSE;
11635 	use_maxprot = (flags & VM_MAP_COPYIN_USE_MAXPROT) ? TRUE : FALSE;
11636 	preserve_purgeable =
11637 	    (flags & VM_MAP_COPYIN_PRESERVE_PURGEABLE) ? TRUE : FALSE;
11638 
11639 	/*
11640 	 *	Check for copies of zero bytes.
11641 	 */
11642 
11643 	if (len == 0) {
11644 		*copy_result = VM_MAP_COPY_NULL;
11645 		return KERN_SUCCESS;
11646 	}
11647 
11648 	/*
11649 	 *	Check that the end address doesn't overflow
11650 	 */
11651 	src_end = src_addr + len;
11652 	if (src_end < src_addr) {
11653 		return KERN_INVALID_ADDRESS;
11654 	}
11655 
11656 	/*
11657 	 *	Compute (page aligned) start and end of region
11658 	 */
11659 	src_start = vm_map_trunc_page(src_addr,
11660 	    VM_MAP_PAGE_MASK(src_map));
11661 	src_end = vm_map_round_page(src_end,
11662 	    VM_MAP_PAGE_MASK(src_map));
11663 
11664 	/*
11665 	 * If the copy is sufficiently small, use a kernel buffer instead
11666 	 * of making a virtual copy.  The theory being that the cost of
11667 	 * setting up VM (and taking C-O-W faults) dominates the copy costs
11668 	 * for small regions.
11669 	 */
11670 	if ((len <= msg_ool_size_small) &&
11671 	    !use_maxprot &&
11672 	    !preserve_purgeable &&
11673 	    !(flags & VM_MAP_COPYIN_ENTRY_LIST) &&
11674 	    /*
11675 	     * Since the "msg_ool_size_small" threshold was increased and
11676 	     * vm_map_copyin_kernel_buffer() doesn't handle accesses beyond the
11677 	     * address space limits, we revert to doing a virtual copy if the
11678 	     * copied range goes beyond those limits.  Otherwise, mach_vm_read()
11679 	     * of the commpage would now fail when it used to work.
11680 	     */
11681 	    (src_start >= vm_map_min(src_map) &&
11682 	    src_start < vm_map_max(src_map) &&
11683 	    src_end >= vm_map_min(src_map) &&
11684 	    src_end < vm_map_max(src_map))) {
11685 		return vm_map_copyin_kernel_buffer(src_map, src_addr, len,
11686 		           src_destroy, copy_result);
11687 	}
11688 
11689 	/*
11690 	 *	Allocate a header element for the list.
11691 	 *
11692 	 *	Use the start and end in the header to
11693 	 *	remember the endpoints prior to rounding.
11694 	 */
11695 
11696 	copy = vm_map_copy_allocate();
11697 	copy->type = VM_MAP_COPY_ENTRY_LIST;
11698 	copy->cpy_hdr.entries_pageable = TRUE;
11699 	copy->cpy_hdr.page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(src_map);
11700 
11701 	vm_map_store_init( &(copy->cpy_hdr));
11702 
11703 	copy->offset = src_addr;
11704 	copy->size = len;
11705 
11706 	new_entry = vm_map_copy_entry_create(copy);
11707 
11708 #define RETURN(x)                                               \
11709 	MACRO_BEGIN                                             \
11710 	vm_map_unlock(src_map);                                 \
11711 	if(src_map != base_map)                                 \
11712 	        vm_map_deallocate(src_map);                     \
11713 	if (new_entry != VM_MAP_ENTRY_NULL)                     \
11714 	        vm_map_copy_entry_dispose(new_entry);           \
11715 	vm_map_copy_discard(copy);                              \
11716 	{                                                       \
11717 	        submap_map_t	*_ptr;                          \
11718                                                                 \
11719 	        for(_ptr = parent_maps; _ptr != NULL; _ptr = parent_maps) { \
11720 	                parent_maps=parent_maps->next;          \
11721 	                if (_ptr->parent_map != base_map)       \
11722 	                        vm_map_deallocate(_ptr->parent_map);    \
11723 	                kfree_type(submap_map_t, _ptr);         \
11724 	        }                                               \
11725 	}                                                       \
11726 	MACRO_RETURN(x);                                        \
11727 	MACRO_END
11728 
11729 	/*
11730 	 *	Find the beginning of the region.
11731 	 */
11732 
11733 	vm_map_lock(src_map);
11734 
11735 	/*
11736 	 * Lookup the original "src_addr" rather than the truncated
11737 	 * "src_start", in case "src_start" falls in a non-map-aligned
11738 	 * map entry *before* the map entry that contains "src_addr"...
11739 	 */
11740 	if (!vm_map_lookup_entry(src_map, src_addr, &tmp_entry)) {
11741 		RETURN(KERN_INVALID_ADDRESS);
11742 	}
11743 	if (!tmp_entry->is_sub_map) {
11744 		/*
11745 		 * ... but clip to the map-rounded "src_start" rather than
11746 		 * "src_addr" to preserve map-alignment.  We'll adjust the
11747 		 * first copy entry at the end, if needed.
11748 		 */
11749 		vm_map_clip_start(src_map, tmp_entry, src_start);
11750 	}
11751 	if (src_start < tmp_entry->vme_start) {
11752 		/*
11753 		 * Move "src_start" up to the start of the
11754 		 * first map entry to copy.
11755 		 */
11756 		src_start = tmp_entry->vme_start;
11757 	}
11758 	/* set for later submap fix-up */
11759 	copy_addr = src_start;
11760 
11761 	/*
11762 	 *	Go through entries until we get to the end.
11763 	 */
11764 
11765 	while (TRUE) {
11766 		vm_map_entry_t  src_entry = tmp_entry;  /* Top-level entry */
11767 		vm_map_size_t   src_size;               /* Size of source
11768 		                                         * map entry (in both
11769 		                                         * maps)
11770 		                                         */
11771 
11772 		vm_object_t             src_object;     /* Object to copy */
11773 		vm_object_offset_t      src_offset;
11774 
11775 		vm_object_t             new_copy_object;/* vm_object_copy_* result */
11776 
11777 		boolean_t       src_needs_copy;         /* Should source map
11778 		                                         * be made read-only
11779 		                                         * for copy-on-write?
11780 		                                         */
11781 
11782 		boolean_t       new_entry_needs_copy;   /* Will new entry be COW? */
11783 
11784 		boolean_t       was_wired;              /* Was source wired? */
11785 		boolean_t       saved_used_for_jit;     /* Saved used_for_jit. */
11786 		vm_map_version_t version;               /* Version before locks
11787 		                                         * dropped to make copy
11788 		                                         */
11789 		kern_return_t   result;                 /* Return value from
11790 		                                         * copy_strategically.
11791 		                                         */
11792 		while (tmp_entry->is_sub_map) {
11793 			vm_map_size_t submap_len;
11794 			submap_map_t *ptr;
11795 
11796 			ptr = kalloc_type(submap_map_t, Z_WAITOK);
11797 			ptr->next = parent_maps;
11798 			parent_maps = ptr;
11799 			ptr->parent_map = src_map;
11800 			ptr->base_start = src_start;
11801 			ptr->base_end = src_end;
11802 			submap_len = tmp_entry->vme_end - src_start;
11803 			if (submap_len > (src_end - src_start)) {
11804 				submap_len = src_end - src_start;
11805 			}
11806 			ptr->base_len = submap_len;
11807 
11808 			src_start -= tmp_entry->vme_start;
11809 			src_start += VME_OFFSET(tmp_entry);
11810 			src_end = src_start + submap_len;
11811 			src_map = VME_SUBMAP(tmp_entry);
11812 			vm_map_lock(src_map);
11813 			/* keep an outstanding reference for all maps in */
11814 			/* the parents tree except the base map */
11815 			vm_map_reference(src_map);
11816 			vm_map_unlock(ptr->parent_map);
11817 			if (!vm_map_lookup_entry(
11818 				    src_map, src_start, &tmp_entry)) {
11819 				RETURN(KERN_INVALID_ADDRESS);
11820 			}
11821 			map_share = TRUE;
11822 			if (!tmp_entry->is_sub_map) {
11823 				vm_map_clip_start(src_map, tmp_entry, src_start);
11824 			}
11825 			src_entry = tmp_entry;
11826 		}
11827 		/* we are now in the lowest level submap... */
11828 
11829 		if ((VME_OBJECT(tmp_entry) != VM_OBJECT_NULL) &&
11830 		    (VME_OBJECT(tmp_entry)->phys_contiguous)) {
11831 			/* This is not, supported for now.In future */
11832 			/* we will need to detect the phys_contig   */
11833 			/* condition and then upgrade copy_slowly   */
11834 			/* to do physical copy from the device mem  */
11835 			/* based object. We can piggy-back off of   */
11836 			/* the was wired boolean to set-up the      */
11837 			/* proper handling */
11838 			RETURN(KERN_PROTECTION_FAILURE);
11839 		}
11840 		/*
11841 		 *	Create a new address map entry to hold the result.
11842 		 *	Fill in the fields from the appropriate source entries.
11843 		 *	We must unlock the source map to do this if we need
11844 		 *	to allocate a map entry.
11845 		 */
11846 		if (new_entry == VM_MAP_ENTRY_NULL) {
11847 			version.main_timestamp = src_map->timestamp;
11848 			vm_map_unlock(src_map);
11849 
11850 			new_entry = vm_map_copy_entry_create(copy);
11851 
11852 			vm_map_lock(src_map);
11853 			if ((version.main_timestamp + 1) != src_map->timestamp) {
11854 				if (!vm_map_lookup_entry(src_map, src_start,
11855 				    &tmp_entry)) {
11856 					RETURN(KERN_INVALID_ADDRESS);
11857 				}
11858 				if (!tmp_entry->is_sub_map) {
11859 					vm_map_clip_start(src_map, tmp_entry, src_start);
11860 				}
11861 				continue; /* restart w/ new tmp_entry */
11862 			}
11863 		}
11864 
11865 		/*
11866 		 *	Verify that the region can be read.
11867 		 */
11868 		if (((src_entry->protection & VM_PROT_READ) == VM_PROT_NONE &&
11869 		    !use_maxprot) ||
11870 		    (src_entry->max_protection & VM_PROT_READ) == 0) {
11871 			RETURN(KERN_PROTECTION_FAILURE);
11872 		}
11873 
11874 		/*
11875 		 *	Clip against the endpoints of the entire region.
11876 		 */
11877 
11878 		vm_map_clip_end(src_map, src_entry, src_end);
11879 
11880 		src_size = src_entry->vme_end - src_start;
11881 		src_object = VME_OBJECT(src_entry);
11882 		src_offset = VME_OFFSET(src_entry);
11883 		was_wired = (src_entry->wired_count != 0);
11884 
11885 		vm_map_entry_copy(src_map, new_entry, src_entry);
11886 		if (new_entry->is_sub_map) {
11887 			/* clr address space specifics */
11888 			new_entry->use_pmap = FALSE;
11889 		} else {
11890 			/*
11891 			 * We're dealing with a copy-on-write operation,
11892 			 * so the resulting mapping should not inherit the
11893 			 * original mapping's accounting settings.
11894 			 * "iokit_acct" should have been cleared in
11895 			 * vm_map_entry_copy().
11896 			 * "use_pmap" should be reset to its default (TRUE)
11897 			 * so that the new mapping gets accounted for in
11898 			 * the task's memory footprint.
11899 			 */
11900 			assert(!new_entry->iokit_acct);
11901 			new_entry->use_pmap = TRUE;
11902 		}
11903 
11904 		/*
11905 		 *	Attempt non-blocking copy-on-write optimizations.
11906 		 */
11907 
11908 		/*
11909 		 * If we are destroying the source, and the object
11910 		 * is internal, we could move the object reference
11911 		 * from the source to the copy.  The copy is
11912 		 * copy-on-write only if the source is.
11913 		 * We make another reference to the object, because
11914 		 * destroying the source entry will deallocate it.
11915 		 *
11916 		 * This memory transfer has to be atomic, (to prevent
11917 		 * the VM object from being shared or copied while
11918 		 * it's being moved here), so we could only do this
11919 		 * if we won't have to unlock the VM map until the
11920 		 * original mapping has been fully removed.
11921 		 */
11922 
11923 RestartCopy:
11924 		if ((src_object == VM_OBJECT_NULL ||
11925 		    (!was_wired && !map_share && !tmp_entry->is_shared
11926 		    && !(debug4k_no_cow_copyin && VM_MAP_PAGE_SHIFT(src_map) < PAGE_SHIFT))) &&
11927 		    vm_object_copy_quickly(
11928 			    VME_OBJECT(new_entry),
11929 			    src_offset,
11930 			    src_size,
11931 			    &src_needs_copy,
11932 			    &new_entry_needs_copy)) {
11933 			new_entry->needs_copy = new_entry_needs_copy;
11934 
11935 			/*
11936 			 *	Handle copy-on-write obligations
11937 			 */
11938 
11939 			if (src_needs_copy && !tmp_entry->needs_copy) {
11940 				vm_prot_t prot;
11941 
11942 				prot = src_entry->protection & ~VM_PROT_WRITE;
11943 
11944 				if (override_nx(src_map, VME_ALIAS(src_entry))
11945 				    && prot) {
11946 					prot |= VM_PROT_EXECUTE;
11947 				}
11948 
11949 				vm_object_pmap_protect(
11950 					src_object,
11951 					src_offset,
11952 					src_size,
11953 					(src_entry->is_shared ?
11954 					PMAP_NULL
11955 					: src_map->pmap),
11956 					VM_MAP_PAGE_SIZE(src_map),
11957 					src_entry->vme_start,
11958 					prot);
11959 
11960 				assert(tmp_entry->wired_count == 0);
11961 				tmp_entry->needs_copy = TRUE;
11962 			}
11963 
11964 			/*
11965 			 *	The map has never been unlocked, so it's safe
11966 			 *	to move to the next entry rather than doing
11967 			 *	another lookup.
11968 			 */
11969 
11970 			goto CopySuccessful;
11971 		}
11972 
11973 		entry_was_shared = tmp_entry->is_shared;
11974 
11975 		/*
11976 		 *	Take an object reference, so that we may
11977 		 *	release the map lock(s).
11978 		 */
11979 
11980 		assert(src_object != VM_OBJECT_NULL);
11981 		vm_object_reference(src_object);
11982 
11983 		/*
11984 		 *	Record the timestamp for later verification.
11985 		 *	Unlock the map.
11986 		 */
11987 
11988 		version.main_timestamp = src_map->timestamp;
11989 		vm_map_unlock(src_map); /* Increments timestamp once! */
11990 		saved_src_entry = src_entry;
11991 		tmp_entry = VM_MAP_ENTRY_NULL;
11992 		src_entry = VM_MAP_ENTRY_NULL;
11993 
11994 		/*
11995 		 *	Perform the copy
11996 		 */
11997 
11998 		if (was_wired ||
11999 		    (debug4k_no_cow_copyin &&
12000 		    VM_MAP_PAGE_SHIFT(src_map) < PAGE_SHIFT)) {
12001 CopySlowly:
12002 			vm_object_lock(src_object);
12003 			result = vm_object_copy_slowly(
12004 				src_object,
12005 				src_offset,
12006 				src_size,
12007 				THREAD_UNINT,
12008 				&new_copy_object);
12009 			/* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
12010 			saved_used_for_jit = new_entry->used_for_jit;
12011 			VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
12012 			new_entry->used_for_jit = saved_used_for_jit;
12013 			VME_OFFSET_SET(new_entry,
12014 			    src_offset - vm_object_trunc_page(src_offset));
12015 			new_entry->needs_copy = FALSE;
12016 		} else if (src_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC &&
12017 		    (entry_was_shared || map_share)) {
12018 			vm_object_t new_object;
12019 
12020 			vm_object_lock_shared(src_object);
12021 			new_object = vm_object_copy_delayed(
12022 				src_object,
12023 				src_offset,
12024 				src_size,
12025 				TRUE);
12026 			if (new_object == VM_OBJECT_NULL) {
12027 				goto CopySlowly;
12028 			}
12029 
12030 			VME_OBJECT_SET(new_entry, new_object, false, 0);
12031 			assert(new_entry->wired_count == 0);
12032 			new_entry->needs_copy = TRUE;
12033 			assert(!new_entry->iokit_acct);
12034 			assert(new_object->purgable == VM_PURGABLE_DENY);
12035 			assertf(new_entry->use_pmap, "src_map %p new_entry %p\n", src_map, new_entry);
12036 			result = KERN_SUCCESS;
12037 		} else {
12038 			vm_object_offset_t new_offset;
12039 			new_offset = VME_OFFSET(new_entry);
12040 			result = vm_object_copy_strategically(src_object,
12041 			    src_offset,
12042 			    src_size,
12043 			    &new_copy_object,
12044 			    &new_offset,
12045 			    &new_entry_needs_copy);
12046 			/* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
12047 			saved_used_for_jit = new_entry->used_for_jit;
12048 			VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
12049 			new_entry->used_for_jit = saved_used_for_jit;
12050 			if (new_offset != VME_OFFSET(new_entry)) {
12051 				VME_OFFSET_SET(new_entry, new_offset);
12052 			}
12053 
12054 			new_entry->needs_copy = new_entry_needs_copy;
12055 		}
12056 
12057 		if (result == KERN_SUCCESS &&
12058 		    ((preserve_purgeable &&
12059 		    src_object->purgable != VM_PURGABLE_DENY) ||
12060 		    new_entry->used_for_jit)) {
12061 			/*
12062 			 * Purgeable objects should be COPY_NONE, true share;
12063 			 * this should be propogated to the copy.
12064 			 *
12065 			 * Also force mappings the pmap specially protects to
12066 			 * be COPY_NONE; trying to COW these mappings would
12067 			 * change the effective protections, which could have
12068 			 * side effects if the pmap layer relies on the
12069 			 * specified protections.
12070 			 */
12071 
12072 			vm_object_t     new_object;
12073 
12074 			new_object = VME_OBJECT(new_entry);
12075 			assert(new_object != src_object);
12076 			vm_object_lock(new_object);
12077 			assert(new_object->ref_count == 1);
12078 			assert(new_object->shadow == VM_OBJECT_NULL);
12079 			assert(new_object->copy == VM_OBJECT_NULL);
12080 			assert(new_object->vo_owner == NULL);
12081 
12082 			new_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
12083 
12084 			if (preserve_purgeable &&
12085 			    src_object->purgable != VM_PURGABLE_DENY) {
12086 				new_object->true_share = TRUE;
12087 
12088 				/* start as non-volatile with no owner... */
12089 				new_object->purgable = VM_PURGABLE_NONVOLATILE;
12090 				vm_purgeable_nonvolatile_enqueue(new_object, NULL);
12091 				/* ... and move to src_object's purgeable state */
12092 				if (src_object->purgable != VM_PURGABLE_NONVOLATILE) {
12093 					int state;
12094 					state = src_object->purgable;
12095 					vm_object_purgable_control(
12096 						new_object,
12097 						VM_PURGABLE_SET_STATE_FROM_KERNEL,
12098 						&state);
12099 				}
12100 				/* no pmap accounting for purgeable objects */
12101 				new_entry->use_pmap = FALSE;
12102 			}
12103 
12104 			vm_object_unlock(new_object);
12105 			new_object = VM_OBJECT_NULL;
12106 		}
12107 
12108 		if (result != KERN_SUCCESS &&
12109 		    result != KERN_MEMORY_RESTART_COPY) {
12110 			vm_map_lock(src_map);
12111 			RETURN(result);
12112 		}
12113 
12114 		/*
12115 		 *	Throw away the extra reference
12116 		 */
12117 
12118 		vm_object_deallocate(src_object);
12119 
12120 		/*
12121 		 *	Verify that the map has not substantially
12122 		 *	changed while the copy was being made.
12123 		 */
12124 
12125 		vm_map_lock(src_map);
12126 
12127 		if ((version.main_timestamp + 1) == src_map->timestamp) {
12128 			/* src_map hasn't changed: src_entry is still valid */
12129 			src_entry = saved_src_entry;
12130 			goto VerificationSuccessful;
12131 		}
12132 
12133 		/*
12134 		 *	Simple version comparison failed.
12135 		 *
12136 		 *	Retry the lookup and verify that the
12137 		 *	same object/offset are still present.
12138 		 *
12139 		 *	[Note: a memory manager that colludes with
12140 		 *	the calling task can detect that we have
12141 		 *	cheated.  While the map was unlocked, the
12142 		 *	mapping could have been changed and restored.]
12143 		 */
12144 
12145 		if (!vm_map_lookup_entry(src_map, src_start, &tmp_entry)) {
12146 			if (result != KERN_MEMORY_RESTART_COPY) {
12147 				vm_object_deallocate(VME_OBJECT(new_entry));
12148 				VME_OBJECT_SET(new_entry, VM_OBJECT_NULL, false, 0);
12149 				/* reset accounting state */
12150 				new_entry->iokit_acct = FALSE;
12151 				new_entry->use_pmap = TRUE;
12152 			}
12153 			RETURN(KERN_INVALID_ADDRESS);
12154 		}
12155 
12156 		src_entry = tmp_entry;
12157 		vm_map_clip_start(src_map, src_entry, src_start);
12158 
12159 		if ((((src_entry->protection & VM_PROT_READ) == VM_PROT_NONE) &&
12160 		    !use_maxprot) ||
12161 		    ((src_entry->max_protection & VM_PROT_READ) == 0)) {
12162 			goto VerificationFailed;
12163 		}
12164 
12165 		if (src_entry->vme_end < new_entry->vme_end) {
12166 			/*
12167 			 * This entry might have been shortened
12168 			 * (vm_map_clip_end) or been replaced with
12169 			 * an entry that ends closer to "src_start"
12170 			 * than before.
12171 			 * Adjust "new_entry" accordingly; copying
12172 			 * less memory would be correct but we also
12173 			 * redo the copy (see below) if the new entry
12174 			 * no longer points at the same object/offset.
12175 			 */
12176 			assert(VM_MAP_PAGE_ALIGNED(src_entry->vme_end,
12177 			    VM_MAP_COPY_PAGE_MASK(copy)));
12178 			new_entry->vme_end = src_entry->vme_end;
12179 			src_size = new_entry->vme_end - src_start;
12180 		} else if (src_entry->vme_end > new_entry->vme_end) {
12181 			/*
12182 			 * This entry might have been extended
12183 			 * (vm_map_entry_simplify() or coalesce)
12184 			 * or been replaced with an entry that ends farther
12185 			 * from "src_start" than before.
12186 			 *
12187 			 * We've called vm_object_copy_*() only on
12188 			 * the previous <start:end> range, so we can't
12189 			 * just extend new_entry.  We have to re-do
12190 			 * the copy based on the new entry as if it was
12191 			 * pointing at a different object/offset (see
12192 			 * "Verification failed" below).
12193 			 */
12194 		}
12195 
12196 		if ((VME_OBJECT(src_entry) != src_object) ||
12197 		    (VME_OFFSET(src_entry) != src_offset) ||
12198 		    (src_entry->vme_end > new_entry->vme_end)) {
12199 			/*
12200 			 *	Verification failed.
12201 			 *
12202 			 *	Start over with this top-level entry.
12203 			 */
12204 
12205 VerificationFailed:     ;
12206 
12207 			vm_object_deallocate(VME_OBJECT(new_entry));
12208 			tmp_entry = src_entry;
12209 			continue;
12210 		}
12211 
12212 		/*
12213 		 *	Verification succeeded.
12214 		 */
12215 
12216 VerificationSuccessful:;
12217 
12218 		if (result == KERN_MEMORY_RESTART_COPY) {
12219 			goto RestartCopy;
12220 		}
12221 
12222 		/*
12223 		 *	Copy succeeded.
12224 		 */
12225 
12226 CopySuccessful: ;
12227 
12228 		/*
12229 		 *	Link in the new copy entry.
12230 		 */
12231 
12232 		vm_map_copy_entry_link(copy, vm_map_copy_last_entry(copy),
12233 		    new_entry);
12234 
12235 		/*
12236 		 *	Determine whether the entire region
12237 		 *	has been copied.
12238 		 */
12239 		src_base = src_start;
12240 		src_start = new_entry->vme_end;
12241 		new_entry = VM_MAP_ENTRY_NULL;
12242 		while ((src_start >= src_end) && (src_end != 0)) {
12243 			submap_map_t    *ptr;
12244 
12245 			if (src_map == base_map) {
12246 				/* back to the top */
12247 				break;
12248 			}
12249 
12250 			ptr = parent_maps;
12251 			assert(ptr != NULL);
12252 			parent_maps = parent_maps->next;
12253 
12254 			/* fix up the damage we did in that submap */
12255 			vm_map_simplify_range(src_map,
12256 			    src_base,
12257 			    src_end);
12258 
12259 			vm_map_unlock(src_map);
12260 			vm_map_deallocate(src_map);
12261 			vm_map_lock(ptr->parent_map);
12262 			src_map = ptr->parent_map;
12263 			src_base = ptr->base_start;
12264 			src_start = ptr->base_start + ptr->base_len;
12265 			src_end = ptr->base_end;
12266 			if (!vm_map_lookup_entry(src_map,
12267 			    src_start,
12268 			    &tmp_entry) &&
12269 			    (src_end > src_start)) {
12270 				RETURN(KERN_INVALID_ADDRESS);
12271 			}
12272 			kfree_type(submap_map_t, ptr);
12273 			if (parent_maps == NULL) {
12274 				map_share = FALSE;
12275 			}
12276 			src_entry = tmp_entry->vme_prev;
12277 		}
12278 
12279 		if ((VM_MAP_PAGE_SHIFT(src_map) != PAGE_SHIFT) &&
12280 		    (src_start >= src_addr + len) &&
12281 		    (src_addr + len != 0)) {
12282 			/*
12283 			 * Stop copying now, even though we haven't reached
12284 			 * "src_end".  We'll adjust the end of the last copy
12285 			 * entry at the end, if needed.
12286 			 *
12287 			 * If src_map's aligment is different from the
12288 			 * system's page-alignment, there could be
12289 			 * extra non-map-aligned map entries between
12290 			 * the original (non-rounded) "src_addr + len"
12291 			 * and the rounded "src_end".
12292 			 * We do not want to copy those map entries since
12293 			 * they're not part of the copied range.
12294 			 */
12295 			break;
12296 		}
12297 
12298 		if ((src_start >= src_end) && (src_end != 0)) {
12299 			break;
12300 		}
12301 
12302 		/*
12303 		 *	Verify that there are no gaps in the region
12304 		 */
12305 
12306 		tmp_entry = src_entry->vme_next;
12307 		if ((tmp_entry->vme_start != src_start) ||
12308 		    (tmp_entry == vm_map_to_entry(src_map))) {
12309 			RETURN(KERN_INVALID_ADDRESS);
12310 		}
12311 	}
12312 
12313 	/*
12314 	 * If the source should be destroyed, do it now, since the
12315 	 * copy was successful.
12316 	 */
12317 	if (src_destroy) {
12318 		vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_FLAGS;
12319 
12320 		if (src_map == kernel_map) {
12321 			remove_flags |= VM_MAP_REMOVE_KUNWIRE;
12322 		}
12323 		(void)vm_map_remove_and_unlock(src_map,
12324 		    vm_map_trunc_page(src_addr, VM_MAP_PAGE_MASK(src_map)),
12325 		    src_end,
12326 		    remove_flags,
12327 		    KMEM_GUARD_NONE);
12328 	} else {
12329 		/* fix up the damage we did in the base map */
12330 		vm_map_simplify_range(
12331 			src_map,
12332 			vm_map_trunc_page(src_addr,
12333 			VM_MAP_PAGE_MASK(src_map)),
12334 			vm_map_round_page(src_end,
12335 			VM_MAP_PAGE_MASK(src_map)));
12336 		vm_map_unlock(src_map);
12337 	}
12338 
12339 	tmp_entry = VM_MAP_ENTRY_NULL;
12340 
12341 	if (VM_MAP_PAGE_SHIFT(src_map) > PAGE_SHIFT &&
12342 	    VM_MAP_PAGE_SHIFT(src_map) != VM_MAP_COPY_PAGE_SHIFT(copy)) {
12343 		vm_map_offset_t original_start, original_offset, original_end;
12344 
12345 		assert(VM_MAP_COPY_PAGE_MASK(copy) == PAGE_MASK);
12346 
12347 		/* adjust alignment of first copy_entry's "vme_start" */
12348 		tmp_entry = vm_map_copy_first_entry(copy);
12349 		if (tmp_entry != vm_map_copy_to_entry(copy)) {
12350 			vm_map_offset_t adjustment;
12351 
12352 			original_start = tmp_entry->vme_start;
12353 			original_offset = VME_OFFSET(tmp_entry);
12354 
12355 			/* map-align the start of the first copy entry... */
12356 			adjustment = (tmp_entry->vme_start -
12357 			    vm_map_trunc_page(
12358 				    tmp_entry->vme_start,
12359 				    VM_MAP_PAGE_MASK(src_map)));
12360 			tmp_entry->vme_start -= adjustment;
12361 			VME_OFFSET_SET(tmp_entry,
12362 			    VME_OFFSET(tmp_entry) - adjustment);
12363 			copy_addr -= adjustment;
12364 			assert(tmp_entry->vme_start < tmp_entry->vme_end);
12365 			/* ... adjust for mis-aligned start of copy range */
12366 			adjustment =
12367 			    (vm_map_trunc_page(copy->offset,
12368 			    PAGE_MASK) -
12369 			    vm_map_trunc_page(copy->offset,
12370 			    VM_MAP_PAGE_MASK(src_map)));
12371 			if (adjustment) {
12372 				assert(page_aligned(adjustment));
12373 				assert(adjustment < VM_MAP_PAGE_SIZE(src_map));
12374 				tmp_entry->vme_start += adjustment;
12375 				VME_OFFSET_SET(tmp_entry,
12376 				    (VME_OFFSET(tmp_entry) +
12377 				    adjustment));
12378 				copy_addr += adjustment;
12379 				assert(tmp_entry->vme_start < tmp_entry->vme_end);
12380 			}
12381 
12382 			/*
12383 			 * Assert that the adjustments haven't exposed
12384 			 * more than was originally copied...
12385 			 */
12386 			assert(tmp_entry->vme_start >= original_start);
12387 			assert(VME_OFFSET(tmp_entry) >= original_offset);
12388 			/*
12389 			 * ... and that it did not adjust outside of a
12390 			 * a single 16K page.
12391 			 */
12392 			assert(vm_map_trunc_page(tmp_entry->vme_start,
12393 			    VM_MAP_PAGE_MASK(src_map)) ==
12394 			    vm_map_trunc_page(original_start,
12395 			    VM_MAP_PAGE_MASK(src_map)));
12396 		}
12397 
12398 		/* adjust alignment of last copy_entry's "vme_end" */
12399 		tmp_entry = vm_map_copy_last_entry(copy);
12400 		if (tmp_entry != vm_map_copy_to_entry(copy)) {
12401 			vm_map_offset_t adjustment;
12402 
12403 			original_end = tmp_entry->vme_end;
12404 
12405 			/* map-align the end of the last copy entry... */
12406 			tmp_entry->vme_end =
12407 			    vm_map_round_page(tmp_entry->vme_end,
12408 			    VM_MAP_PAGE_MASK(src_map));
12409 			/* ... adjust for mis-aligned end of copy range */
12410 			adjustment =
12411 			    (vm_map_round_page((copy->offset +
12412 			    copy->size),
12413 			    VM_MAP_PAGE_MASK(src_map)) -
12414 			    vm_map_round_page((copy->offset +
12415 			    copy->size),
12416 			    PAGE_MASK));
12417 			if (adjustment) {
12418 				assert(page_aligned(adjustment));
12419 				assert(adjustment < VM_MAP_PAGE_SIZE(src_map));
12420 				tmp_entry->vme_end -= adjustment;
12421 				assert(tmp_entry->vme_start < tmp_entry->vme_end);
12422 			}
12423 
12424 			/*
12425 			 * Assert that the adjustments haven't exposed
12426 			 * more than was originally copied...
12427 			 */
12428 			assert(tmp_entry->vme_end <= original_end);
12429 			/*
12430 			 * ... and that it did not adjust outside of a
12431 			 * a single 16K page.
12432 			 */
12433 			assert(vm_map_round_page(tmp_entry->vme_end,
12434 			    VM_MAP_PAGE_MASK(src_map)) ==
12435 			    vm_map_round_page(original_end,
12436 			    VM_MAP_PAGE_MASK(src_map)));
12437 		}
12438 	}
12439 
12440 	/* Fix-up start and end points in copy.  This is necessary */
12441 	/* when the various entries in the copy object were picked */
12442 	/* up from different sub-maps */
12443 
12444 	tmp_entry = vm_map_copy_first_entry(copy);
12445 	copy_size = 0; /* compute actual size */
12446 	while (tmp_entry != vm_map_copy_to_entry(copy)) {
12447 		assert(VM_MAP_PAGE_ALIGNED(
12448 			    copy_addr + (tmp_entry->vme_end -
12449 			    tmp_entry->vme_start),
12450 			    MIN(VM_MAP_COPY_PAGE_MASK(copy), PAGE_MASK)));
12451 		assert(VM_MAP_PAGE_ALIGNED(
12452 			    copy_addr,
12453 			    MIN(VM_MAP_COPY_PAGE_MASK(copy), PAGE_MASK)));
12454 
12455 		/*
12456 		 * The copy_entries will be injected directly into the
12457 		 * destination map and might not be "map aligned" there...
12458 		 */
12459 		tmp_entry->map_aligned = FALSE;
12460 
12461 		tmp_entry->vme_end = copy_addr +
12462 		    (tmp_entry->vme_end - tmp_entry->vme_start);
12463 		tmp_entry->vme_start = copy_addr;
12464 		assert(tmp_entry->vme_start < tmp_entry->vme_end);
12465 		copy_addr += tmp_entry->vme_end - tmp_entry->vme_start;
12466 		copy_size += tmp_entry->vme_end - tmp_entry->vme_start;
12467 		tmp_entry = (struct vm_map_entry *)tmp_entry->vme_next;
12468 	}
12469 
12470 	if (VM_MAP_PAGE_SHIFT(src_map) != PAGE_SHIFT &&
12471 	    copy_size < copy->size) {
12472 		/*
12473 		 * The actual size of the VM map copy is smaller than what
12474 		 * was requested by the caller.  This must be because some
12475 		 * PAGE_SIZE-sized pages are missing at the end of the last
12476 		 * VM_MAP_PAGE_SIZE(src_map)-sized chunk of the range.
12477 		 * The caller might not have been aware of those missing
12478 		 * pages and might not want to be aware of it, which is
12479 		 * fine as long as they don't try to access (and crash on)
12480 		 * those missing pages.
12481 		 * Let's adjust the size of the "copy", to avoid failing
12482 		 * in vm_map_copyout() or vm_map_copy_overwrite().
12483 		 */
12484 		assert(vm_map_round_page(copy_size,
12485 		    VM_MAP_PAGE_MASK(src_map)) ==
12486 		    vm_map_round_page(copy->size,
12487 		    VM_MAP_PAGE_MASK(src_map)));
12488 		copy->size = copy_size;
12489 	}
12490 
12491 	*copy_result = copy;
12492 	return KERN_SUCCESS;
12493 
12494 #undef  RETURN
12495 }
12496 
12497 kern_return_t
vm_map_copy_extract(vm_map_t src_map,vm_map_address_t src_addr,vm_map_size_t len,boolean_t do_copy,vm_map_copy_t * copy_result,vm_prot_t * cur_prot,vm_prot_t * max_prot,vm_inherit_t inheritance,vm_map_kernel_flags_t vmk_flags)12498 vm_map_copy_extract(
12499 	vm_map_t                src_map,
12500 	vm_map_address_t        src_addr,
12501 	vm_map_size_t           len,
12502 	boolean_t               do_copy,
12503 	vm_map_copy_t           *copy_result,   /* OUT */
12504 	vm_prot_t               *cur_prot,      /* IN/OUT */
12505 	vm_prot_t               *max_prot,      /* IN/OUT */
12506 	vm_inherit_t            inheritance,
12507 	vm_map_kernel_flags_t   vmk_flags)
12508 {
12509 	vm_map_copy_t   copy;
12510 	kern_return_t   kr;
12511 	vm_prot_t required_cur_prot, required_max_prot;
12512 
12513 	/*
12514 	 *	Check for copies of zero bytes.
12515 	 */
12516 
12517 	if (len == 0) {
12518 		*copy_result = VM_MAP_COPY_NULL;
12519 		return KERN_SUCCESS;
12520 	}
12521 
12522 	/*
12523 	 *	Check that the end address doesn't overflow
12524 	 */
12525 	if (src_addr + len < src_addr) {
12526 		return KERN_INVALID_ADDRESS;
12527 	}
12528 
12529 	if (VM_MAP_PAGE_SIZE(src_map) < PAGE_SIZE) {
12530 		DEBUG4K_SHARE("src_map %p src_addr 0x%llx src_end 0x%llx\n", src_map, (uint64_t)src_addr, (uint64_t)(src_addr + len));
12531 	}
12532 
12533 	required_cur_prot = *cur_prot;
12534 	required_max_prot = *max_prot;
12535 
12536 	/*
12537 	 *	Allocate a header element for the list.
12538 	 *
12539 	 *	Use the start and end in the header to
12540 	 *	remember the endpoints prior to rounding.
12541 	 */
12542 
12543 	copy = vm_map_copy_allocate();
12544 	copy->type = VM_MAP_COPY_ENTRY_LIST;
12545 	copy->cpy_hdr.entries_pageable = vmk_flags.vmkf_copy_pageable;
12546 
12547 	vm_map_store_init(&copy->cpy_hdr);
12548 
12549 	copy->offset = 0;
12550 	copy->size = len;
12551 
12552 	kr = vm_map_remap_extract(src_map,
12553 	    src_addr,
12554 	    len,
12555 	    do_copy,             /* copy */
12556 	    &copy->cpy_hdr,
12557 	    cur_prot,            /* IN/OUT */
12558 	    max_prot,            /* IN/OUT */
12559 	    inheritance,
12560 	    vmk_flags);
12561 	if (kr != KERN_SUCCESS) {
12562 		vm_map_copy_discard(copy);
12563 		return kr;
12564 	}
12565 	if (required_cur_prot != VM_PROT_NONE) {
12566 		assert((*cur_prot & required_cur_prot) == required_cur_prot);
12567 		assert((*max_prot & required_max_prot) == required_max_prot);
12568 	}
12569 
12570 	*copy_result = copy;
12571 	return KERN_SUCCESS;
12572 }
12573 
12574 /*
12575  *	vm_map_copyin_object:
12576  *
12577  *	Create a copy object from an object.
12578  *	Our caller donates an object reference.
12579  */
12580 
12581 kern_return_t
vm_map_copyin_object(vm_object_t object,vm_object_offset_t offset,vm_object_size_t size,vm_map_copy_t * copy_result)12582 vm_map_copyin_object(
12583 	vm_object_t             object,
12584 	vm_object_offset_t      offset, /* offset of region in object */
12585 	vm_object_size_t        size,   /* size of region in object */
12586 	vm_map_copy_t   *copy_result)   /* OUT */
12587 {
12588 	vm_map_copy_t   copy;           /* Resulting copy */
12589 
12590 	/*
12591 	 *	We drop the object into a special copy object
12592 	 *	that contains the object directly.
12593 	 */
12594 
12595 	copy = vm_map_copy_allocate();
12596 	copy->type = VM_MAP_COPY_OBJECT;
12597 	copy->cpy_object = object;
12598 	copy->offset = offset;
12599 	copy->size = size;
12600 
12601 	*copy_result = copy;
12602 	return KERN_SUCCESS;
12603 }
12604 
12605 static void
vm_map_fork_share(vm_map_t old_map,vm_map_entry_t old_entry,vm_map_t new_map)12606 vm_map_fork_share(
12607 	vm_map_t        old_map,
12608 	vm_map_entry_t  old_entry,
12609 	vm_map_t        new_map)
12610 {
12611 	vm_object_t     object;
12612 	vm_map_entry_t  new_entry;
12613 
12614 	/*
12615 	 *	New sharing code.  New map entry
12616 	 *	references original object.  Internal
12617 	 *	objects use asynchronous copy algorithm for
12618 	 *	future copies.  First make sure we have
12619 	 *	the right object.  If we need a shadow,
12620 	 *	or someone else already has one, then
12621 	 *	make a new shadow and share it.
12622 	 */
12623 
12624 	if (!old_entry->is_sub_map) {
12625 		object = VME_OBJECT(old_entry);
12626 	}
12627 
12628 	if (old_entry->is_sub_map) {
12629 		assert(old_entry->wired_count == 0);
12630 #ifndef NO_NESTED_PMAP
12631 #if !PMAP_FORK_NEST
12632 		if (old_entry->use_pmap) {
12633 			kern_return_t   result;
12634 
12635 			result = pmap_nest(new_map->pmap,
12636 			    (VME_SUBMAP(old_entry))->pmap,
12637 			    (addr64_t)old_entry->vme_start,
12638 			    (uint64_t)(old_entry->vme_end - old_entry->vme_start));
12639 			if (result) {
12640 				panic("vm_map_fork_share: pmap_nest failed!");
12641 			}
12642 		}
12643 #endif /* !PMAP_FORK_NEST */
12644 #endif  /* NO_NESTED_PMAP */
12645 	} else if (object == VM_OBJECT_NULL) {
12646 		object = vm_object_allocate((vm_map_size_t)(old_entry->vme_end -
12647 		    old_entry->vme_start));
12648 		VME_OFFSET_SET(old_entry, 0);
12649 		VME_OBJECT_SET(old_entry, object, false, 0);
12650 		old_entry->use_pmap = TRUE;
12651 //		assert(!old_entry->needs_copy);
12652 	} else if (object->copy_strategy !=
12653 	    MEMORY_OBJECT_COPY_SYMMETRIC) {
12654 		/*
12655 		 *	We are already using an asymmetric
12656 		 *	copy, and therefore we already have
12657 		 *	the right object.
12658 		 */
12659 
12660 		assert(!old_entry->needs_copy);
12661 	} else if (old_entry->needs_copy ||       /* case 1 */
12662 	    object->shadowed ||                 /* case 2 */
12663 	    (!object->true_share &&             /* case 3 */
12664 	    !old_entry->is_shared &&
12665 	    (object->vo_size >
12666 	    (vm_map_size_t)(old_entry->vme_end -
12667 	    old_entry->vme_start)))) {
12668 		/*
12669 		 *	We need to create a shadow.
12670 		 *	There are three cases here.
12671 		 *	In the first case, we need to
12672 		 *	complete a deferred symmetrical
12673 		 *	copy that we participated in.
12674 		 *	In the second and third cases,
12675 		 *	we need to create the shadow so
12676 		 *	that changes that we make to the
12677 		 *	object do not interfere with
12678 		 *	any symmetrical copies which
12679 		 *	have occured (case 2) or which
12680 		 *	might occur (case 3).
12681 		 *
12682 		 *	The first case is when we had
12683 		 *	deferred shadow object creation
12684 		 *	via the entry->needs_copy mechanism.
12685 		 *	This mechanism only works when
12686 		 *	only one entry points to the source
12687 		 *	object, and we are about to create
12688 		 *	a second entry pointing to the
12689 		 *	same object. The problem is that
12690 		 *	there is no way of mapping from
12691 		 *	an object to the entries pointing
12692 		 *	to it. (Deferred shadow creation
12693 		 *	works with one entry because occurs
12694 		 *	at fault time, and we walk from the
12695 		 *	entry to the object when handling
12696 		 *	the fault.)
12697 		 *
12698 		 *	The second case is when the object
12699 		 *	to be shared has already been copied
12700 		 *	with a symmetric copy, but we point
12701 		 *	directly to the object without
12702 		 *	needs_copy set in our entry. (This
12703 		 *	can happen because different ranges
12704 		 *	of an object can be pointed to by
12705 		 *	different entries. In particular,
12706 		 *	a single entry pointing to an object
12707 		 *	can be split by a call to vm_inherit,
12708 		 *	which, combined with task_create, can
12709 		 *	result in the different entries
12710 		 *	having different needs_copy values.)
12711 		 *	The shadowed flag in the object allows
12712 		 *	us to detect this case. The problem
12713 		 *	with this case is that if this object
12714 		 *	has or will have shadows, then we
12715 		 *	must not perform an asymmetric copy
12716 		 *	of this object, since such a copy
12717 		 *	allows the object to be changed, which
12718 		 *	will break the previous symmetrical
12719 		 *	copies (which rely upon the object
12720 		 *	not changing). In a sense, the shadowed
12721 		 *	flag says "don't change this object".
12722 		 *	We fix this by creating a shadow
12723 		 *	object for this object, and sharing
12724 		 *	that. This works because we are free
12725 		 *	to change the shadow object (and thus
12726 		 *	to use an asymmetric copy strategy);
12727 		 *	this is also semantically correct,
12728 		 *	since this object is temporary, and
12729 		 *	therefore a copy of the object is
12730 		 *	as good as the object itself. (This
12731 		 *	is not true for permanent objects,
12732 		 *	since the pager needs to see changes,
12733 		 *	which won't happen if the changes
12734 		 *	are made to a copy.)
12735 		 *
12736 		 *	The third case is when the object
12737 		 *	to be shared has parts sticking
12738 		 *	outside of the entry we're working
12739 		 *	with, and thus may in the future
12740 		 *	be subject to a symmetrical copy.
12741 		 *	(This is a preemptive version of
12742 		 *	case 2.)
12743 		 */
12744 		VME_OBJECT_SHADOW(old_entry,
12745 		    (vm_map_size_t) (old_entry->vme_end -
12746 		    old_entry->vme_start),
12747 		    vm_map_always_shadow(old_map));
12748 
12749 		/*
12750 		 *	If we're making a shadow for other than
12751 		 *	copy on write reasons, then we have
12752 		 *	to remove write permission.
12753 		 */
12754 
12755 		if (!old_entry->needs_copy &&
12756 		    (old_entry->protection & VM_PROT_WRITE)) {
12757 			vm_prot_t prot;
12758 
12759 			assert(!pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, old_entry->protection));
12760 
12761 			prot = old_entry->protection & ~VM_PROT_WRITE;
12762 
12763 			assert(!pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, prot));
12764 
12765 			if (override_nx(old_map, VME_ALIAS(old_entry)) && prot) {
12766 				prot |= VM_PROT_EXECUTE;
12767 			}
12768 
12769 
12770 			if (old_map->mapped_in_other_pmaps) {
12771 				vm_object_pmap_protect(
12772 					VME_OBJECT(old_entry),
12773 					VME_OFFSET(old_entry),
12774 					(old_entry->vme_end -
12775 					old_entry->vme_start),
12776 					PMAP_NULL,
12777 					PAGE_SIZE,
12778 					old_entry->vme_start,
12779 					prot);
12780 			} else {
12781 				pmap_protect(old_map->pmap,
12782 				    old_entry->vme_start,
12783 				    old_entry->vme_end,
12784 				    prot);
12785 			}
12786 		}
12787 
12788 		old_entry->needs_copy = FALSE;
12789 		object = VME_OBJECT(old_entry);
12790 	}
12791 
12792 
12793 	/*
12794 	 *	If object was using a symmetric copy strategy,
12795 	 *	change its copy strategy to the default
12796 	 *	asymmetric copy strategy, which is copy_delay
12797 	 *	in the non-norma case and copy_call in the
12798 	 *	norma case. Bump the reference count for the
12799 	 *	new entry.
12800 	 */
12801 
12802 	if (old_entry->is_sub_map) {
12803 		vm_map_reference(VME_SUBMAP(old_entry));
12804 	} else {
12805 		vm_object_lock(object);
12806 		vm_object_reference_locked(object);
12807 		if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
12808 			object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
12809 		}
12810 		vm_object_unlock(object);
12811 	}
12812 
12813 	/*
12814 	 *	Clone the entry, using object ref from above.
12815 	 *	Mark both entries as shared.
12816 	 */
12817 
12818 	new_entry = vm_map_entry_create(new_map); /* Never the kernel map or descendants */
12819 	vm_map_entry_copy(old_map, new_entry, old_entry);
12820 	old_entry->is_shared = TRUE;
12821 	new_entry->is_shared = TRUE;
12822 
12823 	/*
12824 	 * We're dealing with a shared mapping, so the resulting mapping
12825 	 * should inherit some of the original mapping's accounting settings.
12826 	 * "iokit_acct" should have been cleared in vm_map_entry_copy().
12827 	 * "use_pmap" should stay the same as before (if it hasn't been reset
12828 	 * to TRUE when we cleared "iokit_acct").
12829 	 */
12830 	assert(!new_entry->iokit_acct);
12831 
12832 	/*
12833 	 *	If old entry's inheritence is VM_INHERIT_NONE,
12834 	 *	the new entry is for corpse fork, remove the
12835 	 *	write permission from the new entry.
12836 	 */
12837 	if (old_entry->inheritance == VM_INHERIT_NONE) {
12838 		new_entry->protection &= ~VM_PROT_WRITE;
12839 		new_entry->max_protection &= ~VM_PROT_WRITE;
12840 	}
12841 
12842 	/*
12843 	 *	Insert the entry into the new map -- we
12844 	 *	know we're inserting at the end of the new
12845 	 *	map.
12846 	 */
12847 
12848 	vm_map_store_entry_link(new_map, vm_map_last_entry(new_map), new_entry,
12849 	    VM_MAP_KERNEL_FLAGS_NONE);
12850 
12851 	/*
12852 	 *	Update the physical map
12853 	 */
12854 
12855 	if (old_entry->is_sub_map) {
12856 		/* Bill Angell pmap support goes here */
12857 	} else {
12858 		pmap_copy(new_map->pmap, old_map->pmap, new_entry->vme_start,
12859 		    old_entry->vme_end - old_entry->vme_start,
12860 		    old_entry->vme_start);
12861 	}
12862 }
12863 
12864 static boolean_t
vm_map_fork_copy(vm_map_t old_map,vm_map_entry_t * old_entry_p,vm_map_t new_map,int vm_map_copyin_flags)12865 vm_map_fork_copy(
12866 	vm_map_t        old_map,
12867 	vm_map_entry_t  *old_entry_p,
12868 	vm_map_t        new_map,
12869 	int             vm_map_copyin_flags)
12870 {
12871 	vm_map_entry_t old_entry = *old_entry_p;
12872 	vm_map_size_t entry_size = old_entry->vme_end - old_entry->vme_start;
12873 	vm_map_offset_t start = old_entry->vme_start;
12874 	vm_map_copy_t copy;
12875 	vm_map_entry_t last = vm_map_last_entry(new_map);
12876 
12877 	vm_map_unlock(old_map);
12878 	/*
12879 	 *	Use maxprot version of copyin because we
12880 	 *	care about whether this memory can ever
12881 	 *	be accessed, not just whether it's accessible
12882 	 *	right now.
12883 	 */
12884 	vm_map_copyin_flags |= VM_MAP_COPYIN_USE_MAXPROT;
12885 	if (vm_map_copyin_internal(old_map, start, entry_size,
12886 	    vm_map_copyin_flags, &copy)
12887 	    != KERN_SUCCESS) {
12888 		/*
12889 		 *	The map might have changed while it
12890 		 *	was unlocked, check it again.  Skip
12891 		 *	any blank space or permanently
12892 		 *	unreadable region.
12893 		 */
12894 		vm_map_lock(old_map);
12895 		if (!vm_map_lookup_entry(old_map, start, &last) ||
12896 		    (last->max_protection & VM_PROT_READ) == VM_PROT_NONE) {
12897 			last = last->vme_next;
12898 		}
12899 		*old_entry_p = last;
12900 
12901 		/*
12902 		 * XXX	For some error returns, want to
12903 		 * XXX	skip to the next element.  Note
12904 		 *	that INVALID_ADDRESS and
12905 		 *	PROTECTION_FAILURE are handled above.
12906 		 */
12907 
12908 		return FALSE;
12909 	}
12910 
12911 	/*
12912 	 * Assert that the vm_map_copy is coming from the right
12913 	 * zone and hasn't been forged
12914 	 */
12915 	vm_map_copy_require(copy);
12916 
12917 	/*
12918 	 *	Insert the copy into the new map
12919 	 */
12920 	vm_map_copy_insert(new_map, last, copy);
12921 
12922 	/*
12923 	 *	Pick up the traversal at the end of
12924 	 *	the copied region.
12925 	 */
12926 
12927 	vm_map_lock(old_map);
12928 	start += entry_size;
12929 	if (!vm_map_lookup_entry(old_map, start, &last)) {
12930 		last = last->vme_next;
12931 	} else {
12932 		if (last->vme_start == start) {
12933 			/*
12934 			 * No need to clip here and we don't
12935 			 * want to cause any unnecessary
12936 			 * unnesting...
12937 			 */
12938 		} else {
12939 			vm_map_clip_start(old_map, last, start);
12940 		}
12941 	}
12942 	*old_entry_p = last;
12943 
12944 	return TRUE;
12945 }
12946 
12947 #if PMAP_FORK_NEST
12948 #define PMAP_FORK_NEST_DEBUG 0
12949 static inline void
vm_map_fork_unnest(pmap_t new_pmap,vm_map_offset_t pre_nested_start,vm_map_offset_t pre_nested_end,vm_map_offset_t start,vm_map_offset_t end)12950 vm_map_fork_unnest(
12951 	pmap_t new_pmap,
12952 	vm_map_offset_t pre_nested_start,
12953 	vm_map_offset_t pre_nested_end,
12954 	vm_map_offset_t start,
12955 	vm_map_offset_t end)
12956 {
12957 	kern_return_t kr;
12958 	vm_map_offset_t nesting_mask, start_unnest, end_unnest;
12959 
12960 	assertf(pre_nested_start <= pre_nested_end,
12961 	    "pre_nested start 0x%llx end 0x%llx",
12962 	    (uint64_t)pre_nested_start, (uint64_t)pre_nested_end);
12963 	assertf(start <= end,
12964 	    "start 0x%llx end 0x%llx",
12965 	    (uint64_t) start, (uint64_t)end);
12966 
12967 	if (pre_nested_start == pre_nested_end) {
12968 		/* nothing was pre-nested: done */
12969 		return;
12970 	}
12971 	if (end <= pre_nested_start) {
12972 		/* fully before pre-nested range: done */
12973 		return;
12974 	}
12975 	if (start >= pre_nested_end) {
12976 		/* fully after pre-nested range: done */
12977 		return;
12978 	}
12979 	/* ignore parts of range outside of pre_nested range */
12980 	if (start < pre_nested_start) {
12981 		start = pre_nested_start;
12982 	}
12983 	if (end > pre_nested_end) {
12984 		end = pre_nested_end;
12985 	}
12986 	nesting_mask = pmap_shared_region_size_min(new_pmap) - 1;
12987 	start_unnest = start & ~nesting_mask;
12988 	end_unnest = (end + nesting_mask) & ~nesting_mask;
12989 	kr = pmap_unnest(new_pmap,
12990 	    (addr64_t)start_unnest,
12991 	    (uint64_t)(end_unnest - start_unnest));
12992 #if PMAP_FORK_NEST_DEBUG
12993 	printf("PMAP_FORK_NEST %s:%d new_pmap %p 0x%llx:0x%llx -> pmap_unnest 0x%llx:0x%llx kr 0x%x\n", __FUNCTION__, __LINE__, new_pmap, (uint64_t)start, (uint64_t)end, (uint64_t)start_unnest, (uint64_t)end_unnest, kr);
12994 #endif /* PMAP_FORK_NEST_DEBUG */
12995 	assertf(kr == KERN_SUCCESS,
12996 	    "0x%llx 0x%llx pmap_unnest(%p, 0x%llx, 0x%llx) -> 0x%x",
12997 	    (uint64_t)start, (uint64_t)end, new_pmap,
12998 	    (uint64_t)start_unnest, (uint64_t)(end_unnest - start_unnest),
12999 	    kr);
13000 }
13001 #endif /* PMAP_FORK_NEST */
13002 
13003 /*
13004  *	vm_map_fork:
13005  *
13006  *	Create and return a new map based on the old
13007  *	map, according to the inheritance values on the
13008  *	regions in that map and the options.
13009  *
13010  *	The source map must not be locked.
13011  */
13012 vm_map_t
vm_map_fork(ledger_t ledger,vm_map_t old_map,int options)13013 vm_map_fork(
13014 	ledger_t        ledger,
13015 	vm_map_t        old_map,
13016 	int             options)
13017 {
13018 	pmap_t          new_pmap;
13019 	vm_map_t        new_map;
13020 	vm_map_entry_t  old_entry;
13021 	vm_map_size_t   new_size = 0, entry_size;
13022 	vm_map_entry_t  new_entry;
13023 	boolean_t       src_needs_copy;
13024 	boolean_t       new_entry_needs_copy;
13025 	boolean_t       pmap_is64bit;
13026 	int             vm_map_copyin_flags;
13027 	vm_inherit_t    old_entry_inheritance;
13028 	int             map_create_options;
13029 	kern_return_t   footprint_collect_kr;
13030 
13031 	if (options & ~(VM_MAP_FORK_SHARE_IF_INHERIT_NONE |
13032 	    VM_MAP_FORK_PRESERVE_PURGEABLE |
13033 	    VM_MAP_FORK_CORPSE_FOOTPRINT)) {
13034 		/* unsupported option */
13035 		return VM_MAP_NULL;
13036 	}
13037 
13038 	pmap_is64bit =
13039 #if defined(__i386__) || defined(__x86_64__)
13040 	    old_map->pmap->pm_task_map != TASK_MAP_32BIT;
13041 #elif defined(__arm64__)
13042 	    old_map->pmap->is_64bit;
13043 #else
13044 #error Unknown architecture.
13045 #endif
13046 
13047 	unsigned int pmap_flags = 0;
13048 	pmap_flags |= pmap_is64bit ? PMAP_CREATE_64BIT : 0;
13049 #if defined(HAS_APPLE_PAC)
13050 	pmap_flags |= old_map->pmap->disable_jop ? PMAP_CREATE_DISABLE_JOP : 0;
13051 #endif
13052 #if CONFIG_ROSETTA
13053 	pmap_flags |= old_map->pmap->is_rosetta ? PMAP_CREATE_ROSETTA : 0;
13054 #endif
13055 #if PMAP_CREATE_FORCE_4K_PAGES
13056 	if (VM_MAP_PAGE_SIZE(old_map) == FOURK_PAGE_SIZE &&
13057 	    PAGE_SIZE != FOURK_PAGE_SIZE) {
13058 		pmap_flags |= PMAP_CREATE_FORCE_4K_PAGES;
13059 	}
13060 #endif /* PMAP_CREATE_FORCE_4K_PAGES */
13061 	new_pmap = pmap_create_options(ledger, (vm_map_size_t) 0, pmap_flags);
13062 	if (new_pmap == NULL) {
13063 		return VM_MAP_NULL;
13064 	}
13065 
13066 	vm_map_reference(old_map);
13067 	vm_map_lock(old_map);
13068 
13069 	map_create_options = 0;
13070 	if (old_map->hdr.entries_pageable) {
13071 		map_create_options |= VM_MAP_CREATE_PAGEABLE;
13072 	}
13073 	if (options & VM_MAP_FORK_CORPSE_FOOTPRINT) {
13074 		map_create_options |= VM_MAP_CREATE_CORPSE_FOOTPRINT;
13075 		footprint_collect_kr = KERN_SUCCESS;
13076 	}
13077 	new_map = vm_map_create_options(new_pmap,
13078 	    old_map->min_offset,
13079 	    old_map->max_offset,
13080 	    map_create_options);
13081 	/* inherit cs_enforcement */
13082 	vm_map_cs_enforcement_set(new_map, old_map->cs_enforcement);
13083 	vm_map_lock(new_map);
13084 	vm_commit_pagezero_status(new_map);
13085 	/* inherit the parent map's page size */
13086 	vm_map_set_page_shift(new_map, VM_MAP_PAGE_SHIFT(old_map));
13087 
13088 #if CONFIG_MAP_RANGES
13089 	/* inherit the parent map's VM ranges */
13090 	vm_map_range_fork(new_map, old_map);
13091 #endif
13092 	/* ensure PMAP_CS structures are prepared for the fork */
13093 	pmap_cs_fork_prepare(old_map->pmap, new_pmap);
13094 
13095 #if PMAP_FORK_NEST
13096 	/*
13097 	 * Pre-nest the shared region's pmap.
13098 	 */
13099 	vm_map_offset_t pre_nested_start = 0, pre_nested_end = 0;
13100 	pmap_fork_nest(old_map->pmap, new_pmap,
13101 	    &pre_nested_start, &pre_nested_end);
13102 #if PMAP_FORK_NEST_DEBUG
13103 	printf("PMAP_FORK_NEST %s:%d old %p new %p pre_nested start 0x%llx end 0x%llx\n", __FUNCTION__, __LINE__, old_map->pmap, new_pmap, (uint64_t)pre_nested_start, (uint64_t)pre_nested_end);
13104 #endif /* PMAP_FORK_NEST_DEBUG */
13105 #endif /* PMAP_FORK_NEST */
13106 
13107 	for (old_entry = vm_map_first_entry(old_map); old_entry != vm_map_to_entry(old_map);) {
13108 		/*
13109 		 * Abort any corpse collection if the system is shutting down.
13110 		 */
13111 		if ((options & VM_MAP_FORK_CORPSE_FOOTPRINT) &&
13112 		    get_system_inshutdown()) {
13113 #if PMAP_FORK_NEST
13114 			new_entry = vm_map_last_entry(new_map);
13115 			if (new_entry == vm_map_to_entry(new_map)) {
13116 				/* unnest all that was pre-nested */
13117 				vm_map_fork_unnest(new_pmap,
13118 				    pre_nested_start, pre_nested_end,
13119 				    vm_map_min(new_map), vm_map_max(new_map));
13120 			} else if (new_entry->vme_end < vm_map_max(new_map)) {
13121 				/* unnest hole at the end, if pre-nested */
13122 				vm_map_fork_unnest(new_pmap,
13123 				    pre_nested_start, pre_nested_end,
13124 				    new_entry->vme_end, vm_map_max(new_map));
13125 			}
13126 #endif /* PMAP_FORK_NEST */
13127 			vm_map_corpse_footprint_collect_done(new_map);
13128 			vm_map_unlock(new_map);
13129 			vm_map_unlock(old_map);
13130 			vm_map_deallocate(new_map);
13131 			vm_map_deallocate(old_map);
13132 			printf("Aborting corpse map due to system shutdown\n");
13133 			return VM_MAP_NULL;
13134 		}
13135 
13136 		entry_size = old_entry->vme_end - old_entry->vme_start;
13137 
13138 #if PMAP_FORK_NEST
13139 		/*
13140 		 * Undo any unnecessary pre-nesting.
13141 		 */
13142 		vm_map_offset_t prev_end;
13143 		if (old_entry == vm_map_first_entry(old_map)) {
13144 			prev_end = vm_map_min(old_map);
13145 		} else {
13146 			prev_end = old_entry->vme_prev->vme_end;
13147 		}
13148 		if (prev_end < old_entry->vme_start) {
13149 			/* unnest hole before this entry, if pre-nested */
13150 			vm_map_fork_unnest(new_pmap,
13151 			    pre_nested_start, pre_nested_end,
13152 			    prev_end, old_entry->vme_start);
13153 		}
13154 		if (old_entry->is_sub_map && old_entry->use_pmap) {
13155 			/* keep this entry nested in the child */
13156 #if PMAP_FORK_NEST_DEBUG
13157 			printf("PMAP_FORK_NEST %s:%d new_pmap %p keeping 0x%llx:0x%llx nested\n", __FUNCTION__, __LINE__, new_pmap, (uint64_t)old_entry->vme_start, (uint64_t)old_entry->vme_end);
13158 #endif /* PMAP_FORK_NEST_DEBUG */
13159 		} else {
13160 			/* undo nesting for this entry, if pre-nested */
13161 			vm_map_fork_unnest(new_pmap,
13162 			    pre_nested_start, pre_nested_end,
13163 			    old_entry->vme_start, old_entry->vme_end);
13164 		}
13165 #endif /* PMAP_FORK_NEST */
13166 
13167 		old_entry_inheritance = old_entry->inheritance;
13168 		/*
13169 		 * If caller used the VM_MAP_FORK_SHARE_IF_INHERIT_NONE option
13170 		 * share VM_INHERIT_NONE entries that are not backed by a
13171 		 * device pager.
13172 		 */
13173 		if (old_entry_inheritance == VM_INHERIT_NONE &&
13174 		    (options & VM_MAP_FORK_SHARE_IF_INHERIT_NONE) &&
13175 		    (old_entry->protection & VM_PROT_READ) &&
13176 		    !(!old_entry->is_sub_map &&
13177 		    VME_OBJECT(old_entry) != NULL &&
13178 		    VME_OBJECT(old_entry)->pager != NULL &&
13179 		    is_device_pager_ops(
13180 			    VME_OBJECT(old_entry)->pager->mo_pager_ops))) {
13181 			old_entry_inheritance = VM_INHERIT_SHARE;
13182 		}
13183 
13184 		if (old_entry_inheritance != VM_INHERIT_NONE &&
13185 		    (options & VM_MAP_FORK_CORPSE_FOOTPRINT) &&
13186 		    footprint_collect_kr == KERN_SUCCESS) {
13187 			/*
13188 			 * The corpse won't have old_map->pmap to query
13189 			 * footprint information, so collect that data now
13190 			 * and store it in new_map->vmmap_corpse_footprint
13191 			 * for later autopsy.
13192 			 */
13193 			footprint_collect_kr =
13194 			    vm_map_corpse_footprint_collect(old_map,
13195 			    old_entry,
13196 			    new_map);
13197 		}
13198 
13199 		switch (old_entry_inheritance) {
13200 		case VM_INHERIT_NONE:
13201 			break;
13202 
13203 		case VM_INHERIT_SHARE:
13204 			vm_map_fork_share(old_map, old_entry, new_map);
13205 			new_size += entry_size;
13206 			break;
13207 
13208 		case VM_INHERIT_COPY:
13209 
13210 			/*
13211 			 *	Inline the copy_quickly case;
13212 			 *	upon failure, fall back on call
13213 			 *	to vm_map_fork_copy.
13214 			 */
13215 
13216 			if (old_entry->is_sub_map) {
13217 				break;
13218 			}
13219 			if ((old_entry->wired_count != 0) ||
13220 			    ((VME_OBJECT(old_entry) != NULL) &&
13221 			    (VME_OBJECT(old_entry)->true_share))) {
13222 				goto slow_vm_map_fork_copy;
13223 			}
13224 
13225 			new_entry = vm_map_entry_create(new_map); /* never the kernel map or descendants */
13226 			vm_map_entry_copy(old_map, new_entry, old_entry);
13227 			if (old_entry->vme_permanent) {
13228 				/* inherit "permanent" on fork() */
13229 				new_entry->vme_permanent = TRUE;
13230 			}
13231 
13232 			if (new_entry->used_for_jit == TRUE && new_map->jit_entry_exists == FALSE) {
13233 				new_map->jit_entry_exists = TRUE;
13234 			}
13235 
13236 			if (new_entry->is_sub_map) {
13237 				/* clear address space specifics */
13238 				new_entry->use_pmap = FALSE;
13239 			} else {
13240 				/*
13241 				 * We're dealing with a copy-on-write operation,
13242 				 * so the resulting mapping should not inherit
13243 				 * the original mapping's accounting settings.
13244 				 * "iokit_acct" should have been cleared in
13245 				 * vm_map_entry_copy().
13246 				 * "use_pmap" should be reset to its default
13247 				 * (TRUE) so that the new mapping gets
13248 				 * accounted for in the task's memory footprint.
13249 				 */
13250 				assert(!new_entry->iokit_acct);
13251 				new_entry->use_pmap = TRUE;
13252 			}
13253 
13254 			if (!vm_object_copy_quickly(
13255 				    VME_OBJECT(new_entry),
13256 				    VME_OFFSET(old_entry),
13257 				    (old_entry->vme_end -
13258 				    old_entry->vme_start),
13259 				    &src_needs_copy,
13260 				    &new_entry_needs_copy)) {
13261 				vm_map_entry_dispose(new_entry);
13262 				goto slow_vm_map_fork_copy;
13263 			}
13264 
13265 			/*
13266 			 *	Handle copy-on-write obligations
13267 			 */
13268 
13269 			if (src_needs_copy && !old_entry->needs_copy) {
13270 				vm_prot_t prot;
13271 
13272 				assert(!pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, old_entry->protection));
13273 
13274 				prot = old_entry->protection & ~VM_PROT_WRITE;
13275 
13276 				if (override_nx(old_map, VME_ALIAS(old_entry))
13277 				    && prot) {
13278 					prot |= VM_PROT_EXECUTE;
13279 				}
13280 
13281 				assert(!pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, prot));
13282 
13283 				vm_object_pmap_protect(
13284 					VME_OBJECT(old_entry),
13285 					VME_OFFSET(old_entry),
13286 					(old_entry->vme_end -
13287 					old_entry->vme_start),
13288 					((old_entry->is_shared
13289 					|| old_map->mapped_in_other_pmaps)
13290 					? PMAP_NULL :
13291 					old_map->pmap),
13292 					VM_MAP_PAGE_SIZE(old_map),
13293 					old_entry->vme_start,
13294 					prot);
13295 
13296 				assert(old_entry->wired_count == 0);
13297 				old_entry->needs_copy = TRUE;
13298 			}
13299 			new_entry->needs_copy = new_entry_needs_copy;
13300 
13301 			/*
13302 			 *	Insert the entry at the end
13303 			 *	of the map.
13304 			 */
13305 
13306 			vm_map_store_entry_link(new_map,
13307 			    vm_map_last_entry(new_map),
13308 			    new_entry,
13309 			    VM_MAP_KERNEL_FLAGS_NONE);
13310 			new_size += entry_size;
13311 			break;
13312 
13313 slow_vm_map_fork_copy:
13314 			vm_map_copyin_flags = 0;
13315 			if (options & VM_MAP_FORK_PRESERVE_PURGEABLE) {
13316 				vm_map_copyin_flags |=
13317 				    VM_MAP_COPYIN_PRESERVE_PURGEABLE;
13318 			}
13319 			if (vm_map_fork_copy(old_map,
13320 			    &old_entry,
13321 			    new_map,
13322 			    vm_map_copyin_flags)) {
13323 				new_size += entry_size;
13324 			}
13325 			continue;
13326 		}
13327 		old_entry = old_entry->vme_next;
13328 	}
13329 
13330 #if PMAP_FORK_NEST
13331 	new_entry = vm_map_last_entry(new_map);
13332 	if (new_entry == vm_map_to_entry(new_map)) {
13333 		/* unnest all that was pre-nested */
13334 		vm_map_fork_unnest(new_pmap,
13335 		    pre_nested_start, pre_nested_end,
13336 		    vm_map_min(new_map), vm_map_max(new_map));
13337 	} else if (new_entry->vme_end < vm_map_max(new_map)) {
13338 		/* unnest hole at the end, if pre-nested */
13339 		vm_map_fork_unnest(new_pmap,
13340 		    pre_nested_start, pre_nested_end,
13341 		    new_entry->vme_end, vm_map_max(new_map));
13342 	}
13343 #endif /* PMAP_FORK_NEST */
13344 
13345 #if defined(__arm64__)
13346 	pmap_insert_sharedpage(new_map->pmap);
13347 #endif /* __arm64__ */
13348 
13349 	new_map->size = new_size;
13350 
13351 	if (options & VM_MAP_FORK_CORPSE_FOOTPRINT) {
13352 		vm_map_corpse_footprint_collect_done(new_map);
13353 	}
13354 
13355 	/* Propagate JIT entitlement for the pmap layer. */
13356 	if (pmap_get_jit_entitled(old_map->pmap)) {
13357 		/* Tell the pmap that it supports JIT. */
13358 		pmap_set_jit_entitled(new_map->pmap);
13359 	}
13360 
13361 	vm_map_unlock(new_map);
13362 	vm_map_unlock(old_map);
13363 	vm_map_deallocate(old_map);
13364 
13365 	return new_map;
13366 }
13367 
13368 /*
13369  * vm_map_exec:
13370  *
13371  *      Setup the "new_map" with the proper execution environment according
13372  *	to the type of executable (platform, 64bit, chroot environment).
13373  *	Map the comm page and shared region, etc...
13374  */
13375 kern_return_t
vm_map_exec(vm_map_t new_map,task_t task,boolean_t is64bit,void * fsroot,cpu_type_t cpu,cpu_subtype_t cpu_subtype,boolean_t reslide,boolean_t is_driverkit,uint32_t rsr_version)13376 vm_map_exec(
13377 	vm_map_t        new_map,
13378 	task_t          task,
13379 	boolean_t       is64bit,
13380 	void            *fsroot,
13381 	cpu_type_t      cpu,
13382 	cpu_subtype_t   cpu_subtype,
13383 	boolean_t       reslide,
13384 	boolean_t       is_driverkit,
13385 	uint32_t        rsr_version)
13386 {
13387 	SHARED_REGION_TRACE_DEBUG(
13388 		("shared_region: task %p: vm_map_exec(%p,%p,%p,0x%x,0x%x): ->\n",
13389 		(void *)VM_KERNEL_ADDRPERM(current_task()),
13390 		(void *)VM_KERNEL_ADDRPERM(new_map),
13391 		(void *)VM_KERNEL_ADDRPERM(task),
13392 		(void *)VM_KERNEL_ADDRPERM(fsroot),
13393 		cpu,
13394 		cpu_subtype));
13395 	(void) vm_commpage_enter(new_map, task, is64bit);
13396 
13397 	(void) vm_shared_region_enter(new_map, task, is64bit, fsroot, cpu, cpu_subtype, reslide, is_driverkit, rsr_version);
13398 
13399 	SHARED_REGION_TRACE_DEBUG(
13400 		("shared_region: task %p: vm_map_exec(%p,%p,%p,0x%x,0x%x): <-\n",
13401 		(void *)VM_KERNEL_ADDRPERM(current_task()),
13402 		(void *)VM_KERNEL_ADDRPERM(new_map),
13403 		(void *)VM_KERNEL_ADDRPERM(task),
13404 		(void *)VM_KERNEL_ADDRPERM(fsroot),
13405 		cpu,
13406 		cpu_subtype));
13407 
13408 	/*
13409 	 * Some devices have region(s) of memory that shouldn't get allocated by
13410 	 * user processes. The following code creates dummy vm_map_entry_t's for each
13411 	 * of the regions that needs to be reserved to prevent any allocations in
13412 	 * those regions.
13413 	 */
13414 	kern_return_t kr = KERN_FAILURE;
13415 	vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
13416 	vmk_flags.vmkf_permanent = TRUE;
13417 	vmk_flags.vmkf_beyond_max = TRUE;
13418 
13419 	struct vm_reserved_region *regions = NULL;
13420 	size_t num_regions = ml_get_vm_reserved_regions(is64bit, &regions);
13421 	assert((num_regions == 0) || (num_regions > 0 && regions != NULL));
13422 
13423 	for (size_t i = 0; i < num_regions; ++i) {
13424 		kr = vm_map_enter(
13425 			new_map,
13426 			&regions[i].vmrr_addr,
13427 			regions[i].vmrr_size,
13428 			(vm_map_offset_t)0,
13429 			VM_FLAGS_FIXED,
13430 			vmk_flags,
13431 			VM_KERN_MEMORY_NONE,
13432 			VM_OBJECT_NULL,
13433 			(vm_object_offset_t)0,
13434 			FALSE,
13435 			VM_PROT_NONE,
13436 			VM_PROT_NONE,
13437 			VM_INHERIT_COPY);
13438 
13439 		if (kr != KERN_SUCCESS) {
13440 			panic("Failed to reserve %s region in user map %p %d", regions[i].vmrr_name, new_map, kr);
13441 		}
13442 	}
13443 
13444 	new_map->reserved_regions = (num_regions ? TRUE : FALSE);
13445 
13446 	return KERN_SUCCESS;
13447 }
13448 
13449 uint64_t vm_map_lookup_and_lock_object_copy_slowly_count = 0;
13450 uint64_t vm_map_lookup_and_lock_object_copy_slowly_size = 0;
13451 uint64_t vm_map_lookup_and_lock_object_copy_slowly_max = 0;
13452 uint64_t vm_map_lookup_and_lock_object_copy_slowly_restart = 0;
13453 uint64_t vm_map_lookup_and_lock_object_copy_slowly_error = 0;
13454 uint64_t vm_map_lookup_and_lock_object_copy_strategically_count = 0;
13455 uint64_t vm_map_lookup_and_lock_object_copy_strategically_size = 0;
13456 uint64_t vm_map_lookup_and_lock_object_copy_strategically_max = 0;
13457 uint64_t vm_map_lookup_and_lock_object_copy_strategically_restart = 0;
13458 uint64_t vm_map_lookup_and_lock_object_copy_strategically_error = 0;
13459 uint64_t vm_map_lookup_and_lock_object_copy_shadow_count = 0;
13460 uint64_t vm_map_lookup_and_lock_object_copy_shadow_size = 0;
13461 uint64_t vm_map_lookup_and_lock_object_copy_shadow_max = 0;
13462 /*
13463  *	vm_map_lookup_and_lock_object:
13464  *
13465  *	Finds the VM object, offset, and
13466  *	protection for a given virtual address in the
13467  *	specified map, assuming a page fault of the
13468  *	type specified.
13469  *
13470  *	Returns the (object, offset, protection) for
13471  *	this address, whether it is wired down, and whether
13472  *	this map has the only reference to the data in question.
13473  *	In order to later verify this lookup, a "version"
13474  *	is returned.
13475  *	If contended != NULL, *contended will be set to
13476  *	true iff the thread had to spin or block to acquire
13477  *	an exclusive lock.
13478  *
13479  *	The map MUST be locked by the caller and WILL be
13480  *	locked on exit.  In order to guarantee the
13481  *	existence of the returned object, it is returned
13482  *	locked.
13483  *
13484  *	If a lookup is requested with "write protection"
13485  *	specified, the map may be changed to perform virtual
13486  *	copying operations, although the data referenced will
13487  *	remain the same.
13488  */
13489 kern_return_t
vm_map_lookup_and_lock_object(vm_map_t * var_map,vm_map_offset_t vaddr,vm_prot_t fault_type,int object_lock_type,vm_map_version_t * out_version,vm_object_t * object,vm_object_offset_t * offset,vm_prot_t * out_prot,boolean_t * wired,vm_object_fault_info_t fault_info,vm_map_t * real_map,bool * contended)13490 vm_map_lookup_and_lock_object(
13491 	vm_map_t                *var_map,       /* IN/OUT */
13492 	vm_map_offset_t         vaddr,
13493 	vm_prot_t               fault_type,
13494 	int                     object_lock_type,
13495 	vm_map_version_t        *out_version,   /* OUT */
13496 	vm_object_t             *object,        /* OUT */
13497 	vm_object_offset_t      *offset,        /* OUT */
13498 	vm_prot_t               *out_prot,      /* OUT */
13499 	boolean_t               *wired,         /* OUT */
13500 	vm_object_fault_info_t  fault_info,     /* OUT */
13501 	vm_map_t                *real_map,      /* OUT */
13502 	bool                    *contended)     /* OUT */
13503 {
13504 	vm_map_entry_t                  entry;
13505 	vm_map_t                        map = *var_map;
13506 	vm_map_t                        old_map = *var_map;
13507 	vm_map_t                        cow_sub_map_parent = VM_MAP_NULL;
13508 	vm_map_offset_t                 cow_parent_vaddr = 0;
13509 	vm_map_offset_t                 old_start = 0;
13510 	vm_map_offset_t                 old_end = 0;
13511 	vm_prot_t                       prot;
13512 	boolean_t                       mask_protections;
13513 	boolean_t                       force_copy;
13514 	boolean_t                       no_force_copy_if_executable;
13515 	boolean_t                       submap_needed_copy;
13516 	vm_prot_t                       original_fault_type;
13517 	vm_map_size_t                   fault_page_mask;
13518 
13519 	/*
13520 	 * VM_PROT_MASK means that the caller wants us to use "fault_type"
13521 	 * as a mask against the mapping's actual protections, not as an
13522 	 * absolute value.
13523 	 */
13524 	mask_protections = (fault_type & VM_PROT_IS_MASK) ? TRUE : FALSE;
13525 	force_copy = (fault_type & VM_PROT_COPY) ? TRUE : FALSE;
13526 	no_force_copy_if_executable = (fault_type & VM_PROT_COPY_FAIL_IF_EXECUTABLE) ? TRUE : FALSE;
13527 	fault_type &= VM_PROT_ALL;
13528 	original_fault_type = fault_type;
13529 	if (contended) {
13530 		*contended = false;
13531 	}
13532 
13533 	*real_map = map;
13534 
13535 	fault_page_mask = MIN(VM_MAP_PAGE_MASK(map), PAGE_MASK);
13536 	vaddr = VM_MAP_TRUNC_PAGE(vaddr, fault_page_mask);
13537 
13538 RetryLookup:
13539 	fault_type = original_fault_type;
13540 
13541 	/*
13542 	 *	If the map has an interesting hint, try it before calling
13543 	 *	full blown lookup routine.
13544 	 */
13545 	entry = map->hint;
13546 
13547 	if ((entry == vm_map_to_entry(map)) ||
13548 	    (vaddr < entry->vme_start) || (vaddr >= entry->vme_end)) {
13549 		vm_map_entry_t  tmp_entry;
13550 
13551 		/*
13552 		 *	Entry was either not a valid hint, or the vaddr
13553 		 *	was not contained in the entry, so do a full lookup.
13554 		 */
13555 		if (!vm_map_lookup_entry(map, vaddr, &tmp_entry)) {
13556 			if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
13557 				vm_map_unlock(cow_sub_map_parent);
13558 			}
13559 			if ((*real_map != map)
13560 			    && (*real_map != cow_sub_map_parent)) {
13561 				vm_map_unlock(*real_map);
13562 			}
13563 			return KERN_INVALID_ADDRESS;
13564 		}
13565 
13566 		entry = tmp_entry;
13567 	}
13568 	if (map == old_map) {
13569 		old_start = entry->vme_start;
13570 		old_end = entry->vme_end;
13571 	}
13572 
13573 	/*
13574 	 *	Handle submaps.  Drop lock on upper map, submap is
13575 	 *	returned locked.
13576 	 */
13577 
13578 	submap_needed_copy = FALSE;
13579 submap_recurse:
13580 	if (entry->is_sub_map) {
13581 		vm_map_offset_t         local_vaddr;
13582 		vm_map_offset_t         end_delta;
13583 		vm_map_offset_t         start_delta;
13584 		vm_map_entry_t          submap_entry, saved_submap_entry;
13585 		vm_object_offset_t      submap_entry_offset;
13586 		vm_object_size_t        submap_entry_size;
13587 		vm_prot_t               subentry_protection;
13588 		vm_prot_t               subentry_max_protection;
13589 		boolean_t               subentry_no_copy_on_read;
13590 		boolean_t               subentry_permanent;
13591 		boolean_t               subentry_pmap_cs_associated;
13592 		boolean_t               mapped_needs_copy = FALSE;
13593 		vm_map_version_t        version;
13594 
13595 		assertf(VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)) >= VM_MAP_PAGE_SHIFT(map),
13596 		    "map %p (%d) entry %p submap %p (%d)\n",
13597 		    map, VM_MAP_PAGE_SHIFT(map), entry,
13598 		    VME_SUBMAP(entry), VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)));
13599 
13600 		local_vaddr = vaddr;
13601 
13602 		if ((entry->use_pmap &&
13603 		    !((fault_type & VM_PROT_WRITE) ||
13604 		    force_copy))) {
13605 			/* if real_map equals map we unlock below */
13606 			if ((*real_map != map) &&
13607 			    (*real_map != cow_sub_map_parent)) {
13608 				vm_map_unlock(*real_map);
13609 			}
13610 			*real_map = VME_SUBMAP(entry);
13611 		}
13612 
13613 		if (entry->needs_copy &&
13614 		    ((fault_type & VM_PROT_WRITE) ||
13615 		    force_copy)) {
13616 			if (!mapped_needs_copy) {
13617 				if (vm_map_lock_read_to_write(map)) {
13618 					vm_map_lock_read(map);
13619 					*real_map = map;
13620 					goto RetryLookup;
13621 				}
13622 				vm_map_lock_read(VME_SUBMAP(entry));
13623 				*var_map = VME_SUBMAP(entry);
13624 				cow_sub_map_parent = map;
13625 				/* reset base to map before cow object */
13626 				/* this is the map which will accept   */
13627 				/* the new cow object */
13628 				old_start = entry->vme_start;
13629 				old_end = entry->vme_end;
13630 				cow_parent_vaddr = vaddr;
13631 				mapped_needs_copy = TRUE;
13632 			} else {
13633 				vm_map_lock_read(VME_SUBMAP(entry));
13634 				*var_map = VME_SUBMAP(entry);
13635 				if ((cow_sub_map_parent != map) &&
13636 				    (*real_map != map)) {
13637 					vm_map_unlock(map);
13638 				}
13639 			}
13640 		} else {
13641 			if (entry->needs_copy) {
13642 				submap_needed_copy = TRUE;
13643 			}
13644 			vm_map_lock_read(VME_SUBMAP(entry));
13645 			*var_map = VME_SUBMAP(entry);
13646 			/* leave map locked if it is a target */
13647 			/* cow sub_map above otherwise, just  */
13648 			/* follow the maps down to the object */
13649 			/* here we unlock knowing we are not  */
13650 			/* revisiting the map.  */
13651 			if ((*real_map != map) && (map != cow_sub_map_parent)) {
13652 				vm_map_unlock_read(map);
13653 			}
13654 		}
13655 
13656 		map = *var_map;
13657 
13658 		/* calculate the offset in the submap for vaddr */
13659 		local_vaddr = (local_vaddr - entry->vme_start) + VME_OFFSET(entry);
13660 		assertf(VM_MAP_PAGE_ALIGNED(local_vaddr, fault_page_mask),
13661 		    "local_vaddr 0x%llx entry->vme_start 0x%llx fault_page_mask 0x%llx\n",
13662 		    (uint64_t)local_vaddr, (uint64_t)entry->vme_start, (uint64_t)fault_page_mask);
13663 
13664 RetrySubMap:
13665 		if (!vm_map_lookup_entry(map, local_vaddr, &submap_entry)) {
13666 			if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
13667 				vm_map_unlock(cow_sub_map_parent);
13668 			}
13669 			if ((*real_map != map)
13670 			    && (*real_map != cow_sub_map_parent)) {
13671 				vm_map_unlock(*real_map);
13672 			}
13673 			*real_map = map;
13674 			return KERN_INVALID_ADDRESS;
13675 		}
13676 
13677 		/* find the attenuated shadow of the underlying object */
13678 		/* on our target map */
13679 
13680 		/* in english the submap object may extend beyond the     */
13681 		/* region mapped by the entry or, may only fill a portion */
13682 		/* of it.  For our purposes, we only care if the object   */
13683 		/* doesn't fill.  In this case the area which will        */
13684 		/* ultimately be clipped in the top map will only need    */
13685 		/* to be as big as the portion of the underlying entry    */
13686 		/* which is mapped */
13687 		start_delta = submap_entry->vme_start > VME_OFFSET(entry) ?
13688 		    submap_entry->vme_start - VME_OFFSET(entry) : 0;
13689 
13690 		end_delta =
13691 		    (VME_OFFSET(entry) + start_delta + (old_end - old_start)) <=
13692 		    submap_entry->vme_end ?
13693 		    0 : (VME_OFFSET(entry) +
13694 		    (old_end - old_start))
13695 		    - submap_entry->vme_end;
13696 
13697 		old_start += start_delta;
13698 		old_end -= end_delta;
13699 
13700 		if (submap_entry->is_sub_map) {
13701 			entry = submap_entry;
13702 			vaddr = local_vaddr;
13703 			goto submap_recurse;
13704 		}
13705 
13706 		if (((fault_type & VM_PROT_WRITE) ||
13707 		    force_copy)
13708 		    && cow_sub_map_parent) {
13709 			vm_object_t     sub_object, copy_object;
13710 			vm_object_offset_t copy_offset;
13711 			vm_map_offset_t local_start;
13712 			vm_map_offset_t local_end;
13713 			boolean_t       object_copied = FALSE;
13714 			vm_object_offset_t object_copied_offset = 0;
13715 			boolean_t       object_copied_needs_copy = FALSE;
13716 			kern_return_t   kr = KERN_SUCCESS;
13717 
13718 			if (vm_map_lock_read_to_write(map)) {
13719 				vm_map_lock_read(map);
13720 				old_start -= start_delta;
13721 				old_end += end_delta;
13722 				goto RetrySubMap;
13723 			}
13724 
13725 
13726 			sub_object = VME_OBJECT(submap_entry);
13727 			if (sub_object == VM_OBJECT_NULL) {
13728 				sub_object =
13729 				    vm_object_allocate(
13730 					(vm_map_size_t)
13731 					(submap_entry->vme_end -
13732 					submap_entry->vme_start));
13733 				VME_OBJECT_SET(submap_entry, sub_object, false, 0);
13734 				VME_OFFSET_SET(submap_entry, 0);
13735 				assert(!submap_entry->is_sub_map);
13736 				assert(submap_entry->use_pmap);
13737 			}
13738 			local_start =  local_vaddr -
13739 			    (cow_parent_vaddr - old_start);
13740 			local_end = local_vaddr +
13741 			    (old_end - cow_parent_vaddr);
13742 			vm_map_clip_start(map, submap_entry, local_start);
13743 			vm_map_clip_end(map, submap_entry, local_end);
13744 			if (submap_entry->is_sub_map) {
13745 				/* unnesting was done when clipping */
13746 				assert(!submap_entry->use_pmap);
13747 			}
13748 
13749 			/* This is the COW case, lets connect */
13750 			/* an entry in our space to the underlying */
13751 			/* object in the submap, bypassing the  */
13752 			/* submap. */
13753 			submap_entry_offset = VME_OFFSET(submap_entry);
13754 			submap_entry_size = submap_entry->vme_end - submap_entry->vme_start;
13755 
13756 			if ((submap_entry->wired_count != 0 ||
13757 			    sub_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) &&
13758 			    (submap_entry->protection & VM_PROT_EXECUTE) &&
13759 			    no_force_copy_if_executable) {
13760 //				printf("FBDP map %p entry %p start 0x%llx end 0x%llx wired %d strat %d\n", map, submap_entry, (uint64_t)local_start, (uint64_t)local_end, submap_entry->wired_count, sub_object->copy_strategy);
13761 				if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
13762 					vm_map_unlock(cow_sub_map_parent);
13763 				}
13764 				if ((*real_map != map)
13765 				    && (*real_map != cow_sub_map_parent)) {
13766 					vm_map_unlock(*real_map);
13767 				}
13768 				*real_map = map;
13769 				ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_NO_COW_ON_EXECUTABLE), 0 /* arg */);
13770 				vm_map_lock_write_to_read(map);
13771 				kr = KERN_PROTECTION_FAILURE;
13772 				DTRACE_VM4(submap_no_copy_executable,
13773 				    vm_map_t, map,
13774 				    vm_object_offset_t, submap_entry_offset,
13775 				    vm_object_size_t, submap_entry_size,
13776 				    int, kr);
13777 				return kr;
13778 			}
13779 
13780 			if (submap_entry->wired_count != 0) {
13781 				vm_object_reference(sub_object);
13782 
13783 				assertf(VM_MAP_PAGE_ALIGNED(VME_OFFSET(submap_entry), VM_MAP_PAGE_MASK(map)),
13784 				    "submap_entry %p offset 0x%llx\n",
13785 				    submap_entry, VME_OFFSET(submap_entry));
13786 
13787 				DTRACE_VM6(submap_copy_slowly,
13788 				    vm_map_t, cow_sub_map_parent,
13789 				    vm_map_offset_t, vaddr,
13790 				    vm_map_t, map,
13791 				    vm_object_size_t, submap_entry_size,
13792 				    int, submap_entry->wired_count,
13793 				    int, sub_object->copy_strategy);
13794 
13795 				saved_submap_entry = submap_entry;
13796 				version.main_timestamp = map->timestamp;
13797 				vm_map_unlock(map); /* Increments timestamp by 1 */
13798 				submap_entry = VM_MAP_ENTRY_NULL;
13799 
13800 				vm_object_lock(sub_object);
13801 				kr = vm_object_copy_slowly(sub_object,
13802 				    submap_entry_offset,
13803 				    submap_entry_size,
13804 				    FALSE,
13805 				    &copy_object);
13806 				object_copied = TRUE;
13807 				object_copied_offset = 0;
13808 				/* 4k: account for extra offset in physical page */
13809 				object_copied_offset += submap_entry_offset - vm_object_trunc_page(submap_entry_offset);
13810 				object_copied_needs_copy = FALSE;
13811 				vm_object_deallocate(sub_object);
13812 
13813 				vm_map_lock(map);
13814 
13815 				if (kr != KERN_SUCCESS &&
13816 				    kr != KERN_MEMORY_RESTART_COPY) {
13817 					if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
13818 						vm_map_unlock(cow_sub_map_parent);
13819 					}
13820 					if ((*real_map != map)
13821 					    && (*real_map != cow_sub_map_parent)) {
13822 						vm_map_unlock(*real_map);
13823 					}
13824 					*real_map = map;
13825 					vm_object_deallocate(copy_object);
13826 					copy_object = VM_OBJECT_NULL;
13827 					ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_COPY_SLOWLY_FAILED), 0 /* arg */);
13828 					vm_map_lock_write_to_read(map);
13829 					DTRACE_VM4(submap_copy_error_slowly,
13830 					    vm_object_t, sub_object,
13831 					    vm_object_offset_t, submap_entry_offset,
13832 					    vm_object_size_t, submap_entry_size,
13833 					    int, kr);
13834 					vm_map_lookup_and_lock_object_copy_slowly_error++;
13835 					return kr;
13836 				}
13837 
13838 				if ((kr == KERN_SUCCESS) &&
13839 				    (version.main_timestamp + 1) == map->timestamp) {
13840 					submap_entry = saved_submap_entry;
13841 				} else {
13842 					saved_submap_entry = NULL;
13843 					old_start -= start_delta;
13844 					old_end += end_delta;
13845 					vm_object_deallocate(copy_object);
13846 					copy_object = VM_OBJECT_NULL;
13847 					vm_map_lock_write_to_read(map);
13848 					vm_map_lookup_and_lock_object_copy_slowly_restart++;
13849 					goto RetrySubMap;
13850 				}
13851 				vm_map_lookup_and_lock_object_copy_slowly_count++;
13852 				vm_map_lookup_and_lock_object_copy_slowly_size += submap_entry_size;
13853 				if (submap_entry_size > vm_map_lookup_and_lock_object_copy_slowly_max) {
13854 					vm_map_lookup_and_lock_object_copy_slowly_max = submap_entry_size;
13855 				}
13856 			} else if (sub_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
13857 				submap_entry_offset = VME_OFFSET(submap_entry);
13858 				copy_object = VM_OBJECT_NULL;
13859 				object_copied_offset = submap_entry_offset;
13860 				object_copied_needs_copy = FALSE;
13861 				DTRACE_VM6(submap_copy_strategically,
13862 				    vm_map_t, cow_sub_map_parent,
13863 				    vm_map_offset_t, vaddr,
13864 				    vm_map_t, map,
13865 				    vm_object_size_t, submap_entry_size,
13866 				    int, submap_entry->wired_count,
13867 				    int, sub_object->copy_strategy);
13868 				kr = vm_object_copy_strategically(
13869 					sub_object,
13870 					submap_entry_offset,
13871 					submap_entry->vme_end - submap_entry->vme_start,
13872 					&copy_object,
13873 					&object_copied_offset,
13874 					&object_copied_needs_copy);
13875 				if (kr == KERN_MEMORY_RESTART_COPY) {
13876 					old_start -= start_delta;
13877 					old_end += end_delta;
13878 					vm_object_deallocate(copy_object);
13879 					copy_object = VM_OBJECT_NULL;
13880 					vm_map_lock_write_to_read(map);
13881 					vm_map_lookup_and_lock_object_copy_strategically_restart++;
13882 					goto RetrySubMap;
13883 				}
13884 				if (kr != KERN_SUCCESS) {
13885 					if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
13886 						vm_map_unlock(cow_sub_map_parent);
13887 					}
13888 					if ((*real_map != map)
13889 					    && (*real_map != cow_sub_map_parent)) {
13890 						vm_map_unlock(*real_map);
13891 					}
13892 					*real_map = map;
13893 					vm_object_deallocate(copy_object);
13894 					copy_object = VM_OBJECT_NULL;
13895 					ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_COPY_STRAT_FAILED), 0 /* arg */);
13896 					vm_map_lock_write_to_read(map);
13897 					DTRACE_VM4(submap_copy_error_strategically,
13898 					    vm_object_t, sub_object,
13899 					    vm_object_offset_t, submap_entry_offset,
13900 					    vm_object_size_t, submap_entry_size,
13901 					    int, kr);
13902 					vm_map_lookup_and_lock_object_copy_strategically_error++;
13903 					return kr;
13904 				}
13905 				assert(copy_object != VM_OBJECT_NULL);
13906 				assert(copy_object != sub_object);
13907 				object_copied = TRUE;
13908 				vm_map_lookup_and_lock_object_copy_strategically_count++;
13909 				vm_map_lookup_and_lock_object_copy_strategically_size += submap_entry_size;
13910 				if (submap_entry_size > vm_map_lookup_and_lock_object_copy_strategically_max) {
13911 					vm_map_lookup_and_lock_object_copy_strategically_max = submap_entry_size;
13912 				}
13913 			} else {
13914 				/* set up shadow object */
13915 				object_copied = FALSE;
13916 				copy_object = sub_object;
13917 				vm_object_lock(sub_object);
13918 				vm_object_reference_locked(sub_object);
13919 				sub_object->shadowed = TRUE;
13920 				vm_object_unlock(sub_object);
13921 
13922 				assert(submap_entry->wired_count == 0);
13923 				submap_entry->needs_copy = TRUE;
13924 
13925 				prot = submap_entry->protection;
13926 				assert(!pmap_has_prot_policy(map->pmap, submap_entry->translated_allow_execute, prot));
13927 				prot = prot & ~VM_PROT_WRITE;
13928 				assert(!pmap_has_prot_policy(map->pmap, submap_entry->translated_allow_execute, prot));
13929 
13930 				if (override_nx(old_map,
13931 				    VME_ALIAS(submap_entry))
13932 				    && prot) {
13933 					prot |= VM_PROT_EXECUTE;
13934 				}
13935 
13936 				vm_object_pmap_protect(
13937 					sub_object,
13938 					VME_OFFSET(submap_entry),
13939 					submap_entry->vme_end -
13940 					submap_entry->vme_start,
13941 					(submap_entry->is_shared
13942 					|| map->mapped_in_other_pmaps) ?
13943 					PMAP_NULL : map->pmap,
13944 					VM_MAP_PAGE_SIZE(map),
13945 					submap_entry->vme_start,
13946 					prot);
13947 				vm_map_lookup_and_lock_object_copy_shadow_count++;
13948 				vm_map_lookup_and_lock_object_copy_shadow_size += submap_entry_size;
13949 				if (submap_entry_size > vm_map_lookup_and_lock_object_copy_shadow_max) {
13950 					vm_map_lookup_and_lock_object_copy_shadow_max = submap_entry_size;
13951 				}
13952 			}
13953 
13954 			/*
13955 			 * Adjust the fault offset to the submap entry.
13956 			 */
13957 			copy_offset = (local_vaddr -
13958 			    submap_entry->vme_start +
13959 			    VME_OFFSET(submap_entry));
13960 
13961 			/* This works diffently than the   */
13962 			/* normal submap case. We go back  */
13963 			/* to the parent of the cow map and*/
13964 			/* clip out the target portion of  */
13965 			/* the sub_map, substituting the   */
13966 			/* new copy object,                */
13967 
13968 			subentry_protection = submap_entry->protection;
13969 			subentry_max_protection = submap_entry->max_protection;
13970 			subentry_no_copy_on_read = submap_entry->vme_no_copy_on_read;
13971 			subentry_permanent = submap_entry->vme_permanent;
13972 			subentry_pmap_cs_associated = submap_entry->pmap_cs_associated;
13973 
13974 			vm_map_unlock(map);
13975 			submap_entry = NULL; /* not valid after map unlock */
13976 
13977 			local_start = old_start;
13978 			local_end = old_end;
13979 			map = cow_sub_map_parent;
13980 			*var_map = cow_sub_map_parent;
13981 			vaddr = cow_parent_vaddr;
13982 			cow_sub_map_parent = NULL;
13983 
13984 			if (!vm_map_lookup_entry(map,
13985 			    vaddr, &entry)) {
13986 				if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
13987 					vm_map_unlock(cow_sub_map_parent);
13988 				}
13989 				if ((*real_map != map)
13990 				    && (*real_map != cow_sub_map_parent)) {
13991 					vm_map_unlock(*real_map);
13992 				}
13993 				*real_map = map;
13994 				vm_object_deallocate(
13995 					copy_object);
13996 				copy_object = VM_OBJECT_NULL;
13997 				vm_map_lock_write_to_read(map);
13998 				DTRACE_VM4(submap_lookup_post_unlock,
13999 				    uint64_t, (uint64_t)entry->vme_start,
14000 				    uint64_t, (uint64_t)entry->vme_end,
14001 				    vm_map_offset_t, vaddr,
14002 				    int, object_copied);
14003 				return KERN_INVALID_ADDRESS;
14004 			}
14005 
14006 			/* clip out the portion of space */
14007 			/* mapped by the sub map which   */
14008 			/* corresponds to the underlying */
14009 			/* object */
14010 
14011 			/*
14012 			 * Clip (and unnest) the smallest nested chunk
14013 			 * possible around the faulting address...
14014 			 */
14015 			local_start = vaddr & ~(pmap_shared_region_size_min(map->pmap) - 1);
14016 			local_end = local_start + pmap_shared_region_size_min(map->pmap);
14017 			/*
14018 			 * ... but don't go beyond the "old_start" to "old_end"
14019 			 * range, to avoid spanning over another VM region
14020 			 * with a possibly different VM object and/or offset.
14021 			 */
14022 			if (local_start < old_start) {
14023 				local_start = old_start;
14024 			}
14025 			if (local_end > old_end) {
14026 				local_end = old_end;
14027 			}
14028 			/*
14029 			 * Adjust copy_offset to the start of the range.
14030 			 */
14031 			copy_offset -= (vaddr - local_start);
14032 
14033 			vm_map_clip_start(map, entry, local_start);
14034 			vm_map_clip_end(map, entry, local_end);
14035 			if (entry->is_sub_map) {
14036 				/* unnesting was done when clipping */
14037 				assert(!entry->use_pmap);
14038 			}
14039 
14040 			/* substitute copy object for */
14041 			/* shared map entry           */
14042 			vm_map_deallocate(VME_SUBMAP(entry));
14043 			assert(!entry->iokit_acct);
14044 			entry->use_pmap = TRUE;
14045 			VME_OBJECT_SET(entry, copy_object, false, 0);
14046 
14047 			/* propagate the submap entry's protections */
14048 			if (entry->protection != VM_PROT_READ) {
14049 				/*
14050 				 * Someone has already altered the top entry's
14051 				 * protections via vm_protect(VM_PROT_COPY).
14052 				 * Respect these new values and ignore the
14053 				 * submap entry's protections.
14054 				 */
14055 			} else {
14056 				/*
14057 				 * Regular copy-on-write: propagate the submap
14058 				 * entry's protections to the top map entry.
14059 				 */
14060 				entry->protection |= subentry_protection;
14061 			}
14062 			entry->max_protection |= subentry_max_protection;
14063 			/* propagate some attributes from subentry */
14064 			entry->vme_no_copy_on_read = subentry_no_copy_on_read;
14065 			entry->vme_permanent = subentry_permanent;
14066 			entry->pmap_cs_associated = subentry_pmap_cs_associated;
14067 
14068 			if ((entry->protection & VM_PROT_WRITE) &&
14069 			    (entry->protection & VM_PROT_EXECUTE) &&
14070 #if XNU_TARGET_OS_OSX
14071 			    map->pmap != kernel_pmap &&
14072 			    (vm_map_cs_enforcement(map)
14073 #if __arm64__
14074 			    || !VM_MAP_IS_EXOTIC(map)
14075 #endif /* __arm64__ */
14076 			    ) &&
14077 #endif /* XNU_TARGET_OS_OSX */
14078 			    !(entry->used_for_jit) &&
14079 			    VM_MAP_POLICY_WX_STRIP_X(map)) {
14080 				DTRACE_VM3(cs_wx,
14081 				    uint64_t, (uint64_t)entry->vme_start,
14082 				    uint64_t, (uint64_t)entry->vme_end,
14083 				    vm_prot_t, entry->protection);
14084 				printf("CODE SIGNING: %d[%s] %s can't have both write and exec at the same time\n",
14085 				    proc_selfpid(),
14086 				    (get_bsdtask_info(current_task())
14087 				    ? proc_name_address(get_bsdtask_info(current_task()))
14088 				    : "?"),
14089 				    __FUNCTION__);
14090 				entry->protection &= ~VM_PROT_EXECUTE;
14091 			}
14092 
14093 			if (object_copied) {
14094 				VME_OFFSET_SET(entry, local_start - old_start + object_copied_offset);
14095 				entry->needs_copy = object_copied_needs_copy;
14096 				entry->is_shared = FALSE;
14097 			} else {
14098 				assert(VME_OBJECT(entry) != VM_OBJECT_NULL);
14099 				assert(VME_OBJECT(entry)->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC);
14100 				assert(entry->wired_count == 0);
14101 				VME_OFFSET_SET(entry, copy_offset);
14102 				entry->needs_copy = TRUE;
14103 				if (map != old_map) {
14104 					entry->is_shared = TRUE;
14105 				}
14106 			}
14107 			if (entry->inheritance == VM_INHERIT_SHARE) {
14108 				entry->inheritance = VM_INHERIT_COPY;
14109 			}
14110 
14111 			vm_map_lock_write_to_read(map);
14112 		} else {
14113 			if ((cow_sub_map_parent)
14114 			    && (cow_sub_map_parent != *real_map)
14115 			    && (cow_sub_map_parent != map)) {
14116 				vm_map_unlock(cow_sub_map_parent);
14117 			}
14118 			entry = submap_entry;
14119 			vaddr = local_vaddr;
14120 		}
14121 	}
14122 
14123 	/*
14124 	 *	Check whether this task is allowed to have
14125 	 *	this page.
14126 	 */
14127 
14128 	prot = entry->protection;
14129 
14130 	if (override_nx(old_map, VME_ALIAS(entry)) && prot) {
14131 		/*
14132 		 * HACK -- if not a stack, then allow execution
14133 		 */
14134 		prot |= VM_PROT_EXECUTE;
14135 	}
14136 
14137 	if (mask_protections) {
14138 		fault_type &= prot;
14139 		if (fault_type == VM_PROT_NONE) {
14140 			goto protection_failure;
14141 		}
14142 	}
14143 	if (((fault_type & prot) != fault_type)
14144 #if __arm64__
14145 	    /* prefetch abort in execute-only page */
14146 	    && !(prot == VM_PROT_EXECUTE && fault_type == (VM_PROT_READ | VM_PROT_EXECUTE))
14147 #elif defined(__x86_64__)
14148 	    /* Consider the UEXEC bit when handling an EXECUTE fault */
14149 	    && !((fault_type & VM_PROT_EXECUTE) && !(prot & VM_PROT_EXECUTE) && (prot & VM_PROT_UEXEC))
14150 #endif
14151 	    ) {
14152 protection_failure:
14153 		if (*real_map != map) {
14154 			vm_map_unlock(*real_map);
14155 		}
14156 		*real_map = map;
14157 
14158 		if ((fault_type & VM_PROT_EXECUTE) && prot) {
14159 			log_stack_execution_failure((addr64_t)vaddr, prot);
14160 		}
14161 
14162 		DTRACE_VM2(prot_fault, int, 1, (uint64_t *), NULL);
14163 		DTRACE_VM3(prot_fault_detailed, vm_prot_t, fault_type, vm_prot_t, prot, void *, vaddr);
14164 		/*
14165 		 * Noisy (esp. internally) and can be inferred from CrashReports. So OFF for now.
14166 		 *
14167 		 * ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_PROTECTION_FAILURE), 0);
14168 		 */
14169 		return KERN_PROTECTION_FAILURE;
14170 	}
14171 
14172 	/*
14173 	 *	If this page is not pageable, we have to get
14174 	 *	it for all possible accesses.
14175 	 */
14176 
14177 	*wired = (entry->wired_count != 0);
14178 	if (*wired) {
14179 		fault_type = prot;
14180 	}
14181 
14182 	/*
14183 	 *	If the entry was copy-on-write, we either ...
14184 	 */
14185 
14186 	if (entry->needs_copy) {
14187 		/*
14188 		 *	If we want to write the page, we may as well
14189 		 *	handle that now since we've got the map locked.
14190 		 *
14191 		 *	If we don't need to write the page, we just
14192 		 *	demote the permissions allowed.
14193 		 */
14194 
14195 		if ((fault_type & VM_PROT_WRITE) || *wired || force_copy) {
14196 			/*
14197 			 *	Make a new object, and place it in the
14198 			 *	object chain.  Note that no new references
14199 			 *	have appeared -- one just moved from the
14200 			 *	map to the new object.
14201 			 */
14202 
14203 			if (vm_map_lock_read_to_write(map)) {
14204 				vm_map_lock_read(map);
14205 				goto RetryLookup;
14206 			}
14207 
14208 			if (VME_OBJECT(entry)->shadowed == FALSE) {
14209 				vm_object_lock(VME_OBJECT(entry));
14210 				VME_OBJECT(entry)->shadowed = TRUE;
14211 				vm_object_unlock(VME_OBJECT(entry));
14212 			}
14213 			VME_OBJECT_SHADOW(entry,
14214 			    (vm_map_size_t) (entry->vme_end -
14215 			    entry->vme_start),
14216 			    vm_map_always_shadow(map));
14217 			entry->needs_copy = FALSE;
14218 
14219 			vm_map_lock_write_to_read(map);
14220 		}
14221 		if ((fault_type & VM_PROT_WRITE) == 0 && *wired == 0) {
14222 			/*
14223 			 *	We're attempting to read a copy-on-write
14224 			 *	page -- don't allow writes.
14225 			 */
14226 
14227 			prot &= (~VM_PROT_WRITE);
14228 		}
14229 	}
14230 
14231 	if (submap_needed_copy && (prot & VM_PROT_WRITE)) {
14232 		/*
14233 		 * We went through a "needs_copy" submap without triggering
14234 		 * a copy, so granting write access to the page would bypass
14235 		 * that submap's "needs_copy".
14236 		 */
14237 		assert(!(fault_type & VM_PROT_WRITE));
14238 		assert(!*wired);
14239 		assert(!force_copy);
14240 		// printf("FBDP %d[%s] submap_needed_copy for %p 0x%llx\n", proc_selfpid(), proc_name_address(current_task()->bsd_info), map, vaddr);
14241 		prot &= ~VM_PROT_WRITE;
14242 	}
14243 
14244 	/*
14245 	 *	Create an object if necessary.
14246 	 */
14247 	if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
14248 		if (vm_map_lock_read_to_write(map)) {
14249 			vm_map_lock_read(map);
14250 			goto RetryLookup;
14251 		}
14252 
14253 		VME_OBJECT_SET(entry,
14254 		    vm_object_allocate(
14255 			    (vm_map_size_t)(entry->vme_end -
14256 			    entry->vme_start)), false, 0);
14257 		VME_OFFSET_SET(entry, 0);
14258 		assert(entry->use_pmap);
14259 		vm_map_lock_write_to_read(map);
14260 	}
14261 
14262 	/*
14263 	 *	Return the object/offset from this entry.  If the entry
14264 	 *	was copy-on-write or empty, it has been fixed up.  Also
14265 	 *	return the protection.
14266 	 */
14267 
14268 	*offset = (vaddr - entry->vme_start) + VME_OFFSET(entry);
14269 	*object = VME_OBJECT(entry);
14270 	*out_prot = prot;
14271 	KDBG_FILTERED(MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_MAP_LOOKUP_OBJECT), VM_KERNEL_UNSLIDE_OR_PERM(*object), (unsigned long) VME_ALIAS(entry), 0, 0);
14272 
14273 	if (fault_info) {
14274 		fault_info->interruptible = THREAD_UNINT; /* for now... */
14275 		/* ... the caller will change "interruptible" if needed */
14276 		fault_info->cluster_size = 0;
14277 		fault_info->user_tag = VME_ALIAS(entry);
14278 		fault_info->pmap_options = 0;
14279 		if (entry->iokit_acct ||
14280 		    (!entry->is_sub_map && !entry->use_pmap)) {
14281 			fault_info->pmap_options |= PMAP_OPTIONS_ALT_ACCT;
14282 		}
14283 		fault_info->behavior = entry->behavior;
14284 		fault_info->lo_offset = VME_OFFSET(entry);
14285 		fault_info->hi_offset =
14286 		    (entry->vme_end - entry->vme_start) + VME_OFFSET(entry);
14287 		fault_info->no_cache  = entry->no_cache;
14288 		fault_info->stealth = FALSE;
14289 		fault_info->io_sync = FALSE;
14290 		if (entry->used_for_jit ||
14291 		    entry->vme_resilient_codesign) {
14292 			fault_info->cs_bypass = TRUE;
14293 		} else {
14294 			fault_info->cs_bypass = FALSE;
14295 		}
14296 		fault_info->pmap_cs_associated = FALSE;
14297 #if CONFIG_PMAP_CS
14298 		if (entry->pmap_cs_associated) {
14299 			/*
14300 			 * The pmap layer will validate this page
14301 			 * before allowing it to be executed from.
14302 			 */
14303 			fault_info->pmap_cs_associated = TRUE;
14304 		}
14305 #endif /* CONFIG_PMAP_CS */
14306 		fault_info->mark_zf_absent = FALSE;
14307 		fault_info->batch_pmap_op = FALSE;
14308 		fault_info->resilient_media = entry->vme_resilient_media;
14309 		fault_info->no_copy_on_read = entry->vme_no_copy_on_read;
14310 		if (entry->translated_allow_execute) {
14311 			fault_info->pmap_options |= PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE;
14312 		}
14313 	}
14314 
14315 	/*
14316 	 *	Lock the object to prevent it from disappearing
14317 	 */
14318 	if (object_lock_type == OBJECT_LOCK_EXCLUSIVE) {
14319 		if (contended == NULL) {
14320 			vm_object_lock(*object);
14321 		} else {
14322 			*contended = vm_object_lock_check_contended(*object);
14323 		}
14324 	} else {
14325 		vm_object_lock_shared(*object);
14326 	}
14327 
14328 	/*
14329 	 *	Save the version number
14330 	 */
14331 
14332 	out_version->main_timestamp = map->timestamp;
14333 
14334 	return KERN_SUCCESS;
14335 }
14336 
14337 
14338 /*
14339  *	vm_map_verify:
14340  *
14341  *	Verifies that the map in question has not changed
14342  *	since the given version. The map has to be locked
14343  *	("shared" mode is fine) before calling this function
14344  *	and it will be returned locked too.
14345  */
14346 boolean_t
vm_map_verify(vm_map_t map,vm_map_version_t * version)14347 vm_map_verify(
14348 	vm_map_t                map,
14349 	vm_map_version_t        *version)       /* REF */
14350 {
14351 	boolean_t       result;
14352 
14353 	vm_map_lock_assert_held(map);
14354 	result = (map->timestamp == version->main_timestamp);
14355 
14356 	return result;
14357 }
14358 
14359 /*
14360  *	TEMPORARYTEMPORARYTEMPORARYTEMPORARYTEMPORARYTEMPORARY
14361  *	Goes away after regular vm_region_recurse function migrates to
14362  *	64 bits
14363  *	vm_region_recurse: A form of vm_region which follows the
14364  *	submaps in a target map
14365  *
14366  */
14367 
14368 kern_return_t
vm_map_region_recurse_64(vm_map_t map,vm_map_offset_t * address,vm_map_size_t * size,natural_t * nesting_depth,vm_region_submap_info_64_t submap_info,mach_msg_type_number_t * count)14369 vm_map_region_recurse_64(
14370 	vm_map_t                 map,
14371 	vm_map_offset_t *address,               /* IN/OUT */
14372 	vm_map_size_t           *size,                  /* OUT */
14373 	natural_t               *nesting_depth, /* IN/OUT */
14374 	vm_region_submap_info_64_t      submap_info,    /* IN/OUT */
14375 	mach_msg_type_number_t  *count) /* IN/OUT */
14376 {
14377 	mach_msg_type_number_t  original_count;
14378 	vm_region_extended_info_data_t  extended;
14379 	vm_map_entry_t                  tmp_entry;
14380 	vm_map_offset_t                 user_address;
14381 	unsigned int                    user_max_depth;
14382 
14383 	/*
14384 	 * "curr_entry" is the VM map entry preceding or including the
14385 	 * address we're looking for.
14386 	 * "curr_map" is the map or sub-map containing "curr_entry".
14387 	 * "curr_address" is the equivalent of the top map's "user_address"
14388 	 * in the current map.
14389 	 * "curr_offset" is the cumulated offset of "curr_map" in the
14390 	 * target task's address space.
14391 	 * "curr_depth" is the depth of "curr_map" in the chain of
14392 	 * sub-maps.
14393 	 *
14394 	 * "curr_max_below" and "curr_max_above" limit the range (around
14395 	 * "curr_address") we should take into account in the current (sub)map.
14396 	 * They limit the range to what's visible through the map entries
14397 	 * we've traversed from the top map to the current map.
14398 	 *
14399 	 */
14400 	vm_map_entry_t                  curr_entry;
14401 	vm_map_address_t                curr_address;
14402 	vm_map_offset_t                 curr_offset;
14403 	vm_map_t                        curr_map;
14404 	unsigned int                    curr_depth;
14405 	vm_map_offset_t                 curr_max_below, curr_max_above;
14406 	vm_map_offset_t                 curr_skip;
14407 
14408 	/*
14409 	 * "next_" is the same as "curr_" but for the VM region immediately
14410 	 * after the address we're looking for.  We need to keep track of this
14411 	 * too because we want to return info about that region if the
14412 	 * address we're looking for is not mapped.
14413 	 */
14414 	vm_map_entry_t                  next_entry;
14415 	vm_map_offset_t                 next_offset;
14416 	vm_map_offset_t                 next_address;
14417 	vm_map_t                        next_map;
14418 	unsigned int                    next_depth;
14419 	vm_map_offset_t                 next_max_below, next_max_above;
14420 	vm_map_offset_t                 next_skip;
14421 
14422 	boolean_t                       look_for_pages;
14423 	vm_region_submap_short_info_64_t short_info;
14424 	boolean_t                       do_region_footprint;
14425 	int                             effective_page_size, effective_page_shift;
14426 	boolean_t                       submap_needed_copy;
14427 
14428 	if (map == VM_MAP_NULL) {
14429 		/* no address space to work on */
14430 		return KERN_INVALID_ARGUMENT;
14431 	}
14432 
14433 	effective_page_shift = vm_self_region_page_shift(map);
14434 	effective_page_size = (1 << effective_page_shift);
14435 
14436 	if (*count < VM_REGION_SUBMAP_SHORT_INFO_COUNT_64) {
14437 		/*
14438 		 * "info" structure is not big enough and
14439 		 * would overflow
14440 		 */
14441 		return KERN_INVALID_ARGUMENT;
14442 	}
14443 
14444 	do_region_footprint = task_self_region_footprint();
14445 	original_count = *count;
14446 
14447 	if (original_count < VM_REGION_SUBMAP_INFO_V0_COUNT_64) {
14448 		*count = VM_REGION_SUBMAP_SHORT_INFO_COUNT_64;
14449 		look_for_pages = FALSE;
14450 		short_info = (vm_region_submap_short_info_64_t) submap_info;
14451 		submap_info = NULL;
14452 	} else {
14453 		look_for_pages = TRUE;
14454 		*count = VM_REGION_SUBMAP_INFO_V0_COUNT_64;
14455 		short_info = NULL;
14456 
14457 		if (original_count >= VM_REGION_SUBMAP_INFO_V1_COUNT_64) {
14458 			*count = VM_REGION_SUBMAP_INFO_V1_COUNT_64;
14459 		}
14460 		if (original_count >= VM_REGION_SUBMAP_INFO_V2_COUNT_64) {
14461 			*count = VM_REGION_SUBMAP_INFO_V2_COUNT_64;
14462 		}
14463 	}
14464 
14465 	user_address = *address;
14466 	user_max_depth = *nesting_depth;
14467 	submap_needed_copy = FALSE;
14468 
14469 	if (not_in_kdp) {
14470 		vm_map_lock_read(map);
14471 	}
14472 
14473 recurse_again:
14474 	curr_entry = NULL;
14475 	curr_map = map;
14476 	curr_address = user_address;
14477 	curr_offset = 0;
14478 	curr_skip = 0;
14479 	curr_depth = 0;
14480 	curr_max_above = ((vm_map_offset_t) -1) - curr_address;
14481 	curr_max_below = curr_address;
14482 
14483 	next_entry = NULL;
14484 	next_map = NULL;
14485 	next_address = 0;
14486 	next_offset = 0;
14487 	next_skip = 0;
14488 	next_depth = 0;
14489 	next_max_above = (vm_map_offset_t) -1;
14490 	next_max_below = (vm_map_offset_t) -1;
14491 
14492 	for (;;) {
14493 		if (vm_map_lookup_entry(curr_map,
14494 		    curr_address,
14495 		    &tmp_entry)) {
14496 			/* tmp_entry contains the address we're looking for */
14497 			curr_entry = tmp_entry;
14498 		} else {
14499 			vm_map_offset_t skip;
14500 			/*
14501 			 * The address is not mapped.  "tmp_entry" is the
14502 			 * map entry preceding the address.  We want the next
14503 			 * one, if it exists.
14504 			 */
14505 			curr_entry = tmp_entry->vme_next;
14506 
14507 			if (curr_entry == vm_map_to_entry(curr_map) ||
14508 			    (curr_entry->vme_start >=
14509 			    curr_address + curr_max_above)) {
14510 				/* no next entry at this level: stop looking */
14511 				if (not_in_kdp) {
14512 					vm_map_unlock_read(curr_map);
14513 				}
14514 				curr_entry = NULL;
14515 				curr_map = NULL;
14516 				curr_skip = 0;
14517 				curr_offset = 0;
14518 				curr_depth = 0;
14519 				curr_max_above = 0;
14520 				curr_max_below = 0;
14521 				break;
14522 			}
14523 
14524 			/* adjust current address and offset */
14525 			skip = curr_entry->vme_start - curr_address;
14526 			curr_address = curr_entry->vme_start;
14527 			curr_skip += skip;
14528 			curr_offset += skip;
14529 			curr_max_above -= skip;
14530 			curr_max_below = 0;
14531 		}
14532 
14533 		/*
14534 		 * Is the next entry at this level closer to the address (or
14535 		 * deeper in the submap chain) than the one we had
14536 		 * so far ?
14537 		 */
14538 		tmp_entry = curr_entry->vme_next;
14539 		if (tmp_entry == vm_map_to_entry(curr_map)) {
14540 			/* no next entry at this level */
14541 		} else if (tmp_entry->vme_start >=
14542 		    curr_address + curr_max_above) {
14543 			/*
14544 			 * tmp_entry is beyond the scope of what we mapped of
14545 			 * this submap in the upper level: ignore it.
14546 			 */
14547 		} else if ((next_entry == NULL) ||
14548 		    (tmp_entry->vme_start + curr_offset <=
14549 		    next_entry->vme_start + next_offset)) {
14550 			/*
14551 			 * We didn't have a "next_entry" or this one is
14552 			 * closer to the address we're looking for:
14553 			 * use this "tmp_entry" as the new "next_entry".
14554 			 */
14555 			if (next_entry != NULL) {
14556 				/* unlock the last "next_map" */
14557 				if (next_map != curr_map && not_in_kdp) {
14558 					vm_map_unlock_read(next_map);
14559 				}
14560 			}
14561 			next_entry = tmp_entry;
14562 			next_map = curr_map;
14563 			next_depth = curr_depth;
14564 			next_address = next_entry->vme_start;
14565 			next_skip = curr_skip;
14566 			next_skip += (next_address - curr_address);
14567 			next_offset = curr_offset;
14568 			next_offset += (next_address - curr_address);
14569 			next_max_above = MIN(next_max_above, curr_max_above);
14570 			next_max_above = MIN(next_max_above,
14571 			    next_entry->vme_end - next_address);
14572 			next_max_below = MIN(next_max_below, curr_max_below);
14573 			next_max_below = MIN(next_max_below,
14574 			    next_address - next_entry->vme_start);
14575 		}
14576 
14577 		/*
14578 		 * "curr_max_{above,below}" allow us to keep track of the
14579 		 * portion of the submap that is actually mapped at this level:
14580 		 * the rest of that submap is irrelevant to us, since it's not
14581 		 * mapped here.
14582 		 * The relevant portion of the map starts at
14583 		 * "VME_OFFSET(curr_entry)" up to the size of "curr_entry".
14584 		 */
14585 		curr_max_above = MIN(curr_max_above,
14586 		    curr_entry->vme_end - curr_address);
14587 		curr_max_below = MIN(curr_max_below,
14588 		    curr_address - curr_entry->vme_start);
14589 
14590 		if (!curr_entry->is_sub_map ||
14591 		    curr_depth >= user_max_depth) {
14592 			/*
14593 			 * We hit a leaf map or we reached the maximum depth
14594 			 * we could, so stop looking.  Keep the current map
14595 			 * locked.
14596 			 */
14597 			break;
14598 		}
14599 
14600 		/*
14601 		 * Get down to the next submap level.
14602 		 */
14603 
14604 		if (curr_entry->needs_copy) {
14605 			/* everything below this is effectively copy-on-write */
14606 			submap_needed_copy = TRUE;
14607 		}
14608 
14609 		/*
14610 		 * Lock the next level and unlock the current level,
14611 		 * unless we need to keep it locked to access the "next_entry"
14612 		 * later.
14613 		 */
14614 		if (not_in_kdp) {
14615 			vm_map_lock_read(VME_SUBMAP(curr_entry));
14616 		}
14617 		if (curr_map == next_map) {
14618 			/* keep "next_map" locked in case we need it */
14619 		} else {
14620 			/* release this map */
14621 			if (not_in_kdp) {
14622 				vm_map_unlock_read(curr_map);
14623 			}
14624 		}
14625 
14626 		/*
14627 		 * Adjust the offset.  "curr_entry" maps the submap
14628 		 * at relative address "curr_entry->vme_start" in the
14629 		 * curr_map but skips the first "VME_OFFSET(curr_entry)"
14630 		 * bytes of the submap.
14631 		 * "curr_offset" always represents the offset of a virtual
14632 		 * address in the curr_map relative to the absolute address
14633 		 * space (i.e. the top-level VM map).
14634 		 */
14635 		curr_offset +=
14636 		    (VME_OFFSET(curr_entry) - curr_entry->vme_start);
14637 		curr_address = user_address + curr_offset;
14638 		/* switch to the submap */
14639 		curr_map = VME_SUBMAP(curr_entry);
14640 		curr_depth++;
14641 		curr_entry = NULL;
14642 	}
14643 
14644 // LP64todo: all the current tools are 32bit, obviously never worked for 64b
14645 // so probably should be a real 32b ID vs. ptr.
14646 // Current users just check for equality
14647 
14648 	if (curr_entry == NULL) {
14649 		/* no VM region contains the address... */
14650 
14651 		if (do_region_footprint && /* we want footprint numbers */
14652 		    next_entry == NULL && /* & there are no more regions */
14653 		    /* & we haven't already provided our fake region: */
14654 		    user_address <= vm_map_last_entry(map)->vme_end) {
14655 			ledger_amount_t ledger_resident, ledger_compressed;
14656 
14657 			/*
14658 			 * Add a fake memory region to account for
14659 			 * purgeable and/or ledger-tagged memory that
14660 			 * counts towards this task's memory footprint,
14661 			 * i.e. the resident/compressed pages of non-volatile
14662 			 * objects owned by that task.
14663 			 */
14664 			task_ledgers_footprint(map->pmap->ledger,
14665 			    &ledger_resident,
14666 			    &ledger_compressed);
14667 			if (ledger_resident + ledger_compressed == 0) {
14668 				/* no purgeable memory usage to report */
14669 				return KERN_INVALID_ADDRESS;
14670 			}
14671 			/* fake region to show nonvolatile footprint */
14672 			if (look_for_pages) {
14673 				submap_info->protection = VM_PROT_DEFAULT;
14674 				submap_info->max_protection = VM_PROT_DEFAULT;
14675 				submap_info->inheritance = VM_INHERIT_DEFAULT;
14676 				submap_info->offset = 0;
14677 				submap_info->user_tag = -1;
14678 				submap_info->pages_resident = (unsigned int) (ledger_resident / effective_page_size);
14679 				submap_info->pages_shared_now_private = 0;
14680 				submap_info->pages_swapped_out = (unsigned int) (ledger_compressed / effective_page_size);
14681 				submap_info->pages_dirtied = submap_info->pages_resident;
14682 				submap_info->ref_count = 1;
14683 				submap_info->shadow_depth = 0;
14684 				submap_info->external_pager = 0;
14685 				submap_info->share_mode = SM_PRIVATE;
14686 				if (submap_needed_copy) {
14687 					submap_info->share_mode = SM_COW;
14688 				}
14689 				submap_info->is_submap = 0;
14690 				submap_info->behavior = VM_BEHAVIOR_DEFAULT;
14691 				submap_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
14692 				submap_info->user_wired_count = 0;
14693 				submap_info->pages_reusable = 0;
14694 			} else {
14695 				short_info->user_tag = -1;
14696 				short_info->offset = 0;
14697 				short_info->protection = VM_PROT_DEFAULT;
14698 				short_info->inheritance = VM_INHERIT_DEFAULT;
14699 				short_info->max_protection = VM_PROT_DEFAULT;
14700 				short_info->behavior = VM_BEHAVIOR_DEFAULT;
14701 				short_info->user_wired_count = 0;
14702 				short_info->is_submap = 0;
14703 				short_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
14704 				short_info->external_pager = 0;
14705 				short_info->shadow_depth = 0;
14706 				short_info->share_mode = SM_PRIVATE;
14707 				if (submap_needed_copy) {
14708 					short_info->share_mode = SM_COW;
14709 				}
14710 				short_info->ref_count = 1;
14711 			}
14712 			*nesting_depth = 0;
14713 			*size = (vm_map_size_t) (ledger_resident + ledger_compressed);
14714 //			*address = user_address;
14715 			*address = vm_map_last_entry(map)->vme_end;
14716 			return KERN_SUCCESS;
14717 		}
14718 
14719 		if (next_entry == NULL) {
14720 			/* ... and no VM region follows it either */
14721 			return KERN_INVALID_ADDRESS;
14722 		}
14723 		/* ... gather info about the next VM region */
14724 		curr_entry = next_entry;
14725 		curr_map = next_map;    /* still locked ... */
14726 		curr_address = next_address;
14727 		curr_skip = next_skip;
14728 		curr_offset = next_offset;
14729 		curr_depth = next_depth;
14730 		curr_max_above = next_max_above;
14731 		curr_max_below = next_max_below;
14732 	} else {
14733 		/* we won't need "next_entry" after all */
14734 		if (next_entry != NULL) {
14735 			/* release "next_map" */
14736 			if (next_map != curr_map && not_in_kdp) {
14737 				vm_map_unlock_read(next_map);
14738 			}
14739 		}
14740 	}
14741 	next_entry = NULL;
14742 	next_map = NULL;
14743 	next_offset = 0;
14744 	next_skip = 0;
14745 	next_depth = 0;
14746 	next_max_below = -1;
14747 	next_max_above = -1;
14748 
14749 	if (curr_entry->is_sub_map &&
14750 	    curr_depth < user_max_depth) {
14751 		/*
14752 		 * We're not as deep as we could be:  we must have
14753 		 * gone back up after not finding anything mapped
14754 		 * below the original top-level map entry's.
14755 		 * Let's move "curr_address" forward and recurse again.
14756 		 */
14757 		user_address = curr_address;
14758 		goto recurse_again;
14759 	}
14760 
14761 	*nesting_depth = curr_depth;
14762 	*size = curr_max_above + curr_max_below;
14763 	*address = user_address + curr_skip - curr_max_below;
14764 
14765 	if (look_for_pages) {
14766 		submap_info->user_tag = VME_ALIAS(curr_entry);
14767 		submap_info->offset = VME_OFFSET(curr_entry);
14768 		submap_info->protection = curr_entry->protection;
14769 		submap_info->inheritance = curr_entry->inheritance;
14770 		submap_info->max_protection = curr_entry->max_protection;
14771 		submap_info->behavior = curr_entry->behavior;
14772 		submap_info->user_wired_count = curr_entry->user_wired_count;
14773 		submap_info->is_submap = curr_entry->is_sub_map;
14774 		if (curr_entry->is_sub_map) {
14775 			submap_info->object_id = VM_OBJECT_ID(VME_SUBMAP(curr_entry));
14776 		} else {
14777 			submap_info->object_id = VM_OBJECT_ID(VME_OBJECT(curr_entry));
14778 		}
14779 	} else {
14780 		short_info->user_tag = VME_ALIAS(curr_entry);
14781 		short_info->offset = VME_OFFSET(curr_entry);
14782 		short_info->protection = curr_entry->protection;
14783 		short_info->inheritance = curr_entry->inheritance;
14784 		short_info->max_protection = curr_entry->max_protection;
14785 		short_info->behavior = curr_entry->behavior;
14786 		short_info->user_wired_count = curr_entry->user_wired_count;
14787 		short_info->is_submap = curr_entry->is_sub_map;
14788 		if (curr_entry->is_sub_map) {
14789 			short_info->object_id = VM_OBJECT_ID(VME_SUBMAP(curr_entry));
14790 		} else {
14791 			short_info->object_id = VM_OBJECT_ID(VME_OBJECT(curr_entry));
14792 		}
14793 	}
14794 
14795 	extended.pages_resident = 0;
14796 	extended.pages_swapped_out = 0;
14797 	extended.pages_shared_now_private = 0;
14798 	extended.pages_dirtied = 0;
14799 	extended.pages_reusable = 0;
14800 	extended.external_pager = 0;
14801 	extended.shadow_depth = 0;
14802 	extended.share_mode = SM_EMPTY;
14803 	extended.ref_count = 0;
14804 
14805 	if (not_in_kdp) {
14806 		if (!curr_entry->is_sub_map) {
14807 			vm_map_offset_t range_start, range_end;
14808 			range_start = MAX((curr_address - curr_max_below),
14809 			    curr_entry->vme_start);
14810 			range_end = MIN((curr_address + curr_max_above),
14811 			    curr_entry->vme_end);
14812 			vm_map_region_walk(curr_map,
14813 			    range_start,
14814 			    curr_entry,
14815 			    (VME_OFFSET(curr_entry) +
14816 			    (range_start -
14817 			    curr_entry->vme_start)),
14818 			    range_end - range_start,
14819 			    &extended,
14820 			    look_for_pages, VM_REGION_EXTENDED_INFO_COUNT);
14821 			if (extended.external_pager &&
14822 			    extended.ref_count == 2 &&
14823 			    extended.share_mode == SM_SHARED) {
14824 				extended.share_mode = SM_PRIVATE;
14825 			}
14826 			if (submap_needed_copy) {
14827 				extended.share_mode = SM_COW;
14828 			}
14829 		} else {
14830 			if (curr_entry->use_pmap) {
14831 				extended.share_mode = SM_TRUESHARED;
14832 			} else {
14833 				extended.share_mode = SM_PRIVATE;
14834 			}
14835 			extended.ref_count = os_ref_get_count_raw(&VME_SUBMAP(curr_entry)->map_refcnt);
14836 		}
14837 	}
14838 
14839 	if (look_for_pages) {
14840 		submap_info->pages_resident = extended.pages_resident;
14841 		submap_info->pages_swapped_out = extended.pages_swapped_out;
14842 		submap_info->pages_shared_now_private =
14843 		    extended.pages_shared_now_private;
14844 		submap_info->pages_dirtied = extended.pages_dirtied;
14845 		submap_info->external_pager = extended.external_pager;
14846 		submap_info->shadow_depth = extended.shadow_depth;
14847 		submap_info->share_mode = extended.share_mode;
14848 		submap_info->ref_count = extended.ref_count;
14849 
14850 		if (original_count >= VM_REGION_SUBMAP_INFO_V1_COUNT_64) {
14851 			submap_info->pages_reusable = extended.pages_reusable;
14852 		}
14853 		if (original_count >= VM_REGION_SUBMAP_INFO_V2_COUNT_64) {
14854 			if (curr_entry->is_sub_map) {
14855 				submap_info->object_id_full = (vm_object_id_t)VM_KERNEL_ADDRPERM(VME_SUBMAP(curr_entry));
14856 			} else if (VME_OBJECT(curr_entry)) {
14857 				submap_info->object_id_full = (vm_object_id_t)VM_KERNEL_ADDRPERM(VME_OBJECT(curr_entry));
14858 			} else {
14859 				submap_info->object_id_full = 0ull;
14860 			}
14861 		}
14862 	} else {
14863 		short_info->external_pager = extended.external_pager;
14864 		short_info->shadow_depth = extended.shadow_depth;
14865 		short_info->share_mode = extended.share_mode;
14866 		short_info->ref_count = extended.ref_count;
14867 	}
14868 
14869 	if (not_in_kdp) {
14870 		vm_map_unlock_read(curr_map);
14871 	}
14872 
14873 	return KERN_SUCCESS;
14874 }
14875 
14876 /*
14877  *	vm_region:
14878  *
14879  *	User call to obtain information about a region in
14880  *	a task's address map. Currently, only one flavor is
14881  *	supported.
14882  *
14883  *	XXX The reserved and behavior fields cannot be filled
14884  *	    in until the vm merge from the IK is completed, and
14885  *	    vm_reserve is implemented.
14886  */
14887 
14888 kern_return_t
vm_map_region(vm_map_t map,vm_map_offset_t * address,vm_map_size_t * size,vm_region_flavor_t flavor,vm_region_info_t info,mach_msg_type_number_t * count,mach_port_t * object_name)14889 vm_map_region(
14890 	vm_map_t                 map,
14891 	vm_map_offset_t *address,               /* IN/OUT */
14892 	vm_map_size_t           *size,                  /* OUT */
14893 	vm_region_flavor_t       flavor,                /* IN */
14894 	vm_region_info_t         info,                  /* OUT */
14895 	mach_msg_type_number_t  *count, /* IN/OUT */
14896 	mach_port_t             *object_name)           /* OUT */
14897 {
14898 	vm_map_entry_t          tmp_entry;
14899 	vm_map_entry_t          entry;
14900 	vm_map_offset_t         start;
14901 
14902 	if (map == VM_MAP_NULL) {
14903 		return KERN_INVALID_ARGUMENT;
14904 	}
14905 
14906 	switch (flavor) {
14907 	case VM_REGION_BASIC_INFO:
14908 		/* legacy for old 32-bit objects info */
14909 	{
14910 		vm_region_basic_info_t  basic;
14911 
14912 		if (*count < VM_REGION_BASIC_INFO_COUNT) {
14913 			return KERN_INVALID_ARGUMENT;
14914 		}
14915 
14916 		basic = (vm_region_basic_info_t) info;
14917 		*count = VM_REGION_BASIC_INFO_COUNT;
14918 
14919 		vm_map_lock_read(map);
14920 
14921 		start = *address;
14922 		if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
14923 			if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
14924 				vm_map_unlock_read(map);
14925 				return KERN_INVALID_ADDRESS;
14926 			}
14927 		} else {
14928 			entry = tmp_entry;
14929 		}
14930 
14931 		start = entry->vme_start;
14932 
14933 		basic->offset = (uint32_t)VME_OFFSET(entry);
14934 		basic->protection = entry->protection;
14935 		basic->inheritance = entry->inheritance;
14936 		basic->max_protection = entry->max_protection;
14937 		basic->behavior = entry->behavior;
14938 		basic->user_wired_count = entry->user_wired_count;
14939 		basic->reserved = entry->is_sub_map;
14940 		*address = start;
14941 		*size = (entry->vme_end - start);
14942 
14943 		if (object_name) {
14944 			*object_name = IP_NULL;
14945 		}
14946 		if (entry->is_sub_map) {
14947 			basic->shared = FALSE;
14948 		} else {
14949 			basic->shared = entry->is_shared;
14950 		}
14951 
14952 		vm_map_unlock_read(map);
14953 		return KERN_SUCCESS;
14954 	}
14955 
14956 	case VM_REGION_BASIC_INFO_64:
14957 	{
14958 		vm_region_basic_info_64_t       basic;
14959 
14960 		if (*count < VM_REGION_BASIC_INFO_COUNT_64) {
14961 			return KERN_INVALID_ARGUMENT;
14962 		}
14963 
14964 		basic = (vm_region_basic_info_64_t) info;
14965 		*count = VM_REGION_BASIC_INFO_COUNT_64;
14966 
14967 		vm_map_lock_read(map);
14968 
14969 		start = *address;
14970 		if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
14971 			if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
14972 				vm_map_unlock_read(map);
14973 				return KERN_INVALID_ADDRESS;
14974 			}
14975 		} else {
14976 			entry = tmp_entry;
14977 		}
14978 
14979 		start = entry->vme_start;
14980 
14981 		basic->offset = VME_OFFSET(entry);
14982 		basic->protection = entry->protection;
14983 		basic->inheritance = entry->inheritance;
14984 		basic->max_protection = entry->max_protection;
14985 		basic->behavior = entry->behavior;
14986 		basic->user_wired_count = entry->user_wired_count;
14987 		basic->reserved = entry->is_sub_map;
14988 		*address = start;
14989 		*size = (entry->vme_end - start);
14990 
14991 		if (object_name) {
14992 			*object_name = IP_NULL;
14993 		}
14994 		if (entry->is_sub_map) {
14995 			basic->shared = FALSE;
14996 		} else {
14997 			basic->shared = entry->is_shared;
14998 		}
14999 
15000 		vm_map_unlock_read(map);
15001 		return KERN_SUCCESS;
15002 	}
15003 	case VM_REGION_EXTENDED_INFO:
15004 		if (*count < VM_REGION_EXTENDED_INFO_COUNT) {
15005 			return KERN_INVALID_ARGUMENT;
15006 		}
15007 		OS_FALLTHROUGH;
15008 	case VM_REGION_EXTENDED_INFO__legacy:
15009 		if (*count < VM_REGION_EXTENDED_INFO_COUNT__legacy) {
15010 			return KERN_INVALID_ARGUMENT;
15011 		}
15012 
15013 		{
15014 			vm_region_extended_info_t       extended;
15015 			mach_msg_type_number_t original_count;
15016 			int effective_page_size, effective_page_shift;
15017 
15018 			extended = (vm_region_extended_info_t) info;
15019 
15020 			effective_page_shift = vm_self_region_page_shift(map);
15021 			effective_page_size = (1 << effective_page_shift);
15022 
15023 			vm_map_lock_read(map);
15024 
15025 			start = *address;
15026 			if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15027 				if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15028 					vm_map_unlock_read(map);
15029 					return KERN_INVALID_ADDRESS;
15030 				}
15031 			} else {
15032 				entry = tmp_entry;
15033 			}
15034 			start = entry->vme_start;
15035 
15036 			extended->protection = entry->protection;
15037 			extended->user_tag = VME_ALIAS(entry);
15038 			extended->pages_resident = 0;
15039 			extended->pages_swapped_out = 0;
15040 			extended->pages_shared_now_private = 0;
15041 			extended->pages_dirtied = 0;
15042 			extended->external_pager = 0;
15043 			extended->shadow_depth = 0;
15044 
15045 			original_count = *count;
15046 			if (flavor == VM_REGION_EXTENDED_INFO__legacy) {
15047 				*count = VM_REGION_EXTENDED_INFO_COUNT__legacy;
15048 			} else {
15049 				extended->pages_reusable = 0;
15050 				*count = VM_REGION_EXTENDED_INFO_COUNT;
15051 			}
15052 
15053 			vm_map_region_walk(map, start, entry, VME_OFFSET(entry), entry->vme_end - start, extended, TRUE, *count);
15054 
15055 			if (extended->external_pager && extended->ref_count == 2 && extended->share_mode == SM_SHARED) {
15056 				extended->share_mode = SM_PRIVATE;
15057 			}
15058 
15059 			if (object_name) {
15060 				*object_name = IP_NULL;
15061 			}
15062 			*address = start;
15063 			*size = (entry->vme_end - start);
15064 
15065 			vm_map_unlock_read(map);
15066 			return KERN_SUCCESS;
15067 		}
15068 	case VM_REGION_TOP_INFO:
15069 	{
15070 		vm_region_top_info_t    top;
15071 
15072 		if (*count < VM_REGION_TOP_INFO_COUNT) {
15073 			return KERN_INVALID_ARGUMENT;
15074 		}
15075 
15076 		top = (vm_region_top_info_t) info;
15077 		*count = VM_REGION_TOP_INFO_COUNT;
15078 
15079 		vm_map_lock_read(map);
15080 
15081 		start = *address;
15082 		if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15083 			if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15084 				vm_map_unlock_read(map);
15085 				return KERN_INVALID_ADDRESS;
15086 			}
15087 		} else {
15088 			entry = tmp_entry;
15089 		}
15090 		start = entry->vme_start;
15091 
15092 		top->private_pages_resident = 0;
15093 		top->shared_pages_resident = 0;
15094 
15095 		vm_map_region_top_walk(entry, top);
15096 
15097 		if (object_name) {
15098 			*object_name = IP_NULL;
15099 		}
15100 		*address = start;
15101 		*size = (entry->vme_end - start);
15102 
15103 		vm_map_unlock_read(map);
15104 		return KERN_SUCCESS;
15105 	}
15106 	default:
15107 		return KERN_INVALID_ARGUMENT;
15108 	}
15109 }
15110 
15111 #define OBJ_RESIDENT_COUNT(obj, entry_size)                             \
15112 	MIN((entry_size),                                               \
15113 	    ((obj)->all_reusable ?                                      \
15114 	     (obj)->wired_page_count :                                  \
15115 	     (obj)->resident_page_count - (obj)->reusable_page_count))
15116 
15117 void
vm_map_region_top_walk(vm_map_entry_t entry,vm_region_top_info_t top)15118 vm_map_region_top_walk(
15119 	vm_map_entry_t             entry,
15120 	vm_region_top_info_t       top)
15121 {
15122 	if (entry->is_sub_map || VME_OBJECT(entry) == 0) {
15123 		top->share_mode = SM_EMPTY;
15124 		top->ref_count = 0;
15125 		top->obj_id = 0;
15126 		return;
15127 	}
15128 
15129 	{
15130 		struct  vm_object *obj, *tmp_obj;
15131 		int             ref_count;
15132 		uint32_t        entry_size;
15133 
15134 		entry_size = (uint32_t) ((entry->vme_end - entry->vme_start) / PAGE_SIZE_64);
15135 
15136 		obj = VME_OBJECT(entry);
15137 
15138 		vm_object_lock(obj);
15139 
15140 		if ((ref_count = obj->ref_count) > 1 && obj->paging_in_progress) {
15141 			ref_count--;
15142 		}
15143 
15144 		assert(obj->reusable_page_count <= obj->resident_page_count);
15145 		if (obj->shadow) {
15146 			if (ref_count == 1) {
15147 				top->private_pages_resident =
15148 				    OBJ_RESIDENT_COUNT(obj, entry_size);
15149 			} else {
15150 				top->shared_pages_resident =
15151 				    OBJ_RESIDENT_COUNT(obj, entry_size);
15152 			}
15153 			top->ref_count  = ref_count;
15154 			top->share_mode = SM_COW;
15155 
15156 			while ((tmp_obj = obj->shadow)) {
15157 				vm_object_lock(tmp_obj);
15158 				vm_object_unlock(obj);
15159 				obj = tmp_obj;
15160 
15161 				if ((ref_count = obj->ref_count) > 1 && obj->paging_in_progress) {
15162 					ref_count--;
15163 				}
15164 
15165 				assert(obj->reusable_page_count <= obj->resident_page_count);
15166 				top->shared_pages_resident +=
15167 				    OBJ_RESIDENT_COUNT(obj, entry_size);
15168 				top->ref_count += ref_count - 1;
15169 			}
15170 		} else {
15171 			if (entry->superpage_size) {
15172 				top->share_mode = SM_LARGE_PAGE;
15173 				top->shared_pages_resident = 0;
15174 				top->private_pages_resident = entry_size;
15175 			} else if (entry->needs_copy) {
15176 				top->share_mode = SM_COW;
15177 				top->shared_pages_resident =
15178 				    OBJ_RESIDENT_COUNT(obj, entry_size);
15179 			} else {
15180 				if (ref_count == 1 ||
15181 				    (ref_count == 2 && obj->named)) {
15182 					top->share_mode = SM_PRIVATE;
15183 					top->private_pages_resident =
15184 					    OBJ_RESIDENT_COUNT(obj,
15185 					    entry_size);
15186 				} else {
15187 					top->share_mode = SM_SHARED;
15188 					top->shared_pages_resident =
15189 					    OBJ_RESIDENT_COUNT(obj,
15190 					    entry_size);
15191 				}
15192 			}
15193 			top->ref_count = ref_count;
15194 		}
15195 		/* XXX K64: obj_id will be truncated */
15196 		top->obj_id = (unsigned int) (uintptr_t)VM_KERNEL_ADDRPERM(obj);
15197 
15198 		vm_object_unlock(obj);
15199 	}
15200 }
15201 
15202 void
vm_map_region_walk(vm_map_t map,vm_map_offset_t va,vm_map_entry_t entry,vm_object_offset_t offset,vm_object_size_t range,vm_region_extended_info_t extended,boolean_t look_for_pages,mach_msg_type_number_t count)15203 vm_map_region_walk(
15204 	vm_map_t                        map,
15205 	vm_map_offset_t                 va,
15206 	vm_map_entry_t                  entry,
15207 	vm_object_offset_t              offset,
15208 	vm_object_size_t                range,
15209 	vm_region_extended_info_t       extended,
15210 	boolean_t                       look_for_pages,
15211 	mach_msg_type_number_t count)
15212 {
15213 	struct vm_object *obj, *tmp_obj;
15214 	vm_map_offset_t       last_offset;
15215 	int               i;
15216 	int               ref_count;
15217 	struct vm_object        *shadow_object;
15218 	unsigned short          shadow_depth;
15219 	boolean_t         do_region_footprint;
15220 	int                     effective_page_size, effective_page_shift;
15221 	vm_map_offset_t         effective_page_mask;
15222 
15223 	do_region_footprint = task_self_region_footprint();
15224 
15225 	if ((entry->is_sub_map) ||
15226 	    (VME_OBJECT(entry) == 0) ||
15227 	    (VME_OBJECT(entry)->phys_contiguous &&
15228 	    !entry->superpage_size)) {
15229 		extended->share_mode = SM_EMPTY;
15230 		extended->ref_count = 0;
15231 		return;
15232 	}
15233 
15234 	if (entry->superpage_size) {
15235 		extended->shadow_depth = 0;
15236 		extended->share_mode = SM_LARGE_PAGE;
15237 		extended->ref_count = 1;
15238 		extended->external_pager = 0;
15239 
15240 		/* TODO4K: Superpage in 4k mode? */
15241 		extended->pages_resident = (unsigned int)(range >> PAGE_SHIFT);
15242 		extended->shadow_depth = 0;
15243 		return;
15244 	}
15245 
15246 	effective_page_shift = vm_self_region_page_shift(map);
15247 	effective_page_size = (1 << effective_page_shift);
15248 	effective_page_mask = effective_page_size - 1;
15249 
15250 	offset = vm_map_trunc_page(offset, effective_page_mask);
15251 
15252 	obj = VME_OBJECT(entry);
15253 
15254 	vm_object_lock(obj);
15255 
15256 	if ((ref_count = obj->ref_count) > 1 && obj->paging_in_progress) {
15257 		ref_count--;
15258 	}
15259 
15260 	if (look_for_pages) {
15261 		for (last_offset = offset + range;
15262 		    offset < last_offset;
15263 		    offset += effective_page_size, va += effective_page_size) {
15264 			if (do_region_footprint) {
15265 				int disp;
15266 
15267 				disp = 0;
15268 				if (map->has_corpse_footprint) {
15269 					/*
15270 					 * Query the page info data we saved
15271 					 * while forking the corpse.
15272 					 */
15273 					vm_map_corpse_footprint_query_page_info(
15274 						map,
15275 						va,
15276 						&disp);
15277 				} else {
15278 					/*
15279 					 * Query the pmap.
15280 					 */
15281 					vm_map_footprint_query_page_info(
15282 						map,
15283 						entry,
15284 						va,
15285 						&disp);
15286 				}
15287 				if (disp & VM_PAGE_QUERY_PAGE_PRESENT) {
15288 					extended->pages_resident++;
15289 				}
15290 				if (disp & VM_PAGE_QUERY_PAGE_REUSABLE) {
15291 					extended->pages_reusable++;
15292 				}
15293 				if (disp & VM_PAGE_QUERY_PAGE_DIRTY) {
15294 					extended->pages_dirtied++;
15295 				}
15296 				if (disp & PMAP_QUERY_PAGE_COMPRESSED) {
15297 					extended->pages_swapped_out++;
15298 				}
15299 				continue;
15300 			}
15301 
15302 			vm_map_region_look_for_page(map, va, obj,
15303 			    vm_object_trunc_page(offset), ref_count,
15304 			    0, extended, count);
15305 		}
15306 
15307 		if (do_region_footprint) {
15308 			goto collect_object_info;
15309 		}
15310 	} else {
15311 collect_object_info:
15312 		shadow_object = obj->shadow;
15313 		shadow_depth = 0;
15314 
15315 		if (!(obj->internal)) {
15316 			extended->external_pager = 1;
15317 		}
15318 
15319 		if (shadow_object != VM_OBJECT_NULL) {
15320 			vm_object_lock(shadow_object);
15321 			for (;
15322 			    shadow_object != VM_OBJECT_NULL;
15323 			    shadow_depth++) {
15324 				vm_object_t     next_shadow;
15325 
15326 				if (!(shadow_object->internal)) {
15327 					extended->external_pager = 1;
15328 				}
15329 
15330 				next_shadow = shadow_object->shadow;
15331 				if (next_shadow) {
15332 					vm_object_lock(next_shadow);
15333 				}
15334 				vm_object_unlock(shadow_object);
15335 				shadow_object = next_shadow;
15336 			}
15337 		}
15338 		extended->shadow_depth = shadow_depth;
15339 	}
15340 
15341 	if (extended->shadow_depth || entry->needs_copy) {
15342 		extended->share_mode = SM_COW;
15343 	} else {
15344 		if (ref_count == 1) {
15345 			extended->share_mode = SM_PRIVATE;
15346 		} else {
15347 			if (obj->true_share) {
15348 				extended->share_mode = SM_TRUESHARED;
15349 			} else {
15350 				extended->share_mode = SM_SHARED;
15351 			}
15352 		}
15353 	}
15354 	extended->ref_count = ref_count - extended->shadow_depth;
15355 
15356 	for (i = 0; i < extended->shadow_depth; i++) {
15357 		if ((tmp_obj = obj->shadow) == 0) {
15358 			break;
15359 		}
15360 		vm_object_lock(tmp_obj);
15361 		vm_object_unlock(obj);
15362 
15363 		if ((ref_count = tmp_obj->ref_count) > 1 && tmp_obj->paging_in_progress) {
15364 			ref_count--;
15365 		}
15366 
15367 		extended->ref_count += ref_count;
15368 		obj = tmp_obj;
15369 	}
15370 	vm_object_unlock(obj);
15371 
15372 	if (extended->share_mode == SM_SHARED) {
15373 		vm_map_entry_t       cur;
15374 		vm_map_entry_t       last;
15375 		int      my_refs;
15376 
15377 		obj = VME_OBJECT(entry);
15378 		last = vm_map_to_entry(map);
15379 		my_refs = 0;
15380 
15381 		if ((ref_count = obj->ref_count) > 1 && obj->paging_in_progress) {
15382 			ref_count--;
15383 		}
15384 		for (cur = vm_map_first_entry(map); cur != last; cur = cur->vme_next) {
15385 			my_refs += vm_map_region_count_obj_refs(cur, obj);
15386 		}
15387 
15388 		if (my_refs == ref_count) {
15389 			extended->share_mode = SM_PRIVATE_ALIASED;
15390 		} else if (my_refs > 1) {
15391 			extended->share_mode = SM_SHARED_ALIASED;
15392 		}
15393 	}
15394 }
15395 
15396 
15397 /* object is locked on entry and locked on return */
15398 
15399 
15400 static void
vm_map_region_look_for_page(__unused vm_map_t map,__unused vm_map_offset_t va,vm_object_t object,vm_object_offset_t offset,int max_refcnt,unsigned short depth,vm_region_extended_info_t extended,mach_msg_type_number_t count)15401 vm_map_region_look_for_page(
15402 	__unused vm_map_t               map,
15403 	__unused vm_map_offset_t        va,
15404 	vm_object_t                     object,
15405 	vm_object_offset_t              offset,
15406 	int                             max_refcnt,
15407 	unsigned short                  depth,
15408 	vm_region_extended_info_t       extended,
15409 	mach_msg_type_number_t count)
15410 {
15411 	vm_page_t       p;
15412 	vm_object_t     shadow;
15413 	int             ref_count;
15414 	vm_object_t     caller_object;
15415 
15416 	shadow = object->shadow;
15417 	caller_object = object;
15418 
15419 
15420 	while (TRUE) {
15421 		if (!(object->internal)) {
15422 			extended->external_pager = 1;
15423 		}
15424 
15425 		if ((p = vm_page_lookup(object, offset)) != VM_PAGE_NULL) {
15426 			if (shadow && (max_refcnt == 1)) {
15427 				extended->pages_shared_now_private++;
15428 			}
15429 
15430 			if (!p->vmp_fictitious &&
15431 			    (p->vmp_dirty || pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(p)))) {
15432 				extended->pages_dirtied++;
15433 			} else if (count >= VM_REGION_EXTENDED_INFO_COUNT) {
15434 				if (p->vmp_reusable || object->all_reusable) {
15435 					extended->pages_reusable++;
15436 				}
15437 			}
15438 
15439 			extended->pages_resident++;
15440 
15441 			if (object != caller_object) {
15442 				vm_object_unlock(object);
15443 			}
15444 
15445 			return;
15446 		}
15447 		if (object->internal &&
15448 		    object->alive &&
15449 		    !object->terminating &&
15450 		    object->pager_ready) {
15451 			if (VM_COMPRESSOR_PAGER_STATE_GET(object, offset)
15452 			    == VM_EXTERNAL_STATE_EXISTS) {
15453 				/* the pager has that page */
15454 				extended->pages_swapped_out++;
15455 				if (object != caller_object) {
15456 					vm_object_unlock(object);
15457 				}
15458 				return;
15459 			}
15460 		}
15461 
15462 		if (shadow) {
15463 			vm_object_lock(shadow);
15464 
15465 			if ((ref_count = shadow->ref_count) > 1 && shadow->paging_in_progress) {
15466 				ref_count--;
15467 			}
15468 
15469 			if (++depth > extended->shadow_depth) {
15470 				extended->shadow_depth = depth;
15471 			}
15472 
15473 			if (ref_count > max_refcnt) {
15474 				max_refcnt = ref_count;
15475 			}
15476 
15477 			if (object != caller_object) {
15478 				vm_object_unlock(object);
15479 			}
15480 
15481 			offset = offset + object->vo_shadow_offset;
15482 			object = shadow;
15483 			shadow = object->shadow;
15484 			continue;
15485 		}
15486 		if (object != caller_object) {
15487 			vm_object_unlock(object);
15488 		}
15489 		break;
15490 	}
15491 }
15492 
15493 static int
vm_map_region_count_obj_refs(vm_map_entry_t entry,vm_object_t object)15494 vm_map_region_count_obj_refs(
15495 	vm_map_entry_t    entry,
15496 	vm_object_t       object)
15497 {
15498 	int ref_count;
15499 	vm_object_t chk_obj;
15500 	vm_object_t tmp_obj;
15501 
15502 	if (entry->is_sub_map || VME_OBJECT(entry) == VM_OBJECT_NULL) {
15503 		return 0;
15504 	}
15505 
15506 	ref_count = 0;
15507 	chk_obj = VME_OBJECT(entry);
15508 	vm_object_lock(chk_obj);
15509 
15510 	while (chk_obj) {
15511 		if (chk_obj == object) {
15512 			ref_count++;
15513 		}
15514 		tmp_obj = chk_obj->shadow;
15515 		if (tmp_obj) {
15516 			vm_object_lock(tmp_obj);
15517 		}
15518 		vm_object_unlock(chk_obj);
15519 
15520 		chk_obj = tmp_obj;
15521 	}
15522 
15523 	return ref_count;
15524 }
15525 
15526 
15527 /*
15528  *	Routine:	vm_map_simplify
15529  *
15530  *	Description:
15531  *		Attempt to simplify the map representation in
15532  *		the vicinity of the given starting address.
15533  *	Note:
15534  *		This routine is intended primarily to keep the
15535  *		kernel maps more compact -- they generally don't
15536  *		benefit from the "expand a map entry" technology
15537  *		at allocation time because the adjacent entry
15538  *		is often wired down.
15539  */
15540 void
vm_map_simplify_entry(vm_map_t map,vm_map_entry_t this_entry)15541 vm_map_simplify_entry(
15542 	vm_map_t        map,
15543 	vm_map_entry_t  this_entry)
15544 {
15545 	vm_map_entry_t  prev_entry;
15546 
15547 	prev_entry = this_entry->vme_prev;
15548 
15549 	if ((this_entry != vm_map_to_entry(map)) &&
15550 	    (prev_entry != vm_map_to_entry(map)) &&
15551 
15552 	    (prev_entry->vme_end == this_entry->vme_start) &&
15553 
15554 	    (prev_entry->is_sub_map == this_entry->is_sub_map) &&
15555 	    (prev_entry->vme_object_value == this_entry->vme_object_value) &&
15556 	    (prev_entry->vme_kernel_object == this_entry->vme_kernel_object) &&
15557 	    ((VME_OFFSET(prev_entry) + (prev_entry->vme_end -
15558 	    prev_entry->vme_start))
15559 	    == VME_OFFSET(this_entry)) &&
15560 
15561 	    (prev_entry->behavior == this_entry->behavior) &&
15562 	    (prev_entry->needs_copy == this_entry->needs_copy) &&
15563 	    (prev_entry->protection == this_entry->protection) &&
15564 	    (prev_entry->max_protection == this_entry->max_protection) &&
15565 	    (prev_entry->inheritance == this_entry->inheritance) &&
15566 	    (prev_entry->use_pmap == this_entry->use_pmap) &&
15567 	    (VME_ALIAS(prev_entry) == VME_ALIAS(this_entry)) &&
15568 	    (prev_entry->no_cache == this_entry->no_cache) &&
15569 	    (prev_entry->vme_permanent == this_entry->vme_permanent) &&
15570 	    (prev_entry->map_aligned == this_entry->map_aligned) &&
15571 	    (prev_entry->zero_wired_pages == this_entry->zero_wired_pages) &&
15572 	    (prev_entry->used_for_jit == this_entry->used_for_jit) &&
15573 	    (prev_entry->pmap_cs_associated == this_entry->pmap_cs_associated) &&
15574 	    (prev_entry->iokit_acct == this_entry->iokit_acct) &&
15575 	    (prev_entry->vme_resilient_codesign ==
15576 	    this_entry->vme_resilient_codesign) &&
15577 	    (prev_entry->vme_resilient_media ==
15578 	    this_entry->vme_resilient_media) &&
15579 	    (prev_entry->vme_no_copy_on_read == this_entry->vme_no_copy_on_read) &&
15580 
15581 	    (prev_entry->wired_count == this_entry->wired_count) &&
15582 	    (prev_entry->user_wired_count == this_entry->user_wired_count) &&
15583 
15584 	    ((prev_entry->vme_atomic == FALSE) && (this_entry->vme_atomic == FALSE)) &&
15585 	    (prev_entry->in_transition == FALSE) &&
15586 	    (this_entry->in_transition == FALSE) &&
15587 	    (prev_entry->needs_wakeup == FALSE) &&
15588 	    (this_entry->needs_wakeup == FALSE) &&
15589 	    (prev_entry->is_shared == this_entry->is_shared) &&
15590 	    (prev_entry->superpage_size == FALSE) &&
15591 	    (this_entry->superpage_size == FALSE)
15592 	    ) {
15593 		if (prev_entry->vme_permanent) {
15594 			assert(this_entry->vme_permanent);
15595 			prev_entry->vme_permanent = false;
15596 		}
15597 		vm_map_store_entry_unlink(map, prev_entry, true);
15598 		assert(prev_entry->vme_start < this_entry->vme_end);
15599 		if (prev_entry->map_aligned) {
15600 			assert(VM_MAP_PAGE_ALIGNED(prev_entry->vme_start,
15601 			    VM_MAP_PAGE_MASK(map)));
15602 		}
15603 		this_entry->vme_start = prev_entry->vme_start;
15604 		VME_OFFSET_SET(this_entry, VME_OFFSET(prev_entry));
15605 
15606 		if (map->holelistenabled) {
15607 			vm_map_store_update_first_free(map, this_entry, TRUE);
15608 		}
15609 
15610 		if (prev_entry->is_sub_map) {
15611 			vm_map_deallocate(VME_SUBMAP(prev_entry));
15612 		} else {
15613 			vm_object_deallocate(VME_OBJECT(prev_entry));
15614 		}
15615 		vm_map_entry_dispose(prev_entry);
15616 		SAVE_HINT_MAP_WRITE(map, this_entry);
15617 	}
15618 }
15619 
15620 void
vm_map_simplify(vm_map_t map,vm_map_offset_t start)15621 vm_map_simplify(
15622 	vm_map_t        map,
15623 	vm_map_offset_t start)
15624 {
15625 	vm_map_entry_t  this_entry;
15626 
15627 	vm_map_lock(map);
15628 	if (vm_map_lookup_entry(map, start, &this_entry)) {
15629 		vm_map_simplify_entry(map, this_entry);
15630 		vm_map_simplify_entry(map, this_entry->vme_next);
15631 	}
15632 	vm_map_unlock(map);
15633 }
15634 
15635 static void
vm_map_simplify_range(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)15636 vm_map_simplify_range(
15637 	vm_map_t        map,
15638 	vm_map_offset_t start,
15639 	vm_map_offset_t end)
15640 {
15641 	vm_map_entry_t  entry;
15642 
15643 	/*
15644 	 * The map should be locked (for "write") by the caller.
15645 	 */
15646 
15647 	if (start >= end) {
15648 		/* invalid address range */
15649 		return;
15650 	}
15651 
15652 	start = vm_map_trunc_page(start,
15653 	    VM_MAP_PAGE_MASK(map));
15654 	end = vm_map_round_page(end,
15655 	    VM_MAP_PAGE_MASK(map));
15656 
15657 	if (!vm_map_lookup_entry(map, start, &entry)) {
15658 		/* "start" is not mapped and "entry" ends before "start" */
15659 		if (entry == vm_map_to_entry(map)) {
15660 			/* start with first entry in the map */
15661 			entry = vm_map_first_entry(map);
15662 		} else {
15663 			/* start with next entry */
15664 			entry = entry->vme_next;
15665 		}
15666 	}
15667 
15668 	while (entry != vm_map_to_entry(map) &&
15669 	    entry->vme_start <= end) {
15670 		/* try and coalesce "entry" with its previous entry */
15671 		vm_map_simplify_entry(map, entry);
15672 		entry = entry->vme_next;
15673 	}
15674 }
15675 
15676 
15677 /*
15678  *	Routine:	vm_map_machine_attribute
15679  *	Purpose:
15680  *		Provide machine-specific attributes to mappings,
15681  *		such as cachability etc. for machines that provide
15682  *		them.  NUMA architectures and machines with big/strange
15683  *		caches will use this.
15684  *	Note:
15685  *		Responsibilities for locking and checking are handled here,
15686  *		everything else in the pmap module. If any non-volatile
15687  *		information must be kept, the pmap module should handle
15688  *		it itself. [This assumes that attributes do not
15689  *		need to be inherited, which seems ok to me]
15690  */
15691 kern_return_t
vm_map_machine_attribute(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_machine_attribute_t attribute,vm_machine_attribute_val_t * value)15692 vm_map_machine_attribute(
15693 	vm_map_t                        map,
15694 	vm_map_offset_t         start,
15695 	vm_map_offset_t         end,
15696 	vm_machine_attribute_t  attribute,
15697 	vm_machine_attribute_val_t* value)              /* IN/OUT */
15698 {
15699 	kern_return_t   ret;
15700 	vm_map_size_t sync_size;
15701 	vm_map_entry_t entry;
15702 
15703 	if (start < vm_map_min(map) || end > vm_map_max(map)) {
15704 		return KERN_INVALID_ADDRESS;
15705 	}
15706 
15707 	/* Figure how much memory we need to flush (in page increments) */
15708 	sync_size = end - start;
15709 
15710 	vm_map_lock(map);
15711 
15712 	if (attribute != MATTR_CACHE) {
15713 		/* If we don't have to find physical addresses, we */
15714 		/* don't have to do an explicit traversal here.    */
15715 		ret = pmap_attribute(map->pmap, start, end - start,
15716 		    attribute, value);
15717 		vm_map_unlock(map);
15718 		return ret;
15719 	}
15720 
15721 	ret = KERN_SUCCESS;                                                                             /* Assume it all worked */
15722 
15723 	while (sync_size) {
15724 		if (vm_map_lookup_entry(map, start, &entry)) {
15725 			vm_map_size_t   sub_size;
15726 			if ((entry->vme_end - start) > sync_size) {
15727 				sub_size = sync_size;
15728 				sync_size = 0;
15729 			} else {
15730 				sub_size = entry->vme_end - start;
15731 				sync_size -= sub_size;
15732 			}
15733 			if (entry->is_sub_map) {
15734 				vm_map_offset_t sub_start;
15735 				vm_map_offset_t sub_end;
15736 
15737 				sub_start = (start - entry->vme_start)
15738 				    + VME_OFFSET(entry);
15739 				sub_end = sub_start + sub_size;
15740 				vm_map_machine_attribute(
15741 					VME_SUBMAP(entry),
15742 					sub_start,
15743 					sub_end,
15744 					attribute, value);
15745 			} else if (VME_OBJECT(entry)) {
15746 				vm_page_t               m;
15747 				vm_object_t             object;
15748 				vm_object_t             base_object;
15749 				vm_object_t             last_object;
15750 				vm_object_offset_t      offset;
15751 				vm_object_offset_t      base_offset;
15752 				vm_map_size_t           range;
15753 				range = sub_size;
15754 				offset = (start - entry->vme_start)
15755 				    + VME_OFFSET(entry);
15756 				offset = vm_object_trunc_page(offset);
15757 				base_offset = offset;
15758 				object = VME_OBJECT(entry);
15759 				base_object = object;
15760 				last_object = NULL;
15761 
15762 				vm_object_lock(object);
15763 
15764 				while (range) {
15765 					m = vm_page_lookup(
15766 						object, offset);
15767 
15768 					if (m && !m->vmp_fictitious) {
15769 						ret =
15770 						    pmap_attribute_cache_sync(
15771 							VM_PAGE_GET_PHYS_PAGE(m),
15772 							PAGE_SIZE,
15773 							attribute, value);
15774 					} else if (object->shadow) {
15775 						offset = offset + object->vo_shadow_offset;
15776 						last_object = object;
15777 						object = object->shadow;
15778 						vm_object_lock(last_object->shadow);
15779 						vm_object_unlock(last_object);
15780 						continue;
15781 					}
15782 					if (range < PAGE_SIZE) {
15783 						range = 0;
15784 					} else {
15785 						range -= PAGE_SIZE;
15786 					}
15787 
15788 					if (base_object != object) {
15789 						vm_object_unlock(object);
15790 						vm_object_lock(base_object);
15791 						object = base_object;
15792 					}
15793 					/* Bump to the next page */
15794 					base_offset += PAGE_SIZE;
15795 					offset = base_offset;
15796 				}
15797 				vm_object_unlock(object);
15798 			}
15799 			start += sub_size;
15800 		} else {
15801 			vm_map_unlock(map);
15802 			return KERN_FAILURE;
15803 		}
15804 	}
15805 
15806 	vm_map_unlock(map);
15807 
15808 	return ret;
15809 }
15810 
15811 /*
15812  *	vm_map_behavior_set:
15813  *
15814  *	Sets the paging reference behavior of the specified address
15815  *	range in the target map.  Paging reference behavior affects
15816  *	how pagein operations resulting from faults on the map will be
15817  *	clustered.
15818  */
15819 kern_return_t
vm_map_behavior_set(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_behavior_t new_behavior)15820 vm_map_behavior_set(
15821 	vm_map_t        map,
15822 	vm_map_offset_t start,
15823 	vm_map_offset_t end,
15824 	vm_behavior_t   new_behavior)
15825 {
15826 	vm_map_entry_t  entry;
15827 	vm_map_entry_t  temp_entry;
15828 
15829 	if (start > end ||
15830 	    start < vm_map_min(map) ||
15831 	    end > vm_map_max(map)) {
15832 		return KERN_NO_SPACE;
15833 	}
15834 
15835 	switch (new_behavior) {
15836 	/*
15837 	 * This first block of behaviors all set a persistent state on the specified
15838 	 * memory range.  All we have to do here is to record the desired behavior
15839 	 * in the vm_map_entry_t's.
15840 	 */
15841 
15842 	case VM_BEHAVIOR_DEFAULT:
15843 	case VM_BEHAVIOR_RANDOM:
15844 	case VM_BEHAVIOR_SEQUENTIAL:
15845 	case VM_BEHAVIOR_RSEQNTL:
15846 	case VM_BEHAVIOR_ZERO_WIRED_PAGES:
15847 		vm_map_lock(map);
15848 
15849 		/*
15850 		 *	The entire address range must be valid for the map.
15851 		 *      Note that vm_map_range_check() does a
15852 		 *	vm_map_lookup_entry() internally and returns the
15853 		 *	entry containing the start of the address range if
15854 		 *	the entire range is valid.
15855 		 */
15856 		if (vm_map_range_check(map, start, end, &temp_entry)) {
15857 			entry = temp_entry;
15858 			vm_map_clip_start(map, entry, start);
15859 		} else {
15860 			vm_map_unlock(map);
15861 			return KERN_INVALID_ADDRESS;
15862 		}
15863 
15864 		while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
15865 			vm_map_clip_end(map, entry, end);
15866 			if (entry->is_sub_map) {
15867 				assert(!entry->use_pmap);
15868 			}
15869 
15870 			if (new_behavior == VM_BEHAVIOR_ZERO_WIRED_PAGES) {
15871 				entry->zero_wired_pages = TRUE;
15872 			} else {
15873 				entry->behavior = new_behavior;
15874 			}
15875 			entry = entry->vme_next;
15876 		}
15877 
15878 		vm_map_unlock(map);
15879 		break;
15880 
15881 	/*
15882 	 * The rest of these are different from the above in that they cause
15883 	 * an immediate action to take place as opposed to setting a behavior that
15884 	 * affects future actions.
15885 	 */
15886 
15887 	case VM_BEHAVIOR_WILLNEED:
15888 		return vm_map_willneed(map, start, end);
15889 
15890 	case VM_BEHAVIOR_DONTNEED:
15891 		return vm_map_msync(map, start, end - start, VM_SYNC_DEACTIVATE | VM_SYNC_CONTIGUOUS);
15892 
15893 	case VM_BEHAVIOR_FREE:
15894 		return vm_map_msync(map, start, end - start, VM_SYNC_KILLPAGES | VM_SYNC_CONTIGUOUS);
15895 
15896 	case VM_BEHAVIOR_REUSABLE:
15897 		return vm_map_reusable_pages(map, start, end);
15898 
15899 	case VM_BEHAVIOR_REUSE:
15900 		return vm_map_reuse_pages(map, start, end);
15901 
15902 	case VM_BEHAVIOR_CAN_REUSE:
15903 		return vm_map_can_reuse(map, start, end);
15904 
15905 #if MACH_ASSERT
15906 	case VM_BEHAVIOR_PAGEOUT:
15907 		return vm_map_pageout(map, start, end);
15908 #endif /* MACH_ASSERT */
15909 
15910 	default:
15911 		return KERN_INVALID_ARGUMENT;
15912 	}
15913 
15914 	return KERN_SUCCESS;
15915 }
15916 
15917 
15918 /*
15919  * Internals for madvise(MADV_WILLNEED) system call.
15920  *
15921  * The implementation is to do:-
15922  * a) read-ahead if the mapping corresponds to a mapped regular file
15923  * b) or, fault in the pages (zero-fill, decompress etc) if it's an anonymous mapping
15924  */
15925 
15926 
15927 static kern_return_t
vm_map_willneed(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)15928 vm_map_willneed(
15929 	vm_map_t        map,
15930 	vm_map_offset_t start,
15931 	vm_map_offset_t end
15932 	)
15933 {
15934 	vm_map_entry_t                  entry;
15935 	vm_object_t                     object;
15936 	memory_object_t                 pager;
15937 	struct vm_object_fault_info     fault_info = {};
15938 	kern_return_t                   kr;
15939 	vm_object_size_t                len;
15940 	vm_object_offset_t              offset;
15941 
15942 	fault_info.interruptible = THREAD_UNINT;        /* ignored value */
15943 	fault_info.behavior      = VM_BEHAVIOR_SEQUENTIAL;
15944 	fault_info.stealth       = TRUE;
15945 
15946 	/*
15947 	 * The MADV_WILLNEED operation doesn't require any changes to the
15948 	 * vm_map_entry_t's, so the read lock is sufficient.
15949 	 */
15950 
15951 	vm_map_lock_read(map);
15952 
15953 	/*
15954 	 * The madvise semantics require that the address range be fully
15955 	 * allocated with no holes.  Otherwise, we're required to return
15956 	 * an error.
15957 	 */
15958 
15959 	if (!vm_map_range_check(map, start, end, &entry)) {
15960 		vm_map_unlock_read(map);
15961 		return KERN_INVALID_ADDRESS;
15962 	}
15963 
15964 	/*
15965 	 * Examine each vm_map_entry_t in the range.
15966 	 */
15967 	for (; entry != vm_map_to_entry(map) && start < end;) {
15968 		/*
15969 		 * The first time through, the start address could be anywhere
15970 		 * within the vm_map_entry we found.  So adjust the offset to
15971 		 * correspond.  After that, the offset will always be zero to
15972 		 * correspond to the beginning of the current vm_map_entry.
15973 		 */
15974 		offset = (start - entry->vme_start) + VME_OFFSET(entry);
15975 
15976 		/*
15977 		 * Set the length so we don't go beyond the end of the
15978 		 * map_entry or beyond the end of the range we were given.
15979 		 * This range could span also multiple map entries all of which
15980 		 * map different files, so make sure we only do the right amount
15981 		 * of I/O for each object.  Note that it's possible for there
15982 		 * to be multiple map entries all referring to the same object
15983 		 * but with different page permissions, but it's not worth
15984 		 * trying to optimize that case.
15985 		 */
15986 		len = MIN(entry->vme_end - start, end - start);
15987 
15988 		if ((vm_size_t) len != len) {
15989 			/* 32-bit overflow */
15990 			len = (vm_size_t) (0 - PAGE_SIZE);
15991 		}
15992 		fault_info.cluster_size = (vm_size_t) len;
15993 		fault_info.lo_offset    = offset;
15994 		fault_info.hi_offset    = offset + len;
15995 		fault_info.user_tag     = VME_ALIAS(entry);
15996 		fault_info.pmap_options = 0;
15997 		if (entry->iokit_acct ||
15998 		    (!entry->is_sub_map && !entry->use_pmap)) {
15999 			fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
16000 		}
16001 
16002 		/*
16003 		 * If the entry is a submap OR there's no read permission
16004 		 * to this mapping, then just skip it.
16005 		 */
16006 		if ((entry->is_sub_map) || (entry->protection & VM_PROT_READ) == 0) {
16007 			entry = entry->vme_next;
16008 			start = entry->vme_start;
16009 			continue;
16010 		}
16011 
16012 		object = VME_OBJECT(entry);
16013 
16014 		if (object == NULL ||
16015 		    (object && object->internal)) {
16016 			/*
16017 			 * Memory range backed by anonymous memory.
16018 			 */
16019 			vm_size_t region_size = 0, effective_page_size = 0;
16020 			vm_map_offset_t addr = 0, effective_page_mask = 0;
16021 
16022 			region_size = len;
16023 			addr = start;
16024 
16025 			effective_page_mask = MIN(vm_map_page_mask(current_map()), PAGE_MASK);
16026 			effective_page_size = effective_page_mask + 1;
16027 
16028 			vm_map_unlock_read(map);
16029 
16030 			while (region_size) {
16031 				vm_pre_fault(
16032 					vm_map_trunc_page(addr, effective_page_mask),
16033 					VM_PROT_READ | VM_PROT_WRITE);
16034 
16035 				region_size -= effective_page_size;
16036 				addr += effective_page_size;
16037 			}
16038 		} else {
16039 			/*
16040 			 * Find the file object backing this map entry.  If there is
16041 			 * none, then we simply ignore the "will need" advice for this
16042 			 * entry and go on to the next one.
16043 			 */
16044 			if ((object = find_vnode_object(entry)) == VM_OBJECT_NULL) {
16045 				entry = entry->vme_next;
16046 				start = entry->vme_start;
16047 				continue;
16048 			}
16049 
16050 			vm_object_paging_begin(object);
16051 			pager = object->pager;
16052 			vm_object_unlock(object);
16053 
16054 			/*
16055 			 * The data_request() could take a long time, so let's
16056 			 * release the map lock to avoid blocking other threads.
16057 			 */
16058 			vm_map_unlock_read(map);
16059 
16060 			/*
16061 			 * Get the data from the object asynchronously.
16062 			 *
16063 			 * Note that memory_object_data_request() places limits on the
16064 			 * amount of I/O it will do.  Regardless of the len we
16065 			 * specified, it won't do more than MAX_UPL_TRANSFER_BYTES and it
16066 			 * silently truncates the len to that size.  This isn't
16067 			 * necessarily bad since madvise shouldn't really be used to
16068 			 * page in unlimited amounts of data.  Other Unix variants
16069 			 * limit the willneed case as well.  If this turns out to be an
16070 			 * issue for developers, then we can always adjust the policy
16071 			 * here and still be backwards compatible since this is all
16072 			 * just "advice".
16073 			 */
16074 			kr = memory_object_data_request(
16075 				pager,
16076 				vm_object_trunc_page(offset) + object->paging_offset,
16077 				0,      /* ignored */
16078 				VM_PROT_READ,
16079 				(memory_object_fault_info_t)&fault_info);
16080 
16081 			vm_object_lock(object);
16082 			vm_object_paging_end(object);
16083 			vm_object_unlock(object);
16084 
16085 			/*
16086 			 * If we couldn't do the I/O for some reason, just give up on
16087 			 * the madvise.  We still return success to the user since
16088 			 * madvise isn't supposed to fail when the advice can't be
16089 			 * taken.
16090 			 */
16091 
16092 			if (kr != KERN_SUCCESS) {
16093 				return KERN_SUCCESS;
16094 			}
16095 		}
16096 
16097 		start += len;
16098 		if (start >= end) {
16099 			/* done */
16100 			return KERN_SUCCESS;
16101 		}
16102 
16103 		/* look up next entry */
16104 		vm_map_lock_read(map);
16105 		if (!vm_map_lookup_entry(map, start, &entry)) {
16106 			/*
16107 			 * There's a new hole in the address range.
16108 			 */
16109 			vm_map_unlock_read(map);
16110 			return KERN_INVALID_ADDRESS;
16111 		}
16112 	}
16113 
16114 	vm_map_unlock_read(map);
16115 	return KERN_SUCCESS;
16116 }
16117 
16118 static boolean_t
vm_map_entry_is_reusable(vm_map_entry_t entry)16119 vm_map_entry_is_reusable(
16120 	vm_map_entry_t entry)
16121 {
16122 	/* Only user map entries */
16123 
16124 	vm_object_t object;
16125 
16126 	if (entry->is_sub_map) {
16127 		return FALSE;
16128 	}
16129 
16130 	switch (VME_ALIAS(entry)) {
16131 	case VM_MEMORY_MALLOC:
16132 	case VM_MEMORY_MALLOC_SMALL:
16133 	case VM_MEMORY_MALLOC_LARGE:
16134 	case VM_MEMORY_REALLOC:
16135 	case VM_MEMORY_MALLOC_TINY:
16136 	case VM_MEMORY_MALLOC_LARGE_REUSABLE:
16137 	case VM_MEMORY_MALLOC_LARGE_REUSED:
16138 		/*
16139 		 * This is a malloc() memory region: check if it's still
16140 		 * in its original state and can be re-used for more
16141 		 * malloc() allocations.
16142 		 */
16143 		break;
16144 	default:
16145 		/*
16146 		 * Not a malloc() memory region: let the caller decide if
16147 		 * it's re-usable.
16148 		 */
16149 		return TRUE;
16150 	}
16151 
16152 	if (/*entry->is_shared ||*/
16153 		entry->is_sub_map ||
16154 		entry->in_transition ||
16155 		entry->protection != VM_PROT_DEFAULT ||
16156 		entry->max_protection != VM_PROT_ALL ||
16157 		entry->inheritance != VM_INHERIT_DEFAULT ||
16158 		entry->no_cache ||
16159 		entry->vme_permanent ||
16160 		entry->superpage_size != FALSE ||
16161 		entry->zero_wired_pages ||
16162 		entry->wired_count != 0 ||
16163 		entry->user_wired_count != 0) {
16164 		return FALSE;
16165 	}
16166 
16167 	object = VME_OBJECT(entry);
16168 	if (object == VM_OBJECT_NULL) {
16169 		return TRUE;
16170 	}
16171 	if (
16172 #if 0
16173 		/*
16174 		 * Let's proceed even if the VM object is potentially
16175 		 * shared.
16176 		 * We check for this later when processing the actual
16177 		 * VM pages, so the contents will be safe if shared.
16178 		 *
16179 		 * But we can still mark this memory region as "reusable" to
16180 		 * acknowledge that the caller did let us know that the memory
16181 		 * could be re-used and should not be penalized for holding
16182 		 * on to it.  This allows its "resident size" to not include
16183 		 * the reusable range.
16184 		 */
16185 		object->ref_count == 1 &&
16186 #endif
16187 		object->wired_page_count == 0 &&
16188 		object->copy == VM_OBJECT_NULL &&
16189 		object->shadow == VM_OBJECT_NULL &&
16190 		object->internal &&
16191 		object->purgable == VM_PURGABLE_DENY &&
16192 		object->wimg_bits == VM_WIMG_USE_DEFAULT &&
16193 		!object->code_signed) {
16194 		return TRUE;
16195 	}
16196 	return FALSE;
16197 }
16198 
16199 static kern_return_t
vm_map_reuse_pages(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16200 vm_map_reuse_pages(
16201 	vm_map_t        map,
16202 	vm_map_offset_t start,
16203 	vm_map_offset_t end)
16204 {
16205 	vm_map_entry_t                  entry;
16206 	vm_object_t                     object;
16207 	vm_object_offset_t              start_offset, end_offset;
16208 
16209 	/*
16210 	 * The MADV_REUSE operation doesn't require any changes to the
16211 	 * vm_map_entry_t's, so the read lock is sufficient.
16212 	 */
16213 
16214 	if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
16215 		/*
16216 		 * XXX TODO4K
16217 		 * need to figure out what reusable means for a
16218 		 * portion of a native page.
16219 		 */
16220 		return KERN_SUCCESS;
16221 	}
16222 
16223 	vm_map_lock_read(map);
16224 	assert(map->pmap != kernel_pmap);       /* protect alias access */
16225 
16226 	/*
16227 	 * The madvise semantics require that the address range be fully
16228 	 * allocated with no holes.  Otherwise, we're required to return
16229 	 * an error.
16230 	 */
16231 
16232 	if (!vm_map_range_check(map, start, end, &entry)) {
16233 		vm_map_unlock_read(map);
16234 		vm_page_stats_reusable.reuse_pages_failure++;
16235 		return KERN_INVALID_ADDRESS;
16236 	}
16237 
16238 	/*
16239 	 * Examine each vm_map_entry_t in the range.
16240 	 */
16241 	for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
16242 	    entry = entry->vme_next) {
16243 		/*
16244 		 * Sanity check on the VM map entry.
16245 		 */
16246 		if (!vm_map_entry_is_reusable(entry)) {
16247 			vm_map_unlock_read(map);
16248 			vm_page_stats_reusable.reuse_pages_failure++;
16249 			return KERN_INVALID_ADDRESS;
16250 		}
16251 
16252 		/*
16253 		 * The first time through, the start address could be anywhere
16254 		 * within the vm_map_entry we found.  So adjust the offset to
16255 		 * correspond.
16256 		 */
16257 		if (entry->vme_start < start) {
16258 			start_offset = start - entry->vme_start;
16259 		} else {
16260 			start_offset = 0;
16261 		}
16262 		end_offset = MIN(end, entry->vme_end) - entry->vme_start;
16263 		start_offset += VME_OFFSET(entry);
16264 		end_offset += VME_OFFSET(entry);
16265 
16266 		object = VME_OBJECT(entry);
16267 		if (object != VM_OBJECT_NULL) {
16268 			vm_object_lock(object);
16269 			vm_object_reuse_pages(object, start_offset, end_offset,
16270 			    TRUE);
16271 			vm_object_unlock(object);
16272 		}
16273 
16274 		if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE_REUSABLE) {
16275 			/*
16276 			 * XXX
16277 			 * We do not hold the VM map exclusively here.
16278 			 * The "alias" field is not that critical, so it's
16279 			 * safe to update it here, as long as it is the only
16280 			 * one that can be modified while holding the VM map
16281 			 * "shared".
16282 			 */
16283 			VME_ALIAS_SET(entry, VM_MEMORY_MALLOC_LARGE_REUSED);
16284 		}
16285 	}
16286 
16287 	vm_map_unlock_read(map);
16288 	vm_page_stats_reusable.reuse_pages_success++;
16289 	return KERN_SUCCESS;
16290 }
16291 
16292 
16293 static kern_return_t
vm_map_reusable_pages(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16294 vm_map_reusable_pages(
16295 	vm_map_t        map,
16296 	vm_map_offset_t start,
16297 	vm_map_offset_t end)
16298 {
16299 	vm_map_entry_t                  entry;
16300 	vm_object_t                     object;
16301 	vm_object_offset_t              start_offset, end_offset;
16302 	vm_map_offset_t                 pmap_offset;
16303 
16304 	if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
16305 		/*
16306 		 * XXX TODO4K
16307 		 * need to figure out what reusable means for a portion
16308 		 * of a native page.
16309 		 */
16310 		return KERN_SUCCESS;
16311 	}
16312 
16313 	/*
16314 	 * The MADV_REUSABLE operation doesn't require any changes to the
16315 	 * vm_map_entry_t's, so the read lock is sufficient.
16316 	 */
16317 
16318 	vm_map_lock_read(map);
16319 	assert(map->pmap != kernel_pmap);       /* protect alias access */
16320 
16321 	/*
16322 	 * The madvise semantics require that the address range be fully
16323 	 * allocated with no holes.  Otherwise, we're required to return
16324 	 * an error.
16325 	 */
16326 
16327 	if (!vm_map_range_check(map, start, end, &entry)) {
16328 		vm_map_unlock_read(map);
16329 		vm_page_stats_reusable.reusable_pages_failure++;
16330 		return KERN_INVALID_ADDRESS;
16331 	}
16332 
16333 	/*
16334 	 * Examine each vm_map_entry_t in the range.
16335 	 */
16336 	for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
16337 	    entry = entry->vme_next) {
16338 		int kill_pages = 0;
16339 
16340 		/*
16341 		 * Sanity check on the VM map entry.
16342 		 */
16343 		if (!vm_map_entry_is_reusable(entry)) {
16344 			vm_map_unlock_read(map);
16345 			vm_page_stats_reusable.reusable_pages_failure++;
16346 			return KERN_INVALID_ADDRESS;
16347 		}
16348 
16349 		if (!(entry->protection & VM_PROT_WRITE) && !entry->used_for_jit) {
16350 			/* not writable: can't discard contents */
16351 			vm_map_unlock_read(map);
16352 			vm_page_stats_reusable.reusable_nonwritable++;
16353 			vm_page_stats_reusable.reusable_pages_failure++;
16354 			return KERN_PROTECTION_FAILURE;
16355 		}
16356 
16357 		/*
16358 		 * The first time through, the start address could be anywhere
16359 		 * within the vm_map_entry we found.  So adjust the offset to
16360 		 * correspond.
16361 		 */
16362 		if (entry->vme_start < start) {
16363 			start_offset = start - entry->vme_start;
16364 			pmap_offset = start;
16365 		} else {
16366 			start_offset = 0;
16367 			pmap_offset = entry->vme_start;
16368 		}
16369 		end_offset = MIN(end, entry->vme_end) - entry->vme_start;
16370 		start_offset += VME_OFFSET(entry);
16371 		end_offset += VME_OFFSET(entry);
16372 
16373 		object = VME_OBJECT(entry);
16374 		if (object == VM_OBJECT_NULL) {
16375 			continue;
16376 		}
16377 
16378 
16379 		vm_object_lock(object);
16380 		if (((object->ref_count == 1) ||
16381 		    (object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC &&
16382 		    object->copy == VM_OBJECT_NULL)) &&
16383 		    object->shadow == VM_OBJECT_NULL &&
16384 		    /*
16385 		     * "iokit_acct" entries are billed for their virtual size
16386 		     * (rather than for their resident pages only), so they
16387 		     * wouldn't benefit from making pages reusable, and it
16388 		     * would be hard to keep track of pages that are both
16389 		     * "iokit_acct" and "reusable" in the pmap stats and
16390 		     * ledgers.
16391 		     */
16392 		    !(entry->iokit_acct ||
16393 		    (!entry->is_sub_map && !entry->use_pmap))) {
16394 			if (object->ref_count != 1) {
16395 				vm_page_stats_reusable.reusable_shared++;
16396 			}
16397 			kill_pages = 1;
16398 		} else {
16399 			kill_pages = -1;
16400 		}
16401 		if (kill_pages != -1) {
16402 			vm_object_deactivate_pages(object,
16403 			    start_offset,
16404 			    end_offset - start_offset,
16405 			    kill_pages,
16406 			    TRUE /*reusable_pages*/,
16407 			    map->pmap,
16408 			    pmap_offset);
16409 		} else {
16410 			vm_page_stats_reusable.reusable_pages_shared++;
16411 			DTRACE_VM4(vm_map_reusable_pages_shared,
16412 			    unsigned int, VME_ALIAS(entry),
16413 			    vm_map_t, map,
16414 			    vm_map_entry_t, entry,
16415 			    vm_object_t, object);
16416 		}
16417 		vm_object_unlock(object);
16418 
16419 		if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE ||
16420 		    VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE_REUSED) {
16421 			/*
16422 			 * XXX
16423 			 * We do not hold the VM map exclusively here.
16424 			 * The "alias" field is not that critical, so it's
16425 			 * safe to update it here, as long as it is the only
16426 			 * one that can be modified while holding the VM map
16427 			 * "shared".
16428 			 */
16429 			VME_ALIAS_SET(entry, VM_MEMORY_MALLOC_LARGE_REUSABLE);
16430 		}
16431 	}
16432 
16433 	vm_map_unlock_read(map);
16434 	vm_page_stats_reusable.reusable_pages_success++;
16435 	return KERN_SUCCESS;
16436 }
16437 
16438 
16439 static kern_return_t
vm_map_can_reuse(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16440 vm_map_can_reuse(
16441 	vm_map_t        map,
16442 	vm_map_offset_t start,
16443 	vm_map_offset_t end)
16444 {
16445 	vm_map_entry_t                  entry;
16446 
16447 	/*
16448 	 * The MADV_REUSABLE operation doesn't require any changes to the
16449 	 * vm_map_entry_t's, so the read lock is sufficient.
16450 	 */
16451 
16452 	vm_map_lock_read(map);
16453 	assert(map->pmap != kernel_pmap);       /* protect alias access */
16454 
16455 	/*
16456 	 * The madvise semantics require that the address range be fully
16457 	 * allocated with no holes.  Otherwise, we're required to return
16458 	 * an error.
16459 	 */
16460 
16461 	if (!vm_map_range_check(map, start, end, &entry)) {
16462 		vm_map_unlock_read(map);
16463 		vm_page_stats_reusable.can_reuse_failure++;
16464 		return KERN_INVALID_ADDRESS;
16465 	}
16466 
16467 	/*
16468 	 * Examine each vm_map_entry_t in the range.
16469 	 */
16470 	for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
16471 	    entry = entry->vme_next) {
16472 		/*
16473 		 * Sanity check on the VM map entry.
16474 		 */
16475 		if (!vm_map_entry_is_reusable(entry)) {
16476 			vm_map_unlock_read(map);
16477 			vm_page_stats_reusable.can_reuse_failure++;
16478 			return KERN_INVALID_ADDRESS;
16479 		}
16480 	}
16481 
16482 	vm_map_unlock_read(map);
16483 	vm_page_stats_reusable.can_reuse_success++;
16484 	return KERN_SUCCESS;
16485 }
16486 
16487 
16488 #if MACH_ASSERT
16489 static kern_return_t
vm_map_pageout(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16490 vm_map_pageout(
16491 	vm_map_t        map,
16492 	vm_map_offset_t start,
16493 	vm_map_offset_t end)
16494 {
16495 	vm_map_entry_t                  entry;
16496 
16497 	/*
16498 	 * The MADV_PAGEOUT operation doesn't require any changes to the
16499 	 * vm_map_entry_t's, so the read lock is sufficient.
16500 	 */
16501 
16502 	vm_map_lock_read(map);
16503 
16504 	/*
16505 	 * The madvise semantics require that the address range be fully
16506 	 * allocated with no holes.  Otherwise, we're required to return
16507 	 * an error.
16508 	 */
16509 
16510 	if (!vm_map_range_check(map, start, end, &entry)) {
16511 		vm_map_unlock_read(map);
16512 		return KERN_INVALID_ADDRESS;
16513 	}
16514 
16515 	/*
16516 	 * Examine each vm_map_entry_t in the range.
16517 	 */
16518 	for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
16519 	    entry = entry->vme_next) {
16520 		vm_object_t     object;
16521 
16522 		/*
16523 		 * Sanity check on the VM map entry.
16524 		 */
16525 		if (entry->is_sub_map) {
16526 			vm_map_t submap;
16527 			vm_map_offset_t submap_start;
16528 			vm_map_offset_t submap_end;
16529 			vm_map_entry_t submap_entry;
16530 
16531 			submap = VME_SUBMAP(entry);
16532 			submap_start = VME_OFFSET(entry);
16533 			submap_end = submap_start + (entry->vme_end -
16534 			    entry->vme_start);
16535 
16536 			vm_map_lock_read(submap);
16537 
16538 			if (!vm_map_range_check(submap,
16539 			    submap_start,
16540 			    submap_end,
16541 			    &submap_entry)) {
16542 				vm_map_unlock_read(submap);
16543 				vm_map_unlock_read(map);
16544 				return KERN_INVALID_ADDRESS;
16545 			}
16546 
16547 			if (submap_entry->is_sub_map) {
16548 				vm_map_unlock_read(submap);
16549 				continue;
16550 			}
16551 
16552 			object = VME_OBJECT(submap_entry);
16553 			if (object == VM_OBJECT_NULL || !object->internal) {
16554 				vm_map_unlock_read(submap);
16555 				continue;
16556 			}
16557 
16558 			vm_object_pageout(object);
16559 
16560 			vm_map_unlock_read(submap);
16561 			submap = VM_MAP_NULL;
16562 			submap_entry = VM_MAP_ENTRY_NULL;
16563 			continue;
16564 		}
16565 
16566 		object = VME_OBJECT(entry);
16567 		if (object == VM_OBJECT_NULL || !object->internal) {
16568 			continue;
16569 		}
16570 
16571 		vm_object_pageout(object);
16572 	}
16573 
16574 	vm_map_unlock_read(map);
16575 	return KERN_SUCCESS;
16576 }
16577 #endif /* MACH_ASSERT */
16578 
16579 
16580 /*
16581  *	Routine:	vm_map_entry_insert
16582  *
16583  *	Description:	This routine inserts a new vm_entry in a locked map.
16584  */
16585 static vm_map_entry_t
vm_map_entry_insert(vm_map_t map,vm_map_entry_t insp_entry,vm_map_offset_t start,vm_map_offset_t end,vm_object_t object,vm_object_offset_t offset,vm_map_kernel_flags_t vmk_flags,boolean_t needs_copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance,boolean_t no_cache,boolean_t permanent,unsigned int superpage_size,boolean_t clear_map_aligned,int alias)16586 vm_map_entry_insert(
16587 	vm_map_t                map,
16588 	vm_map_entry_t          insp_entry,
16589 	vm_map_offset_t         start,
16590 	vm_map_offset_t         end,
16591 	vm_object_t             object,
16592 	vm_object_offset_t      offset,
16593 	vm_map_kernel_flags_t   vmk_flags,
16594 	boolean_t               needs_copy,
16595 	vm_prot_t               cur_protection,
16596 	vm_prot_t               max_protection,
16597 	vm_inherit_t            inheritance,
16598 	boolean_t               no_cache,
16599 	boolean_t               permanent,
16600 	unsigned int            superpage_size,
16601 	boolean_t               clear_map_aligned,
16602 	int                     alias)
16603 {
16604 	vm_map_entry_t  new_entry;
16605 	boolean_t map_aligned = FALSE;
16606 
16607 	assert(insp_entry != (vm_map_entry_t)0);
16608 	vm_map_lock_assert_exclusive(map);
16609 
16610 #if DEVELOPMENT || DEBUG
16611 	vm_object_offset_t      end_offset = 0;
16612 	assertf(!os_add_overflow(end - start, offset, &end_offset), "size 0x%llx, offset 0x%llx caused overflow", (uint64_t)(end - start), offset);
16613 #endif /* DEVELOPMENT || DEBUG */
16614 
16615 	if (VM_MAP_PAGE_SHIFT(map) != PAGE_SHIFT) {
16616 		map_aligned = TRUE;
16617 	}
16618 	if (clear_map_aligned &&
16619 	    (!VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)) ||
16620 	    !VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)))) {
16621 		map_aligned = FALSE;
16622 	}
16623 	if (map_aligned) {
16624 		assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
16625 		assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
16626 	} else {
16627 		assert(page_aligned(start));
16628 		assert(page_aligned(end));
16629 	}
16630 	assert(start < end);
16631 
16632 	new_entry = vm_map_entry_create(map);
16633 
16634 	new_entry->vme_start = start;
16635 	new_entry->vme_end = end;
16636 
16637 	if (vmk_flags.vmkf_submap) {
16638 		new_entry->vme_atomic = vmk_flags.vmkf_submap_atomic;
16639 		VME_SUBMAP_SET(new_entry, (vm_map_t)object);
16640 	} else {
16641 		VME_OBJECT_SET(new_entry, object, false, 0);
16642 	}
16643 	VME_OFFSET_SET(new_entry, offset);
16644 	VME_ALIAS_SET(new_entry, alias);
16645 
16646 	new_entry->map_aligned = map_aligned;
16647 	new_entry->needs_copy = needs_copy;
16648 	new_entry->inheritance = inheritance;
16649 	new_entry->protection = cur_protection;
16650 	new_entry->max_protection = max_protection;
16651 	/*
16652 	 * submap: "use_pmap" means "nested".
16653 	 * default: false.
16654 	 *
16655 	 * object: "use_pmap" means "use pmap accounting" for footprint.
16656 	 * default: true.
16657 	 */
16658 	new_entry->use_pmap = !vmk_flags.vmkf_submap;
16659 	new_entry->no_cache = no_cache;
16660 	new_entry->vme_permanent = permanent;
16661 	new_entry->translated_allow_execute = vmk_flags.vmkf_translated_allow_execute;
16662 	new_entry->vme_no_copy_on_read = vmk_flags.vmkf_no_copy_on_read;
16663 	new_entry->superpage_size = (superpage_size != 0);
16664 
16665 	if (vmk_flags.vmkf_map_jit) {
16666 		if (!(map->jit_entry_exists) ||
16667 		    VM_MAP_POLICY_ALLOW_MULTIPLE_JIT(map)) {
16668 			new_entry->used_for_jit = TRUE;
16669 			map->jit_entry_exists = TRUE;
16670 		}
16671 	}
16672 
16673 	/*
16674 	 *	Insert the new entry into the list.
16675 	 */
16676 
16677 	vm_map_store_entry_link(map, insp_entry, new_entry, vmk_flags);
16678 	map->size += end - start;
16679 
16680 	/*
16681 	 *	Update the free space hint and the lookup hint.
16682 	 */
16683 
16684 	SAVE_HINT_MAP_WRITE(map, new_entry);
16685 	return new_entry;
16686 }
16687 
16688 /*
16689  *	Routine:	vm_map_remap_extract
16690  *
16691  *	Description:	This routine returns a vm_entry list from a map.
16692  */
16693 static kern_return_t
vm_map_remap_extract(vm_map_t map,vm_map_offset_t addr,vm_map_size_t size,boolean_t copy,struct vm_map_header * map_header,vm_prot_t * cur_protection,vm_prot_t * max_protection,vm_inherit_t inheritance,vm_map_kernel_flags_t vmk_flags)16694 vm_map_remap_extract(
16695 	vm_map_t                map,
16696 	vm_map_offset_t         addr,
16697 	vm_map_size_t           size,
16698 	boolean_t               copy,
16699 	struct vm_map_header    *map_header,
16700 	vm_prot_t               *cur_protection,   /* IN/OUT */
16701 	vm_prot_t               *max_protection,   /* IN/OUT */
16702 	/* What, no behavior? */
16703 	vm_inherit_t            inheritance,
16704 	vm_map_kernel_flags_t   vmk_flags)
16705 {
16706 	kern_return_t           result;
16707 	vm_map_size_t           mapped_size;
16708 	vm_map_size_t           tmp_size;
16709 	vm_map_entry_t          src_entry;     /* result of last map lookup */
16710 	vm_map_entry_t          new_entry;
16711 	vm_object_offset_t      offset;
16712 	vm_map_offset_t         map_address;
16713 	vm_map_offset_t         src_start;     /* start of entry to map */
16714 	vm_map_offset_t         src_end;       /* end of region to be mapped */
16715 	vm_object_t             object;
16716 	vm_map_version_t        version;
16717 	boolean_t               src_needs_copy;
16718 	boolean_t               new_entry_needs_copy;
16719 	vm_map_entry_t          saved_src_entry;
16720 	boolean_t               src_entry_was_wired;
16721 	vm_prot_t               max_prot_for_prot_copy;
16722 	vm_map_offset_t         effective_page_mask;
16723 	boolean_t               pageable, same_map;
16724 	boolean_t               vm_remap_legacy;
16725 	vm_prot_t               required_cur_prot, required_max_prot;
16726 	vm_object_t             new_copy_object;     /* vm_object_copy_* result */
16727 	boolean_t               saved_used_for_jit;     /* Saved used_for_jit. */
16728 
16729 	pageable = vmk_flags.vmkf_copy_pageable;
16730 	same_map = vmk_flags.vmkf_copy_same_map;
16731 
16732 	effective_page_mask = MIN(PAGE_MASK, VM_MAP_PAGE_MASK(map));
16733 
16734 	assert(map != VM_MAP_NULL);
16735 	assert(size != 0);
16736 	assert(size == vm_map_round_page(size, effective_page_mask));
16737 	assert(inheritance == VM_INHERIT_NONE ||
16738 	    inheritance == VM_INHERIT_COPY ||
16739 	    inheritance == VM_INHERIT_SHARE);
16740 	assert(!(*cur_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)));
16741 	assert(!(*max_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)));
16742 	assert((*cur_protection & *max_protection) == *cur_protection);
16743 
16744 	/*
16745 	 *	Compute start and end of region.
16746 	 */
16747 	src_start = vm_map_trunc_page(addr, effective_page_mask);
16748 	src_end = vm_map_round_page(src_start + size, effective_page_mask);
16749 
16750 	/*
16751 	 *	Initialize map_header.
16752 	 */
16753 	map_header->links.next = CAST_TO_VM_MAP_ENTRY(&map_header->links);
16754 	map_header->links.prev = CAST_TO_VM_MAP_ENTRY(&map_header->links);
16755 	map_header->nentries = 0;
16756 	map_header->entries_pageable = pageable;
16757 //	map_header->page_shift = MIN(VM_MAP_PAGE_SHIFT(map), PAGE_SHIFT);
16758 	map_header->page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(map);
16759 	map_header->rb_head_store.rbh_root = (void *)(int)SKIP_RB_TREE;
16760 
16761 	vm_map_store_init( map_header );
16762 
16763 	if (copy && vmk_flags.vmkf_remap_prot_copy) {
16764 		/*
16765 		 * Special case for vm_map_protect(VM_PROT_COPY):
16766 		 * we want to set the new mappings' max protection to the
16767 		 * specified *max_protection...
16768 		 */
16769 		max_prot_for_prot_copy = *max_protection & (VM_PROT_ALL | VM_PROT_ALLEXEC);
16770 		/* ... but we want to use the vm_remap() legacy mode */
16771 		*max_protection = VM_PROT_NONE;
16772 		*cur_protection = VM_PROT_NONE;
16773 	} else {
16774 		max_prot_for_prot_copy = VM_PROT_NONE;
16775 	}
16776 
16777 	if (*cur_protection == VM_PROT_NONE &&
16778 	    *max_protection == VM_PROT_NONE) {
16779 		/*
16780 		 * vm_remap() legacy mode:
16781 		 * Extract all memory regions in the specified range and
16782 		 * collect the strictest set of protections allowed on the
16783 		 * entire range, so the caller knows what they can do with
16784 		 * the remapped range.
16785 		 * We start with VM_PROT_ALL and we'll remove the protections
16786 		 * missing from each memory region.
16787 		 */
16788 		vm_remap_legacy = TRUE;
16789 		*cur_protection = VM_PROT_ALL;
16790 		*max_protection = VM_PROT_ALL;
16791 		required_cur_prot = VM_PROT_NONE;
16792 		required_max_prot = VM_PROT_NONE;
16793 	} else {
16794 		/*
16795 		 * vm_remap_new() mode:
16796 		 * Extract all memory regions in the specified range and
16797 		 * ensure that they have at least the protections specified
16798 		 * by the caller via *cur_protection and *max_protection.
16799 		 * The resulting mapping should have these protections.
16800 		 */
16801 		vm_remap_legacy = FALSE;
16802 		if (copy) {
16803 			required_cur_prot = VM_PROT_NONE;
16804 			required_max_prot = VM_PROT_READ;
16805 		} else {
16806 			required_cur_prot = *cur_protection;
16807 			required_max_prot = *max_protection;
16808 		}
16809 	}
16810 
16811 	map_address = 0;
16812 	mapped_size = 0;
16813 	result = KERN_SUCCESS;
16814 
16815 	/*
16816 	 *	The specified source virtual space might correspond to
16817 	 *	multiple map entries, need to loop on them.
16818 	 */
16819 	vm_map_lock(map);
16820 	if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
16821 		/*
16822 		 * This address space uses sub-pages so the range might
16823 		 * not be re-mappable in an address space with larger
16824 		 * pages. Re-assemble any broken-up VM map entries to
16825 		 * improve our chances of making it work.
16826 		 */
16827 		vm_map_simplify_range(map, src_start, src_end);
16828 	}
16829 	while (mapped_size != size) {
16830 		vm_map_size_t   entry_size;
16831 
16832 		/*
16833 		 *	Find the beginning of the region.
16834 		 */
16835 		if (!vm_map_lookup_entry(map, src_start, &src_entry)) {
16836 			result = KERN_INVALID_ADDRESS;
16837 			break;
16838 		}
16839 
16840 		if (src_start < src_entry->vme_start ||
16841 		    (mapped_size && src_start != src_entry->vme_start)) {
16842 			result = KERN_INVALID_ADDRESS;
16843 			break;
16844 		}
16845 
16846 		tmp_size = size - mapped_size;
16847 		if (src_end > src_entry->vme_end) {
16848 			tmp_size -= (src_end - src_entry->vme_end);
16849 		}
16850 
16851 		entry_size = (vm_map_size_t)(src_entry->vme_end -
16852 		    src_entry->vme_start);
16853 
16854 		if (src_entry->is_sub_map &&
16855 		    vmk_flags.vmkf_copy_single_object) {
16856 			vm_map_t submap;
16857 			vm_map_offset_t submap_start;
16858 			vm_map_size_t submap_size;
16859 			boolean_t submap_needs_copy;
16860 
16861 			/*
16862 			 * No check for "required protection" on "src_entry"
16863 			 * because the protections that matter are the ones
16864 			 * on the submap's VM map entry, which will be checked
16865 			 * during the call to vm_map_remap_extract() below.
16866 			 */
16867 			submap_size = src_entry->vme_end - src_start;
16868 			if (submap_size > size) {
16869 				submap_size = size;
16870 			}
16871 			submap_start = VME_OFFSET(src_entry) + src_start - src_entry->vme_start;
16872 			submap = VME_SUBMAP(src_entry);
16873 			if (copy) {
16874 				/*
16875 				 * The caller wants a copy-on-write re-mapping,
16876 				 * so let's extract from the submap accordingly.
16877 				 */
16878 				submap_needs_copy = TRUE;
16879 			} else if (src_entry->needs_copy) {
16880 				/*
16881 				 * The caller wants a shared re-mapping but the
16882 				 * submap is mapped with "needs_copy", so its
16883 				 * contents can't be shared as is. Extract the
16884 				 * contents of the submap as "copy-on-write".
16885 				 * The re-mapping won't be shared with the
16886 				 * original mapping but this is equivalent to
16887 				 * what happened with the original "remap from
16888 				 * submap" code.
16889 				 * The shared region is mapped "needs_copy", for
16890 				 * example.
16891 				 */
16892 				submap_needs_copy = TRUE;
16893 			} else {
16894 				/*
16895 				 * The caller wants a shared re-mapping and
16896 				 * this mapping can be shared (no "needs_copy"),
16897 				 * so let's extract from the submap accordingly.
16898 				 * Kernel submaps are mapped without
16899 				 * "needs_copy", for example.
16900 				 */
16901 				submap_needs_copy = FALSE;
16902 			}
16903 			vm_map_reference(submap);
16904 			vm_map_unlock(map);
16905 			src_entry = NULL;
16906 			if (vm_remap_legacy) {
16907 				*cur_protection = VM_PROT_NONE;
16908 				*max_protection = VM_PROT_NONE;
16909 			}
16910 
16911 			DTRACE_VM7(remap_submap_recurse,
16912 			    vm_map_t, map,
16913 			    vm_map_offset_t, addr,
16914 			    vm_map_size_t, size,
16915 			    boolean_t, copy,
16916 			    vm_map_offset_t, submap_start,
16917 			    vm_map_size_t, submap_size,
16918 			    boolean_t, submap_needs_copy);
16919 
16920 			result = vm_map_remap_extract(submap,
16921 			    submap_start,
16922 			    submap_size,
16923 			    submap_needs_copy,
16924 			    map_header,
16925 			    cur_protection,
16926 			    max_protection,
16927 			    inheritance,
16928 			    vmk_flags);
16929 			vm_map_deallocate(submap);
16930 			return result;
16931 		}
16932 
16933 		if (src_entry->is_sub_map) {
16934 			/* protections for submap mapping are irrelevant here */
16935 		} else if (((src_entry->protection & required_cur_prot) !=
16936 		    required_cur_prot) ||
16937 		    ((src_entry->max_protection & required_max_prot) !=
16938 		    required_max_prot)) {
16939 			if (vmk_flags.vmkf_copy_single_object &&
16940 			    mapped_size != 0) {
16941 				/*
16942 				 * Single object extraction.
16943 				 * We can't extract more with the required
16944 				 * protection but we've extracted some, so
16945 				 * stop there and declare success.
16946 				 * The caller should check the size of
16947 				 * the copy entry we've extracted.
16948 				 */
16949 				result = KERN_SUCCESS;
16950 			} else {
16951 				/*
16952 				 * VM range extraction.
16953 				 * Required proctection is not available
16954 				 * for this part of the range: fail.
16955 				 */
16956 				result = KERN_PROTECTION_FAILURE;
16957 			}
16958 			break;
16959 		}
16960 
16961 		if (src_entry->is_sub_map) {
16962 			vm_map_t submap;
16963 			vm_map_offset_t submap_start;
16964 			vm_map_size_t submap_size;
16965 			vm_map_copy_t submap_copy;
16966 			vm_prot_t submap_curprot, submap_maxprot;
16967 			boolean_t submap_needs_copy;
16968 
16969 			/*
16970 			 * No check for "required protection" on "src_entry"
16971 			 * because the protections that matter are the ones
16972 			 * on the submap's VM map entry, which will be checked
16973 			 * during the call to vm_map_copy_extract() below.
16974 			 */
16975 			object = VM_OBJECT_NULL;
16976 			submap_copy = VM_MAP_COPY_NULL;
16977 
16978 			/* find equivalent range in the submap */
16979 			submap = VME_SUBMAP(src_entry);
16980 			submap_start = VME_OFFSET(src_entry) + src_start - src_entry->vme_start;
16981 			submap_size = tmp_size;
16982 			if (copy) {
16983 				/*
16984 				 * The caller wants a copy-on-write re-mapping,
16985 				 * so let's extract from the submap accordingly.
16986 				 */
16987 				submap_needs_copy = TRUE;
16988 			} else if (src_entry->needs_copy) {
16989 				/*
16990 				 * The caller wants a shared re-mapping but the
16991 				 * submap is mapped with "needs_copy", so its
16992 				 * contents can't be shared as is. Extract the
16993 				 * contents of the submap as "copy-on-write".
16994 				 * The re-mapping won't be shared with the
16995 				 * original mapping but this is equivalent to
16996 				 * what happened with the original "remap from
16997 				 * submap" code.
16998 				 * The shared region is mapped "needs_copy", for
16999 				 * example.
17000 				 */
17001 				submap_needs_copy = TRUE;
17002 			} else {
17003 				/*
17004 				 * The caller wants a shared re-mapping and
17005 				 * this mapping can be shared (no "needs_copy"),
17006 				 * so let's extract from the submap accordingly.
17007 				 * Kernel submaps are mapped without
17008 				 * "needs_copy", for example.
17009 				 */
17010 				submap_needs_copy = FALSE;
17011 			}
17012 			/* extra ref to keep submap alive */
17013 			vm_map_reference(submap);
17014 
17015 			DTRACE_VM7(remap_submap_recurse,
17016 			    vm_map_t, map,
17017 			    vm_map_offset_t, addr,
17018 			    vm_map_size_t, size,
17019 			    boolean_t, copy,
17020 			    vm_map_offset_t, submap_start,
17021 			    vm_map_size_t, submap_size,
17022 			    boolean_t, submap_needs_copy);
17023 
17024 			/*
17025 			 * The map can be safely unlocked since we
17026 			 * already hold a reference on the submap.
17027 			 *
17028 			 * No timestamp since we don't care if the map
17029 			 * gets modified while we're down in the submap.
17030 			 * We'll resume the extraction at src_start + tmp_size
17031 			 * anyway.
17032 			 */
17033 			vm_map_unlock(map);
17034 			src_entry = NULL; /* not valid once map is unlocked */
17035 
17036 			if (vm_remap_legacy) {
17037 				submap_curprot = VM_PROT_NONE;
17038 				submap_maxprot = VM_PROT_NONE;
17039 				if (max_prot_for_prot_copy) {
17040 					submap_maxprot = max_prot_for_prot_copy;
17041 				}
17042 			} else {
17043 				assert(!max_prot_for_prot_copy);
17044 				submap_curprot = *cur_protection;
17045 				submap_maxprot = *max_protection;
17046 			}
17047 			result = vm_map_copy_extract(submap,
17048 			    submap_start,
17049 			    submap_size,
17050 			    submap_needs_copy,
17051 			    &submap_copy,
17052 			    &submap_curprot,
17053 			    &submap_maxprot,
17054 			    inheritance,
17055 			    vmk_flags);
17056 
17057 			/* release extra ref on submap */
17058 			vm_map_deallocate(submap);
17059 			submap = VM_MAP_NULL;
17060 
17061 			if (result != KERN_SUCCESS) {
17062 				vm_map_lock(map);
17063 				break;
17064 			}
17065 
17066 			/* transfer submap_copy entries to map_header */
17067 			while (vm_map_copy_first_entry(submap_copy) !=
17068 			    vm_map_copy_to_entry(submap_copy)) {
17069 				vm_map_entry_t copy_entry;
17070 				vm_map_size_t copy_entry_size;
17071 
17072 				copy_entry = vm_map_copy_first_entry(submap_copy);
17073 
17074 				/*
17075 				 * Prevent kernel_object from being exposed to
17076 				 * user space.
17077 				 */
17078 				if (__improbable(copy_entry->vme_kernel_object)) {
17079 					printf("%d[%s]: rejecting attempt to extract from kernel_object\n",
17080 					    proc_selfpid(),
17081 					    (get_bsdtask_info(current_task())
17082 					    ? proc_name_address(get_bsdtask_info(current_task()))
17083 					    : "?"));
17084 					DTRACE_VM(extract_kernel_only);
17085 					result = KERN_INVALID_RIGHT;
17086 					vm_map_copy_discard(submap_copy);
17087 					submap_copy = VM_MAP_COPY_NULL;
17088 					vm_map_lock(map);
17089 					break;
17090 				}
17091 
17092 				vm_map_copy_entry_unlink(submap_copy, copy_entry);
17093 				copy_entry_size = copy_entry->vme_end - copy_entry->vme_start;
17094 				copy_entry->vme_start = map_address;
17095 				copy_entry->vme_end = map_address + copy_entry_size;
17096 				map_address += copy_entry_size;
17097 				mapped_size += copy_entry_size;
17098 				src_start += copy_entry_size;
17099 				assert(src_start <= src_end);
17100 				_vm_map_store_entry_link(map_header,
17101 				    map_header->links.prev,
17102 				    copy_entry);
17103 			}
17104 			/* done with submap_copy */
17105 			vm_map_copy_discard(submap_copy);
17106 
17107 			if (vm_remap_legacy) {
17108 				*cur_protection &= submap_curprot;
17109 				*max_protection &= submap_maxprot;
17110 			}
17111 
17112 			/* re-acquire the map lock and continue to next entry */
17113 			vm_map_lock(map);
17114 			continue;
17115 		} else {
17116 			object = VME_OBJECT(src_entry);
17117 
17118 			/*
17119 			 * Prevent kernel_object from being exposed to
17120 			 * user space.
17121 			 */
17122 			if (__improbable(object == kernel_object)) {
17123 				printf("%d[%s]: rejecting attempt to extract from kernel_object\n",
17124 				    proc_selfpid(),
17125 				    (get_bsdtask_info(current_task())
17126 				    ? proc_name_address(get_bsdtask_info(current_task()))
17127 				    : "?"));
17128 				DTRACE_VM(extract_kernel_only);
17129 				result = KERN_INVALID_RIGHT;
17130 				break;
17131 			}
17132 
17133 			if (src_entry->iokit_acct) {
17134 				/*
17135 				 * This entry uses "IOKit accounting".
17136 				 */
17137 			} else if (object != VM_OBJECT_NULL &&
17138 			    (object->purgable != VM_PURGABLE_DENY ||
17139 			    object->vo_ledger_tag != VM_LEDGER_TAG_NONE)) {
17140 				/*
17141 				 * Purgeable objects have their own accounting:
17142 				 * no pmap accounting for them.
17143 				 */
17144 				assertf(!src_entry->use_pmap,
17145 				    "map=%p src_entry=%p [0x%llx:0x%llx] 0x%x/0x%x %d",
17146 				    map,
17147 				    src_entry,
17148 				    (uint64_t)src_entry->vme_start,
17149 				    (uint64_t)src_entry->vme_end,
17150 				    src_entry->protection,
17151 				    src_entry->max_protection,
17152 				    VME_ALIAS(src_entry));
17153 			} else {
17154 				/*
17155 				 * Not IOKit or purgeable:
17156 				 * must be accounted by pmap stats.
17157 				 */
17158 				assertf(src_entry->use_pmap,
17159 				    "map=%p src_entry=%p [0x%llx:0x%llx] 0x%x/0x%x %d",
17160 				    map,
17161 				    src_entry,
17162 				    (uint64_t)src_entry->vme_start,
17163 				    (uint64_t)src_entry->vme_end,
17164 				    src_entry->protection,
17165 				    src_entry->max_protection,
17166 				    VME_ALIAS(src_entry));
17167 			}
17168 
17169 			if (object == VM_OBJECT_NULL) {
17170 				assert(!src_entry->needs_copy);
17171 				if (src_entry->max_protection == VM_PROT_NONE) {
17172 					assert(src_entry->protection == VM_PROT_NONE);
17173 					/*
17174 					 * No VM object and no permissions:
17175 					 * this must be a reserved range with
17176 					 * nothing to share or copy.
17177 					 * There could also be all sorts of
17178 					 * pmap shenanigans within that reserved
17179 					 * range, so let's just copy the map
17180 					 * entry as is to remap a similar
17181 					 * reserved range.
17182 					 */
17183 					offset = 0; /* no object => no offset */
17184 					goto copy_src_entry;
17185 				}
17186 				object = vm_object_allocate(entry_size);
17187 				VME_OFFSET_SET(src_entry, 0);
17188 				VME_OBJECT_SET(src_entry, object, false, 0);
17189 				assert(src_entry->use_pmap);
17190 				assert(!map->mapped_in_other_pmaps);
17191 			} else if (src_entry->wired_count ||
17192 			    object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
17193 				/*
17194 				 * A wired memory region should not have
17195 				 * any pending copy-on-write and needs to
17196 				 * keep pointing at the VM object that
17197 				 * contains the wired pages.
17198 				 * If we're sharing this memory (copy=false),
17199 				 * we'll share this VM object.
17200 				 * If we're copying this memory (copy=true),
17201 				 * we'll call vm_object_copy_slowly() below
17202 				 * and use the new VM object for the remapping.
17203 				 *
17204 				 * Or, we are already using an asymmetric
17205 				 * copy, and therefore we already have
17206 				 * the right object.
17207 				 */
17208 				assert(!src_entry->needs_copy);
17209 			} else if (src_entry->needs_copy || object->shadowed ||
17210 			    (object->internal && !object->true_share &&
17211 			    !src_entry->is_shared &&
17212 			    object->vo_size > entry_size)) {
17213 				VME_OBJECT_SHADOW(src_entry, entry_size,
17214 				    vm_map_always_shadow(map));
17215 				assert(src_entry->use_pmap);
17216 
17217 				if (!src_entry->needs_copy &&
17218 				    (src_entry->protection & VM_PROT_WRITE)) {
17219 					vm_prot_t prot;
17220 
17221 					assert(!pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, src_entry->protection));
17222 
17223 					prot = src_entry->protection & ~VM_PROT_WRITE;
17224 
17225 					if (override_nx(map,
17226 					    VME_ALIAS(src_entry))
17227 					    && prot) {
17228 						prot |= VM_PROT_EXECUTE;
17229 					}
17230 
17231 					assert(!pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, prot));
17232 
17233 					if (map->mapped_in_other_pmaps) {
17234 						vm_object_pmap_protect(
17235 							VME_OBJECT(src_entry),
17236 							VME_OFFSET(src_entry),
17237 							entry_size,
17238 							PMAP_NULL,
17239 							PAGE_SIZE,
17240 							src_entry->vme_start,
17241 							prot);
17242 #if MACH_ASSERT
17243 					} else if (__improbable(map->pmap == PMAP_NULL)) {
17244 						extern boolean_t vm_tests_in_progress;
17245 						assert(vm_tests_in_progress);
17246 						/*
17247 						 * Some VM tests (in vm_tests.c)
17248 						 * sometimes want to use a VM
17249 						 * map without a pmap.
17250 						 * Otherwise, this should never
17251 						 * happen.
17252 						 */
17253 #endif /* MACH_ASSERT */
17254 					} else {
17255 						pmap_protect(vm_map_pmap(map),
17256 						    src_entry->vme_start,
17257 						    src_entry->vme_end,
17258 						    prot);
17259 					}
17260 				}
17261 
17262 				object = VME_OBJECT(src_entry);
17263 				src_entry->needs_copy = FALSE;
17264 			}
17265 
17266 
17267 			vm_object_lock(object);
17268 			vm_object_reference_locked(object); /* object ref. for new entry */
17269 			assert(!src_entry->needs_copy);
17270 			if (object->copy_strategy ==
17271 			    MEMORY_OBJECT_COPY_SYMMETRIC) {
17272 				/*
17273 				 * If we want to share this object (copy==0),
17274 				 * it needs to be COPY_DELAY.
17275 				 * If we want to copy this object (copy==1),
17276 				 * we can't just set "needs_copy" on our side
17277 				 * and expect the other side to do the same
17278 				 * (symmetrically), so we can't let the object
17279 				 * stay COPY_SYMMETRIC.
17280 				 * So we always switch from COPY_SYMMETRIC to
17281 				 * COPY_DELAY.
17282 				 */
17283 				object->copy_strategy =
17284 				    MEMORY_OBJECT_COPY_DELAY;
17285 				object->true_share = TRUE;
17286 			}
17287 			vm_object_unlock(object);
17288 		}
17289 
17290 		offset = (VME_OFFSET(src_entry) +
17291 		    (src_start - src_entry->vme_start));
17292 
17293 copy_src_entry:
17294 		new_entry = _vm_map_entry_create(map_header);
17295 		vm_map_entry_copy(map, new_entry, src_entry);
17296 		if (new_entry->is_sub_map) {
17297 			/* clr address space specifics */
17298 			new_entry->use_pmap = FALSE;
17299 		} else if (copy) {
17300 			/*
17301 			 * We're dealing with a copy-on-write operation,
17302 			 * so the resulting mapping should not inherit the
17303 			 * original mapping's accounting settings.
17304 			 * "use_pmap" should be reset to its default (TRUE)
17305 			 * so that the new mapping gets accounted for in
17306 			 * the task's memory footprint.
17307 			 */
17308 			new_entry->use_pmap = TRUE;
17309 		}
17310 		/* "iokit_acct" was cleared in vm_map_entry_copy() */
17311 		assert(!new_entry->iokit_acct);
17312 
17313 		new_entry->map_aligned = FALSE;
17314 
17315 		new_entry->vme_start = map_address;
17316 		new_entry->vme_end = map_address + tmp_size;
17317 		assert(new_entry->vme_start < new_entry->vme_end);
17318 		if (copy && vmk_flags.vmkf_remap_prot_copy) {
17319 			/* security: keep "permanent" and "pmap_cs_associated" */
17320 			new_entry->vme_permanent = src_entry->vme_permanent;
17321 			new_entry->pmap_cs_associated = src_entry->pmap_cs_associated;
17322 			/*
17323 			 * Remapping for vm_map_protect(VM_PROT_COPY)
17324 			 * to convert a read-only mapping into a
17325 			 * copy-on-write version of itself but
17326 			 * with write access:
17327 			 * keep the original inheritance but let's not
17328 			 * add VM_PROT_WRITE to the max protection yet
17329 			 * since we want to do more security checks against
17330 			 * the target map.
17331 			 */
17332 			new_entry->inheritance = src_entry->inheritance;
17333 			new_entry->protection &= max_prot_for_prot_copy;
17334 		} else {
17335 			new_entry->inheritance = inheritance;
17336 			if (!vm_remap_legacy) {
17337 				new_entry->protection = *cur_protection;
17338 				new_entry->max_protection = *max_protection;
17339 			}
17340 		}
17341 		VME_OFFSET_SET(new_entry, offset);
17342 
17343 		/*
17344 		 * The new region has to be copied now if required.
17345 		 */
17346 RestartCopy:
17347 		if (!copy) {
17348 			if (src_entry->used_for_jit == TRUE) {
17349 				if (same_map) {
17350 				} else if (!VM_MAP_POLICY_ALLOW_JIT_SHARING(map)) {
17351 					/*
17352 					 * Cannot allow an entry describing a JIT
17353 					 * region to be shared across address spaces.
17354 					 */
17355 					result = KERN_INVALID_ARGUMENT;
17356 					vm_object_deallocate(object);
17357 					vm_map_entry_dispose(new_entry);
17358 					new_entry = VM_MAP_ENTRY_NULL;
17359 					break;
17360 				}
17361 			}
17362 
17363 			src_entry->is_shared = TRUE;
17364 			new_entry->is_shared = TRUE;
17365 			if (!(new_entry->is_sub_map)) {
17366 				new_entry->needs_copy = FALSE;
17367 			}
17368 		} else if (src_entry->is_sub_map) {
17369 			/* make this a COW sub_map if not already */
17370 			assert(new_entry->wired_count == 0);
17371 			new_entry->needs_copy = TRUE;
17372 			object = VM_OBJECT_NULL;
17373 		} else if (src_entry->wired_count == 0 &&
17374 		    !(debug4k_no_cow_copyin && VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) &&
17375 		    vm_object_copy_quickly(VME_OBJECT(new_entry),
17376 		    VME_OFFSET(new_entry),
17377 		    (new_entry->vme_end -
17378 		    new_entry->vme_start),
17379 		    &src_needs_copy,
17380 		    &new_entry_needs_copy)) {
17381 			new_entry->needs_copy = new_entry_needs_copy;
17382 			new_entry->is_shared = FALSE;
17383 			assertf(new_entry->use_pmap, "map %p new_entry %p\n", map, new_entry);
17384 
17385 			/*
17386 			 * Handle copy_on_write semantics.
17387 			 */
17388 			if (src_needs_copy && !src_entry->needs_copy) {
17389 				vm_prot_t prot;
17390 
17391 				assert(!pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, src_entry->protection));
17392 
17393 				prot = src_entry->protection & ~VM_PROT_WRITE;
17394 
17395 				if (override_nx(map,
17396 				    VME_ALIAS(src_entry))
17397 				    && prot) {
17398 					prot |= VM_PROT_EXECUTE;
17399 				}
17400 
17401 				assert(!pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, prot));
17402 
17403 				vm_object_pmap_protect(object,
17404 				    offset,
17405 				    entry_size,
17406 				    ((src_entry->is_shared
17407 				    || map->mapped_in_other_pmaps) ?
17408 				    PMAP_NULL : map->pmap),
17409 				    VM_MAP_PAGE_SIZE(map),
17410 				    src_entry->vme_start,
17411 				    prot);
17412 
17413 				assert(src_entry->wired_count == 0);
17414 				src_entry->needs_copy = TRUE;
17415 			}
17416 			/*
17417 			 * Throw away the old object reference of the new entry.
17418 			 */
17419 			vm_object_deallocate(object);
17420 		} else {
17421 			new_entry->is_shared = FALSE;
17422 			assertf(new_entry->use_pmap, "map %p new_entry %p\n", map, new_entry);
17423 
17424 			src_entry_was_wired = (src_entry->wired_count > 0);
17425 			saved_src_entry = src_entry;
17426 			src_entry = VM_MAP_ENTRY_NULL;
17427 
17428 			/*
17429 			 * The map can be safely unlocked since we
17430 			 * already hold a reference on the object.
17431 			 *
17432 			 * Record the timestamp of the map for later
17433 			 * verification, and unlock the map.
17434 			 */
17435 			version.main_timestamp = map->timestamp;
17436 			vm_map_unlock(map);     /* Increments timestamp once! */
17437 
17438 			/*
17439 			 * Perform the copy.
17440 			 */
17441 			if (src_entry_was_wired > 0 ||
17442 			    (debug4k_no_cow_copyin &&
17443 			    VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT)) {
17444 				vm_object_lock(object);
17445 				result = vm_object_copy_slowly(
17446 					object,
17447 					offset,
17448 					(new_entry->vme_end -
17449 					new_entry->vme_start),
17450 					THREAD_UNINT,
17451 					&new_copy_object);
17452 				/* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
17453 				saved_used_for_jit = new_entry->used_for_jit;
17454 				VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
17455 				new_entry->used_for_jit = saved_used_for_jit;
17456 				VME_OFFSET_SET(new_entry, offset - vm_object_trunc_page(offset));
17457 				new_entry->needs_copy = FALSE;
17458 			} else {
17459 				vm_object_offset_t new_offset;
17460 
17461 				new_offset = VME_OFFSET(new_entry);
17462 				result = vm_object_copy_strategically(
17463 					object,
17464 					offset,
17465 					(new_entry->vme_end -
17466 					new_entry->vme_start),
17467 					&new_copy_object,
17468 					&new_offset,
17469 					&new_entry_needs_copy);
17470 				/* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
17471 				saved_used_for_jit = new_entry->used_for_jit;
17472 				VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
17473 				new_entry->used_for_jit = saved_used_for_jit;
17474 				if (new_offset != VME_OFFSET(new_entry)) {
17475 					VME_OFFSET_SET(new_entry, new_offset);
17476 				}
17477 
17478 				new_entry->needs_copy = new_entry_needs_copy;
17479 			}
17480 
17481 			/*
17482 			 * Throw away the old object reference of the new entry.
17483 			 */
17484 			vm_object_deallocate(object);
17485 
17486 			if (result != KERN_SUCCESS &&
17487 			    result != KERN_MEMORY_RESTART_COPY) {
17488 				vm_map_entry_dispose(new_entry);
17489 				vm_map_lock(map);
17490 				break;
17491 			}
17492 
17493 			/*
17494 			 * Verify that the map has not substantially
17495 			 * changed while the copy was being made.
17496 			 */
17497 
17498 			vm_map_lock(map);
17499 			if (version.main_timestamp + 1 != map->timestamp) {
17500 				/*
17501 				 * Simple version comparison failed.
17502 				 *
17503 				 * Retry the lookup and verify that the
17504 				 * same object/offset are still present.
17505 				 */
17506 				saved_src_entry = VM_MAP_ENTRY_NULL;
17507 				vm_object_deallocate(VME_OBJECT(new_entry));
17508 				vm_map_entry_dispose(new_entry);
17509 				if (result == KERN_MEMORY_RESTART_COPY) {
17510 					result = KERN_SUCCESS;
17511 				}
17512 				continue;
17513 			}
17514 			/* map hasn't changed: src_entry is still valid */
17515 			src_entry = saved_src_entry;
17516 			saved_src_entry = VM_MAP_ENTRY_NULL;
17517 
17518 			if (result == KERN_MEMORY_RESTART_COPY) {
17519 				vm_object_reference(object);
17520 				goto RestartCopy;
17521 			}
17522 		}
17523 
17524 		_vm_map_store_entry_link(map_header,
17525 		    map_header->links.prev, new_entry);
17526 
17527 		/* protections for submap mapping are irrelevant here */
17528 		if (vm_remap_legacy && !src_entry->is_sub_map) {
17529 			*cur_protection &= src_entry->protection;
17530 			*max_protection &= src_entry->max_protection;
17531 		}
17532 
17533 		map_address += tmp_size;
17534 		mapped_size += tmp_size;
17535 		src_start += tmp_size;
17536 
17537 		if (vmk_flags.vmkf_copy_single_object) {
17538 			if (mapped_size != size) {
17539 				DEBUG4K_SHARE("map %p addr 0x%llx size 0x%llx clipped copy at mapped_size 0x%llx\n",
17540 				    map, (uint64_t)addr, (uint64_t)size, (uint64_t)mapped_size);
17541 				if (src_entry->vme_next != vm_map_to_entry(map) &&
17542 				    src_entry->vme_next->vme_object_value ==
17543 				    src_entry->vme_object_value) {
17544 					/* XXX TODO4K */
17545 					DEBUG4K_ERROR("could have extended copy to next entry...\n");
17546 				}
17547 			}
17548 			break;
17549 		}
17550 	} /* end while */
17551 
17552 	vm_map_unlock(map);
17553 	if (result != KERN_SUCCESS) {
17554 		/*
17555 		 * Free all allocated elements.
17556 		 */
17557 		for (src_entry = map_header->links.next;
17558 		    src_entry != CAST_TO_VM_MAP_ENTRY(&map_header->links);
17559 		    src_entry = new_entry) {
17560 			new_entry = src_entry->vme_next;
17561 			_vm_map_store_entry_unlink(map_header, src_entry, false);
17562 			if (src_entry->is_sub_map) {
17563 				vm_map_deallocate(VME_SUBMAP(src_entry));
17564 			} else {
17565 				vm_object_deallocate(VME_OBJECT(src_entry));
17566 			}
17567 			vm_map_entry_dispose(src_entry);
17568 		}
17569 	}
17570 	return result;
17571 }
17572 
17573 bool
vm_map_is_exotic(vm_map_t map)17574 vm_map_is_exotic(
17575 	vm_map_t map)
17576 {
17577 	return VM_MAP_IS_EXOTIC(map);
17578 }
17579 
17580 bool
vm_map_is_alien(vm_map_t map)17581 vm_map_is_alien(
17582 	vm_map_t map)
17583 {
17584 	return VM_MAP_IS_ALIEN(map);
17585 }
17586 
17587 #if XNU_TARGET_OS_OSX
17588 void
vm_map_mark_alien(vm_map_t map)17589 vm_map_mark_alien(
17590 	vm_map_t map)
17591 {
17592 	vm_map_lock(map);
17593 	map->is_alien = true;
17594 	vm_map_unlock(map);
17595 }
17596 
17597 void
vm_map_single_jit(vm_map_t map)17598 vm_map_single_jit(
17599 	vm_map_t map)
17600 {
17601 	vm_map_lock(map);
17602 	map->single_jit = true;
17603 	vm_map_unlock(map);
17604 }
17605 #endif /* XNU_TARGET_OS_OSX */
17606 
17607 /*
17608  * Callers of this function must call vm_map_copy_require on
17609  * previously created vm_map_copy_t or pass a newly created
17610  * one to ensure that it hasn't been forged.
17611  */
17612 static kern_return_t
vm_map_copy_to_physcopy(vm_map_copy_t copy_map,vm_map_t target_map)17613 vm_map_copy_to_physcopy(
17614 	vm_map_copy_t   copy_map,
17615 	vm_map_t        target_map)
17616 {
17617 	vm_map_size_t           size;
17618 	vm_map_entry_t          entry;
17619 	vm_map_entry_t          new_entry;
17620 	vm_object_t             new_object;
17621 	unsigned int            pmap_flags;
17622 	pmap_t                  new_pmap;
17623 	vm_map_t                new_map;
17624 	vm_map_address_t        src_start, src_end, src_cur;
17625 	vm_map_address_t        dst_start, dst_end, dst_cur;
17626 	kern_return_t           kr;
17627 	void                    *kbuf;
17628 
17629 	/*
17630 	 * Perform the equivalent of vm_allocate() and memcpy().
17631 	 * Replace the mappings in "copy_map" with the newly allocated mapping.
17632 	 */
17633 	DEBUG4K_COPY("copy_map %p (%d %d 0x%llx 0x%llx) BEFORE\n", copy_map, copy_map->cpy_hdr.page_shift, copy_map->cpy_hdr.nentries, copy_map->offset, (uint64_t)copy_map->size);
17634 
17635 	assert(copy_map->cpy_hdr.page_shift != VM_MAP_PAGE_MASK(target_map));
17636 
17637 	/* create a new pmap to map "copy_map" */
17638 	pmap_flags = 0;
17639 	assert(copy_map->cpy_hdr.page_shift == FOURK_PAGE_SHIFT);
17640 #if PMAP_CREATE_FORCE_4K_PAGES
17641 	pmap_flags |= PMAP_CREATE_FORCE_4K_PAGES;
17642 #endif /* PMAP_CREATE_FORCE_4K_PAGES */
17643 	pmap_flags |= PMAP_CREATE_64BIT;
17644 	new_pmap = pmap_create_options(NULL, (vm_map_size_t)0, pmap_flags);
17645 	if (new_pmap == NULL) {
17646 		return KERN_RESOURCE_SHORTAGE;
17647 	}
17648 
17649 	/* allocate new VM object */
17650 	size = VM_MAP_ROUND_PAGE(copy_map->size, PAGE_MASK);
17651 	new_object = vm_object_allocate(size);
17652 	assert(new_object);
17653 
17654 	/* allocate new VM map entry */
17655 	new_entry = vm_map_copy_entry_create(copy_map);
17656 	assert(new_entry);
17657 
17658 	/* finish initializing new VM map entry */
17659 	new_entry->protection = VM_PROT_DEFAULT;
17660 	new_entry->max_protection = VM_PROT_DEFAULT;
17661 	new_entry->use_pmap = TRUE;
17662 
17663 	/* make new VM map entry point to new VM object */
17664 	new_entry->vme_start = 0;
17665 	new_entry->vme_end = size;
17666 	VME_OBJECT_SET(new_entry, new_object, false, 0);
17667 	VME_OFFSET_SET(new_entry, 0);
17668 
17669 	/* create a new pageable VM map to map "copy_map" */
17670 	new_map = vm_map_create_options(new_pmap, 0, MACH_VM_MAX_ADDRESS,
17671 	    VM_MAP_CREATE_PAGEABLE);
17672 	assert(new_map);
17673 	vm_map_set_page_shift(new_map, copy_map->cpy_hdr.page_shift);
17674 
17675 	/* map "copy_map" in the new VM map */
17676 	src_start = 0;
17677 	kr = vm_map_copyout_internal(
17678 		new_map,
17679 		&src_start,
17680 		copy_map,
17681 		copy_map->size,
17682 		FALSE, /* consume_on_success */
17683 		VM_PROT_DEFAULT,
17684 		VM_PROT_DEFAULT,
17685 		VM_INHERIT_DEFAULT);
17686 	assert(kr == KERN_SUCCESS);
17687 	src_end = src_start + copy_map->size;
17688 
17689 	/* map "new_object" in the new VM map */
17690 	vm_object_reference(new_object);
17691 	dst_start = 0;
17692 	kr = vm_map_enter(new_map,
17693 	    &dst_start,
17694 	    size,
17695 	    0,               /* mask */
17696 	    VM_FLAGS_ANYWHERE,
17697 	    VM_MAP_KERNEL_FLAGS_NONE,
17698 	    VM_KERN_MEMORY_OSFMK,
17699 	    new_object,
17700 	    0,               /* offset */
17701 	    FALSE,               /* needs copy */
17702 	    VM_PROT_DEFAULT,
17703 	    VM_PROT_DEFAULT,
17704 	    VM_INHERIT_DEFAULT);
17705 	assert(kr == KERN_SUCCESS);
17706 	dst_end = dst_start + size;
17707 
17708 	/* get a kernel buffer */
17709 	kbuf = kalloc_data(PAGE_SIZE, Z_WAITOK | Z_NOFAIL);
17710 
17711 	/* physically copy "copy_map" mappings to new VM object */
17712 	for (src_cur = src_start, dst_cur = dst_start;
17713 	    src_cur < src_end;
17714 	    src_cur += PAGE_SIZE, dst_cur += PAGE_SIZE) {
17715 		vm_size_t bytes;
17716 
17717 		bytes = PAGE_SIZE;
17718 		if (src_cur + PAGE_SIZE > src_end) {
17719 			/* partial copy for last page */
17720 			bytes = src_end - src_cur;
17721 			assert(bytes > 0 && bytes < PAGE_SIZE);
17722 			/* rest of dst page should be zero-filled */
17723 		}
17724 		/* get bytes from src mapping */
17725 		kr = copyinmap(new_map, src_cur, kbuf, bytes);
17726 		if (kr != KERN_SUCCESS) {
17727 			DEBUG4K_COPY("copyinmap(%p, 0x%llx, %p, 0x%llx) kr 0x%x\n", new_map, (uint64_t)src_cur, kbuf, (uint64_t)bytes, kr);
17728 		}
17729 		/* put bytes in dst mapping */
17730 		assert(dst_cur < dst_end);
17731 		assert(dst_cur + bytes <= dst_end);
17732 		kr = copyoutmap(new_map, kbuf, dst_cur, bytes);
17733 		if (kr != KERN_SUCCESS) {
17734 			DEBUG4K_COPY("copyoutmap(%p, %p, 0x%llx, 0x%llx) kr 0x%x\n", new_map, kbuf, (uint64_t)dst_cur, (uint64_t)bytes, kr);
17735 		}
17736 	}
17737 
17738 	/* free kernel buffer */
17739 	kfree_data(kbuf, PAGE_SIZE);
17740 
17741 	/* destroy new map */
17742 	vm_map_destroy(new_map);
17743 	new_map = VM_MAP_NULL;
17744 
17745 	/* dispose of the old map entries in "copy_map" */
17746 	while (vm_map_copy_first_entry(copy_map) !=
17747 	    vm_map_copy_to_entry(copy_map)) {
17748 		entry = vm_map_copy_first_entry(copy_map);
17749 		vm_map_copy_entry_unlink(copy_map, entry);
17750 		if (entry->is_sub_map) {
17751 			vm_map_deallocate(VME_SUBMAP(entry));
17752 		} else {
17753 			vm_object_deallocate(VME_OBJECT(entry));
17754 		}
17755 		vm_map_copy_entry_dispose(entry);
17756 	}
17757 
17758 	/* change "copy_map"'s page_size to match "target_map" */
17759 	copy_map->cpy_hdr.page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(target_map);
17760 	copy_map->offset = 0;
17761 	copy_map->size = size;
17762 
17763 	/* insert new map entry in "copy_map" */
17764 	assert(vm_map_copy_last_entry(copy_map) == vm_map_copy_to_entry(copy_map));
17765 	vm_map_copy_entry_link(copy_map, vm_map_copy_last_entry(copy_map), new_entry);
17766 
17767 	DEBUG4K_COPY("copy_map %p (%d %d 0x%llx 0x%llx) AFTER\n", copy_map, copy_map->cpy_hdr.page_shift, copy_map->cpy_hdr.nentries, copy_map->offset, (uint64_t)copy_map->size);
17768 	return KERN_SUCCESS;
17769 }
17770 
17771 void
17772 vm_map_copy_adjust_get_target_copy_map(
17773 	vm_map_copy_t   copy_map,
17774 	vm_map_copy_t   *target_copy_map_p);
17775 void
vm_map_copy_adjust_get_target_copy_map(vm_map_copy_t copy_map,vm_map_copy_t * target_copy_map_p)17776 vm_map_copy_adjust_get_target_copy_map(
17777 	vm_map_copy_t   copy_map,
17778 	vm_map_copy_t   *target_copy_map_p)
17779 {
17780 	vm_map_copy_t   target_copy_map;
17781 	vm_map_entry_t  entry, target_entry;
17782 
17783 	if (*target_copy_map_p != VM_MAP_COPY_NULL) {
17784 		/* the caller already has a "target_copy_map": use it */
17785 		return;
17786 	}
17787 
17788 	/* the caller wants us to create a new copy of "copy_map" */
17789 	target_copy_map = vm_map_copy_allocate();
17790 	target_copy_map->type = copy_map->type;
17791 	assert(target_copy_map->type == VM_MAP_COPY_ENTRY_LIST);
17792 	target_copy_map->offset = copy_map->offset;
17793 	target_copy_map->size = copy_map->size;
17794 	target_copy_map->cpy_hdr.page_shift = copy_map->cpy_hdr.page_shift;
17795 	vm_map_store_init(&target_copy_map->cpy_hdr);
17796 	for (entry = vm_map_copy_first_entry(copy_map);
17797 	    entry != vm_map_copy_to_entry(copy_map);
17798 	    entry = entry->vme_next) {
17799 		target_entry = vm_map_copy_entry_create(target_copy_map);
17800 		vm_map_entry_copy_full(target_entry, entry);
17801 		if (target_entry->is_sub_map) {
17802 			vm_map_reference(VME_SUBMAP(target_entry));
17803 		} else {
17804 			vm_object_reference(VME_OBJECT(target_entry));
17805 		}
17806 		vm_map_copy_entry_link(
17807 			target_copy_map,
17808 			vm_map_copy_last_entry(target_copy_map),
17809 			target_entry);
17810 	}
17811 	entry = VM_MAP_ENTRY_NULL;
17812 	*target_copy_map_p = target_copy_map;
17813 }
17814 
17815 /*
17816  * Callers of this function must call vm_map_copy_require on
17817  * previously created vm_map_copy_t or pass a newly created
17818  * one to ensure that it hasn't been forged.
17819  */
17820 static void
vm_map_copy_trim(vm_map_copy_t copy_map,uint16_t new_page_shift,vm_map_offset_t trim_start,vm_map_offset_t trim_end)17821 vm_map_copy_trim(
17822 	vm_map_copy_t   copy_map,
17823 	uint16_t        new_page_shift,
17824 	vm_map_offset_t trim_start,
17825 	vm_map_offset_t trim_end)
17826 {
17827 	uint16_t        copy_page_shift;
17828 	vm_map_entry_t  entry, next_entry;
17829 
17830 	assert(copy_map->type == VM_MAP_COPY_ENTRY_LIST);
17831 	assert(copy_map->cpy_hdr.nentries > 0);
17832 
17833 	trim_start += vm_map_copy_first_entry(copy_map)->vme_start;
17834 	trim_end += vm_map_copy_first_entry(copy_map)->vme_start;
17835 
17836 	/* use the new page_shift to do the clipping */
17837 	copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy_map);
17838 	copy_map->cpy_hdr.page_shift = new_page_shift;
17839 
17840 	for (entry = vm_map_copy_first_entry(copy_map);
17841 	    entry != vm_map_copy_to_entry(copy_map);
17842 	    entry = next_entry) {
17843 		next_entry = entry->vme_next;
17844 		if (entry->vme_end <= trim_start) {
17845 			/* entry fully before trim range: skip */
17846 			continue;
17847 		}
17848 		if (entry->vme_start >= trim_end) {
17849 			/* entry fully after trim range: done */
17850 			break;
17851 		}
17852 		/* clip entry if needed */
17853 		vm_map_copy_clip_start(copy_map, entry, trim_start);
17854 		vm_map_copy_clip_end(copy_map, entry, trim_end);
17855 		/* dispose of entry */
17856 		copy_map->size -= entry->vme_end - entry->vme_start;
17857 		vm_map_copy_entry_unlink(copy_map, entry);
17858 		if (entry->is_sub_map) {
17859 			vm_map_deallocate(VME_SUBMAP(entry));
17860 		} else {
17861 			vm_object_deallocate(VME_OBJECT(entry));
17862 		}
17863 		vm_map_copy_entry_dispose(entry);
17864 		entry = VM_MAP_ENTRY_NULL;
17865 	}
17866 
17867 	/* restore copy_map's original page_shift */
17868 	copy_map->cpy_hdr.page_shift = copy_page_shift;
17869 }
17870 
17871 /*
17872  * Make any necessary adjustments to "copy_map" to allow it to be
17873  * mapped into "target_map".
17874  * If no changes were necessary, "target_copy_map" points to the
17875  * untouched "copy_map".
17876  * If changes are necessary, changes will be made to "target_copy_map".
17877  * If "target_copy_map" was NULL, we create a new "vm_map_copy_t" and
17878  * copy the original "copy_map" to it before applying the changes.
17879  * The caller should discard "target_copy_map" if it's not the same as
17880  * the original "copy_map".
17881  */
17882 /* TODO4K: also adjust to sub-range in the copy_map -> add start&end? */
17883 kern_return_t
vm_map_copy_adjust_to_target(vm_map_copy_t src_copy_map,vm_map_offset_t offset,vm_map_size_t size,vm_map_t target_map,boolean_t copy,vm_map_copy_t * target_copy_map_p,vm_map_offset_t * overmap_start_p,vm_map_offset_t * overmap_end_p,vm_map_offset_t * trimmed_start_p)17884 vm_map_copy_adjust_to_target(
17885 	vm_map_copy_t           src_copy_map,
17886 	vm_map_offset_t         offset,
17887 	vm_map_size_t           size,
17888 	vm_map_t                target_map,
17889 	boolean_t               copy,
17890 	vm_map_copy_t           *target_copy_map_p,
17891 	vm_map_offset_t         *overmap_start_p,
17892 	vm_map_offset_t         *overmap_end_p,
17893 	vm_map_offset_t         *trimmed_start_p)
17894 {
17895 	vm_map_copy_t           copy_map, target_copy_map;
17896 	vm_map_size_t           target_size;
17897 	vm_map_size_t           src_copy_map_size;
17898 	vm_map_size_t           overmap_start, overmap_end;
17899 	int                     misalignments;
17900 	vm_map_entry_t          entry, target_entry;
17901 	vm_map_offset_t         addr_adjustment;
17902 	vm_map_offset_t         new_start, new_end;
17903 	int                     copy_page_mask, target_page_mask;
17904 	uint16_t                copy_page_shift, target_page_shift;
17905 	vm_map_offset_t         trimmed_end;
17906 
17907 	/*
17908 	 * Assert that the vm_map_copy is coming from the right
17909 	 * zone and hasn't been forged
17910 	 */
17911 	vm_map_copy_require(src_copy_map);
17912 	assert(src_copy_map->type == VM_MAP_COPY_ENTRY_LIST);
17913 
17914 	/*
17915 	 * Start working with "src_copy_map" but we'll switch
17916 	 * to "target_copy_map" as soon as we start making adjustments.
17917 	 */
17918 	copy_map = src_copy_map;
17919 	src_copy_map_size = src_copy_map->size;
17920 
17921 	copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy_map);
17922 	copy_page_mask = VM_MAP_COPY_PAGE_MASK(copy_map);
17923 	target_page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(target_map);
17924 	target_page_mask = VM_MAP_PAGE_MASK(target_map);
17925 
17926 	DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p...\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, (uint64_t)offset, (uint64_t)size, *target_copy_map_p);
17927 
17928 	target_copy_map = *target_copy_map_p;
17929 	if (target_copy_map != VM_MAP_COPY_NULL) {
17930 		vm_map_copy_require(target_copy_map);
17931 	}
17932 
17933 	if (offset + size > copy_map->size) {
17934 		DEBUG4K_ERROR("copy_map %p (%d->%d) copy_map->size 0x%llx offset 0x%llx size 0x%llx KERN_INVALID_ARGUMENT\n", copy_map, copy_page_shift, target_page_shift, (uint64_t)copy_map->size, (uint64_t)offset, (uint64_t)size);
17935 		return KERN_INVALID_ARGUMENT;
17936 	}
17937 
17938 	/* trim the end */
17939 	trimmed_end = 0;
17940 	new_end = VM_MAP_ROUND_PAGE(offset + size, target_page_mask);
17941 	if (new_end < copy_map->size) {
17942 		trimmed_end = src_copy_map_size - new_end;
17943 		DEBUG4K_ADJUST("copy_map %p (%d->%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p... trim end from 0x%llx to 0x%llx\n", copy_map, copy_page_shift, target_page_shift, copy, (uint64_t)offset, (uint64_t)size, target_copy_map, (uint64_t)new_end, (uint64_t)copy_map->size);
17944 		/* get "target_copy_map" if needed and adjust it */
17945 		vm_map_copy_adjust_get_target_copy_map(copy_map,
17946 		    &target_copy_map);
17947 		copy_map = target_copy_map;
17948 		vm_map_copy_trim(target_copy_map, target_page_shift,
17949 		    new_end, copy_map->size);
17950 	}
17951 
17952 	/* trim the start */
17953 	new_start = VM_MAP_TRUNC_PAGE(offset, target_page_mask);
17954 	if (new_start != 0) {
17955 		DEBUG4K_ADJUST("copy_map %p (%d->%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p... trim start from 0x%llx to 0x%llx\n", copy_map, copy_page_shift, target_page_shift, copy, (uint64_t)offset, (uint64_t)size, target_copy_map, (uint64_t)0, (uint64_t)new_start);
17956 		/* get "target_copy_map" if needed and adjust it */
17957 		vm_map_copy_adjust_get_target_copy_map(copy_map,
17958 		    &target_copy_map);
17959 		copy_map = target_copy_map;
17960 		vm_map_copy_trim(target_copy_map, target_page_shift,
17961 		    0, new_start);
17962 	}
17963 	*trimmed_start_p = new_start;
17964 
17965 	/* target_size starts with what's left after trimming */
17966 	target_size = copy_map->size;
17967 	assertf(target_size == src_copy_map_size - *trimmed_start_p - trimmed_end,
17968 	    "target_size 0x%llx src_copy_map_size 0x%llx trimmed_start 0x%llx trimmed_end 0x%llx\n",
17969 	    (uint64_t)target_size, (uint64_t)src_copy_map_size,
17970 	    (uint64_t)*trimmed_start_p, (uint64_t)trimmed_end);
17971 
17972 	/* check for misalignments but don't adjust yet */
17973 	misalignments = 0;
17974 	overmap_start = 0;
17975 	overmap_end = 0;
17976 	if (copy_page_shift < target_page_shift) {
17977 		/*
17978 		 * Remapping from 4K to 16K: check the VM object alignments
17979 		 * throughout the range.
17980 		 * If the start and end of the range are mis-aligned, we can
17981 		 * over-map to re-align, and adjust the "overmap" start/end
17982 		 * and "target_size" of the range accordingly.
17983 		 * If there is any mis-alignment within the range:
17984 		 *     if "copy":
17985 		 *         we can do immediate-copy instead of copy-on-write,
17986 		 *     else:
17987 		 *         no way to remap and share; fail.
17988 		 */
17989 		for (entry = vm_map_copy_first_entry(copy_map);
17990 		    entry != vm_map_copy_to_entry(copy_map);
17991 		    entry = entry->vme_next) {
17992 			vm_object_offset_t object_offset_start, object_offset_end;
17993 
17994 			object_offset_start = VME_OFFSET(entry);
17995 			object_offset_end = object_offset_start;
17996 			object_offset_end += entry->vme_end - entry->vme_start;
17997 			if (object_offset_start & target_page_mask) {
17998 				if (entry == vm_map_copy_first_entry(copy_map) && !copy) {
17999 					overmap_start++;
18000 				} else {
18001 					misalignments++;
18002 				}
18003 			}
18004 			if (object_offset_end & target_page_mask) {
18005 				if (entry->vme_next == vm_map_copy_to_entry(copy_map) && !copy) {
18006 					overmap_end++;
18007 				} else {
18008 					misalignments++;
18009 				}
18010 			}
18011 		}
18012 	}
18013 	entry = VM_MAP_ENTRY_NULL;
18014 
18015 	/* decide how to deal with misalignments */
18016 	assert(overmap_start <= 1);
18017 	assert(overmap_end <= 1);
18018 	if (!overmap_start && !overmap_end && !misalignments) {
18019 		/* copy_map is properly aligned for target_map ... */
18020 		if (*trimmed_start_p) {
18021 			/* ... but we trimmed it, so still need to adjust */
18022 		} else {
18023 			/* ... and we didn't trim anything: we're done */
18024 			if (target_copy_map == VM_MAP_COPY_NULL) {
18025 				target_copy_map = copy_map;
18026 			}
18027 			*target_copy_map_p = target_copy_map;
18028 			*overmap_start_p = 0;
18029 			*overmap_end_p = 0;
18030 			DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx) -> trimmed 0x%llx overmap start 0x%llx end 0x%llx KERN_SUCCESS\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p);
18031 			return KERN_SUCCESS;
18032 		}
18033 	} else if (misalignments && !copy) {
18034 		/* can't "share" if misaligned */
18035 		DEBUG4K_ADJUST("unsupported sharing\n");
18036 #if MACH_ASSERT
18037 		if (debug4k_panic_on_misaligned_sharing) {
18038 			panic("DEBUG4k %s:%d unsupported sharing", __FUNCTION__, __LINE__);
18039 		}
18040 #endif /* MACH_ASSERT */
18041 		DEBUG4K_ADJUST("copy_map %p (%d) target_map %p (%d) copy %d target_copy_map %p -> KERN_NOT_SUPPORTED\n", copy_map, copy_page_shift, target_map, target_page_shift, copy, *target_copy_map_p);
18042 		return KERN_NOT_SUPPORTED;
18043 	} else {
18044 		/* can't virtual-copy if misaligned (but can physical-copy) */
18045 		DEBUG4K_ADJUST("mis-aligned copying\n");
18046 	}
18047 
18048 	/* get a "target_copy_map" if needed and switch to it */
18049 	vm_map_copy_adjust_get_target_copy_map(copy_map, &target_copy_map);
18050 	copy_map = target_copy_map;
18051 
18052 	if (misalignments && copy) {
18053 		vm_map_size_t target_copy_map_size;
18054 
18055 		/*
18056 		 * Can't do copy-on-write with misaligned mappings.
18057 		 * Replace the mappings with a physical copy of the original
18058 		 * mappings' contents.
18059 		 */
18060 		target_copy_map_size = target_copy_map->size;
18061 		kern_return_t kr = vm_map_copy_to_physcopy(target_copy_map, target_map);
18062 		if (kr != KERN_SUCCESS) {
18063 			return kr;
18064 		}
18065 		*target_copy_map_p = target_copy_map;
18066 		*overmap_start_p = 0;
18067 		*overmap_end_p = target_copy_map->size - target_copy_map_size;
18068 		DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx)-> trimmed 0x%llx overmap start 0x%llx end 0x%llx PHYSCOPY\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p);
18069 		return KERN_SUCCESS;
18070 	}
18071 
18072 	/* apply the adjustments */
18073 	misalignments = 0;
18074 	overmap_start = 0;
18075 	overmap_end = 0;
18076 	/* remove copy_map->offset, so that everything starts at offset 0 */
18077 	addr_adjustment = copy_map->offset;
18078 	/* also remove whatever we trimmed from the start */
18079 	addr_adjustment += *trimmed_start_p;
18080 	for (target_entry = vm_map_copy_first_entry(target_copy_map);
18081 	    target_entry != vm_map_copy_to_entry(target_copy_map);
18082 	    target_entry = target_entry->vme_next) {
18083 		vm_object_offset_t object_offset_start, object_offset_end;
18084 
18085 		DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx BEFORE\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
18086 		object_offset_start = VME_OFFSET(target_entry);
18087 		if (object_offset_start & target_page_mask) {
18088 			DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx misaligned at start\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
18089 			if (target_entry == vm_map_copy_first_entry(target_copy_map)) {
18090 				/*
18091 				 * start of 1st entry is mis-aligned:
18092 				 * re-adjust by over-mapping.
18093 				 */
18094 				overmap_start = object_offset_start - trunc_page_mask_64(object_offset_start, target_page_mask);
18095 				DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> overmap_start 0x%llx\n", target_entry, VME_OFFSET(target_entry), copy, (uint64_t)overmap_start);
18096 				VME_OFFSET_SET(target_entry, VME_OFFSET(target_entry) - overmap_start);
18097 			} else {
18098 				misalignments++;
18099 				DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> misalignments %d\n", target_entry, VME_OFFSET(target_entry), copy, misalignments);
18100 				assert(copy);
18101 			}
18102 		}
18103 
18104 		if (target_entry == vm_map_copy_first_entry(target_copy_map)) {
18105 			target_size += overmap_start;
18106 		} else {
18107 			target_entry->vme_start += overmap_start;
18108 		}
18109 		target_entry->vme_end += overmap_start;
18110 
18111 		object_offset_end = VME_OFFSET(target_entry) + target_entry->vme_end - target_entry->vme_start;
18112 		if (object_offset_end & target_page_mask) {
18113 			DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx misaligned at end\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
18114 			if (target_entry->vme_next == vm_map_copy_to_entry(target_copy_map)) {
18115 				/*
18116 				 * end of last entry is mis-aligned: re-adjust by over-mapping.
18117 				 */
18118 				overmap_end = round_page_mask_64(object_offset_end, target_page_mask) - object_offset_end;
18119 				DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> overmap_end 0x%llx\n", target_entry, VME_OFFSET(target_entry), copy, (uint64_t)overmap_end);
18120 				target_entry->vme_end += overmap_end;
18121 				target_size += overmap_end;
18122 			} else {
18123 				misalignments++;
18124 				DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> misalignments %d\n", target_entry, VME_OFFSET(target_entry), copy, misalignments);
18125 				assert(copy);
18126 			}
18127 		}
18128 		target_entry->vme_start -= addr_adjustment;
18129 		target_entry->vme_end -= addr_adjustment;
18130 		DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx AFTER\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
18131 	}
18132 
18133 	target_copy_map->size = target_size;
18134 	target_copy_map->offset += overmap_start;
18135 	target_copy_map->offset -= addr_adjustment;
18136 	target_copy_map->cpy_hdr.page_shift = target_page_shift;
18137 
18138 //	assert(VM_MAP_PAGE_ALIGNED(target_copy_map->size, target_page_mask));
18139 //	assert(VM_MAP_PAGE_ALIGNED(target_copy_map->offset, FOURK_PAGE_MASK));
18140 	assert(overmap_start < VM_MAP_PAGE_SIZE(target_map));
18141 	assert(overmap_end < VM_MAP_PAGE_SIZE(target_map));
18142 
18143 	*target_copy_map_p = target_copy_map;
18144 	*overmap_start_p = overmap_start;
18145 	*overmap_end_p = overmap_end;
18146 
18147 	DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx) -> trimmed 0x%llx overmap start 0x%llx end 0x%llx KERN_SUCCESS\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p);
18148 	return KERN_SUCCESS;
18149 }
18150 
18151 kern_return_t
vm_map_range_physical_size(vm_map_t map,vm_map_address_t start,mach_vm_size_t size,mach_vm_size_t * phys_size)18152 vm_map_range_physical_size(
18153 	vm_map_t         map,
18154 	vm_map_address_t start,
18155 	mach_vm_size_t   size,
18156 	mach_vm_size_t * phys_size)
18157 {
18158 	kern_return_t   kr;
18159 	vm_map_copy_t   copy_map, target_copy_map;
18160 	vm_map_offset_t adjusted_start, adjusted_end;
18161 	vm_map_size_t   adjusted_size;
18162 	vm_prot_t       cur_prot, max_prot;
18163 	vm_map_offset_t overmap_start, overmap_end, trimmed_start, end;
18164 	vm_map_kernel_flags_t vmk_flags;
18165 
18166 	if (size == 0) {
18167 		DEBUG4K_SHARE("map %p start 0x%llx size 0x%llx -> phys_size 0!\n", map, (uint64_t)start, (uint64_t)size);
18168 		*phys_size = 0;
18169 		return KERN_SUCCESS;
18170 	}
18171 
18172 	adjusted_start = vm_map_trunc_page(start, VM_MAP_PAGE_MASK(map));
18173 	adjusted_end = vm_map_round_page(start + size, VM_MAP_PAGE_MASK(map));
18174 	if (__improbable(os_add_overflow(start, size, &end) ||
18175 	    adjusted_end <= adjusted_start)) {
18176 		/* wraparound */
18177 		printf("%s:%d(start=0x%llx, size=0x%llx) pgmask 0x%x: wraparound\n", __FUNCTION__, __LINE__, (uint64_t)start, (uint64_t)size, VM_MAP_PAGE_MASK(map));
18178 		*phys_size = 0;
18179 		return KERN_INVALID_ARGUMENT;
18180 	}
18181 	assert(adjusted_end > adjusted_start);
18182 	adjusted_size = adjusted_end - adjusted_start;
18183 	*phys_size = adjusted_size;
18184 	if (VM_MAP_PAGE_SIZE(map) == PAGE_SIZE) {
18185 		return KERN_SUCCESS;
18186 	}
18187 	if (start == 0) {
18188 		adjusted_start = vm_map_trunc_page(start, PAGE_MASK);
18189 		adjusted_end = vm_map_round_page(start + size, PAGE_MASK);
18190 		if (__improbable(adjusted_end <= adjusted_start)) {
18191 			/* wraparound */
18192 			printf("%s:%d(start=0x%llx, size=0x%llx) pgmask 0x%x: wraparound\n", __FUNCTION__, __LINE__, (uint64_t)start, (uint64_t)size, PAGE_MASK);
18193 			*phys_size = 0;
18194 			return KERN_INVALID_ARGUMENT;
18195 		}
18196 		assert(adjusted_end > adjusted_start);
18197 		adjusted_size = adjusted_end - adjusted_start;
18198 		*phys_size = adjusted_size;
18199 		return KERN_SUCCESS;
18200 	}
18201 
18202 	vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
18203 	vmk_flags.vmkf_copy_pageable = TRUE;
18204 	vmk_flags.vmkf_copy_same_map = TRUE;
18205 	assert(adjusted_size != 0);
18206 	cur_prot = VM_PROT_NONE; /* legacy mode */
18207 	max_prot = VM_PROT_NONE; /* legacy mode */
18208 	kr = vm_map_copy_extract(map, adjusted_start, adjusted_size,
18209 	    FALSE /* copy */,
18210 	    &copy_map,
18211 	    &cur_prot, &max_prot, VM_INHERIT_DEFAULT,
18212 	    vmk_flags);
18213 	if (kr != KERN_SUCCESS) {
18214 		DEBUG4K_ERROR("map %p start 0x%llx 0x%llx size 0x%llx 0x%llx kr 0x%x\n", map, (uint64_t)start, (uint64_t)adjusted_start, size, (uint64_t)adjusted_size, kr);
18215 		//assert(0);
18216 		*phys_size = 0;
18217 		return kr;
18218 	}
18219 	assert(copy_map != VM_MAP_COPY_NULL);
18220 	target_copy_map = copy_map;
18221 	DEBUG4K_ADJUST("adjusting...\n");
18222 	kr = vm_map_copy_adjust_to_target(
18223 		copy_map,
18224 		start - adjusted_start, /* offset */
18225 		size, /* size */
18226 		kernel_map,
18227 		FALSE,                          /* copy */
18228 		&target_copy_map,
18229 		&overmap_start,
18230 		&overmap_end,
18231 		&trimmed_start);
18232 	if (kr == KERN_SUCCESS) {
18233 		if (target_copy_map->size != *phys_size) {
18234 			DEBUG4K_ADJUST("map %p (%d) start 0x%llx size 0x%llx adjusted_start 0x%llx adjusted_end 0x%llx overmap_start 0x%llx overmap_end 0x%llx trimmed_start 0x%llx phys_size 0x%llx -> 0x%llx\n", map, VM_MAP_PAGE_SHIFT(map), (uint64_t)start, (uint64_t)size, (uint64_t)adjusted_start, (uint64_t)adjusted_end, (uint64_t)overmap_start, (uint64_t)overmap_end, (uint64_t)trimmed_start, (uint64_t)*phys_size, (uint64_t)target_copy_map->size);
18235 		}
18236 		*phys_size = target_copy_map->size;
18237 	} else {
18238 		DEBUG4K_ERROR("map %p start 0x%llx 0x%llx size 0x%llx 0x%llx kr 0x%x\n", map, (uint64_t)start, (uint64_t)adjusted_start, size, (uint64_t)adjusted_size, kr);
18239 		//assert(0);
18240 		*phys_size = 0;
18241 	}
18242 	vm_map_copy_discard(copy_map);
18243 	copy_map = VM_MAP_COPY_NULL;
18244 
18245 	return kr;
18246 }
18247 
18248 
18249 kern_return_t
memory_entry_check_for_adjustment(vm_map_t src_map,ipc_port_t port,vm_map_offset_t * overmap_start,vm_map_offset_t * overmap_end)18250 memory_entry_check_for_adjustment(
18251 	vm_map_t                        src_map,
18252 	ipc_port_t                      port,
18253 	vm_map_offset_t         *overmap_start,
18254 	vm_map_offset_t         *overmap_end)
18255 {
18256 	kern_return_t kr = KERN_SUCCESS;
18257 	vm_map_copy_t copy_map = VM_MAP_COPY_NULL, target_copy_map = VM_MAP_COPY_NULL;
18258 
18259 	assert(port);
18260 	assertf(ip_kotype(port) == IKOT_NAMED_ENTRY, "Port Type expected: %d...received:%d\n", IKOT_NAMED_ENTRY, ip_kotype(port));
18261 
18262 	vm_named_entry_t        named_entry;
18263 
18264 	named_entry = mach_memory_entry_from_port(port);
18265 	named_entry_lock(named_entry);
18266 	copy_map = named_entry->backing.copy;
18267 	target_copy_map = copy_map;
18268 
18269 	if (src_map && VM_MAP_PAGE_SHIFT(src_map) < PAGE_SHIFT) {
18270 		vm_map_offset_t trimmed_start;
18271 
18272 		trimmed_start = 0;
18273 		DEBUG4K_ADJUST("adjusting...\n");
18274 		kr = vm_map_copy_adjust_to_target(
18275 			copy_map,
18276 			0, /* offset */
18277 			copy_map->size, /* size */
18278 			src_map,
18279 			FALSE, /* copy */
18280 			&target_copy_map,
18281 			overmap_start,
18282 			overmap_end,
18283 			&trimmed_start);
18284 		assert(trimmed_start == 0);
18285 	}
18286 	named_entry_unlock(named_entry);
18287 
18288 	return kr;
18289 }
18290 
18291 
18292 /*
18293  *	Routine:	vm_remap
18294  *
18295  *			Map portion of a task's address space.
18296  *			Mapped region must not overlap more than
18297  *			one vm memory object. Protections and
18298  *			inheritance attributes remain the same
18299  *			as in the original task and are	out parameters.
18300  *			Source and Target task can be identical
18301  *			Other attributes are identical as for vm_map()
18302  */
18303 kern_return_t
vm_map_remap(vm_map_t target_map,vm_map_address_t * address,vm_map_size_t size,vm_map_offset_t mask,int flags,vm_map_kernel_flags_t vmk_flags,vm_tag_t tag,vm_map_t src_map,vm_map_offset_t memory_address,boolean_t copy,vm_prot_t * cur_protection,vm_prot_t * max_protection,vm_inherit_t inheritance)18304 vm_map_remap(
18305 	vm_map_t                target_map,
18306 	vm_map_address_t        *address,
18307 	vm_map_size_t           size,
18308 	vm_map_offset_t         mask,
18309 	int                     flags,
18310 	vm_map_kernel_flags_t   vmk_flags,
18311 	vm_tag_t                tag,
18312 	vm_map_t                src_map,
18313 	vm_map_offset_t         memory_address,
18314 	boolean_t               copy,
18315 	vm_prot_t               *cur_protection, /* IN/OUT */
18316 	vm_prot_t               *max_protection, /* IN/OUT */
18317 	vm_inherit_t            inheritance)
18318 {
18319 	kern_return_t           result;
18320 	vm_map_entry_t          entry;
18321 	vm_map_entry_t          insp_entry = VM_MAP_ENTRY_NULL;
18322 	vm_map_entry_t          new_entry;
18323 	vm_map_copy_t           copy_map;
18324 	vm_map_offset_t         offset_in_mapping;
18325 	vm_map_size_t           target_size = 0;
18326 	vm_map_size_t           src_page_mask, target_page_mask;
18327 	vm_map_offset_t         overmap_start, overmap_end, trimmed_start;
18328 	vm_map_offset_t         initial_memory_address;
18329 	vm_map_size_t           initial_size;
18330 	VM_MAP_ZAP_DECLARE(zap_list);
18331 
18332 	if (target_map == VM_MAP_NULL) {
18333 		return KERN_INVALID_ARGUMENT;
18334 	}
18335 
18336 	initial_memory_address = memory_address;
18337 	initial_size = size;
18338 	src_page_mask = VM_MAP_PAGE_MASK(src_map);
18339 	target_page_mask = VM_MAP_PAGE_MASK(target_map);
18340 
18341 	switch (inheritance) {
18342 	case VM_INHERIT_NONE:
18343 	case VM_INHERIT_COPY:
18344 	case VM_INHERIT_SHARE:
18345 		if (size != 0 && src_map != VM_MAP_NULL) {
18346 			break;
18347 		}
18348 		OS_FALLTHROUGH;
18349 	default:
18350 		return KERN_INVALID_ARGUMENT;
18351 	}
18352 
18353 	if (src_page_mask != target_page_mask) {
18354 		if (copy) {
18355 			DEBUG4K_COPY("src_map %p pgsz 0x%x addr 0x%llx size 0x%llx copy %d -> target_map %p pgsz 0x%x\n", src_map, VM_MAP_PAGE_SIZE(src_map), (uint64_t)memory_address, (uint64_t)size, copy, target_map, VM_MAP_PAGE_SIZE(target_map));
18356 		} else {
18357 			DEBUG4K_SHARE("src_map %p pgsz 0x%x addr 0x%llx size 0x%llx copy %d -> target_map %p pgsz 0x%x\n", src_map, VM_MAP_PAGE_SIZE(src_map), (uint64_t)memory_address, (uint64_t)size, copy, target_map, VM_MAP_PAGE_SIZE(target_map));
18358 		}
18359 	}
18360 
18361 	/*
18362 	 * If the user is requesting that we return the address of the
18363 	 * first byte of the data (rather than the base of the page),
18364 	 * then we use different rounding semantics: specifically,
18365 	 * we assume that (memory_address, size) describes a region
18366 	 * all of whose pages we must cover, rather than a base to be truncated
18367 	 * down and a size to be added to that base.  So we figure out
18368 	 * the highest page that the requested region includes and make
18369 	 * sure that the size will cover it.
18370 	 *
18371 	 * The key example we're worried about it is of the form:
18372 	 *
18373 	 *              memory_address = 0x1ff0, size = 0x20
18374 	 *
18375 	 * With the old semantics, we round down the memory_address to 0x1000
18376 	 * and round up the size to 0x1000, resulting in our covering *only*
18377 	 * page 0x1000.  With the new semantics, we'd realize that the region covers
18378 	 * 0x1ff0-0x2010, and compute a size of 0x2000.  Thus, we cover both page
18379 	 * 0x1000 and page 0x2000 in the region we remap.
18380 	 */
18381 	if ((flags & VM_FLAGS_RETURN_DATA_ADDR) != 0) {
18382 		vm_map_offset_t range_start, range_end;
18383 
18384 		range_start = vm_map_trunc_page(memory_address, src_page_mask);
18385 		range_end = vm_map_round_page(memory_address + size, src_page_mask);
18386 		memory_address = range_start;
18387 		size = range_end - range_start;
18388 		offset_in_mapping = initial_memory_address - memory_address;
18389 	} else {
18390 		/*
18391 		 * IMPORTANT:
18392 		 * This legacy code path is broken: for the range mentioned
18393 		 * above [ memory_address = 0x1ff0,size = 0x20 ], which spans
18394 		 * two 4k pages, it yields [ memory_address = 0x1000,
18395 		 * size = 0x1000 ], which covers only the first 4k page.
18396 		 * BUT some code unfortunately depends on this bug, so we
18397 		 * can't fix it without breaking something.
18398 		 * New code should get automatically opted in the new
18399 		 * behavior with the new VM_FLAGS_RETURN_DATA_ADDR flags.
18400 		 */
18401 		offset_in_mapping = 0;
18402 		memory_address = vm_map_trunc_page(memory_address, src_page_mask);
18403 		size = vm_map_round_page(size, src_page_mask);
18404 		initial_memory_address = memory_address;
18405 		initial_size = size;
18406 	}
18407 
18408 
18409 	if (size == 0) {
18410 		return KERN_INVALID_ARGUMENT;
18411 	}
18412 
18413 	if (flags & VM_FLAGS_RESILIENT_MEDIA) {
18414 		/* must be copy-on-write to be "media resilient" */
18415 		if (!copy) {
18416 			return KERN_INVALID_ARGUMENT;
18417 		}
18418 	}
18419 
18420 	vmk_flags.vmkf_copy_pageable = target_map->hdr.entries_pageable;
18421 	vmk_flags.vmkf_copy_same_map = (src_map == target_map);
18422 
18423 	assert(size != 0);
18424 	result = vm_map_copy_extract(src_map,
18425 	    memory_address,
18426 	    size,
18427 	    copy, &copy_map,
18428 	    cur_protection, /* IN/OUT */
18429 	    max_protection, /* IN/OUT */
18430 	    inheritance,
18431 	    vmk_flags);
18432 	if (result != KERN_SUCCESS) {
18433 		return result;
18434 	}
18435 	assert(copy_map != VM_MAP_COPY_NULL);
18436 
18437 	overmap_start = 0;
18438 	overmap_end = 0;
18439 	trimmed_start = 0;
18440 	target_size = size;
18441 	if (src_page_mask != target_page_mask) {
18442 		vm_map_copy_t target_copy_map;
18443 
18444 		target_copy_map = copy_map; /* can modify "copy_map" itself */
18445 		DEBUG4K_ADJUST("adjusting...\n");
18446 		result = vm_map_copy_adjust_to_target(
18447 			copy_map,
18448 			offset_in_mapping, /* offset */
18449 			initial_size,
18450 			target_map,
18451 			copy,
18452 			&target_copy_map,
18453 			&overmap_start,
18454 			&overmap_end,
18455 			&trimmed_start);
18456 		if (result != KERN_SUCCESS) {
18457 			DEBUG4K_COPY("failed to adjust 0x%x\n", result);
18458 			vm_map_copy_discard(copy_map);
18459 			return result;
18460 		}
18461 		if (trimmed_start == 0) {
18462 			/* nothing trimmed: no adjustment needed */
18463 		} else if (trimmed_start >= offset_in_mapping) {
18464 			/* trimmed more than offset_in_mapping: nothing left */
18465 			assert(overmap_start == 0);
18466 			assert(overmap_end == 0);
18467 			offset_in_mapping = 0;
18468 		} else {
18469 			/* trimmed some of offset_in_mapping: adjust */
18470 			assert(overmap_start == 0);
18471 			assert(overmap_end == 0);
18472 			offset_in_mapping -= trimmed_start;
18473 		}
18474 		offset_in_mapping += overmap_start;
18475 		target_size = target_copy_map->size;
18476 	}
18477 
18478 	/*
18479 	 * Allocate/check a range of free virtual address
18480 	 * space for the target
18481 	 */
18482 	*address = vm_map_trunc_page(*address, target_page_mask);
18483 	vm_map_lock(target_map);
18484 	target_size = vm_map_round_page(target_size, target_page_mask);
18485 	result = vm_map_remap_range_allocate(target_map, address,
18486 	    target_size, mask, flags, vmk_flags, tag,
18487 	    &insp_entry, &zap_list);
18488 
18489 	for (entry = vm_map_copy_first_entry(copy_map);
18490 	    entry != vm_map_copy_to_entry(copy_map);
18491 	    entry = new_entry) {
18492 		new_entry = entry->vme_next;
18493 		vm_map_copy_entry_unlink(copy_map, entry);
18494 		if (result == KERN_SUCCESS) {
18495 			if (vmk_flags.vmkf_remap_prot_copy) {
18496 				/*
18497 				 * This vm_map_remap() is for a
18498 				 * vm_protect(VM_PROT_COPY), so the caller
18499 				 * expects to be allowed to add write access
18500 				 * to this new mapping.  This is done by
18501 				 * adding VM_PROT_WRITE to each entry's
18502 				 * max_protection... unless some security
18503 				 * settings disallow it.
18504 				 */
18505 				bool allow_write = false;
18506 				if (entry->vme_permanent) {
18507 					/* immutable mapping... */
18508 					if ((entry->max_protection & VM_PROT_EXECUTE) &&
18509 					    developer_mode_state()) {
18510 						/*
18511 						 * ... but executable and
18512 						 * possibly being debugged,
18513 						 * so let's allow it to become
18514 						 * writable, for breakpoints
18515 						 * and dtrace probes, for
18516 						 * example.
18517 						 */
18518 						allow_write = true;
18519 					} else {
18520 						printf("%d[%s] vm_remap(0x%llx,0x%llx) VM_PROT_COPY denied on permanent mapping prot 0x%x/0x%x developer %d\n",
18521 						    proc_selfpid(),
18522 						    (get_bsdtask_info(current_task())
18523 						    ? proc_name_address(get_bsdtask_info(current_task()))
18524 						    : "?"),
18525 						    (uint64_t)memory_address,
18526 						    (uint64_t)size,
18527 						    entry->protection,
18528 						    entry->max_protection,
18529 						    developer_mode_state());
18530 						DTRACE_VM6(vm_map_delete_permanent_deny_protcopy,
18531 						    vm_map_entry_t, entry,
18532 						    vm_map_offset_t, entry->vme_start,
18533 						    vm_map_offset_t, entry->vme_end,
18534 						    vm_prot_t, entry->protection,
18535 						    vm_prot_t, entry->max_protection,
18536 						    int, VME_ALIAS(entry));
18537 					}
18538 				} else {
18539 					allow_write = true;
18540 				}
18541 
18542 				/*
18543 				 * VM_PROT_COPY: allow this mapping to become
18544 				 * writable, unless it was "permanent".
18545 				 */
18546 				if (allow_write) {
18547 					entry->max_protection |= VM_PROT_WRITE;
18548 				}
18549 			}
18550 			if (flags & VM_FLAGS_RESILIENT_CODESIGN) {
18551 				/* no codesigning -> read-only access */
18552 				entry->max_protection = VM_PROT_READ;
18553 				entry->protection = VM_PROT_READ;
18554 				entry->vme_resilient_codesign = TRUE;
18555 			}
18556 			entry->vme_start += *address;
18557 			entry->vme_end += *address;
18558 			assert(!entry->map_aligned);
18559 			if ((flags & VM_FLAGS_RESILIENT_MEDIA) &&
18560 			    !entry->is_sub_map &&
18561 			    (VME_OBJECT(entry) == VM_OBJECT_NULL ||
18562 			    VME_OBJECT(entry)->internal)) {
18563 				entry->vme_resilient_media = TRUE;
18564 			}
18565 			assert(VM_MAP_PAGE_ALIGNED(entry->vme_start, MIN(target_page_mask, PAGE_MASK)));
18566 			assert(VM_MAP_PAGE_ALIGNED(entry->vme_end, MIN(target_page_mask, PAGE_MASK)));
18567 			assert(VM_MAP_PAGE_ALIGNED(VME_OFFSET(entry), MIN(target_page_mask, PAGE_MASK)));
18568 			vm_map_store_entry_link(target_map, insp_entry, entry,
18569 			    vmk_flags);
18570 			insp_entry = entry;
18571 		} else {
18572 			if (!entry->is_sub_map) {
18573 				vm_object_deallocate(VME_OBJECT(entry));
18574 			} else {
18575 				vm_map_deallocate(VME_SUBMAP(entry));
18576 			}
18577 			vm_map_copy_entry_dispose(entry);
18578 		}
18579 	}
18580 
18581 	if (flags & VM_FLAGS_RESILIENT_CODESIGN) {
18582 		*cur_protection = VM_PROT_READ;
18583 		*max_protection = VM_PROT_READ;
18584 	}
18585 
18586 	if (result == KERN_SUCCESS) {
18587 		target_map->size += target_size;
18588 		SAVE_HINT_MAP_WRITE(target_map, insp_entry);
18589 
18590 	}
18591 	vm_map_unlock(target_map);
18592 
18593 	vm_map_zap_dispose(&zap_list);
18594 
18595 	if (result == KERN_SUCCESS && target_map->wiring_required) {
18596 		result = vm_map_wire_kernel(target_map, *address,
18597 		    *address + size, *cur_protection, VM_KERN_MEMORY_MLOCK,
18598 		    TRUE);
18599 	}
18600 
18601 	/*
18602 	 * If requested, return the address of the data pointed to by the
18603 	 * request, rather than the base of the resulting page.
18604 	 */
18605 	if ((flags & VM_FLAGS_RETURN_DATA_ADDR) != 0) {
18606 		*address += offset_in_mapping;
18607 	}
18608 
18609 	if (src_page_mask != target_page_mask) {
18610 		DEBUG4K_SHARE("vm_remap(%p 0x%llx 0x%llx copy=%d-> %p 0x%llx 0x%llx  result=0x%x\n", src_map, (uint64_t)memory_address, (uint64_t)size, copy, target_map, (uint64_t)*address, (uint64_t)offset_in_mapping, result);
18611 	}
18612 	vm_map_copy_discard(copy_map);
18613 	copy_map = VM_MAP_COPY_NULL;
18614 
18615 	return result;
18616 }
18617 
18618 /*
18619  *	Routine:	vm_map_remap_range_allocate
18620  *
18621  *	Description:
18622  *		Allocate a range in the specified virtual address map.
18623  *		returns the address and the map entry just before the allocated
18624  *		range
18625  *
18626  *	Map must be locked.
18627  */
18628 
18629 static kern_return_t
vm_map_remap_range_allocate(vm_map_t map,vm_map_address_t * address,vm_map_size_t size,vm_map_offset_t mask,int flags,vm_map_kernel_flags_t vmk_flags,__unused vm_tag_t tag,vm_map_entry_t * map_entry,vm_map_zap_t zap_list)18630 vm_map_remap_range_allocate(
18631 	vm_map_t                map,
18632 	vm_map_address_t        *address,       /* IN/OUT */
18633 	vm_map_size_t           size,
18634 	vm_map_offset_t         mask,
18635 	int                     flags,
18636 	vm_map_kernel_flags_t   vmk_flags,
18637 	__unused vm_tag_t       tag,
18638 	vm_map_entry_t          *map_entry,     /* OUT */
18639 	vm_map_zap_t            zap_list)
18640 {
18641 	vm_map_entry_t  entry;
18642 	vm_map_offset_t start;
18643 	kern_return_t   kr;
18644 
18645 	start = *address;
18646 
18647 	if (flags & VM_FLAGS_ANYWHERE) {
18648 		if (flags & VM_FLAGS_RANDOM_ADDR) {
18649 			vmk_flags.vmkf_random_address = true;
18650 		}
18651 
18652 		if (start) {
18653 			/* override the target range if a hint has been provided */
18654 			vmk_flags.vmkf_range_id = (map == kernel_map ?
18655 			    kmem_addr_get_range(start, size) :
18656 			    VM_MAP_REMAP_RANGE_ID(map, NULL, start, size));
18657 		}
18658 
18659 		kr = vm_map_locate_space(map, size, mask, vmk_flags,
18660 		    &start, &entry);
18661 		if (kr != KERN_SUCCESS) {
18662 			return kr;
18663 		}
18664 		*address = start;
18665 	} else {
18666 		vm_map_entry_t  temp_entry;
18667 		vm_map_offset_t end;
18668 
18669 		/*
18670 		 *	Verify that:
18671 		 *		the address doesn't itself violate
18672 		 *		the mask requirement.
18673 		 */
18674 
18675 		if ((start & mask) != 0) {
18676 			return KERN_NO_SPACE;
18677 		}
18678 
18679 
18680 		/*
18681 		 *	...	the address is within bounds
18682 		 */
18683 
18684 		end = start + size;
18685 
18686 		if ((start < map->min_offset) ||
18687 		    (end > map->max_offset) ||
18688 		    (start >= end)) {
18689 			return KERN_INVALID_ADDRESS;
18690 		}
18691 
18692 		/*
18693 		 * If we're asked to overwrite whatever was mapped in that
18694 		 * range, first deallocate that range.
18695 		 */
18696 		if (flags & VM_FLAGS_OVERWRITE) {
18697 			vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_MAP_ALIGN;
18698 
18699 			/*
18700 			 * We use a "zap_list" to avoid having to unlock
18701 			 * the "map" in vm_map_delete(), which would compromise
18702 			 * the atomicity of the "deallocate" and then "remap"
18703 			 * combination.
18704 			 */
18705 			remove_flags |= VM_MAP_REMOVE_NO_YIELD;
18706 
18707 			if (vmk_flags.vmkf_overwrite_immutable) {
18708 				remove_flags |= VM_MAP_REMOVE_IMMUTABLE;
18709 			}
18710 			if (vmk_flags.vmkf_remap_prot_copy) {
18711 				remove_flags |= VM_MAP_REMOVE_IMMUTABLE_CODE;
18712 			}
18713 			kr = vm_map_delete(map, start, end, remove_flags,
18714 			    KMEM_GUARD_NONE, zap_list).kmr_return;
18715 			if (kr != KERN_SUCCESS) {
18716 				/* XXX FBDP restore zap_list? */
18717 				return kr;
18718 			}
18719 		}
18720 
18721 		/*
18722 		 *	...	the starting address isn't allocated
18723 		 */
18724 
18725 		if (vm_map_lookup_entry(map, start, &temp_entry)) {
18726 			return KERN_NO_SPACE;
18727 		}
18728 
18729 		entry = temp_entry;
18730 
18731 		/*
18732 		 *	...	the next region doesn't overlap the
18733 		 *		end point.
18734 		 */
18735 
18736 		if ((entry->vme_next != vm_map_to_entry(map)) &&
18737 		    (entry->vme_next->vme_start < end)) {
18738 			return KERN_NO_SPACE;
18739 		}
18740 	}
18741 	*map_entry = entry;
18742 	return KERN_SUCCESS;
18743 }
18744 
18745 /*
18746  *	vm_map_switch:
18747  *
18748  *	Set the address map for the current thread to the specified map
18749  */
18750 
18751 vm_map_t
vm_map_switch(vm_map_t map)18752 vm_map_switch(
18753 	vm_map_t        map)
18754 {
18755 	int             mycpu;
18756 	thread_t        thread = current_thread();
18757 	vm_map_t        oldmap = thread->map;
18758 
18759 	mp_disable_preemption();
18760 	mycpu = cpu_number();
18761 
18762 	/*
18763 	 *	Deactivate the current map and activate the requested map
18764 	 */
18765 	PMAP_SWITCH_USER(thread, map, mycpu);
18766 
18767 	mp_enable_preemption();
18768 	return oldmap;
18769 }
18770 
18771 
18772 /*
18773  *	Routine:	vm_map_write_user
18774  *
18775  *	Description:
18776  *		Copy out data from a kernel space into space in the
18777  *		destination map. The space must already exist in the
18778  *		destination map.
18779  *		NOTE:  This routine should only be called by threads
18780  *		which can block on a page fault. i.e. kernel mode user
18781  *		threads.
18782  *
18783  */
18784 kern_return_t
vm_map_write_user(vm_map_t map,void * src_p,vm_map_address_t dst_addr,vm_size_t size)18785 vm_map_write_user(
18786 	vm_map_t                map,
18787 	void                    *src_p,
18788 	vm_map_address_t        dst_addr,
18789 	vm_size_t               size)
18790 {
18791 	kern_return_t   kr = KERN_SUCCESS;
18792 
18793 	if (current_map() == map) {
18794 		if (copyout(src_p, dst_addr, size)) {
18795 			kr = KERN_INVALID_ADDRESS;
18796 		}
18797 	} else {
18798 		vm_map_t        oldmap;
18799 
18800 		/* take on the identity of the target map while doing */
18801 		/* the transfer */
18802 
18803 		vm_map_reference(map);
18804 		oldmap = vm_map_switch(map);
18805 		if (copyout(src_p, dst_addr, size)) {
18806 			kr = KERN_INVALID_ADDRESS;
18807 		}
18808 		vm_map_switch(oldmap);
18809 		vm_map_deallocate(map);
18810 	}
18811 	return kr;
18812 }
18813 
18814 /*
18815  *	Routine:	vm_map_read_user
18816  *
18817  *	Description:
18818  *		Copy in data from a user space source map into the
18819  *		kernel map. The space must already exist in the
18820  *		kernel map.
18821  *		NOTE:  This routine should only be called by threads
18822  *		which can block on a page fault. i.e. kernel mode user
18823  *		threads.
18824  *
18825  */
18826 kern_return_t
vm_map_read_user(vm_map_t map,vm_map_address_t src_addr,void * dst_p,vm_size_t size)18827 vm_map_read_user(
18828 	vm_map_t                map,
18829 	vm_map_address_t        src_addr,
18830 	void                    *dst_p,
18831 	vm_size_t               size)
18832 {
18833 	kern_return_t   kr = KERN_SUCCESS;
18834 
18835 	if (current_map() == map) {
18836 		if (copyin(src_addr, dst_p, size)) {
18837 			kr = KERN_INVALID_ADDRESS;
18838 		}
18839 	} else {
18840 		vm_map_t        oldmap;
18841 
18842 		/* take on the identity of the target map while doing */
18843 		/* the transfer */
18844 
18845 		vm_map_reference(map);
18846 		oldmap = vm_map_switch(map);
18847 		if (copyin(src_addr, dst_p, size)) {
18848 			kr = KERN_INVALID_ADDRESS;
18849 		}
18850 		vm_map_switch(oldmap);
18851 		vm_map_deallocate(map);
18852 	}
18853 	return kr;
18854 }
18855 
18856 
18857 /*
18858  *	vm_map_check_protection:
18859  *
18860  *	Assert that the target map allows the specified
18861  *	privilege on the entire address region given.
18862  *	The entire region must be allocated.
18863  */
18864 boolean_t
vm_map_check_protection(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t protection)18865 vm_map_check_protection(vm_map_t map, vm_map_offset_t start,
18866     vm_map_offset_t end, vm_prot_t protection)
18867 {
18868 	vm_map_entry_t entry;
18869 	vm_map_entry_t tmp_entry;
18870 
18871 	vm_map_lock(map);
18872 
18873 	if (start < vm_map_min(map) || end > vm_map_max(map) || start > end) {
18874 		vm_map_unlock(map);
18875 		return FALSE;
18876 	}
18877 
18878 	if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
18879 		vm_map_unlock(map);
18880 		return FALSE;
18881 	}
18882 
18883 	entry = tmp_entry;
18884 
18885 	while (start < end) {
18886 		if (entry == vm_map_to_entry(map)) {
18887 			vm_map_unlock(map);
18888 			return FALSE;
18889 		}
18890 
18891 		/*
18892 		 *	No holes allowed!
18893 		 */
18894 
18895 		if (start < entry->vme_start) {
18896 			vm_map_unlock(map);
18897 			return FALSE;
18898 		}
18899 
18900 		/*
18901 		 * Check protection associated with entry.
18902 		 */
18903 
18904 		if ((entry->protection & protection) != protection) {
18905 			vm_map_unlock(map);
18906 			return FALSE;
18907 		}
18908 
18909 		/* go to next entry */
18910 
18911 		start = entry->vme_end;
18912 		entry = entry->vme_next;
18913 	}
18914 	vm_map_unlock(map);
18915 	return TRUE;
18916 }
18917 
18918 kern_return_t
vm_map_purgable_control(vm_map_t map,vm_map_offset_t address,vm_purgable_t control,int * state)18919 vm_map_purgable_control(
18920 	vm_map_t                map,
18921 	vm_map_offset_t         address,
18922 	vm_purgable_t           control,
18923 	int                     *state)
18924 {
18925 	vm_map_entry_t          entry;
18926 	vm_object_t             object;
18927 	kern_return_t           kr;
18928 	boolean_t               was_nonvolatile;
18929 
18930 	/*
18931 	 * Vet all the input parameters and current type and state of the
18932 	 * underlaying object.  Return with an error if anything is amiss.
18933 	 */
18934 	if (map == VM_MAP_NULL) {
18935 		return KERN_INVALID_ARGUMENT;
18936 	}
18937 
18938 	if (control != VM_PURGABLE_SET_STATE &&
18939 	    control != VM_PURGABLE_GET_STATE &&
18940 	    control != VM_PURGABLE_PURGE_ALL &&
18941 	    control != VM_PURGABLE_SET_STATE_FROM_KERNEL) {
18942 		return KERN_INVALID_ARGUMENT;
18943 	}
18944 
18945 	if (control == VM_PURGABLE_PURGE_ALL) {
18946 		vm_purgeable_object_purge_all();
18947 		return KERN_SUCCESS;
18948 	}
18949 
18950 	if ((control == VM_PURGABLE_SET_STATE ||
18951 	    control == VM_PURGABLE_SET_STATE_FROM_KERNEL) &&
18952 	    (((*state & ~(VM_PURGABLE_ALL_MASKS)) != 0) ||
18953 	    ((*state & VM_PURGABLE_STATE_MASK) > VM_PURGABLE_STATE_MASK))) {
18954 		return KERN_INVALID_ARGUMENT;
18955 	}
18956 
18957 	vm_map_lock_read(map);
18958 
18959 	if (!vm_map_lookup_entry(map, address, &entry) || entry->is_sub_map) {
18960 		/*
18961 		 * Must pass a valid non-submap address.
18962 		 */
18963 		vm_map_unlock_read(map);
18964 		return KERN_INVALID_ADDRESS;
18965 	}
18966 
18967 	if ((entry->protection & VM_PROT_WRITE) == 0 &&
18968 	    control != VM_PURGABLE_GET_STATE) {
18969 		/*
18970 		 * Can't apply purgable controls to something you can't write.
18971 		 */
18972 		vm_map_unlock_read(map);
18973 		return KERN_PROTECTION_FAILURE;
18974 	}
18975 
18976 	object = VME_OBJECT(entry);
18977 	if (object == VM_OBJECT_NULL ||
18978 	    object->purgable == VM_PURGABLE_DENY) {
18979 		/*
18980 		 * Object must already be present and be purgeable.
18981 		 */
18982 		vm_map_unlock_read(map);
18983 		return KERN_INVALID_ARGUMENT;
18984 	}
18985 
18986 	vm_object_lock(object);
18987 
18988 #if 00
18989 	if (VME_OFFSET(entry) != 0 ||
18990 	    entry->vme_end - entry->vme_start != object->vo_size) {
18991 		/*
18992 		 * Can only apply purgable controls to the whole (existing)
18993 		 * object at once.
18994 		 */
18995 		vm_map_unlock_read(map);
18996 		vm_object_unlock(object);
18997 		return KERN_INVALID_ARGUMENT;
18998 	}
18999 #endif
19000 
19001 	assert(!entry->is_sub_map);
19002 	assert(!entry->use_pmap); /* purgeable has its own accounting */
19003 
19004 	vm_map_unlock_read(map);
19005 
19006 	was_nonvolatile = (object->purgable == VM_PURGABLE_NONVOLATILE);
19007 
19008 	kr = vm_object_purgable_control(object, control, state);
19009 
19010 	if (was_nonvolatile &&
19011 	    object->purgable != VM_PURGABLE_NONVOLATILE &&
19012 	    map->pmap == kernel_pmap) {
19013 #if DEBUG
19014 		object->vo_purgeable_volatilizer = kernel_task;
19015 #endif /* DEBUG */
19016 	}
19017 
19018 	vm_object_unlock(object);
19019 
19020 	return kr;
19021 }
19022 
19023 void
vm_map_footprint_query_page_info(vm_map_t map,vm_map_entry_t map_entry,vm_map_offset_t curr_s_offset,int * disposition_p)19024 vm_map_footprint_query_page_info(
19025 	vm_map_t        map,
19026 	vm_map_entry_t  map_entry,
19027 	vm_map_offset_t curr_s_offset,
19028 	int             *disposition_p)
19029 {
19030 	int             pmap_disp;
19031 	vm_object_t     object = VM_OBJECT_NULL;
19032 	int             disposition;
19033 	int             effective_page_size;
19034 
19035 	vm_map_lock_assert_held(map);
19036 	assert(!map->has_corpse_footprint);
19037 	assert(curr_s_offset >= map_entry->vme_start);
19038 	assert(curr_s_offset < map_entry->vme_end);
19039 
19040 	if (map_entry->is_sub_map) {
19041 		if (!map_entry->use_pmap) {
19042 			/* nested pmap: no footprint */
19043 			*disposition_p = 0;
19044 			return;
19045 		}
19046 	} else {
19047 		object = VME_OBJECT(map_entry);
19048 		if (object == VM_OBJECT_NULL) {
19049 			/* nothing mapped here: no need to ask */
19050 			*disposition_p = 0;
19051 			return;
19052 		}
19053 	}
19054 
19055 	effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(map));
19056 
19057 	pmap_disp = 0;
19058 
19059 	/*
19060 	 * Query the pmap.
19061 	 */
19062 	pmap_query_page_info(map->pmap, curr_s_offset, &pmap_disp);
19063 
19064 	/*
19065 	 * Compute this page's disposition.
19066 	 */
19067 	disposition = 0;
19068 
19069 	/* deal with "alternate accounting" first */
19070 	if (!map_entry->is_sub_map &&
19071 	    object->vo_no_footprint) {
19072 		/* does not count in footprint */
19073 		assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19074 	} else if (!map_entry->is_sub_map &&
19075 	    (object->purgable == VM_PURGABLE_NONVOLATILE ||
19076 	    (object->purgable == VM_PURGABLE_DENY &&
19077 	    object->vo_ledger_tag)) &&
19078 	    VM_OBJECT_OWNER(object) != NULL &&
19079 	    VM_OBJECT_OWNER(object)->map == map) {
19080 		assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19081 		if ((((curr_s_offset
19082 		    - map_entry->vme_start
19083 		    + VME_OFFSET(map_entry))
19084 		    / effective_page_size) <
19085 		    (object->resident_page_count +
19086 		    vm_compressor_pager_get_count(object->pager)))) {
19087 			/*
19088 			 * Non-volatile purgeable object owned
19089 			 * by this task: report the first
19090 			 * "#resident + #compressed" pages as
19091 			 * "resident" (to show that they
19092 			 * contribute to the footprint) but not
19093 			 * "dirty" (to avoid double-counting
19094 			 * with the fake "non-volatile" region
19095 			 * we'll report at the end of the
19096 			 * address space to account for all
19097 			 * (mapped or not) non-volatile memory
19098 			 * owned by this task.
19099 			 */
19100 			disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
19101 		}
19102 	} else if (!map_entry->is_sub_map &&
19103 	    (object->purgable == VM_PURGABLE_VOLATILE ||
19104 	    object->purgable == VM_PURGABLE_EMPTY) &&
19105 	    VM_OBJECT_OWNER(object) != NULL &&
19106 	    VM_OBJECT_OWNER(object)->map == map) {
19107 		assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19108 		if ((((curr_s_offset
19109 		    - map_entry->vme_start
19110 		    + VME_OFFSET(map_entry))
19111 		    / effective_page_size) <
19112 		    object->wired_page_count)) {
19113 			/*
19114 			 * Volatile|empty purgeable object owned
19115 			 * by this task: report the first
19116 			 * "#wired" pages as "resident" (to
19117 			 * show that they contribute to the
19118 			 * footprint) but not "dirty" (to avoid
19119 			 * double-counting with the fake
19120 			 * "non-volatile" region we'll report
19121 			 * at the end of the address space to
19122 			 * account for all (mapped or not)
19123 			 * non-volatile memory owned by this
19124 			 * task.
19125 			 */
19126 			disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
19127 		}
19128 	} else if (!map_entry->is_sub_map &&
19129 	    map_entry->iokit_acct &&
19130 	    object->internal &&
19131 	    object->purgable == VM_PURGABLE_DENY) {
19132 		/*
19133 		 * Non-purgeable IOKit memory: phys_footprint
19134 		 * includes the entire virtual mapping.
19135 		 */
19136 		assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19137 		disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
19138 		disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
19139 	} else if (pmap_disp & (PMAP_QUERY_PAGE_ALTACCT |
19140 	    PMAP_QUERY_PAGE_COMPRESSED_ALTACCT)) {
19141 		/* alternate accounting */
19142 #if __arm64__ && (DEVELOPMENT || DEBUG)
19143 		if (map->pmap->footprint_was_suspended) {
19144 			/*
19145 			 * The assertion below can fail if dyld
19146 			 * suspended footprint accounting
19147 			 * while doing some adjustments to
19148 			 * this page;  the mapping would say
19149 			 * "use pmap accounting" but the page
19150 			 * would be marked "alternate
19151 			 * accounting".
19152 			 */
19153 		} else
19154 #endif /* __arm64__ && (DEVELOPMENT || DEBUG) */
19155 		{
19156 			assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19157 		}
19158 		disposition = 0;
19159 	} else {
19160 		if (pmap_disp & PMAP_QUERY_PAGE_PRESENT) {
19161 			assertf(map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19162 			disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
19163 			disposition |= VM_PAGE_QUERY_PAGE_REF;
19164 			if (pmap_disp & PMAP_QUERY_PAGE_INTERNAL) {
19165 				disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
19166 			} else {
19167 				disposition |= VM_PAGE_QUERY_PAGE_EXTERNAL;
19168 			}
19169 			if (pmap_disp & PMAP_QUERY_PAGE_REUSABLE) {
19170 				disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
19171 			}
19172 		} else if (pmap_disp & PMAP_QUERY_PAGE_COMPRESSED) {
19173 			assertf(map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19174 			disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
19175 		}
19176 	}
19177 
19178 	*disposition_p = disposition;
19179 }
19180 
19181 kern_return_t
vm_map_page_query_internal(vm_map_t target_map,vm_map_offset_t offset,int * disposition,int * ref_count)19182 vm_map_page_query_internal(
19183 	vm_map_t        target_map,
19184 	vm_map_offset_t offset,
19185 	int             *disposition,
19186 	int             *ref_count)
19187 {
19188 	kern_return_t                   kr;
19189 	vm_page_info_basic_data_t       info;
19190 	mach_msg_type_number_t          count;
19191 
19192 	count = VM_PAGE_INFO_BASIC_COUNT;
19193 	kr = vm_map_page_info(target_map,
19194 	    offset,
19195 	    VM_PAGE_INFO_BASIC,
19196 	    (vm_page_info_t) &info,
19197 	    &count);
19198 	if (kr == KERN_SUCCESS) {
19199 		*disposition = info.disposition;
19200 		*ref_count = info.ref_count;
19201 	} else {
19202 		*disposition = 0;
19203 		*ref_count = 0;
19204 	}
19205 
19206 	return kr;
19207 }
19208 
19209 kern_return_t
vm_map_page_info(vm_map_t map,vm_map_offset_t offset,vm_page_info_flavor_t flavor,vm_page_info_t info,mach_msg_type_number_t * count)19210 vm_map_page_info(
19211 	vm_map_t                map,
19212 	vm_map_offset_t         offset,
19213 	vm_page_info_flavor_t   flavor,
19214 	vm_page_info_t          info,
19215 	mach_msg_type_number_t  *count)
19216 {
19217 	return vm_map_page_range_info_internal(map,
19218 	           offset, /* start of range */
19219 	           (offset + 1), /* this will get rounded in the call to the page boundary */
19220 	           (int)-1, /* effective_page_shift: unspecified */
19221 	           flavor,
19222 	           info,
19223 	           count);
19224 }
19225 
19226 kern_return_t
vm_map_page_range_info_internal(vm_map_t map,vm_map_offset_t start_offset,vm_map_offset_t end_offset,int effective_page_shift,vm_page_info_flavor_t flavor,vm_page_info_t info,mach_msg_type_number_t * count)19227 vm_map_page_range_info_internal(
19228 	vm_map_t                map,
19229 	vm_map_offset_t         start_offset,
19230 	vm_map_offset_t         end_offset,
19231 	int                     effective_page_shift,
19232 	vm_page_info_flavor_t   flavor,
19233 	vm_page_info_t          info,
19234 	mach_msg_type_number_t  *count)
19235 {
19236 	vm_map_entry_t          map_entry = VM_MAP_ENTRY_NULL;
19237 	vm_object_t             object = VM_OBJECT_NULL, curr_object = VM_OBJECT_NULL;
19238 	vm_page_t               m = VM_PAGE_NULL;
19239 	kern_return_t           retval = KERN_SUCCESS;
19240 	int                     disposition = 0;
19241 	int                     ref_count = 0;
19242 	int                     depth = 0, info_idx = 0;
19243 	vm_page_info_basic_t    basic_info = 0;
19244 	vm_map_offset_t         offset_in_page = 0, offset_in_object = 0, curr_offset_in_object = 0;
19245 	vm_map_offset_t         start = 0, end = 0, curr_s_offset = 0, curr_e_offset = 0;
19246 	boolean_t               do_region_footprint;
19247 	ledger_amount_t         ledger_resident, ledger_compressed;
19248 	int                     effective_page_size;
19249 	vm_map_offset_t         effective_page_mask;
19250 
19251 	switch (flavor) {
19252 	case VM_PAGE_INFO_BASIC:
19253 		if (*count != VM_PAGE_INFO_BASIC_COUNT) {
19254 			/*
19255 			 * The "vm_page_info_basic_data" structure was not
19256 			 * properly padded, so allow the size to be off by
19257 			 * one to maintain backwards binary compatibility...
19258 			 */
19259 			if (*count != VM_PAGE_INFO_BASIC_COUNT - 1) {
19260 				return KERN_INVALID_ARGUMENT;
19261 			}
19262 		}
19263 		break;
19264 	default:
19265 		return KERN_INVALID_ARGUMENT;
19266 	}
19267 
19268 	if (effective_page_shift == -1) {
19269 		effective_page_shift = vm_self_region_page_shift_safely(map);
19270 		if (effective_page_shift == -1) {
19271 			return KERN_INVALID_ARGUMENT;
19272 		}
19273 	}
19274 	effective_page_size = (1 << effective_page_shift);
19275 	effective_page_mask = effective_page_size - 1;
19276 
19277 	do_region_footprint = task_self_region_footprint();
19278 	disposition = 0;
19279 	ref_count = 0;
19280 	depth = 0;
19281 	info_idx = 0; /* Tracks the next index within the info structure to be filled.*/
19282 	retval = KERN_SUCCESS;
19283 
19284 	offset_in_page = start_offset & effective_page_mask;
19285 	start = vm_map_trunc_page(start_offset, effective_page_mask);
19286 	end = vm_map_round_page(end_offset, effective_page_mask);
19287 
19288 	if (end < start) {
19289 		return KERN_INVALID_ARGUMENT;
19290 	}
19291 
19292 	assert((end - start) <= MAX_PAGE_RANGE_QUERY);
19293 
19294 	vm_map_lock_read(map);
19295 
19296 	task_ledgers_footprint(map->pmap->ledger, &ledger_resident, &ledger_compressed);
19297 
19298 	for (curr_s_offset = start; curr_s_offset < end;) {
19299 		/*
19300 		 * New lookup needs reset of these variables.
19301 		 */
19302 		curr_object = object = VM_OBJECT_NULL;
19303 		offset_in_object = 0;
19304 		ref_count = 0;
19305 		depth = 0;
19306 
19307 		if (do_region_footprint &&
19308 		    curr_s_offset >= vm_map_last_entry(map)->vme_end) {
19309 			/*
19310 			 * Request for "footprint" info about a page beyond
19311 			 * the end of address space: this must be for
19312 			 * the fake region vm_map_region_recurse_64()
19313 			 * reported to account for non-volatile purgeable
19314 			 * memory owned by this task.
19315 			 */
19316 			disposition = 0;
19317 
19318 			if (curr_s_offset - vm_map_last_entry(map)->vme_end <=
19319 			    (unsigned) ledger_compressed) {
19320 				/*
19321 				 * We haven't reported all the "non-volatile
19322 				 * compressed" pages yet, so report this fake
19323 				 * page as "compressed".
19324 				 */
19325 				disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
19326 			} else {
19327 				/*
19328 				 * We've reported all the non-volatile
19329 				 * compressed page but not all the non-volatile
19330 				 * pages , so report this fake page as
19331 				 * "resident dirty".
19332 				 */
19333 				disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
19334 				disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
19335 				disposition |= VM_PAGE_QUERY_PAGE_REF;
19336 			}
19337 			switch (flavor) {
19338 			case VM_PAGE_INFO_BASIC:
19339 				basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
19340 				basic_info->disposition = disposition;
19341 				basic_info->ref_count = 1;
19342 				basic_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
19343 				basic_info->offset = 0;
19344 				basic_info->depth = 0;
19345 
19346 				info_idx++;
19347 				break;
19348 			}
19349 			curr_s_offset += effective_page_size;
19350 			continue;
19351 		}
19352 
19353 		/*
19354 		 * First, find the map entry covering "curr_s_offset", going down
19355 		 * submaps if necessary.
19356 		 */
19357 		if (!vm_map_lookup_entry(map, curr_s_offset, &map_entry)) {
19358 			/* no entry -> no object -> no page */
19359 
19360 			if (curr_s_offset < vm_map_min(map)) {
19361 				/*
19362 				 * Illegal address that falls below map min.
19363 				 */
19364 				curr_e_offset = MIN(end, vm_map_min(map));
19365 			} else if (curr_s_offset >= vm_map_max(map)) {
19366 				/*
19367 				 * Illegal address that falls on/after map max.
19368 				 */
19369 				curr_e_offset = end;
19370 			} else if (map_entry == vm_map_to_entry(map)) {
19371 				/*
19372 				 * Hit a hole.
19373 				 */
19374 				if (map_entry->vme_next == vm_map_to_entry(map)) {
19375 					/*
19376 					 * Empty map.
19377 					 */
19378 					curr_e_offset = MIN(map->max_offset, end);
19379 				} else {
19380 					/*
19381 					 * Hole at start of the map.
19382 					 */
19383 					curr_e_offset = MIN(map_entry->vme_next->vme_start, end);
19384 				}
19385 			} else {
19386 				if (map_entry->vme_next == vm_map_to_entry(map)) {
19387 					/*
19388 					 * Hole at the end of the map.
19389 					 */
19390 					curr_e_offset = MIN(map->max_offset, end);
19391 				} else {
19392 					curr_e_offset = MIN(map_entry->vme_next->vme_start, end);
19393 				}
19394 			}
19395 
19396 			assert(curr_e_offset >= curr_s_offset);
19397 
19398 			uint64_t num_pages = (curr_e_offset - curr_s_offset) >> effective_page_shift;
19399 
19400 			void *info_ptr = (void*) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
19401 
19402 			bzero(info_ptr, num_pages * sizeof(struct vm_page_info_basic));
19403 
19404 			curr_s_offset = curr_e_offset;
19405 
19406 			info_idx += num_pages;
19407 
19408 			continue;
19409 		}
19410 
19411 		/* compute offset from this map entry's start */
19412 		offset_in_object = curr_s_offset - map_entry->vme_start;
19413 
19414 		/* compute offset into this map entry's object (or submap) */
19415 		offset_in_object += VME_OFFSET(map_entry);
19416 
19417 		if (map_entry->is_sub_map) {
19418 			vm_map_t sub_map = VM_MAP_NULL;
19419 			vm_page_info_t submap_info = 0;
19420 			vm_map_offset_t submap_s_offset = 0, submap_e_offset = 0, range_len = 0;
19421 
19422 			range_len = MIN(map_entry->vme_end, end) - curr_s_offset;
19423 
19424 			submap_s_offset = offset_in_object;
19425 			submap_e_offset = submap_s_offset + range_len;
19426 
19427 			sub_map = VME_SUBMAP(map_entry);
19428 
19429 			vm_map_reference(sub_map);
19430 			vm_map_unlock_read(map);
19431 
19432 			submap_info = (vm_page_info_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
19433 
19434 			assertf(VM_MAP_PAGE_SHIFT(sub_map) >= VM_MAP_PAGE_SHIFT(map),
19435 			    "Submap page size (%d) differs from current map (%d)\n", VM_MAP_PAGE_SIZE(sub_map), VM_MAP_PAGE_SIZE(map));
19436 
19437 			retval = vm_map_page_range_info_internal(sub_map,
19438 			    submap_s_offset,
19439 			    submap_e_offset,
19440 			    effective_page_shift,
19441 			    VM_PAGE_INFO_BASIC,
19442 			    (vm_page_info_t) submap_info,
19443 			    count);
19444 
19445 			assert(retval == KERN_SUCCESS);
19446 
19447 			vm_map_lock_read(map);
19448 			vm_map_deallocate(sub_map);
19449 
19450 			/* Move the "info" index by the number of pages we inspected.*/
19451 			info_idx += range_len >> effective_page_shift;
19452 
19453 			/* Move our current offset by the size of the range we inspected.*/
19454 			curr_s_offset += range_len;
19455 
19456 			continue;
19457 		}
19458 
19459 		object = VME_OBJECT(map_entry);
19460 
19461 		if (object == VM_OBJECT_NULL) {
19462 			/*
19463 			 * We don't have an object here and, hence,
19464 			 * no pages to inspect. We'll fill up the
19465 			 * info structure appropriately.
19466 			 */
19467 
19468 			curr_e_offset = MIN(map_entry->vme_end, end);
19469 
19470 			uint64_t num_pages = (curr_e_offset - curr_s_offset) >> effective_page_shift;
19471 
19472 			void *info_ptr = (void*) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
19473 
19474 			bzero(info_ptr, num_pages * sizeof(struct vm_page_info_basic));
19475 
19476 			curr_s_offset = curr_e_offset;
19477 
19478 			info_idx += num_pages;
19479 
19480 			continue;
19481 		}
19482 
19483 		if (do_region_footprint) {
19484 			disposition = 0;
19485 			if (map->has_corpse_footprint) {
19486 				/*
19487 				 * Query the page info data we saved
19488 				 * while forking the corpse.
19489 				 */
19490 				vm_map_corpse_footprint_query_page_info(
19491 					map,
19492 					curr_s_offset,
19493 					&disposition);
19494 			} else {
19495 				/*
19496 				 * Query the live pmap for footprint info
19497 				 * about this page.
19498 				 */
19499 				vm_map_footprint_query_page_info(
19500 					map,
19501 					map_entry,
19502 					curr_s_offset,
19503 					&disposition);
19504 			}
19505 			switch (flavor) {
19506 			case VM_PAGE_INFO_BASIC:
19507 				basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
19508 				basic_info->disposition = disposition;
19509 				basic_info->ref_count = 1;
19510 				basic_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
19511 				basic_info->offset = 0;
19512 				basic_info->depth = 0;
19513 
19514 				info_idx++;
19515 				break;
19516 			}
19517 			curr_s_offset += effective_page_size;
19518 			continue;
19519 		}
19520 
19521 		vm_object_reference(object);
19522 		/*
19523 		 * Shared mode -- so we can allow other readers
19524 		 * to grab the lock too.
19525 		 */
19526 		vm_object_lock_shared(object);
19527 
19528 		curr_e_offset = MIN(map_entry->vme_end, end);
19529 
19530 		vm_map_unlock_read(map);
19531 
19532 		map_entry = NULL; /* map is unlocked, the entry is no longer valid. */
19533 
19534 		curr_object = object;
19535 
19536 		for (; curr_s_offset < curr_e_offset;) {
19537 			if (object == curr_object) {
19538 				ref_count = curr_object->ref_count - 1; /* account for our object reference above. */
19539 			} else {
19540 				ref_count = curr_object->ref_count;
19541 			}
19542 
19543 			curr_offset_in_object = offset_in_object;
19544 
19545 			for (;;) {
19546 				m = vm_page_lookup(curr_object, vm_object_trunc_page(curr_offset_in_object));
19547 
19548 				if (m != VM_PAGE_NULL) {
19549 					disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
19550 					break;
19551 				} else {
19552 					if (curr_object->internal &&
19553 					    curr_object->alive &&
19554 					    !curr_object->terminating &&
19555 					    curr_object->pager_ready) {
19556 						if (VM_COMPRESSOR_PAGER_STATE_GET(curr_object, vm_object_trunc_page(curr_offset_in_object))
19557 						    == VM_EXTERNAL_STATE_EXISTS) {
19558 							/* the pager has that page */
19559 							disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
19560 							break;
19561 						}
19562 					}
19563 
19564 					/*
19565 					 * Go down the VM object shadow chain until we find the page
19566 					 * we're looking for.
19567 					 */
19568 
19569 					if (curr_object->shadow != VM_OBJECT_NULL) {
19570 						vm_object_t shadow = VM_OBJECT_NULL;
19571 
19572 						curr_offset_in_object += curr_object->vo_shadow_offset;
19573 						shadow = curr_object->shadow;
19574 
19575 						vm_object_lock_shared(shadow);
19576 						vm_object_unlock(curr_object);
19577 
19578 						curr_object = shadow;
19579 						depth++;
19580 						continue;
19581 					} else {
19582 						break;
19583 					}
19584 				}
19585 			}
19586 
19587 			/* The ref_count is not strictly accurate, it measures the number   */
19588 			/* of entities holding a ref on the object, they may not be mapping */
19589 			/* the object or may not be mapping the section holding the         */
19590 			/* target page but its still a ball park number and though an over- */
19591 			/* count, it picks up the copy-on-write cases                       */
19592 
19593 			/* We could also get a picture of page sharing from pmap_attributes */
19594 			/* but this would under count as only faulted-in mappings would     */
19595 			/* show up.							    */
19596 
19597 			if ((curr_object == object) && curr_object->shadow) {
19598 				disposition |= VM_PAGE_QUERY_PAGE_COPIED;
19599 			}
19600 
19601 			if (!curr_object->internal) {
19602 				disposition |= VM_PAGE_QUERY_PAGE_EXTERNAL;
19603 			}
19604 
19605 			if (m != VM_PAGE_NULL) {
19606 				if (m->vmp_fictitious) {
19607 					disposition |= VM_PAGE_QUERY_PAGE_FICTITIOUS;
19608 				} else {
19609 					if (m->vmp_dirty || pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(m))) {
19610 						disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
19611 					}
19612 
19613 					if (m->vmp_reference || pmap_is_referenced(VM_PAGE_GET_PHYS_PAGE(m))) {
19614 						disposition |= VM_PAGE_QUERY_PAGE_REF;
19615 					}
19616 
19617 					if (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
19618 						disposition |= VM_PAGE_QUERY_PAGE_SPECULATIVE;
19619 					}
19620 
19621 					/*
19622 					 * XXX TODO4K:
19623 					 * when this routine deals with 4k
19624 					 * pages, check the appropriate CS bit
19625 					 * here.
19626 					 */
19627 					if (m->vmp_cs_validated) {
19628 						disposition |= VM_PAGE_QUERY_PAGE_CS_VALIDATED;
19629 					}
19630 					if (m->vmp_cs_tainted) {
19631 						disposition |= VM_PAGE_QUERY_PAGE_CS_TAINTED;
19632 					}
19633 					if (m->vmp_cs_nx) {
19634 						disposition |= VM_PAGE_QUERY_PAGE_CS_NX;
19635 					}
19636 					if (m->vmp_reusable || curr_object->all_reusable) {
19637 						disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
19638 					}
19639 				}
19640 			}
19641 
19642 			switch (flavor) {
19643 			case VM_PAGE_INFO_BASIC:
19644 				basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
19645 				basic_info->disposition = disposition;
19646 				basic_info->ref_count = ref_count;
19647 				basic_info->object_id = (vm_object_id_t) (uintptr_t)
19648 				    VM_KERNEL_ADDRPERM(curr_object);
19649 				basic_info->offset =
19650 				    (memory_object_offset_t) curr_offset_in_object + offset_in_page;
19651 				basic_info->depth = depth;
19652 
19653 				info_idx++;
19654 				break;
19655 			}
19656 
19657 			disposition = 0;
19658 			offset_in_page = 0; // This doesn't really make sense for any offset other than the starting offset.
19659 
19660 			/*
19661 			 * Move to next offset in the range and in our object.
19662 			 */
19663 			curr_s_offset += effective_page_size;
19664 			offset_in_object += effective_page_size;
19665 			curr_offset_in_object = offset_in_object;
19666 
19667 			if (curr_object != object) {
19668 				vm_object_unlock(curr_object);
19669 
19670 				curr_object = object;
19671 
19672 				vm_object_lock_shared(curr_object);
19673 			} else {
19674 				vm_object_lock_yield_shared(curr_object);
19675 			}
19676 		}
19677 
19678 		vm_object_unlock(curr_object);
19679 		vm_object_deallocate(curr_object);
19680 
19681 		vm_map_lock_read(map);
19682 	}
19683 
19684 	vm_map_unlock_read(map);
19685 	return retval;
19686 }
19687 
19688 /*
19689  *	vm_map_msync
19690  *
19691  *	Synchronises the memory range specified with its backing store
19692  *	image by either flushing or cleaning the contents to the appropriate
19693  *	memory manager engaging in a memory object synchronize dialog with
19694  *	the manager.  The client doesn't return until the manager issues
19695  *	m_o_s_completed message.  MIG Magically converts user task parameter
19696  *	to the task's address map.
19697  *
19698  *	interpretation of sync_flags
19699  *	VM_SYNC_INVALIDATE	- discard pages, only return precious
19700  *				  pages to manager.
19701  *
19702  *	VM_SYNC_INVALIDATE & (VM_SYNC_SYNCHRONOUS | VM_SYNC_ASYNCHRONOUS)
19703  *				- discard pages, write dirty or precious
19704  *				  pages back to memory manager.
19705  *
19706  *	VM_SYNC_SYNCHRONOUS | VM_SYNC_ASYNCHRONOUS
19707  *				- write dirty or precious pages back to
19708  *				  the memory manager.
19709  *
19710  *	VM_SYNC_CONTIGUOUS	- does everything normally, but if there
19711  *				  is a hole in the region, and we would
19712  *				  have returned KERN_SUCCESS, return
19713  *				  KERN_INVALID_ADDRESS instead.
19714  *
19715  *	NOTE
19716  *	The memory object attributes have not yet been implemented, this
19717  *	function will have to deal with the invalidate attribute
19718  *
19719  *	RETURNS
19720  *	KERN_INVALID_TASK		Bad task parameter
19721  *	KERN_INVALID_ARGUMENT		both sync and async were specified.
19722  *	KERN_SUCCESS			The usual.
19723  *	KERN_INVALID_ADDRESS		There was a hole in the region.
19724  */
19725 
19726 kern_return_t
vm_map_msync(vm_map_t map,vm_map_address_t address,vm_map_size_t size,vm_sync_t sync_flags)19727 vm_map_msync(
19728 	vm_map_t                map,
19729 	vm_map_address_t        address,
19730 	vm_map_size_t           size,
19731 	vm_sync_t               sync_flags)
19732 {
19733 	vm_map_entry_t          entry;
19734 	vm_map_size_t           amount_left;
19735 	vm_object_offset_t      offset;
19736 	vm_object_offset_t      start_offset, end_offset;
19737 	boolean_t               do_sync_req;
19738 	boolean_t               had_hole = FALSE;
19739 	vm_map_offset_t         pmap_offset;
19740 
19741 	if ((sync_flags & VM_SYNC_ASYNCHRONOUS) &&
19742 	    (sync_flags & VM_SYNC_SYNCHRONOUS)) {
19743 		return KERN_INVALID_ARGUMENT;
19744 	}
19745 
19746 	if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
19747 		DEBUG4K_SHARE("map %p address 0x%llx size 0x%llx flags 0x%x\n", map, (uint64_t)address, (uint64_t)size, sync_flags);
19748 	}
19749 
19750 	/*
19751 	 * align address and size on page boundaries
19752 	 */
19753 	size = (vm_map_round_page(address + size,
19754 	    VM_MAP_PAGE_MASK(map)) -
19755 	    vm_map_trunc_page(address,
19756 	    VM_MAP_PAGE_MASK(map)));
19757 	address = vm_map_trunc_page(address,
19758 	    VM_MAP_PAGE_MASK(map));
19759 
19760 	if (map == VM_MAP_NULL) {
19761 		return KERN_INVALID_TASK;
19762 	}
19763 
19764 	if (size == 0) {
19765 		return KERN_SUCCESS;
19766 	}
19767 
19768 	amount_left = size;
19769 
19770 	while (amount_left > 0) {
19771 		vm_object_size_t        flush_size;
19772 		vm_object_t             object;
19773 
19774 		vm_map_lock(map);
19775 		if (!vm_map_lookup_entry(map,
19776 		    address,
19777 		    &entry)) {
19778 			vm_map_size_t   skip;
19779 
19780 			/*
19781 			 * hole in the address map.
19782 			 */
19783 			had_hole = TRUE;
19784 
19785 			if (sync_flags & VM_SYNC_KILLPAGES) {
19786 				/*
19787 				 * For VM_SYNC_KILLPAGES, there should be
19788 				 * no holes in the range, since we couldn't
19789 				 * prevent someone else from allocating in
19790 				 * that hole and we wouldn't want to "kill"
19791 				 * their pages.
19792 				 */
19793 				vm_map_unlock(map);
19794 				break;
19795 			}
19796 
19797 			/*
19798 			 * Check for empty map.
19799 			 */
19800 			if (entry == vm_map_to_entry(map) &&
19801 			    entry->vme_next == entry) {
19802 				vm_map_unlock(map);
19803 				break;
19804 			}
19805 			/*
19806 			 * Check that we don't wrap and that
19807 			 * we have at least one real map entry.
19808 			 */
19809 			if ((map->hdr.nentries == 0) ||
19810 			    (entry->vme_next->vme_start < address)) {
19811 				vm_map_unlock(map);
19812 				break;
19813 			}
19814 			/*
19815 			 * Move up to the next entry if needed
19816 			 */
19817 			skip = (entry->vme_next->vme_start - address);
19818 			if (skip >= amount_left) {
19819 				amount_left = 0;
19820 			} else {
19821 				amount_left -= skip;
19822 			}
19823 			address = entry->vme_next->vme_start;
19824 			vm_map_unlock(map);
19825 			continue;
19826 		}
19827 
19828 		offset = address - entry->vme_start;
19829 		pmap_offset = address;
19830 
19831 		/*
19832 		 * do we have more to flush than is contained in this
19833 		 * entry ?
19834 		 */
19835 		if (amount_left + entry->vme_start + offset > entry->vme_end) {
19836 			flush_size = entry->vme_end -
19837 			    (entry->vme_start + offset);
19838 		} else {
19839 			flush_size = amount_left;
19840 		}
19841 		amount_left -= flush_size;
19842 		address += flush_size;
19843 
19844 		if (entry->is_sub_map == TRUE) {
19845 			vm_map_t        local_map;
19846 			vm_map_offset_t local_offset;
19847 
19848 			local_map = VME_SUBMAP(entry);
19849 			local_offset = VME_OFFSET(entry);
19850 			vm_map_reference(local_map);
19851 			vm_map_unlock(map);
19852 			if (vm_map_msync(
19853 				    local_map,
19854 				    local_offset,
19855 				    flush_size,
19856 				    sync_flags) == KERN_INVALID_ADDRESS) {
19857 				had_hole = TRUE;
19858 			}
19859 			vm_map_deallocate(local_map);
19860 			continue;
19861 		}
19862 		object = VME_OBJECT(entry);
19863 
19864 		/*
19865 		 * We can't sync this object if the object has not been
19866 		 * created yet
19867 		 */
19868 		if (object == VM_OBJECT_NULL) {
19869 			vm_map_unlock(map);
19870 			continue;
19871 		}
19872 		offset += VME_OFFSET(entry);
19873 
19874 		vm_object_lock(object);
19875 
19876 		if (sync_flags & (VM_SYNC_KILLPAGES | VM_SYNC_DEACTIVATE)) {
19877 			int kill_pages = 0;
19878 			boolean_t reusable_pages = FALSE;
19879 
19880 			if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
19881 				/*
19882 				 * This is a destructive operation and so we
19883 				 * err on the side of limiting the range of
19884 				 * the operation.
19885 				 */
19886 				start_offset = vm_object_round_page(offset);
19887 				end_offset = vm_object_trunc_page(offset + flush_size);
19888 
19889 				if (end_offset <= start_offset) {
19890 					vm_object_unlock(object);
19891 					vm_map_unlock(map);
19892 					continue;
19893 				}
19894 
19895 				pmap_offset += start_offset - offset;
19896 			} else {
19897 				start_offset = offset;
19898 				end_offset = offset + flush_size;
19899 			}
19900 
19901 			if (sync_flags & VM_SYNC_KILLPAGES) {
19902 				if (((object->ref_count == 1) ||
19903 				    ((object->copy_strategy !=
19904 				    MEMORY_OBJECT_COPY_SYMMETRIC) &&
19905 				    (object->copy == VM_OBJECT_NULL))) &&
19906 				    (object->shadow == VM_OBJECT_NULL)) {
19907 					if (object->ref_count != 1) {
19908 						vm_page_stats_reusable.free_shared++;
19909 					}
19910 					kill_pages = 1;
19911 				} else {
19912 					kill_pages = -1;
19913 				}
19914 			}
19915 			if (kill_pages != -1) {
19916 				vm_object_deactivate_pages(
19917 					object,
19918 					start_offset,
19919 					(vm_object_size_t) (end_offset - start_offset),
19920 					kill_pages,
19921 					reusable_pages,
19922 					map->pmap,
19923 					pmap_offset);
19924 			}
19925 			vm_object_unlock(object);
19926 			vm_map_unlock(map);
19927 			continue;
19928 		}
19929 		/*
19930 		 * We can't sync this object if there isn't a pager.
19931 		 * Don't bother to sync internal objects, since there can't
19932 		 * be any "permanent" storage for these objects anyway.
19933 		 */
19934 		if ((object->pager == MEMORY_OBJECT_NULL) ||
19935 		    (object->internal) || (object->private)) {
19936 			vm_object_unlock(object);
19937 			vm_map_unlock(map);
19938 			continue;
19939 		}
19940 		/*
19941 		 * keep reference on the object until syncing is done
19942 		 */
19943 		vm_object_reference_locked(object);
19944 		vm_object_unlock(object);
19945 
19946 		vm_map_unlock(map);
19947 
19948 		if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
19949 			start_offset = vm_object_trunc_page(offset);
19950 			end_offset = vm_object_round_page(offset + flush_size);
19951 		} else {
19952 			start_offset = offset;
19953 			end_offset = offset + flush_size;
19954 		}
19955 
19956 		do_sync_req = vm_object_sync(object,
19957 		    start_offset,
19958 		    (end_offset - start_offset),
19959 		    sync_flags & VM_SYNC_INVALIDATE,
19960 		    ((sync_flags & VM_SYNC_SYNCHRONOUS) ||
19961 		    (sync_flags & VM_SYNC_ASYNCHRONOUS)),
19962 		    sync_flags & VM_SYNC_SYNCHRONOUS);
19963 
19964 		if ((sync_flags & VM_SYNC_INVALIDATE) && object->resident_page_count == 0) {
19965 			/*
19966 			 * clear out the clustering and read-ahead hints
19967 			 */
19968 			vm_object_lock(object);
19969 
19970 			object->pages_created = 0;
19971 			object->pages_used = 0;
19972 			object->sequential = 0;
19973 			object->last_alloc = 0;
19974 
19975 			vm_object_unlock(object);
19976 		}
19977 		vm_object_deallocate(object);
19978 	} /* while */
19979 
19980 	/* for proper msync() behaviour */
19981 	if (had_hole == TRUE && (sync_flags & VM_SYNC_CONTIGUOUS)) {
19982 		return KERN_INVALID_ADDRESS;
19983 	}
19984 
19985 	return KERN_SUCCESS;
19986 }/* vm_msync */
19987 
19988 void
vm_named_entry_associate_vm_object(vm_named_entry_t named_entry,vm_object_t object,vm_object_offset_t offset,vm_object_size_t size,vm_prot_t prot)19989 vm_named_entry_associate_vm_object(
19990 	vm_named_entry_t        named_entry,
19991 	vm_object_t             object,
19992 	vm_object_offset_t      offset,
19993 	vm_object_size_t        size,
19994 	vm_prot_t               prot)
19995 {
19996 	vm_map_copy_t copy;
19997 	vm_map_entry_t copy_entry;
19998 
19999 	assert(!named_entry->is_sub_map);
20000 	assert(!named_entry->is_copy);
20001 	assert(!named_entry->is_object);
20002 	assert(!named_entry->internal);
20003 	assert(named_entry->backing.copy == VM_MAP_COPY_NULL);
20004 
20005 	copy = vm_map_copy_allocate();
20006 	copy->type = VM_MAP_COPY_ENTRY_LIST;
20007 	copy->offset = offset;
20008 	copy->size = size;
20009 	copy->cpy_hdr.page_shift = (uint16_t)PAGE_SHIFT;
20010 	vm_map_store_init(&copy->cpy_hdr);
20011 
20012 	copy_entry = vm_map_copy_entry_create(copy);
20013 	copy_entry->protection = prot;
20014 	copy_entry->max_protection = prot;
20015 	copy_entry->use_pmap = TRUE;
20016 	copy_entry->vme_start = VM_MAP_TRUNC_PAGE(offset, PAGE_MASK);
20017 	copy_entry->vme_end = VM_MAP_ROUND_PAGE(offset + size, PAGE_MASK);
20018 	VME_OBJECT_SET(copy_entry, object, false, 0);
20019 	VME_OFFSET_SET(copy_entry, vm_object_trunc_page(offset));
20020 	vm_map_copy_entry_link(copy, vm_map_copy_last_entry(copy), copy_entry);
20021 
20022 	named_entry->backing.copy = copy;
20023 	named_entry->is_object = TRUE;
20024 	if (object->internal) {
20025 		named_entry->internal = TRUE;
20026 	}
20027 
20028 	DEBUG4K_MEMENTRY("named_entry %p copy %p object %p offset 0x%llx size 0x%llx prot 0x%x\n",
20029 	    named_entry, copy, object, offset, size, prot);
20030 }
20031 
20032 vm_object_t
vm_named_entry_to_vm_object(vm_named_entry_t named_entry)20033 vm_named_entry_to_vm_object(
20034 	vm_named_entry_t named_entry)
20035 {
20036 	vm_map_copy_t   copy;
20037 	vm_map_entry_t  copy_entry;
20038 	vm_object_t     object;
20039 
20040 	assert(!named_entry->is_sub_map);
20041 	assert(!named_entry->is_copy);
20042 	assert(named_entry->is_object);
20043 	copy = named_entry->backing.copy;
20044 	assert(copy != VM_MAP_COPY_NULL);
20045 	/*
20046 	 * Assert that the vm_map_copy is coming from the right
20047 	 * zone and hasn't been forged
20048 	 */
20049 	vm_map_copy_require(copy);
20050 	assert(copy->cpy_hdr.nentries == 1);
20051 	copy_entry = vm_map_copy_first_entry(copy);
20052 	object = VME_OBJECT(copy_entry);
20053 
20054 	DEBUG4K_MEMENTRY("%p -> %p -> %p [0x%llx 0x%llx 0x%llx 0x%x/0x%x ] -> %p offset 0x%llx size 0x%llx prot 0x%x\n", named_entry, copy, copy_entry, (uint64_t)copy_entry->vme_start, (uint64_t)copy_entry->vme_end, copy_entry->vme_offset, copy_entry->protection, copy_entry->max_protection, object, named_entry->offset, named_entry->size, named_entry->protection);
20055 
20056 	return object;
20057 }
20058 
20059 /*
20060  *	Routine:	convert_port_entry_to_map
20061  *	Purpose:
20062  *		Convert from a port specifying an entry or a task
20063  *		to a map. Doesn't consume the port ref; produces a map ref,
20064  *		which may be null.  Unlike convert_port_to_map, the
20065  *		port may be task or a named entry backed.
20066  *	Conditions:
20067  *		Nothing locked.
20068  */
20069 
20070 vm_map_t
convert_port_entry_to_map(ipc_port_t port)20071 convert_port_entry_to_map(
20072 	ipc_port_t      port)
20073 {
20074 	vm_map_t map = VM_MAP_NULL;
20075 	vm_named_entry_t named_entry;
20076 
20077 	if (!IP_VALID(port)) {
20078 		return VM_MAP_NULL;
20079 	}
20080 
20081 	if (ip_kotype(port) != IKOT_NAMED_ENTRY) {
20082 		return convert_port_to_map(port);
20083 	}
20084 
20085 	named_entry = mach_memory_entry_from_port(port);
20086 
20087 	if ((named_entry->is_sub_map) &&
20088 	    (named_entry->protection & VM_PROT_WRITE)) {
20089 		map = named_entry->backing.map;
20090 		if (map->pmap != PMAP_NULL) {
20091 			if (map->pmap == kernel_pmap) {
20092 				panic("userspace has access "
20093 				    "to a kernel map %p", map);
20094 			}
20095 			pmap_require(map->pmap);
20096 		}
20097 		vm_map_reference(map);
20098 	}
20099 
20100 	return map;
20101 }
20102 
20103 /*
20104  * Export routines to other components for the things we access locally through
20105  * macros.
20106  */
20107 #undef current_map
20108 vm_map_t
current_map(void)20109 current_map(void)
20110 {
20111 	return current_map_fast();
20112 }
20113 
20114 /*
20115  *	vm_map_reference:
20116  *
20117  *	Takes a reference on the specified map.
20118  */
20119 void
vm_map_reference(vm_map_t map)20120 vm_map_reference(
20121 	vm_map_t        map)
20122 {
20123 	if (__probable(map != VM_MAP_NULL)) {
20124 		vm_map_require(map);
20125 		os_ref_retain_raw(&map->map_refcnt, &map_refgrp);
20126 	}
20127 }
20128 
20129 /*
20130  *	vm_map_deallocate:
20131  *
20132  *	Removes a reference from the specified map,
20133  *	destroying it if no references remain.
20134  *	The map should not be locked.
20135  */
20136 void
vm_map_deallocate(vm_map_t map)20137 vm_map_deallocate(
20138 	vm_map_t        map)
20139 {
20140 	if (__probable(map != VM_MAP_NULL)) {
20141 		vm_map_require(map);
20142 		if (os_ref_release_raw(&map->map_refcnt, &map_refgrp) == 0) {
20143 			vm_map_destroy(map);
20144 		}
20145 	}
20146 }
20147 
20148 void
vm_map_inspect_deallocate(vm_map_inspect_t map)20149 vm_map_inspect_deallocate(
20150 	vm_map_inspect_t      map)
20151 {
20152 	vm_map_deallocate((vm_map_t)map);
20153 }
20154 
20155 void
vm_map_read_deallocate(vm_map_read_t map)20156 vm_map_read_deallocate(
20157 	vm_map_read_t      map)
20158 {
20159 	vm_map_deallocate((vm_map_t)map);
20160 }
20161 
20162 
20163 void
vm_map_disable_NX(vm_map_t map)20164 vm_map_disable_NX(vm_map_t map)
20165 {
20166 	if (map == NULL) {
20167 		return;
20168 	}
20169 	if (map->pmap == NULL) {
20170 		return;
20171 	}
20172 
20173 	pmap_disable_NX(map->pmap);
20174 }
20175 
20176 void
vm_map_disallow_data_exec(vm_map_t map)20177 vm_map_disallow_data_exec(vm_map_t map)
20178 {
20179 	if (map == NULL) {
20180 		return;
20181 	}
20182 
20183 	map->map_disallow_data_exec = TRUE;
20184 }
20185 
20186 /* XXX Consider making these constants (VM_MAX_ADDRESS and MACH_VM_MAX_ADDRESS)
20187  * more descriptive.
20188  */
20189 void
vm_map_set_32bit(vm_map_t map)20190 vm_map_set_32bit(vm_map_t map)
20191 {
20192 #if defined(__arm64__)
20193 	map->max_offset = pmap_max_offset(FALSE, ARM_PMAP_MAX_OFFSET_DEVICE);
20194 #else
20195 	map->max_offset = (vm_map_offset_t)VM_MAX_ADDRESS;
20196 #endif
20197 }
20198 
20199 
20200 void
vm_map_set_64bit(vm_map_t map)20201 vm_map_set_64bit(vm_map_t map)
20202 {
20203 #if defined(__arm64__)
20204 	map->max_offset = pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_DEVICE);
20205 #else
20206 	map->max_offset = (vm_map_offset_t)MACH_VM_MAX_ADDRESS;
20207 #endif
20208 }
20209 
20210 /*
20211  * Expand the maximum size of an existing map to the maximum supported.
20212  */
20213 void
vm_map_set_jumbo(vm_map_t map)20214 vm_map_set_jumbo(vm_map_t map)
20215 {
20216 #if defined (__arm64__) && !XNU_TARGET_OS_OSX
20217 	vm_map_set_max_addr(map, ~0);
20218 #else /* arm64 */
20219 	(void) map;
20220 #endif
20221 }
20222 
20223 /*
20224  * This map has a JIT entitlement
20225  */
20226 void
vm_map_set_jit_entitled(vm_map_t map)20227 vm_map_set_jit_entitled(vm_map_t map)
20228 {
20229 #if defined (__arm64__)
20230 	pmap_set_jit_entitled(map->pmap);
20231 #else /* arm64 */
20232 	(void) map;
20233 #endif
20234 }
20235 
20236 /*
20237  * This map has TPRO enabled
20238  */
20239 void
vm_map_set_tpro(vm_map_t map)20240 vm_map_set_tpro(vm_map_t map)
20241 {
20242 #if defined (__arm64e__)
20243 	pmap_set_tpro(map->pmap);
20244 #else /* arm64e */
20245 	(void) map;
20246 #endif
20247 }
20248 
20249 /*
20250  * Expand the maximum size of an existing map.
20251  */
20252 void
vm_map_set_max_addr(vm_map_t map,vm_map_offset_t new_max_offset)20253 vm_map_set_max_addr(vm_map_t map, vm_map_offset_t new_max_offset)
20254 {
20255 #if defined(__arm64__)
20256 	vm_map_offset_t max_supported_offset;
20257 	vm_map_offset_t old_max_offset;
20258 
20259 	vm_map_lock(map);
20260 
20261 	old_max_offset = map->max_offset;
20262 	max_supported_offset = pmap_max_offset(vm_map_is_64bit(map), ARM_PMAP_MAX_OFFSET_JUMBO);
20263 
20264 	new_max_offset = trunc_page(new_max_offset);
20265 
20266 	/* The address space cannot be shrunk using this routine. */
20267 	if (old_max_offset >= new_max_offset) {
20268 		vm_map_unlock(map);
20269 		return;
20270 	}
20271 
20272 	if (max_supported_offset < new_max_offset) {
20273 		new_max_offset = max_supported_offset;
20274 	}
20275 
20276 	map->max_offset = new_max_offset;
20277 
20278 	if (map->holelistenabled) {
20279 		if (map->holes_list->prev->vme_end == old_max_offset) {
20280 			/*
20281 			 * There is already a hole at the end of the map; simply make it bigger.
20282 			 */
20283 			map->holes_list->prev->vme_end = map->max_offset;
20284 		} else {
20285 			/*
20286 			 * There is no hole at the end, so we need to create a new hole
20287 			 * for the new empty space we're creating.
20288 			 */
20289 			struct vm_map_links *new_hole;
20290 
20291 			new_hole = zalloc_id(ZONE_ID_VM_MAP_HOLES, Z_WAITOK | Z_NOFAIL);
20292 			new_hole->start = old_max_offset;
20293 			new_hole->end = map->max_offset;
20294 			new_hole->prev = map->holes_list->prev;
20295 			new_hole->next = (struct vm_map_entry *)map->holes_list;
20296 			map->holes_list->prev->links.next = (struct vm_map_entry *)new_hole;
20297 			map->holes_list->prev = (struct vm_map_entry *)new_hole;
20298 		}
20299 	}
20300 
20301 	vm_map_unlock(map);
20302 #else
20303 	(void)map;
20304 	(void)new_max_offset;
20305 #endif
20306 }
20307 
20308 vm_map_offset_t
vm_compute_max_offset(boolean_t is64)20309 vm_compute_max_offset(boolean_t is64)
20310 {
20311 #if defined(__arm64__)
20312 	return pmap_max_offset(is64, ARM_PMAP_MAX_OFFSET_DEVICE);
20313 #else
20314 	return is64 ? (vm_map_offset_t)MACH_VM_MAX_ADDRESS : (vm_map_offset_t)VM_MAX_ADDRESS;
20315 #endif
20316 }
20317 
20318 void
vm_map_get_max_aslr_slide_section(vm_map_t map __unused,int64_t * max_sections,int64_t * section_size)20319 vm_map_get_max_aslr_slide_section(
20320 	vm_map_t                map __unused,
20321 	int64_t                 *max_sections,
20322 	int64_t                 *section_size)
20323 {
20324 #if defined(__arm64__)
20325 	*max_sections = 3;
20326 	*section_size = ARM_TT_TWIG_SIZE;
20327 #else
20328 	*max_sections = 1;
20329 	*section_size = 0;
20330 #endif
20331 }
20332 
20333 uint64_t
vm_map_get_max_aslr_slide_pages(vm_map_t map)20334 vm_map_get_max_aslr_slide_pages(vm_map_t map)
20335 {
20336 #if defined(__arm64__)
20337 	/* Limit arm64 slide to 16MB to conserve contiguous VA space in the more
20338 	 * limited embedded address space; this is also meant to minimize pmap
20339 	 * memory usage on 16KB page systems.
20340 	 */
20341 	return 1 << (24 - VM_MAP_PAGE_SHIFT(map));
20342 #else
20343 	return 1 << (vm_map_is_64bit(map) ? 16 : 8);
20344 #endif
20345 }
20346 
20347 uint64_t
vm_map_get_max_loader_aslr_slide_pages(vm_map_t map)20348 vm_map_get_max_loader_aslr_slide_pages(vm_map_t map)
20349 {
20350 #if defined(__arm64__)
20351 	/* We limit the loader slide to 4MB, in order to ensure at least 8 bits
20352 	 * of independent entropy on 16KB page systems.
20353 	 */
20354 	return 1 << (22 - VM_MAP_PAGE_SHIFT(map));
20355 #else
20356 	return 1 << (vm_map_is_64bit(map) ? 16 : 8);
20357 #endif
20358 }
20359 
20360 boolean_t
vm_map_is_64bit(vm_map_t map)20361 vm_map_is_64bit(
20362 	vm_map_t map)
20363 {
20364 	return map->max_offset > ((vm_map_offset_t)VM_MAX_ADDRESS);
20365 }
20366 
20367 boolean_t
vm_map_has_hard_pagezero(vm_map_t map,vm_map_offset_t pagezero_size)20368 vm_map_has_hard_pagezero(
20369 	vm_map_t        map,
20370 	vm_map_offset_t pagezero_size)
20371 {
20372 	/*
20373 	 * XXX FBDP
20374 	 * We should lock the VM map (for read) here but we can get away
20375 	 * with it for now because there can't really be any race condition:
20376 	 * the VM map's min_offset is changed only when the VM map is created
20377 	 * and when the zero page is established (when the binary gets loaded),
20378 	 * and this routine gets called only when the task terminates and the
20379 	 * VM map is being torn down, and when a new map is created via
20380 	 * load_machfile()/execve().
20381 	 */
20382 	return map->min_offset >= pagezero_size;
20383 }
20384 
20385 /*
20386  * Raise a VM map's maximun offset.
20387  */
20388 kern_return_t
vm_map_raise_max_offset(vm_map_t map,vm_map_offset_t new_max_offset)20389 vm_map_raise_max_offset(
20390 	vm_map_t        map,
20391 	vm_map_offset_t new_max_offset)
20392 {
20393 	kern_return_t   ret;
20394 
20395 	vm_map_lock(map);
20396 	ret = KERN_INVALID_ADDRESS;
20397 
20398 	if (new_max_offset >= map->max_offset) {
20399 		if (!vm_map_is_64bit(map)) {
20400 			if (new_max_offset <= (vm_map_offset_t)VM_MAX_ADDRESS) {
20401 				map->max_offset = new_max_offset;
20402 				ret = KERN_SUCCESS;
20403 			}
20404 		} else {
20405 			if (new_max_offset <= (vm_map_offset_t)MACH_VM_MAX_ADDRESS) {
20406 				map->max_offset = new_max_offset;
20407 				ret = KERN_SUCCESS;
20408 			}
20409 		}
20410 	}
20411 
20412 	vm_map_unlock(map);
20413 	return ret;
20414 }
20415 
20416 
20417 /*
20418  * Raise a VM map's minimum offset.
20419  * To strictly enforce "page zero" reservation.
20420  */
20421 kern_return_t
vm_map_raise_min_offset(vm_map_t map,vm_map_offset_t new_min_offset)20422 vm_map_raise_min_offset(
20423 	vm_map_t        map,
20424 	vm_map_offset_t new_min_offset)
20425 {
20426 	vm_map_entry_t  first_entry;
20427 
20428 	new_min_offset = vm_map_round_page(new_min_offset,
20429 	    VM_MAP_PAGE_MASK(map));
20430 
20431 	vm_map_lock(map);
20432 
20433 	if (new_min_offset < map->min_offset) {
20434 		/*
20435 		 * Can't move min_offset backwards, as that would expose
20436 		 * a part of the address space that was previously, and for
20437 		 * possibly good reasons, inaccessible.
20438 		 */
20439 		vm_map_unlock(map);
20440 		return KERN_INVALID_ADDRESS;
20441 	}
20442 	if (new_min_offset >= map->max_offset) {
20443 		/* can't go beyond the end of the address space */
20444 		vm_map_unlock(map);
20445 		return KERN_INVALID_ADDRESS;
20446 	}
20447 
20448 	first_entry = vm_map_first_entry(map);
20449 	if (first_entry != vm_map_to_entry(map) &&
20450 	    first_entry->vme_start < new_min_offset) {
20451 		/*
20452 		 * Some memory was already allocated below the new
20453 		 * minimun offset.  It's too late to change it now...
20454 		 */
20455 		vm_map_unlock(map);
20456 		return KERN_NO_SPACE;
20457 	}
20458 
20459 	map->min_offset = new_min_offset;
20460 
20461 	if (map->holelistenabled) {
20462 		assert(map->holes_list);
20463 		map->holes_list->start = new_min_offset;
20464 		assert(new_min_offset < map->holes_list->end);
20465 	}
20466 
20467 	vm_map_unlock(map);
20468 
20469 	return KERN_SUCCESS;
20470 }
20471 
20472 /*
20473  * Set the limit on the maximum amount of address space and user wired memory allowed for this map.
20474  * This is basically a copy of the RLIMIT_AS and RLIMIT_MEMLOCK rlimit value maintained by the BSD
20475  * side of the kernel. The limits are checked in the mach VM side, so we keep a copy so we don't
20476  * have to reach over to the BSD data structures.
20477  */
20478 
20479 uint64_t vm_map_set_size_limit_count = 0;
20480 kern_return_t
vm_map_set_size_limit(vm_map_t map,uint64_t new_size_limit)20481 vm_map_set_size_limit(vm_map_t map, uint64_t new_size_limit)
20482 {
20483 	kern_return_t kr;
20484 
20485 	vm_map_lock(map);
20486 	if (new_size_limit < map->size) {
20487 		/* new limit should not be lower than its current size */
20488 		DTRACE_VM2(vm_map_set_size_limit_fail,
20489 		    vm_map_size_t, map->size,
20490 		    uint64_t, new_size_limit);
20491 		kr = KERN_FAILURE;
20492 	} else if (new_size_limit == map->size_limit) {
20493 		/* no change */
20494 		kr = KERN_SUCCESS;
20495 	} else {
20496 		/* set new limit */
20497 		DTRACE_VM2(vm_map_set_size_limit,
20498 		    vm_map_size_t, map->size,
20499 		    uint64_t, new_size_limit);
20500 		if (new_size_limit != RLIM_INFINITY) {
20501 			vm_map_set_size_limit_count++;
20502 		}
20503 		map->size_limit = new_size_limit;
20504 		kr = KERN_SUCCESS;
20505 	}
20506 	vm_map_unlock(map);
20507 	return kr;
20508 }
20509 
20510 uint64_t vm_map_set_data_limit_count = 0;
20511 kern_return_t
vm_map_set_data_limit(vm_map_t map,uint64_t new_data_limit)20512 vm_map_set_data_limit(vm_map_t map, uint64_t new_data_limit)
20513 {
20514 	kern_return_t kr;
20515 
20516 	vm_map_lock(map);
20517 	if (new_data_limit < map->size) {
20518 		/* new limit should not be lower than its current size */
20519 		DTRACE_VM2(vm_map_set_data_limit_fail,
20520 		    vm_map_size_t, map->size,
20521 		    uint64_t, new_data_limit);
20522 		kr = KERN_FAILURE;
20523 	} else if (new_data_limit == map->data_limit) {
20524 		/* no change */
20525 		kr = KERN_SUCCESS;
20526 	} else {
20527 		/* set new limit */
20528 		DTRACE_VM2(vm_map_set_data_limit,
20529 		    vm_map_size_t, map->size,
20530 		    uint64_t, new_data_limit);
20531 		if (new_data_limit != RLIM_INFINITY) {
20532 			vm_map_set_data_limit_count++;
20533 		}
20534 		map->data_limit = new_data_limit;
20535 		kr = KERN_SUCCESS;
20536 	}
20537 	vm_map_unlock(map);
20538 	return kr;
20539 }
20540 
20541 void
vm_map_set_user_wire_limit(vm_map_t map,vm_size_t limit)20542 vm_map_set_user_wire_limit(vm_map_t     map,
20543     vm_size_t    limit)
20544 {
20545 	vm_map_lock(map);
20546 	map->user_wire_limit = limit;
20547 	vm_map_unlock(map);
20548 }
20549 
20550 
20551 void
vm_map_switch_protect(vm_map_t map,boolean_t val)20552 vm_map_switch_protect(vm_map_t     map,
20553     boolean_t    val)
20554 {
20555 	vm_map_lock(map);
20556 	map->switch_protect = val;
20557 	vm_map_unlock(map);
20558 }
20559 
20560 extern int cs_process_enforcement_enable;
20561 boolean_t
vm_map_cs_enforcement(vm_map_t map)20562 vm_map_cs_enforcement(
20563 	vm_map_t map)
20564 {
20565 	if (cs_process_enforcement_enable) {
20566 		return TRUE;
20567 	}
20568 	return map->cs_enforcement;
20569 }
20570 
20571 kern_return_t
vm_map_cs_wx_enable(vm_map_t map)20572 vm_map_cs_wx_enable(
20573 	vm_map_t map)
20574 {
20575 	return pmap_cs_allow_invalid(vm_map_pmap(map));
20576 }
20577 
20578 void
vm_map_cs_debugged_set(vm_map_t map,boolean_t val)20579 vm_map_cs_debugged_set(
20580 	vm_map_t map,
20581 	boolean_t val)
20582 {
20583 	vm_map_lock(map);
20584 	map->cs_debugged = val;
20585 	vm_map_unlock(map);
20586 }
20587 
20588 void
vm_map_cs_enforcement_set(vm_map_t map,boolean_t val)20589 vm_map_cs_enforcement_set(
20590 	vm_map_t map,
20591 	boolean_t val)
20592 {
20593 	vm_map_lock(map);
20594 	map->cs_enforcement = val;
20595 	pmap_set_vm_map_cs_enforced(map->pmap, val);
20596 	vm_map_unlock(map);
20597 }
20598 
20599 /*
20600  * IOKit has mapped a region into this map; adjust the pmap's ledgers appropriately.
20601  * phys_footprint is a composite limit consisting of iokit + physmem, so we need to
20602  * bump both counters.
20603  */
20604 void
vm_map_iokit_mapped_region(vm_map_t map,vm_size_t bytes)20605 vm_map_iokit_mapped_region(vm_map_t map, vm_size_t bytes)
20606 {
20607 	pmap_t pmap = vm_map_pmap(map);
20608 
20609 	ledger_credit(pmap->ledger, task_ledgers.iokit_mapped, bytes);
20610 	ledger_credit(pmap->ledger, task_ledgers.phys_footprint, bytes);
20611 }
20612 
20613 void
vm_map_iokit_unmapped_region(vm_map_t map,vm_size_t bytes)20614 vm_map_iokit_unmapped_region(vm_map_t map, vm_size_t bytes)
20615 {
20616 	pmap_t pmap = vm_map_pmap(map);
20617 
20618 	ledger_debit(pmap->ledger, task_ledgers.iokit_mapped, bytes);
20619 	ledger_debit(pmap->ledger, task_ledgers.phys_footprint, bytes);
20620 }
20621 
20622 /* Add (generate) code signature for memory range */
20623 #if CONFIG_DYNAMIC_CODE_SIGNING
20624 kern_return_t
vm_map_sign(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)20625 vm_map_sign(vm_map_t map,
20626     vm_map_offset_t start,
20627     vm_map_offset_t end)
20628 {
20629 	vm_map_entry_t entry;
20630 	vm_page_t m;
20631 	vm_object_t object;
20632 
20633 	/*
20634 	 * Vet all the input parameters and current type and state of the
20635 	 * underlaying object.  Return with an error if anything is amiss.
20636 	 */
20637 	if (map == VM_MAP_NULL) {
20638 		return KERN_INVALID_ARGUMENT;
20639 	}
20640 
20641 	vm_map_lock_read(map);
20642 
20643 	if (!vm_map_lookup_entry(map, start, &entry) || entry->is_sub_map) {
20644 		/*
20645 		 * Must pass a valid non-submap address.
20646 		 */
20647 		vm_map_unlock_read(map);
20648 		return KERN_INVALID_ADDRESS;
20649 	}
20650 
20651 	if ((entry->vme_start > start) || (entry->vme_end < end)) {
20652 		/*
20653 		 * Map entry doesn't cover the requested range. Not handling
20654 		 * this situation currently.
20655 		 */
20656 		vm_map_unlock_read(map);
20657 		return KERN_INVALID_ARGUMENT;
20658 	}
20659 
20660 	object = VME_OBJECT(entry);
20661 	if (object == VM_OBJECT_NULL) {
20662 		/*
20663 		 * Object must already be present or we can't sign.
20664 		 */
20665 		vm_map_unlock_read(map);
20666 		return KERN_INVALID_ARGUMENT;
20667 	}
20668 
20669 	vm_object_lock(object);
20670 	vm_map_unlock_read(map);
20671 
20672 	while (start < end) {
20673 		uint32_t refmod;
20674 
20675 		m = vm_page_lookup(object,
20676 		    start - entry->vme_start + VME_OFFSET(entry));
20677 		if (m == VM_PAGE_NULL) {
20678 			/* shoud we try to fault a page here? we can probably
20679 			 * demand it exists and is locked for this request */
20680 			vm_object_unlock(object);
20681 			return KERN_FAILURE;
20682 		}
20683 		/* deal with special page status */
20684 		if (m->vmp_busy ||
20685 		    (m->vmp_unusual && (VMP_ERROR_GET(m) || m->vmp_restart || m->vmp_private || m->vmp_absent))) {
20686 			vm_object_unlock(object);
20687 			return KERN_FAILURE;
20688 		}
20689 
20690 		/* Page is OK... now "validate" it */
20691 		/* This is the place where we'll call out to create a code
20692 		 * directory, later */
20693 		/* XXX TODO4K: deal with 4k subpages individually? */
20694 		m->vmp_cs_validated = VMP_CS_ALL_TRUE;
20695 
20696 		/* The page is now "clean" for codesigning purposes. That means
20697 		 * we don't consider it as modified (wpmapped) anymore. But
20698 		 * we'll disconnect the page so we note any future modification
20699 		 * attempts. */
20700 		m->vmp_wpmapped = FALSE;
20701 		refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
20702 
20703 		/* Pull the dirty status from the pmap, since we cleared the
20704 		 * wpmapped bit */
20705 		if ((refmod & VM_MEM_MODIFIED) && !m->vmp_dirty) {
20706 			SET_PAGE_DIRTY(m, FALSE);
20707 		}
20708 
20709 		/* On to the next page */
20710 		start += PAGE_SIZE;
20711 	}
20712 	vm_object_unlock(object);
20713 
20714 	return KERN_SUCCESS;
20715 }
20716 #endif
20717 
20718 kern_return_t
vm_map_partial_reap(vm_map_t map,unsigned int * reclaimed_resident,unsigned int * reclaimed_compressed)20719 vm_map_partial_reap(vm_map_t map, unsigned int *reclaimed_resident, unsigned int *reclaimed_compressed)
20720 {
20721 	vm_map_entry_t  entry = VM_MAP_ENTRY_NULL;
20722 	vm_map_entry_t  next_entry;
20723 	kern_return_t   kr = KERN_SUCCESS;
20724 	VM_MAP_ZAP_DECLARE(zap_list);
20725 
20726 	vm_map_lock(map);
20727 
20728 	for (entry = vm_map_first_entry(map);
20729 	    entry != vm_map_to_entry(map);
20730 	    entry = next_entry) {
20731 		next_entry = entry->vme_next;
20732 
20733 		if (!entry->is_sub_map &&
20734 		    VME_OBJECT(entry) &&
20735 		    (VME_OBJECT(entry)->internal == TRUE) &&
20736 		    (VME_OBJECT(entry)->ref_count == 1)) {
20737 			*reclaimed_resident += VME_OBJECT(entry)->resident_page_count;
20738 			*reclaimed_compressed += vm_compressor_pager_get_count(VME_OBJECT(entry)->pager);
20739 
20740 			(void)vm_map_delete(map, entry->vme_start,
20741 			    entry->vme_end, VM_MAP_REMOVE_NO_YIELD,
20742 			    KMEM_GUARD_NONE, &zap_list);
20743 		}
20744 	}
20745 
20746 	vm_map_unlock(map);
20747 
20748 	vm_map_zap_dispose(&zap_list);
20749 
20750 	return kr;
20751 }
20752 
20753 
20754 #if DEVELOPMENT || DEBUG
20755 
20756 int
vm_map_disconnect_page_mappings(vm_map_t map,boolean_t do_unnest)20757 vm_map_disconnect_page_mappings(
20758 	vm_map_t map,
20759 	boolean_t do_unnest)
20760 {
20761 	vm_map_entry_t entry;
20762 	ledger_amount_t byte_count = 0;
20763 
20764 	if (do_unnest == TRUE) {
20765 #ifndef NO_NESTED_PMAP
20766 		vm_map_lock(map);
20767 
20768 		for (entry = vm_map_first_entry(map);
20769 		    entry != vm_map_to_entry(map);
20770 		    entry = entry->vme_next) {
20771 			if (entry->is_sub_map && entry->use_pmap) {
20772 				/*
20773 				 * Make sure the range between the start of this entry and
20774 				 * the end of this entry is no longer nested, so that
20775 				 * we will only remove mappings from the pmap in use by this
20776 				 * this task
20777 				 */
20778 				vm_map_clip_unnest(map, entry, entry->vme_start, entry->vme_end);
20779 			}
20780 		}
20781 		vm_map_unlock(map);
20782 #endif
20783 	}
20784 	vm_map_lock_read(map);
20785 
20786 	ledger_get_balance(map->pmap->ledger, task_ledgers.phys_mem, &byte_count);
20787 
20788 	for (entry = vm_map_first_entry(map);
20789 	    entry != vm_map_to_entry(map);
20790 	    entry = entry->vme_next) {
20791 		if (!entry->is_sub_map && ((VME_OBJECT(entry) == 0) ||
20792 		    (VME_OBJECT(entry)->phys_contiguous))) {
20793 			continue;
20794 		}
20795 		if (entry->is_sub_map) {
20796 			assert(!entry->use_pmap);
20797 		}
20798 
20799 		pmap_remove_options(map->pmap, entry->vme_start, entry->vme_end, 0);
20800 	}
20801 	vm_map_unlock_read(map);
20802 
20803 	return (int) (byte_count / VM_MAP_PAGE_SIZE(map));
20804 }
20805 
20806 kern_return_t
vm_map_inject_error(vm_map_t map,vm_map_offset_t vaddr)20807 vm_map_inject_error(vm_map_t map, vm_map_offset_t vaddr)
20808 {
20809 	vm_object_t object = NULL;
20810 	vm_object_offset_t offset;
20811 	vm_prot_t prot;
20812 	boolean_t wired;
20813 	vm_map_version_t version;
20814 	vm_map_t real_map;
20815 	int result = KERN_FAILURE;
20816 
20817 	vaddr = vm_map_trunc_page(vaddr, PAGE_MASK);
20818 	vm_map_lock(map);
20819 
20820 	result = vm_map_lookup_and_lock_object(&map, vaddr, VM_PROT_READ,
20821 	    OBJECT_LOCK_EXCLUSIVE, &version, &object, &offset, &prot, &wired,
20822 	    NULL, &real_map, NULL);
20823 	if (object == NULL) {
20824 		result = KERN_MEMORY_ERROR;
20825 	} else if (object->pager) {
20826 		result = vm_compressor_pager_inject_error(object->pager,
20827 		    offset);
20828 	} else {
20829 		result = KERN_MEMORY_PRESENT;
20830 	}
20831 
20832 	if (object != NULL) {
20833 		vm_object_unlock(object);
20834 	}
20835 
20836 	if (real_map != map) {
20837 		vm_map_unlock(real_map);
20838 	}
20839 	vm_map_unlock(map);
20840 
20841 	return result;
20842 }
20843 
20844 #endif
20845 
20846 
20847 #if CONFIG_FREEZE
20848 
20849 
20850 extern struct freezer_context freezer_context_global;
20851 AbsoluteTime c_freezer_last_yield_ts = 0;
20852 
20853 extern unsigned int memorystatus_freeze_private_shared_pages_ratio;
20854 extern unsigned int memorystatus_freeze_shared_mb_per_process_max;
20855 
20856 kern_return_t
vm_map_freeze(task_t task,unsigned int * purgeable_count,unsigned int * wired_count,unsigned int * clean_count,unsigned int * dirty_count,unsigned int dirty_budget,unsigned int * shared_count,int * freezer_error_code,boolean_t eval_only)20857 vm_map_freeze(
20858 	task_t       task,
20859 	unsigned int *purgeable_count,
20860 	unsigned int *wired_count,
20861 	unsigned int *clean_count,
20862 	unsigned int *dirty_count,
20863 	unsigned int dirty_budget,
20864 	unsigned int *shared_count,
20865 	int          *freezer_error_code,
20866 	boolean_t    eval_only)
20867 {
20868 	vm_map_entry_t  entry2 = VM_MAP_ENTRY_NULL;
20869 	kern_return_t   kr = KERN_SUCCESS;
20870 	boolean_t       evaluation_phase = TRUE;
20871 	vm_object_t     cur_shared_object = NULL;
20872 	int             cur_shared_obj_ref_cnt = 0;
20873 	unsigned int    dirty_private_count = 0, dirty_shared_count = 0, obj_pages_snapshot = 0;
20874 
20875 	*purgeable_count = *wired_count = *clean_count = *dirty_count = *shared_count = 0;
20876 
20877 	/*
20878 	 * We need the exclusive lock here so that we can
20879 	 * block any page faults or lookups while we are
20880 	 * in the middle of freezing this vm map.
20881 	 */
20882 	vm_map_t map = task->map;
20883 
20884 	vm_map_lock(map);
20885 
20886 	assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
20887 
20888 	if (vm_compressor_low_on_space() || vm_swap_low_on_space()) {
20889 		if (vm_compressor_low_on_space()) {
20890 			*freezer_error_code = FREEZER_ERROR_NO_COMPRESSOR_SPACE;
20891 		}
20892 
20893 		if (vm_swap_low_on_space()) {
20894 			*freezer_error_code = FREEZER_ERROR_NO_SWAP_SPACE;
20895 		}
20896 
20897 		kr = KERN_NO_SPACE;
20898 		goto done;
20899 	}
20900 
20901 	if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE == FALSE) {
20902 		/*
20903 		 * In-memory compressor backing the freezer. No disk.
20904 		 * So no need to do the evaluation phase.
20905 		 */
20906 		evaluation_phase = FALSE;
20907 
20908 		if (eval_only == TRUE) {
20909 			/*
20910 			 * We don't support 'eval_only' mode
20911 			 * in this non-swap config.
20912 			 */
20913 			*freezer_error_code = FREEZER_ERROR_GENERIC;
20914 			kr = KERN_INVALID_ARGUMENT;
20915 			goto done;
20916 		}
20917 
20918 		freezer_context_global.freezer_ctx_uncompressed_pages = 0;
20919 		clock_get_uptime(&c_freezer_last_yield_ts);
20920 	}
20921 again:
20922 
20923 	for (entry2 = vm_map_first_entry(map);
20924 	    entry2 != vm_map_to_entry(map);
20925 	    entry2 = entry2->vme_next) {
20926 		vm_object_t src_object;
20927 
20928 		if (entry2->is_sub_map) {
20929 			continue;
20930 		}
20931 
20932 		src_object = VME_OBJECT(entry2);
20933 		if (!src_object ||
20934 		    src_object->phys_contiguous ||
20935 		    !src_object->internal) {
20936 			continue;
20937 		}
20938 
20939 		/* If eligible, scan the entry, moving eligible pages over to our parent object */
20940 
20941 		if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
20942 			/*
20943 			 * We skip purgeable objects during evaluation phase only.
20944 			 * If we decide to freeze this process, we'll explicitly
20945 			 * purge these objects before we go around again with
20946 			 * 'evaluation_phase' set to FALSE.
20947 			 */
20948 
20949 			if ((src_object->purgable == VM_PURGABLE_EMPTY) || (src_object->purgable == VM_PURGABLE_VOLATILE)) {
20950 				/*
20951 				 * We want to purge objects that may not belong to this task but are mapped
20952 				 * in this task alone. Since we already purged this task's purgeable memory
20953 				 * at the end of a successful evaluation phase, we want to avoid doing no-op calls
20954 				 * on this task's purgeable objects. Hence the check for only volatile objects.
20955 				 */
20956 				if (evaluation_phase == FALSE &&
20957 				    (src_object->purgable == VM_PURGABLE_VOLATILE) &&
20958 				    (src_object->ref_count == 1)) {
20959 					vm_object_lock(src_object);
20960 					vm_object_purge(src_object, 0);
20961 					vm_object_unlock(src_object);
20962 				}
20963 				continue;
20964 			}
20965 
20966 			/*
20967 			 * Pages belonging to this object could be swapped to disk.
20968 			 * Make sure it's not a shared object because we could end
20969 			 * up just bringing it back in again.
20970 			 *
20971 			 * We try to optimize somewhat by checking for objects that are mapped
20972 			 * more than once within our own map. But we don't do full searches,
20973 			 * we just look at the entries following our current entry.
20974 			 */
20975 
20976 			if (src_object->ref_count > 1) {
20977 				if (src_object != cur_shared_object) {
20978 					obj_pages_snapshot = (src_object->resident_page_count - src_object->wired_page_count) + vm_compressor_pager_get_count(src_object->pager);
20979 					dirty_shared_count += obj_pages_snapshot;
20980 
20981 					cur_shared_object = src_object;
20982 					cur_shared_obj_ref_cnt = 1;
20983 					continue;
20984 				} else {
20985 					cur_shared_obj_ref_cnt++;
20986 					if (src_object->ref_count == cur_shared_obj_ref_cnt) {
20987 						/*
20988 						 * Fall through to below and treat this object as private.
20989 						 * So deduct its pages from our shared total and add it to the
20990 						 * private total.
20991 						 */
20992 
20993 						dirty_shared_count -= obj_pages_snapshot;
20994 						dirty_private_count += obj_pages_snapshot;
20995 					} else {
20996 						continue;
20997 					}
20998 				}
20999 			}
21000 
21001 
21002 			if (src_object->ref_count == 1) {
21003 				dirty_private_count += (src_object->resident_page_count - src_object->wired_page_count) + vm_compressor_pager_get_count(src_object->pager);
21004 			}
21005 
21006 			if (evaluation_phase == TRUE) {
21007 				continue;
21008 			}
21009 		}
21010 
21011 		uint32_t paged_out_count = vm_object_compressed_freezer_pageout(src_object, dirty_budget);
21012 		*wired_count += src_object->wired_page_count;
21013 
21014 		if (vm_compressor_low_on_space() || vm_swap_low_on_space()) {
21015 			if (vm_compressor_low_on_space()) {
21016 				*freezer_error_code = FREEZER_ERROR_NO_COMPRESSOR_SPACE;
21017 			}
21018 
21019 			if (vm_swap_low_on_space()) {
21020 				*freezer_error_code = FREEZER_ERROR_NO_SWAP_SPACE;
21021 			}
21022 
21023 			kr = KERN_NO_SPACE;
21024 			break;
21025 		}
21026 		if (paged_out_count >= dirty_budget) {
21027 			break;
21028 		}
21029 		dirty_budget -= paged_out_count;
21030 	}
21031 
21032 	*shared_count = (unsigned int) ((dirty_shared_count * PAGE_SIZE_64) / (1024 * 1024ULL));
21033 	if (evaluation_phase) {
21034 		unsigned int shared_pages_threshold = (memorystatus_freeze_shared_mb_per_process_max * 1024 * 1024ULL) / PAGE_SIZE_64;
21035 
21036 		if (dirty_shared_count > shared_pages_threshold) {
21037 			*freezer_error_code = FREEZER_ERROR_EXCESS_SHARED_MEMORY;
21038 			kr = KERN_FAILURE;
21039 			goto done;
21040 		}
21041 
21042 		if (dirty_shared_count &&
21043 		    ((dirty_private_count / dirty_shared_count) < memorystatus_freeze_private_shared_pages_ratio)) {
21044 			*freezer_error_code = FREEZER_ERROR_LOW_PRIVATE_SHARED_RATIO;
21045 			kr = KERN_FAILURE;
21046 			goto done;
21047 		}
21048 
21049 		evaluation_phase = FALSE;
21050 		dirty_shared_count = dirty_private_count = 0;
21051 
21052 		freezer_context_global.freezer_ctx_uncompressed_pages = 0;
21053 		clock_get_uptime(&c_freezer_last_yield_ts);
21054 
21055 		if (eval_only) {
21056 			kr = KERN_SUCCESS;
21057 			goto done;
21058 		}
21059 
21060 		vm_purgeable_purge_task_owned(task);
21061 
21062 		goto again;
21063 	} else {
21064 		kr = KERN_SUCCESS;
21065 	}
21066 
21067 done:
21068 	vm_map_unlock(map);
21069 
21070 	if ((eval_only == FALSE) && (kr == KERN_SUCCESS)) {
21071 		vm_object_compressed_freezer_done();
21072 	}
21073 	return kr;
21074 }
21075 
21076 #endif
21077 
21078 /*
21079  * vm_map_entry_should_cow_for_true_share:
21080  *
21081  * Determines if the map entry should be clipped and setup for copy-on-write
21082  * to avoid applying "true_share" to a large VM object when only a subset is
21083  * targeted.
21084  *
21085  * For now, we target only the map entries created for the Objective C
21086  * Garbage Collector, which initially have the following properties:
21087  *	- alias == VM_MEMORY_MALLOC
21088  *      - wired_count == 0
21089  *      - !needs_copy
21090  * and a VM object with:
21091  *      - internal
21092  *      - copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC
21093  *      - !true_share
21094  *      - vo_size == ANON_CHUNK_SIZE
21095  *
21096  * Only non-kernel map entries.
21097  */
21098 boolean_t
vm_map_entry_should_cow_for_true_share(vm_map_entry_t entry)21099 vm_map_entry_should_cow_for_true_share(
21100 	vm_map_entry_t  entry)
21101 {
21102 	vm_object_t     object;
21103 
21104 	if (entry->is_sub_map) {
21105 		/* entry does not point at a VM object */
21106 		return FALSE;
21107 	}
21108 
21109 	if (entry->needs_copy) {
21110 		/* already set for copy_on_write: done! */
21111 		return FALSE;
21112 	}
21113 
21114 	if (VME_ALIAS(entry) != VM_MEMORY_MALLOC &&
21115 	    VME_ALIAS(entry) != VM_MEMORY_MALLOC_SMALL) {
21116 		/* not a malloc heap or Obj-C Garbage Collector heap */
21117 		return FALSE;
21118 	}
21119 
21120 	if (entry->wired_count) {
21121 		/* wired: can't change the map entry... */
21122 		vm_counters.should_cow_but_wired++;
21123 		return FALSE;
21124 	}
21125 
21126 	object = VME_OBJECT(entry);
21127 
21128 	if (object == VM_OBJECT_NULL) {
21129 		/* no object yet... */
21130 		return FALSE;
21131 	}
21132 
21133 	if (!object->internal) {
21134 		/* not an internal object */
21135 		return FALSE;
21136 	}
21137 
21138 	if (object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
21139 		/* not the default copy strategy */
21140 		return FALSE;
21141 	}
21142 
21143 	if (object->true_share) {
21144 		/* already true_share: too late to avoid it */
21145 		return FALSE;
21146 	}
21147 
21148 	if (VME_ALIAS(entry) == VM_MEMORY_MALLOC &&
21149 	    object->vo_size != ANON_CHUNK_SIZE) {
21150 		/* ... not an object created for the ObjC Garbage Collector */
21151 		return FALSE;
21152 	}
21153 
21154 	if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_SMALL &&
21155 	    object->vo_size != 2048 * 4096) {
21156 		/* ... not a "MALLOC_SMALL" heap */
21157 		return FALSE;
21158 	}
21159 
21160 	/*
21161 	 * All the criteria match: we have a large object being targeted for "true_share".
21162 	 * To limit the adverse side-effects linked with "true_share", tell the caller to
21163 	 * try and avoid setting up the entire object for "true_share" by clipping the
21164 	 * targeted range and setting it up for copy-on-write.
21165 	 */
21166 	return TRUE;
21167 }
21168 
21169 vm_map_offset_t
vm_map_round_page_mask(vm_map_offset_t offset,vm_map_offset_t mask)21170 vm_map_round_page_mask(
21171 	vm_map_offset_t offset,
21172 	vm_map_offset_t mask)
21173 {
21174 	return VM_MAP_ROUND_PAGE(offset, mask);
21175 }
21176 
21177 vm_map_offset_t
vm_map_trunc_page_mask(vm_map_offset_t offset,vm_map_offset_t mask)21178 vm_map_trunc_page_mask(
21179 	vm_map_offset_t offset,
21180 	vm_map_offset_t mask)
21181 {
21182 	return VM_MAP_TRUNC_PAGE(offset, mask);
21183 }
21184 
21185 boolean_t
vm_map_page_aligned(vm_map_offset_t offset,vm_map_offset_t mask)21186 vm_map_page_aligned(
21187 	vm_map_offset_t offset,
21188 	vm_map_offset_t mask)
21189 {
21190 	return ((offset) & mask) == 0;
21191 }
21192 
21193 int
vm_map_page_shift(vm_map_t map)21194 vm_map_page_shift(
21195 	vm_map_t map)
21196 {
21197 	return VM_MAP_PAGE_SHIFT(map);
21198 }
21199 
21200 int
vm_map_page_size(vm_map_t map)21201 vm_map_page_size(
21202 	vm_map_t map)
21203 {
21204 	return VM_MAP_PAGE_SIZE(map);
21205 }
21206 
21207 vm_map_offset_t
vm_map_page_mask(vm_map_t map)21208 vm_map_page_mask(
21209 	vm_map_t map)
21210 {
21211 	return VM_MAP_PAGE_MASK(map);
21212 }
21213 
21214 kern_return_t
vm_map_set_page_shift(vm_map_t map,int pageshift)21215 vm_map_set_page_shift(
21216 	vm_map_t        map,
21217 	int             pageshift)
21218 {
21219 	if (map->hdr.nentries != 0) {
21220 		/* too late to change page size */
21221 		return KERN_FAILURE;
21222 	}
21223 
21224 	map->hdr.page_shift = (uint16_t)pageshift;
21225 
21226 	return KERN_SUCCESS;
21227 }
21228 
21229 kern_return_t
vm_map_query_volatile(vm_map_t map,mach_vm_size_t * volatile_virtual_size_p,mach_vm_size_t * volatile_resident_size_p,mach_vm_size_t * volatile_compressed_size_p,mach_vm_size_t * volatile_pmap_size_p,mach_vm_size_t * volatile_compressed_pmap_size_p)21230 vm_map_query_volatile(
21231 	vm_map_t        map,
21232 	mach_vm_size_t  *volatile_virtual_size_p,
21233 	mach_vm_size_t  *volatile_resident_size_p,
21234 	mach_vm_size_t  *volatile_compressed_size_p,
21235 	mach_vm_size_t  *volatile_pmap_size_p,
21236 	mach_vm_size_t  *volatile_compressed_pmap_size_p)
21237 {
21238 	mach_vm_size_t  volatile_virtual_size;
21239 	mach_vm_size_t  volatile_resident_count;
21240 	mach_vm_size_t  volatile_compressed_count;
21241 	mach_vm_size_t  volatile_pmap_count;
21242 	mach_vm_size_t  volatile_compressed_pmap_count;
21243 	mach_vm_size_t  resident_count;
21244 	vm_map_entry_t  entry;
21245 	vm_object_t     object;
21246 
21247 	/* map should be locked by caller */
21248 
21249 	volatile_virtual_size = 0;
21250 	volatile_resident_count = 0;
21251 	volatile_compressed_count = 0;
21252 	volatile_pmap_count = 0;
21253 	volatile_compressed_pmap_count = 0;
21254 
21255 	for (entry = vm_map_first_entry(map);
21256 	    entry != vm_map_to_entry(map);
21257 	    entry = entry->vme_next) {
21258 		mach_vm_size_t  pmap_resident_bytes, pmap_compressed_bytes;
21259 
21260 		if (entry->is_sub_map) {
21261 			continue;
21262 		}
21263 		if (!(entry->protection & VM_PROT_WRITE)) {
21264 			continue;
21265 		}
21266 		object = VME_OBJECT(entry);
21267 		if (object == VM_OBJECT_NULL) {
21268 			continue;
21269 		}
21270 		if (object->purgable != VM_PURGABLE_VOLATILE &&
21271 		    object->purgable != VM_PURGABLE_EMPTY) {
21272 			continue;
21273 		}
21274 		if (VME_OFFSET(entry)) {
21275 			/*
21276 			 * If the map entry has been split and the object now
21277 			 * appears several times in the VM map, we don't want
21278 			 * to count the object's resident_page_count more than
21279 			 * once.  We count it only for the first one, starting
21280 			 * at offset 0 and ignore the other VM map entries.
21281 			 */
21282 			continue;
21283 		}
21284 		resident_count = object->resident_page_count;
21285 		if ((VME_OFFSET(entry) / PAGE_SIZE) >= resident_count) {
21286 			resident_count = 0;
21287 		} else {
21288 			resident_count -= (VME_OFFSET(entry) / PAGE_SIZE);
21289 		}
21290 
21291 		volatile_virtual_size += entry->vme_end - entry->vme_start;
21292 		volatile_resident_count += resident_count;
21293 		if (object->pager) {
21294 			volatile_compressed_count +=
21295 			    vm_compressor_pager_get_count(object->pager);
21296 		}
21297 		pmap_compressed_bytes = 0;
21298 		pmap_resident_bytes =
21299 		    pmap_query_resident(map->pmap,
21300 		    entry->vme_start,
21301 		    entry->vme_end,
21302 		    &pmap_compressed_bytes);
21303 		volatile_pmap_count += (pmap_resident_bytes / PAGE_SIZE);
21304 		volatile_compressed_pmap_count += (pmap_compressed_bytes
21305 		    / PAGE_SIZE);
21306 	}
21307 
21308 	/* map is still locked on return */
21309 
21310 	*volatile_virtual_size_p = volatile_virtual_size;
21311 	*volatile_resident_size_p = volatile_resident_count * PAGE_SIZE;
21312 	*volatile_compressed_size_p = volatile_compressed_count * PAGE_SIZE;
21313 	*volatile_pmap_size_p = volatile_pmap_count * PAGE_SIZE;
21314 	*volatile_compressed_pmap_size_p = volatile_compressed_pmap_count * PAGE_SIZE;
21315 
21316 	return KERN_SUCCESS;
21317 }
21318 
21319 void
vm_map_sizes(vm_map_t map,vm_map_size_t * psize,vm_map_size_t * pfree,vm_map_size_t * plargest_free)21320 vm_map_sizes(vm_map_t map,
21321     vm_map_size_t * psize,
21322     vm_map_size_t * pfree,
21323     vm_map_size_t * plargest_free)
21324 {
21325 	vm_map_entry_t  entry;
21326 	vm_map_offset_t prev;
21327 	vm_map_size_t   free, total_free, largest_free;
21328 	boolean_t       end;
21329 
21330 	if (!map) {
21331 		*psize = *pfree = *plargest_free = 0;
21332 		return;
21333 	}
21334 	total_free = largest_free = 0;
21335 
21336 	vm_map_lock_read(map);
21337 	if (psize) {
21338 		*psize = map->max_offset - map->min_offset;
21339 	}
21340 
21341 	prev = map->min_offset;
21342 	for (entry = vm_map_first_entry(map);; entry = entry->vme_next) {
21343 		end = (entry == vm_map_to_entry(map));
21344 
21345 		if (end) {
21346 			free = entry->vme_end   - prev;
21347 		} else {
21348 			free = entry->vme_start - prev;
21349 		}
21350 
21351 		total_free += free;
21352 		if (free > largest_free) {
21353 			largest_free = free;
21354 		}
21355 
21356 		if (end) {
21357 			break;
21358 		}
21359 		prev = entry->vme_end;
21360 	}
21361 	vm_map_unlock_read(map);
21362 	if (pfree) {
21363 		*pfree = total_free;
21364 	}
21365 	if (plargest_free) {
21366 		*plargest_free = largest_free;
21367 	}
21368 }
21369 
21370 #if VM_SCAN_FOR_SHADOW_CHAIN
21371 int vm_map_shadow_max(vm_map_t map);
21372 int
vm_map_shadow_max(vm_map_t map)21373 vm_map_shadow_max(
21374 	vm_map_t map)
21375 {
21376 	int             shadows, shadows_max;
21377 	vm_map_entry_t  entry;
21378 	vm_object_t     object, next_object;
21379 
21380 	if (map == NULL) {
21381 		return 0;
21382 	}
21383 
21384 	shadows_max = 0;
21385 
21386 	vm_map_lock_read(map);
21387 
21388 	for (entry = vm_map_first_entry(map);
21389 	    entry != vm_map_to_entry(map);
21390 	    entry = entry->vme_next) {
21391 		if (entry->is_sub_map) {
21392 			continue;
21393 		}
21394 		object = VME_OBJECT(entry);
21395 		if (object == NULL) {
21396 			continue;
21397 		}
21398 		vm_object_lock_shared(object);
21399 		for (shadows = 0;
21400 		    object->shadow != NULL;
21401 		    shadows++, object = next_object) {
21402 			next_object = object->shadow;
21403 			vm_object_lock_shared(next_object);
21404 			vm_object_unlock(object);
21405 		}
21406 		vm_object_unlock(object);
21407 		if (shadows > shadows_max) {
21408 			shadows_max = shadows;
21409 		}
21410 	}
21411 
21412 	vm_map_unlock_read(map);
21413 
21414 	return shadows_max;
21415 }
21416 #endif /* VM_SCAN_FOR_SHADOW_CHAIN */
21417 
21418 void
vm_commit_pagezero_status(vm_map_t lmap)21419 vm_commit_pagezero_status(vm_map_t lmap)
21420 {
21421 	pmap_advise_pagezero_range(lmap->pmap, lmap->min_offset);
21422 }
21423 
21424 #if XNU_TARGET_OS_OSX
21425 void
vm_map_set_high_start(vm_map_t map,vm_map_offset_t high_start)21426 vm_map_set_high_start(
21427 	vm_map_t        map,
21428 	vm_map_offset_t high_start)
21429 {
21430 	map->vmmap_high_start = high_start;
21431 }
21432 #endif /* XNU_TARGET_OS_OSX */
21433 
21434 
21435 /*
21436  * FORKED CORPSE FOOTPRINT
21437  *
21438  * A forked corpse gets a copy of the original VM map but its pmap is mostly
21439  * empty since it never ran and never got to fault in any pages.
21440  * Collecting footprint info (via "sysctl vm.self_region_footprint") for
21441  * a forked corpse would therefore return very little information.
21442  *
21443  * When forking a corpse, we can pass the VM_MAP_FORK_CORPSE_FOOTPRINT option
21444  * to vm_map_fork() to collect footprint information from the original VM map
21445  * and its pmap, and store it in the forked corpse's VM map.  That information
21446  * is stored in place of the VM map's "hole list" since we'll never need to
21447  * lookup for holes in the corpse's map.
21448  *
21449  * The corpse's footprint info looks like this:
21450  *
21451  * vm_map->vmmap_corpse_footprint points to pageable kernel memory laid out
21452  * as follows:
21453  *                     +---------------------------------------+
21454  *            header-> | cf_size                               |
21455  *                     +-------------------+-------------------+
21456  *                     | cf_last_region    | cf_last_zeroes    |
21457  *                     +-------------------+-------------------+
21458  *           region1-> | cfr_vaddr                             |
21459  *                     +-------------------+-------------------+
21460  *                     | cfr_num_pages     | d0 | d1 | d2 | d3 |
21461  *                     +---------------------------------------+
21462  *                     | d4 | d5 | ...                         |
21463  *                     +---------------------------------------+
21464  *                     | ...                                   |
21465  *                     +-------------------+-------------------+
21466  *                     | dy | dz | na | na | cfr_vaddr...      | <-region2
21467  *                     +-------------------+-------------------+
21468  *                     | cfr_vaddr (ctd)   | cfr_num_pages     |
21469  *                     +---------------------------------------+
21470  *                     | d0 | d1 ...                           |
21471  *                     +---------------------------------------+
21472  *                       ...
21473  *                     +---------------------------------------+
21474  *       last region-> | cfr_vaddr                             |
21475  *                     +---------------------------------------+
21476  *                     + cfr_num_pages     | d0 | d1 | d2 | d3 |
21477  *                     +---------------------------------------+
21478  *                       ...
21479  *                     +---------------------------------------+
21480  *                     | dx | dy | dz | na | na | na | na | na |
21481  *                     +---------------------------------------+
21482  *
21483  * where:
21484  *      cf_size:	total size of the buffer (rounded to page size)
21485  *      cf_last_region:	offset in the buffer of the last "region" sub-header
21486  *	cf_last_zeroes: number of trailing "zero" dispositions at the end
21487  *			of last region
21488  *	cfr_vaddr:	virtual address of the start of the covered "region"
21489  *	cfr_num_pages:	number of pages in the covered "region"
21490  *	d*:		disposition of the page at that virtual address
21491  * Regions in the buffer are word-aligned.
21492  *
21493  * We estimate the size of the buffer based on the number of memory regions
21494  * and the virtual size of the address space.  While copying each memory region
21495  * during vm_map_fork(), we also collect the footprint info for that region
21496  * and store it in the buffer, packing it as much as possible (coalescing
21497  * contiguous memory regions to avoid having too many region headers and
21498  * avoiding long streaks of "zero" page dispositions by splitting footprint
21499  * "regions", so the number of regions in the footprint buffer might not match
21500  * the number of memory regions in the address space.
21501  *
21502  * We also have to copy the original task's "nonvolatile" ledgers since that's
21503  * part of the footprint and will need to be reported to any tool asking for
21504  * the footprint information of the forked corpse.
21505  */
21506 
21507 uint64_t vm_map_corpse_footprint_count = 0;
21508 uint64_t vm_map_corpse_footprint_size_avg = 0;
21509 uint64_t vm_map_corpse_footprint_size_max = 0;
21510 uint64_t vm_map_corpse_footprint_full = 0;
21511 uint64_t vm_map_corpse_footprint_no_buf = 0;
21512 
21513 struct vm_map_corpse_footprint_header {
21514 	vm_size_t       cf_size;        /* allocated buffer size */
21515 	uint32_t        cf_last_region; /* offset of last region in buffer */
21516 	union {
21517 		uint32_t cfu_last_zeroes; /* during creation:
21518 		                           * number of "zero" dispositions at
21519 		                           * end of last region */
21520 		uint32_t cfu_hint_region; /* during lookup:
21521 		                           * offset of last looked up region */
21522 #define cf_last_zeroes cfu.cfu_last_zeroes
21523 #define cf_hint_region cfu.cfu_hint_region
21524 	} cfu;
21525 };
21526 typedef uint8_t cf_disp_t;
21527 struct vm_map_corpse_footprint_region {
21528 	vm_map_offset_t cfr_vaddr;      /* region start virtual address */
21529 	uint32_t        cfr_num_pages;  /* number of pages in this "region" */
21530 	cf_disp_t   cfr_disposition[0]; /* disposition of each page */
21531 } __attribute__((packed));
21532 
21533 static cf_disp_t
vm_page_disposition_to_cf_disp(int disposition)21534 vm_page_disposition_to_cf_disp(
21535 	int disposition)
21536 {
21537 	assert(sizeof(cf_disp_t) == 1);
21538 	/* relocate bits that don't fit in a "uint8_t" */
21539 	if (disposition & VM_PAGE_QUERY_PAGE_REUSABLE) {
21540 		disposition |= VM_PAGE_QUERY_PAGE_FICTITIOUS;
21541 	}
21542 	/* cast gets rid of extra bits */
21543 	return (cf_disp_t) disposition;
21544 }
21545 
21546 static int
vm_page_cf_disp_to_disposition(cf_disp_t cf_disp)21547 vm_page_cf_disp_to_disposition(
21548 	cf_disp_t cf_disp)
21549 {
21550 	int disposition;
21551 
21552 	assert(sizeof(cf_disp_t) == 1);
21553 	disposition = (int) cf_disp;
21554 	/* move relocated bits back in place */
21555 	if (cf_disp & VM_PAGE_QUERY_PAGE_FICTITIOUS) {
21556 		disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
21557 		disposition &= ~VM_PAGE_QUERY_PAGE_FICTITIOUS;
21558 	}
21559 	return disposition;
21560 }
21561 
21562 /*
21563  * vm_map_corpse_footprint_new_region:
21564  *      closes the current footprint "region" and creates a new one
21565  *
21566  * Returns NULL if there's not enough space in the buffer for a new region.
21567  */
21568 static struct vm_map_corpse_footprint_region *
vm_map_corpse_footprint_new_region(struct vm_map_corpse_footprint_header * footprint_header)21569 vm_map_corpse_footprint_new_region(
21570 	struct vm_map_corpse_footprint_header *footprint_header)
21571 {
21572 	uintptr_t       footprint_edge;
21573 	uint32_t        new_region_offset;
21574 	struct vm_map_corpse_footprint_region *footprint_region;
21575 	struct vm_map_corpse_footprint_region *new_footprint_region;
21576 
21577 	footprint_edge = ((uintptr_t)footprint_header +
21578 	    footprint_header->cf_size);
21579 	footprint_region = ((struct vm_map_corpse_footprint_region *)
21580 	    ((char *)footprint_header +
21581 	    footprint_header->cf_last_region));
21582 	assert((uintptr_t)footprint_region + sizeof(*footprint_region) <=
21583 	    footprint_edge);
21584 
21585 	/* get rid of trailing zeroes in the last region */
21586 	assert(footprint_region->cfr_num_pages >=
21587 	    footprint_header->cf_last_zeroes);
21588 	footprint_region->cfr_num_pages -=
21589 	    footprint_header->cf_last_zeroes;
21590 	footprint_header->cf_last_zeroes = 0;
21591 
21592 	/* reuse this region if it's now empty */
21593 	if (footprint_region->cfr_num_pages == 0) {
21594 		return footprint_region;
21595 	}
21596 
21597 	/* compute offset of new region */
21598 	new_region_offset = footprint_header->cf_last_region;
21599 	new_region_offset += sizeof(*footprint_region);
21600 	new_region_offset += (footprint_region->cfr_num_pages * sizeof(cf_disp_t));
21601 	new_region_offset = roundup(new_region_offset, sizeof(int));
21602 
21603 	/* check if we're going over the edge */
21604 	if (((uintptr_t)footprint_header +
21605 	    new_region_offset +
21606 	    sizeof(*footprint_region)) >=
21607 	    footprint_edge) {
21608 		/* over the edge: no new region */
21609 		return NULL;
21610 	}
21611 
21612 	/* adjust offset of last region in header */
21613 	footprint_header->cf_last_region = new_region_offset;
21614 
21615 	new_footprint_region = (struct vm_map_corpse_footprint_region *)
21616 	    ((char *)footprint_header +
21617 	    footprint_header->cf_last_region);
21618 	new_footprint_region->cfr_vaddr = 0;
21619 	new_footprint_region->cfr_num_pages = 0;
21620 	/* caller needs to initialize new region */
21621 
21622 	return new_footprint_region;
21623 }
21624 
21625 /*
21626  * vm_map_corpse_footprint_collect:
21627  *	collect footprint information for "old_entry" in "old_map" and
21628  *	stores it in "new_map"'s vmmap_footprint_info.
21629  */
21630 kern_return_t
vm_map_corpse_footprint_collect(vm_map_t old_map,vm_map_entry_t old_entry,vm_map_t new_map)21631 vm_map_corpse_footprint_collect(
21632 	vm_map_t        old_map,
21633 	vm_map_entry_t  old_entry,
21634 	vm_map_t        new_map)
21635 {
21636 	vm_map_offset_t va;
21637 	kern_return_t   kr;
21638 	struct vm_map_corpse_footprint_header *footprint_header;
21639 	struct vm_map_corpse_footprint_region *footprint_region;
21640 	struct vm_map_corpse_footprint_region *new_footprint_region;
21641 	cf_disp_t       *next_disp_p;
21642 	uintptr_t       footprint_edge;
21643 	uint32_t        num_pages_tmp;
21644 	int             effective_page_size;
21645 
21646 	effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(old_map));
21647 
21648 	va = old_entry->vme_start;
21649 
21650 	vm_map_lock_assert_exclusive(old_map);
21651 	vm_map_lock_assert_exclusive(new_map);
21652 
21653 	assert(new_map->has_corpse_footprint);
21654 	assert(!old_map->has_corpse_footprint);
21655 	if (!new_map->has_corpse_footprint ||
21656 	    old_map->has_corpse_footprint) {
21657 		/*
21658 		 * This can only transfer footprint info from a
21659 		 * map with a live pmap to a map with a corpse footprint.
21660 		 */
21661 		return KERN_NOT_SUPPORTED;
21662 	}
21663 
21664 	if (new_map->vmmap_corpse_footprint == NULL) {
21665 		vm_offset_t     buf;
21666 		vm_size_t       buf_size;
21667 
21668 		buf = 0;
21669 		buf_size = (sizeof(*footprint_header) +
21670 		    (old_map->hdr.nentries
21671 		    *
21672 		    (sizeof(*footprint_region) +
21673 		    +3))            /* potential alignment for each region */
21674 		    +
21675 		    ((old_map->size / effective_page_size)
21676 		    *
21677 		    sizeof(cf_disp_t)));      /* disposition for each page */
21678 //		printf("FBDP corpse map %p guestimate footprint size 0x%llx\n", new_map, (uint64_t) buf_size);
21679 		buf_size = round_page(buf_size);
21680 
21681 		/* limit buffer to 1 page to validate overflow detection */
21682 //		buf_size = PAGE_SIZE;
21683 
21684 		/* limit size to a somewhat sane amount */
21685 #if XNU_TARGET_OS_OSX
21686 #define VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE   (8*1024*1024)   /* 8MB */
21687 #else /* XNU_TARGET_OS_OSX */
21688 #define VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE   (256*1024)      /* 256KB */
21689 #endif /* XNU_TARGET_OS_OSX */
21690 		if (buf_size > VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE) {
21691 			buf_size = VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE;
21692 		}
21693 
21694 		/*
21695 		 * Allocate the pageable buffer (with a trailing guard page).
21696 		 * It will be zero-filled on demand.
21697 		 */
21698 		kr = kmem_alloc(kernel_map, &buf, buf_size + PAGE_SIZE,
21699 		    KMA_DATA | KMA_PAGEABLE | KMA_GUARD_LAST,
21700 		    VM_KERN_MEMORY_DIAG);
21701 		if (kr != KERN_SUCCESS) {
21702 			vm_map_corpse_footprint_no_buf++;
21703 			return kr;
21704 		}
21705 
21706 		/* initialize header and 1st region */
21707 		footprint_header = (struct vm_map_corpse_footprint_header *)buf;
21708 		new_map->vmmap_corpse_footprint = footprint_header;
21709 
21710 		footprint_header->cf_size = buf_size;
21711 		footprint_header->cf_last_region =
21712 		    sizeof(*footprint_header);
21713 		footprint_header->cf_last_zeroes = 0;
21714 
21715 		footprint_region = (struct vm_map_corpse_footprint_region *)
21716 		    ((char *)footprint_header +
21717 		    footprint_header->cf_last_region);
21718 		footprint_region->cfr_vaddr = 0;
21719 		footprint_region->cfr_num_pages = 0;
21720 	} else {
21721 		/* retrieve header and last region */
21722 		footprint_header = (struct vm_map_corpse_footprint_header *)
21723 		    new_map->vmmap_corpse_footprint;
21724 		footprint_region = (struct vm_map_corpse_footprint_region *)
21725 		    ((char *)footprint_header +
21726 		    footprint_header->cf_last_region);
21727 	}
21728 	footprint_edge = ((uintptr_t)footprint_header +
21729 	    footprint_header->cf_size);
21730 
21731 	if ((footprint_region->cfr_vaddr +
21732 	    (((vm_map_offset_t)footprint_region->cfr_num_pages) *
21733 	    effective_page_size))
21734 	    != old_entry->vme_start) {
21735 		uint64_t num_pages_delta, num_pages_delta_size;
21736 		uint32_t region_offset_delta_size;
21737 
21738 		/*
21739 		 * Not the next contiguous virtual address:
21740 		 * start a new region or store "zero" dispositions for
21741 		 * the missing pages?
21742 		 */
21743 		/* size of gap in actual page dispositions */
21744 		num_pages_delta = ((old_entry->vme_start -
21745 		    footprint_region->cfr_vaddr) / effective_page_size)
21746 		    - footprint_region->cfr_num_pages;
21747 		num_pages_delta_size = num_pages_delta * sizeof(cf_disp_t);
21748 		/* size of gap as a new footprint region header */
21749 		region_offset_delta_size =
21750 		    (sizeof(*footprint_region) +
21751 		    roundup(((footprint_region->cfr_num_pages -
21752 		    footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)),
21753 		    sizeof(int)) -
21754 		    ((footprint_region->cfr_num_pages -
21755 		    footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)));
21756 //		printf("FBDP %s:%d region 0x%x 0x%llx 0x%x vme_start 0x%llx pages_delta 0x%llx region_delta 0x%x\n", __FUNCTION__, __LINE__, footprint_header->cf_last_region, footprint_region->cfr_vaddr, footprint_region->cfr_num_pages, old_entry->vme_start, num_pages_delta, region_offset_delta);
21757 		if (region_offset_delta_size < num_pages_delta_size ||
21758 		    os_add3_overflow(footprint_region->cfr_num_pages,
21759 		    (uint32_t) num_pages_delta,
21760 		    1,
21761 		    &num_pages_tmp)) {
21762 			/*
21763 			 * Storing data for this gap would take more space
21764 			 * than inserting a new footprint region header:
21765 			 * let's start a new region and save space. If it's a
21766 			 * tie, let's avoid using a new region, since that
21767 			 * would require more region hops to find the right
21768 			 * range during lookups.
21769 			 *
21770 			 * If the current region's cfr_num_pages would overflow
21771 			 * if we added "zero" page dispositions for the gap,
21772 			 * no choice but to start a new region.
21773 			 */
21774 //			printf("FBDP %s:%d new region\n", __FUNCTION__, __LINE__);
21775 			new_footprint_region =
21776 			    vm_map_corpse_footprint_new_region(footprint_header);
21777 			/* check that we're not going over the edge */
21778 			if (new_footprint_region == NULL) {
21779 				goto over_the_edge;
21780 			}
21781 			footprint_region = new_footprint_region;
21782 			/* initialize new region as empty */
21783 			footprint_region->cfr_vaddr = old_entry->vme_start;
21784 			footprint_region->cfr_num_pages = 0;
21785 		} else {
21786 			/*
21787 			 * Store "zero" page dispositions for the missing
21788 			 * pages.
21789 			 */
21790 //			printf("FBDP %s:%d zero gap\n", __FUNCTION__, __LINE__);
21791 			for (; num_pages_delta > 0; num_pages_delta--) {
21792 				next_disp_p = (cf_disp_t *)
21793 				    ((uintptr_t) footprint_region +
21794 				    sizeof(*footprint_region));
21795 				next_disp_p += footprint_region->cfr_num_pages;
21796 				/* check that we're not going over the edge */
21797 				if ((uintptr_t)next_disp_p >= footprint_edge) {
21798 					goto over_the_edge;
21799 				}
21800 				/* store "zero" disposition for this gap page */
21801 				footprint_region->cfr_num_pages++;
21802 				*next_disp_p = (cf_disp_t) 0;
21803 				footprint_header->cf_last_zeroes++;
21804 			}
21805 		}
21806 	}
21807 
21808 	for (va = old_entry->vme_start;
21809 	    va < old_entry->vme_end;
21810 	    va += effective_page_size) {
21811 		int             disposition;
21812 		cf_disp_t       cf_disp;
21813 
21814 		vm_map_footprint_query_page_info(old_map,
21815 		    old_entry,
21816 		    va,
21817 		    &disposition);
21818 		cf_disp = vm_page_disposition_to_cf_disp(disposition);
21819 
21820 //		if (va < SHARED_REGION_BASE_ARM64) printf("FBDP collect map %p va 0x%llx disp 0x%x\n", new_map, va, disp);
21821 
21822 		if (cf_disp == 0 && footprint_region->cfr_num_pages == 0) {
21823 			/*
21824 			 * Ignore "zero" dispositions at start of
21825 			 * region: just move start of region.
21826 			 */
21827 			footprint_region->cfr_vaddr += effective_page_size;
21828 			continue;
21829 		}
21830 
21831 		/* would region's cfr_num_pages overflow? */
21832 		if (os_add_overflow(footprint_region->cfr_num_pages, 1,
21833 		    &num_pages_tmp)) {
21834 			/* overflow: create a new region */
21835 			new_footprint_region =
21836 			    vm_map_corpse_footprint_new_region(
21837 				footprint_header);
21838 			if (new_footprint_region == NULL) {
21839 				goto over_the_edge;
21840 			}
21841 			footprint_region = new_footprint_region;
21842 			footprint_region->cfr_vaddr = va;
21843 			footprint_region->cfr_num_pages = 0;
21844 		}
21845 
21846 		next_disp_p = (cf_disp_t *) ((uintptr_t) footprint_region +
21847 		    sizeof(*footprint_region));
21848 		next_disp_p += footprint_region->cfr_num_pages;
21849 		/* check that we're not going over the edge */
21850 		if ((uintptr_t)next_disp_p >= footprint_edge) {
21851 			goto over_the_edge;
21852 		}
21853 		/* store this dispostion */
21854 		*next_disp_p = cf_disp;
21855 		footprint_region->cfr_num_pages++;
21856 
21857 		if (cf_disp != 0) {
21858 			/* non-zero disp: break the current zero streak */
21859 			footprint_header->cf_last_zeroes = 0;
21860 			/* done */
21861 			continue;
21862 		}
21863 
21864 		/* zero disp: add to the current streak of zeroes */
21865 		footprint_header->cf_last_zeroes++;
21866 		if ((footprint_header->cf_last_zeroes +
21867 		    roundup(((footprint_region->cfr_num_pages -
21868 		    footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)) &
21869 		    (sizeof(int) - 1),
21870 		    sizeof(int))) <
21871 		    (sizeof(*footprint_header))) {
21872 			/*
21873 			 * There are not enough trailing "zero" dispositions
21874 			 * (+ the extra padding we would need for the previous
21875 			 * region); creating a new region would not save space
21876 			 * at this point, so let's keep this "zero" disposition
21877 			 * in this region and reconsider later.
21878 			 */
21879 			continue;
21880 		}
21881 		/*
21882 		 * Create a new region to avoid having too many consecutive
21883 		 * "zero" dispositions.
21884 		 */
21885 		new_footprint_region =
21886 		    vm_map_corpse_footprint_new_region(footprint_header);
21887 		if (new_footprint_region == NULL) {
21888 			goto over_the_edge;
21889 		}
21890 		footprint_region = new_footprint_region;
21891 		/* initialize the new region as empty ... */
21892 		footprint_region->cfr_num_pages = 0;
21893 		/* ... and skip this "zero" disp */
21894 		footprint_region->cfr_vaddr = va + effective_page_size;
21895 	}
21896 
21897 	return KERN_SUCCESS;
21898 
21899 over_the_edge:
21900 //	printf("FBDP map %p footprint was full for va 0x%llx\n", new_map, va);
21901 	vm_map_corpse_footprint_full++;
21902 	return KERN_RESOURCE_SHORTAGE;
21903 }
21904 
21905 /*
21906  * vm_map_corpse_footprint_collect_done:
21907  *	completes the footprint collection by getting rid of any remaining
21908  *	trailing "zero" dispositions and trimming the unused part of the
21909  *	kernel buffer
21910  */
21911 void
vm_map_corpse_footprint_collect_done(vm_map_t new_map)21912 vm_map_corpse_footprint_collect_done(
21913 	vm_map_t        new_map)
21914 {
21915 	struct vm_map_corpse_footprint_header *footprint_header;
21916 	struct vm_map_corpse_footprint_region *footprint_region;
21917 	vm_size_t       buf_size, actual_size;
21918 	kern_return_t   kr;
21919 
21920 	assert(new_map->has_corpse_footprint);
21921 	if (!new_map->has_corpse_footprint ||
21922 	    new_map->vmmap_corpse_footprint == NULL) {
21923 		return;
21924 	}
21925 
21926 	footprint_header = (struct vm_map_corpse_footprint_header *)
21927 	    new_map->vmmap_corpse_footprint;
21928 	buf_size = footprint_header->cf_size;
21929 
21930 	footprint_region = (struct vm_map_corpse_footprint_region *)
21931 	    ((char *)footprint_header +
21932 	    footprint_header->cf_last_region);
21933 
21934 	/* get rid of trailing zeroes in last region */
21935 	assert(footprint_region->cfr_num_pages >= footprint_header->cf_last_zeroes);
21936 	footprint_region->cfr_num_pages -= footprint_header->cf_last_zeroes;
21937 	footprint_header->cf_last_zeroes = 0;
21938 
21939 	actual_size = (vm_size_t)(footprint_header->cf_last_region +
21940 	    sizeof(*footprint_region) +
21941 	    (footprint_region->cfr_num_pages * sizeof(cf_disp_t)));
21942 
21943 //	printf("FBDP map %p buf_size 0x%llx actual_size 0x%llx\n", new_map, (uint64_t) buf_size, (uint64_t) actual_size);
21944 	vm_map_corpse_footprint_size_avg =
21945 	    (((vm_map_corpse_footprint_size_avg *
21946 	    vm_map_corpse_footprint_count) +
21947 	    actual_size) /
21948 	    (vm_map_corpse_footprint_count + 1));
21949 	vm_map_corpse_footprint_count++;
21950 	if (actual_size > vm_map_corpse_footprint_size_max) {
21951 		vm_map_corpse_footprint_size_max = actual_size;
21952 	}
21953 
21954 	actual_size = round_page(actual_size);
21955 	if (buf_size > actual_size) {
21956 		kr = vm_deallocate(kernel_map,
21957 		    ((vm_address_t)footprint_header +
21958 		    actual_size +
21959 		    PAGE_SIZE),                 /* trailing guard page */
21960 		    (buf_size - actual_size));
21961 		assertf(kr == KERN_SUCCESS,
21962 		    "trim: footprint_header %p buf_size 0x%llx actual_size 0x%llx kr=0x%x\n",
21963 		    footprint_header,
21964 		    (uint64_t) buf_size,
21965 		    (uint64_t) actual_size,
21966 		    kr);
21967 		kr = vm_protect(kernel_map,
21968 		    ((vm_address_t)footprint_header +
21969 		    actual_size),
21970 		    PAGE_SIZE,
21971 		    FALSE,             /* set_maximum */
21972 		    VM_PROT_NONE);
21973 		assertf(kr == KERN_SUCCESS,
21974 		    "guard: footprint_header %p buf_size 0x%llx actual_size 0x%llx kr=0x%x\n",
21975 		    footprint_header,
21976 		    (uint64_t) buf_size,
21977 		    (uint64_t) actual_size,
21978 		    kr);
21979 	}
21980 
21981 	footprint_header->cf_size = actual_size;
21982 }
21983 
21984 /*
21985  * vm_map_corpse_footprint_query_page_info:
21986  *	retrieves the disposition of the page at virtual address "vaddr"
21987  *	in the forked corpse's VM map
21988  *
21989  * This is the equivalent of vm_map_footprint_query_page_info() for a forked corpse.
21990  */
21991 kern_return_t
vm_map_corpse_footprint_query_page_info(vm_map_t map,vm_map_offset_t va,int * disposition_p)21992 vm_map_corpse_footprint_query_page_info(
21993 	vm_map_t        map,
21994 	vm_map_offset_t va,
21995 	int             *disposition_p)
21996 {
21997 	struct vm_map_corpse_footprint_header *footprint_header;
21998 	struct vm_map_corpse_footprint_region *footprint_region;
21999 	uint32_t        footprint_region_offset;
22000 	vm_map_offset_t region_start, region_end;
22001 	int             disp_idx;
22002 	kern_return_t   kr;
22003 	int             effective_page_size;
22004 	cf_disp_t       cf_disp;
22005 
22006 	if (!map->has_corpse_footprint) {
22007 		*disposition_p = 0;
22008 		kr = KERN_INVALID_ARGUMENT;
22009 		goto done;
22010 	}
22011 
22012 	footprint_header = map->vmmap_corpse_footprint;
22013 	if (footprint_header == NULL) {
22014 		*disposition_p = 0;
22015 //		if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p);
22016 		kr = KERN_INVALID_ARGUMENT;
22017 		goto done;
22018 	}
22019 
22020 	/* start looking at the hint ("cf_hint_region") */
22021 	footprint_region_offset = footprint_header->cf_hint_region;
22022 
22023 	effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(map));
22024 
22025 lookup_again:
22026 	if (footprint_region_offset < sizeof(*footprint_header)) {
22027 		/* hint too low: start from 1st region */
22028 		footprint_region_offset = sizeof(*footprint_header);
22029 	}
22030 	if (footprint_region_offset >= footprint_header->cf_last_region) {
22031 		/* hint too high: re-start from 1st region */
22032 		footprint_region_offset = sizeof(*footprint_header);
22033 	}
22034 	footprint_region = (struct vm_map_corpse_footprint_region *)
22035 	    ((char *)footprint_header + footprint_region_offset);
22036 	region_start = footprint_region->cfr_vaddr;
22037 	region_end = (region_start +
22038 	    ((vm_map_offset_t)(footprint_region->cfr_num_pages) *
22039 	    effective_page_size));
22040 	if (va < region_start &&
22041 	    footprint_region_offset != sizeof(*footprint_header)) {
22042 		/* our range starts before the hint region */
22043 
22044 		/* reset the hint (in a racy way...) */
22045 		footprint_header->cf_hint_region = sizeof(*footprint_header);
22046 		/* lookup "va" again from 1st region */
22047 		footprint_region_offset = sizeof(*footprint_header);
22048 		goto lookup_again;
22049 	}
22050 
22051 	while (va >= region_end) {
22052 		if (footprint_region_offset >= footprint_header->cf_last_region) {
22053 			break;
22054 		}
22055 		/* skip the region's header */
22056 		footprint_region_offset += sizeof(*footprint_region);
22057 		/* skip the region's page dispositions */
22058 		footprint_region_offset += (footprint_region->cfr_num_pages * sizeof(cf_disp_t));
22059 		/* align to next word boundary */
22060 		footprint_region_offset =
22061 		    roundup(footprint_region_offset,
22062 		    sizeof(int));
22063 		footprint_region = (struct vm_map_corpse_footprint_region *)
22064 		    ((char *)footprint_header + footprint_region_offset);
22065 		region_start = footprint_region->cfr_vaddr;
22066 		region_end = (region_start +
22067 		    ((vm_map_offset_t)(footprint_region->cfr_num_pages) *
22068 		    effective_page_size));
22069 	}
22070 	if (va < region_start || va >= region_end) {
22071 		/* page not found */
22072 		*disposition_p = 0;
22073 //		if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p);
22074 		kr = KERN_SUCCESS;
22075 		goto done;
22076 	}
22077 
22078 	/* "va" found: set the lookup hint for next lookup (in a racy way...) */
22079 	footprint_header->cf_hint_region = footprint_region_offset;
22080 
22081 	/* get page disposition for "va" in this region */
22082 	disp_idx = (int) ((va - footprint_region->cfr_vaddr) / effective_page_size);
22083 	cf_disp = footprint_region->cfr_disposition[disp_idx];
22084 	*disposition_p = vm_page_cf_disp_to_disposition(cf_disp);
22085 	kr = KERN_SUCCESS;
22086 done:
22087 //	if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p);
22088 	/* dtrace -n 'vminfo:::footprint_query_page_info { printf("map 0x%p va 0x%llx disp 0x%x kr 0x%x", arg0, arg1, arg2, arg3); }' */
22089 	DTRACE_VM4(footprint_query_page_info,
22090 	    vm_map_t, map,
22091 	    vm_map_offset_t, va,
22092 	    int, *disposition_p,
22093 	    kern_return_t, kr);
22094 
22095 	return kr;
22096 }
22097 
22098 void
vm_map_corpse_footprint_destroy(vm_map_t map)22099 vm_map_corpse_footprint_destroy(
22100 	vm_map_t        map)
22101 {
22102 	if (map->has_corpse_footprint &&
22103 	    map->vmmap_corpse_footprint != 0) {
22104 		struct vm_map_corpse_footprint_header *footprint_header;
22105 		vm_size_t buf_size;
22106 		kern_return_t kr;
22107 
22108 		footprint_header = map->vmmap_corpse_footprint;
22109 		buf_size = footprint_header->cf_size;
22110 		kr = vm_deallocate(kernel_map,
22111 		    (vm_offset_t) map->vmmap_corpse_footprint,
22112 		    ((vm_size_t) buf_size
22113 		    + PAGE_SIZE));                 /* trailing guard page */
22114 		assertf(kr == KERN_SUCCESS, "kr=0x%x\n", kr);
22115 		map->vmmap_corpse_footprint = 0;
22116 		map->has_corpse_footprint = FALSE;
22117 	}
22118 }
22119 
22120 /*
22121  * vm_map_copy_footprint_ledgers:
22122  *	copies any ledger that's relevant to the memory footprint of "old_task"
22123  *	into the forked corpse's task ("new_task")
22124  */
22125 void
vm_map_copy_footprint_ledgers(task_t old_task,task_t new_task)22126 vm_map_copy_footprint_ledgers(
22127 	task_t  old_task,
22128 	task_t  new_task)
22129 {
22130 	vm_map_copy_ledger(old_task, new_task, task_ledgers.phys_footprint);
22131 	vm_map_copy_ledger(old_task, new_task, task_ledgers.purgeable_nonvolatile);
22132 	vm_map_copy_ledger(old_task, new_task, task_ledgers.purgeable_nonvolatile_compressed);
22133 	vm_map_copy_ledger(old_task, new_task, task_ledgers.internal);
22134 	vm_map_copy_ledger(old_task, new_task, task_ledgers.internal_compressed);
22135 	vm_map_copy_ledger(old_task, new_task, task_ledgers.iokit_mapped);
22136 	vm_map_copy_ledger(old_task, new_task, task_ledgers.alternate_accounting);
22137 	vm_map_copy_ledger(old_task, new_task, task_ledgers.alternate_accounting_compressed);
22138 	vm_map_copy_ledger(old_task, new_task, task_ledgers.page_table);
22139 	vm_map_copy_ledger(old_task, new_task, task_ledgers.tagged_footprint);
22140 	vm_map_copy_ledger(old_task, new_task, task_ledgers.tagged_footprint_compressed);
22141 	vm_map_copy_ledger(old_task, new_task, task_ledgers.network_nonvolatile);
22142 	vm_map_copy_ledger(old_task, new_task, task_ledgers.network_nonvolatile_compressed);
22143 	vm_map_copy_ledger(old_task, new_task, task_ledgers.media_footprint);
22144 	vm_map_copy_ledger(old_task, new_task, task_ledgers.media_footprint_compressed);
22145 	vm_map_copy_ledger(old_task, new_task, task_ledgers.graphics_footprint);
22146 	vm_map_copy_ledger(old_task, new_task, task_ledgers.graphics_footprint_compressed);
22147 	vm_map_copy_ledger(old_task, new_task, task_ledgers.neural_footprint);
22148 	vm_map_copy_ledger(old_task, new_task, task_ledgers.neural_footprint_compressed);
22149 	vm_map_copy_ledger(old_task, new_task, task_ledgers.wired_mem);
22150 }
22151 
22152 /*
22153  * vm_map_copy_ledger:
22154  *	copy a single ledger from "old_task" to "new_task"
22155  */
22156 void
vm_map_copy_ledger(task_t old_task,task_t new_task,int ledger_entry)22157 vm_map_copy_ledger(
22158 	task_t  old_task,
22159 	task_t  new_task,
22160 	int     ledger_entry)
22161 {
22162 	ledger_amount_t old_balance, new_balance, delta;
22163 
22164 	assert(new_task->map->has_corpse_footprint);
22165 	if (!new_task->map->has_corpse_footprint) {
22166 		return;
22167 	}
22168 
22169 	/* turn off sanity checks for the ledger we're about to mess with */
22170 	ledger_disable_panic_on_negative(new_task->ledger,
22171 	    ledger_entry);
22172 
22173 	/* adjust "new_task" to match "old_task" */
22174 	ledger_get_balance(old_task->ledger,
22175 	    ledger_entry,
22176 	    &old_balance);
22177 	ledger_get_balance(new_task->ledger,
22178 	    ledger_entry,
22179 	    &new_balance);
22180 	if (new_balance == old_balance) {
22181 		/* new == old: done */
22182 	} else if (new_balance > old_balance) {
22183 		/* new > old ==> new -= new - old */
22184 		delta = new_balance - old_balance;
22185 		ledger_debit(new_task->ledger,
22186 		    ledger_entry,
22187 		    delta);
22188 	} else {
22189 		/* new < old ==> new += old - new */
22190 		delta = old_balance - new_balance;
22191 		ledger_credit(new_task->ledger,
22192 		    ledger_entry,
22193 		    delta);
22194 	}
22195 }
22196 
22197 /*
22198  * vm_map_get_pmap:
22199  * returns the pmap associated with the vm_map
22200  */
22201 pmap_t
vm_map_get_pmap(vm_map_t map)22202 vm_map_get_pmap(vm_map_t map)
22203 {
22204 	return vm_map_pmap(map);
22205 }
22206 
22207 #if CONFIG_MAP_RANGES
22208 /*
22209  * vm_map_range_map_init:
22210  *  initializes the VM range ID map to enable index lookup
22211  *  of user VM ranges based on VM tag from userspace.
22212  */
22213 static void
vm_map_range_map_init(void)22214 vm_map_range_map_init(void)
22215 {
22216 	/* maintain status quo by default */
22217 	for (int i = 0; i < VM_MEMORY_COUNT; i++) {
22218 		vm_map_range_id_map[i] = UMEM_RANGE_ID_DEFAULT;
22219 	}
22220 
22221 	/* move all MALLOC allocations to heap range  */
22222 	vm_map_range_id_map[VM_MEMORY_MALLOC] = UMEM_RANGE_ID_HEAP;
22223 	vm_map_range_id_map[VM_MEMORY_MALLOC_HUGE] = UMEM_RANGE_ID_HEAP;
22224 	vm_map_range_id_map[VM_MEMORY_MALLOC_LARGE] = UMEM_RANGE_ID_HEAP;
22225 	vm_map_range_id_map[VM_MEMORY_MALLOC_LARGE_REUSABLE] = UMEM_RANGE_ID_HEAP;
22226 	vm_map_range_id_map[VM_MEMORY_MALLOC_LARGE_REUSED] = UMEM_RANGE_ID_HEAP;
22227 	vm_map_range_id_map[VM_MEMORY_MALLOC_MEDIUM] = UMEM_RANGE_ID_HEAP;
22228 	vm_map_range_id_map[VM_MEMORY_MALLOC_NANO] = UMEM_RANGE_ID_HEAP;
22229 	vm_map_range_id_map[VM_MEMORY_MALLOC_PGUARD] = UMEM_RANGE_ID_HEAP;
22230 	vm_map_range_id_map[VM_MEMORY_MALLOC_PROB_GUARD] = UMEM_RANGE_ID_HEAP;
22231 	vm_map_range_id_map[VM_MEMORY_MALLOC_SMALL] = UMEM_RANGE_ID_HEAP;
22232 	vm_map_range_id_map[VM_MEMORY_MALLOC_TINY] = UMEM_RANGE_ID_HEAP;
22233 }
22234 
22235 /*
22236  * vm_map_range_configure:
22237  *	configures the user vm_map ranges by increasing the maximum VA range of
22238  *  the map and carving out a range at the end of VA space (searching backwards
22239  *  in the newly expanded map).
22240  */
22241 kern_return_t
vm_map_range_configure(vm_map_t map)22242 vm_map_range_configure(vm_map_t map)
22243 {
22244 	vm_map_size_t           addr_space_size;
22245 	vm_map_offset_t         start, end, saved_max, random_addr;
22246 
22247 	if (!vm_map_user_ranges) {
22248 		return KERN_SUCCESS;
22249 	}
22250 
22251 	/* Should not be applying ranges to kernel map or kernel map submaps */
22252 	assert(map != kernel_map);
22253 	assert(vm_map_pmap(map) != kernel_pmap);
22254 
22255 	/* save the existing max offset */
22256 	vm_map_lock_read(map);
22257 	saved_max = map->max_offset;
22258 	vm_map_unlock_read(map);
22259 
22260 	/*
22261 	 * Check that we're not already jumbo'd. If so we cannot guarantee that
22262 	 * we can set up the ranges safely without interfering with the existing
22263 	 * map.
22264 	 */
22265 	if (saved_max > vm_compute_max_offset(vm_map_is_64bit(map))) {
22266 		return KERN_NO_SPACE;
22267 	}
22268 
22269 	/* expand the default VM space to the largest possible address */
22270 	vm_map_set_jumbo(map);
22271 
22272 	vm_map_lock(map);
22273 	addr_space_size = map->max_offset - saved_max;
22274 
22275 	if (addr_space_size <= VM_MAP_USER_RANGE_MAX) {
22276 		vm_map_unlock(map);
22277 		return KERN_NO_SPACE;
22278 	}
22279 
22280 	addr_space_size -= VM_MAP_USER_RANGE_MAX;
22281 	random_addr = (vm_map_offset_t)random();
22282 	random_addr <<= VM_MAP_PAGE_SHIFT(map);
22283 	random_addr %= addr_space_size;
22284 
22285 	/*
22286 	 * round off the start so we begin on a L2 TT boundary and ensure we have
22287 	 * at least a ARM_TT_L2_SIZE sized hole between existing map range and
22288 	 * new range(s).
22289 	 */
22290 	start = vm_map_round_page(saved_max + random_addr + 1, ARM_TT_L2_OFFMASK);
22291 	end = MIN(map->max_offset, start + VM_MAP_USER_RANGE_MAX);
22292 	assert(start > saved_max);
22293 	assert(end <= map->max_offset);
22294 
22295 	/* default range covers the "normal" heap range */
22296 	map->user_range[UMEM_RANGE_ID_DEFAULT].min_address = map->min_offset;
22297 	map->user_range[UMEM_RANGE_ID_DEFAULT].max_address = saved_max;
22298 
22299 	/* heap range covers the new extended range */
22300 	map->user_range[UMEM_RANGE_ID_HEAP].min_address = start;
22301 	map->user_range[UMEM_RANGE_ID_HEAP].max_address = end;
22302 	map->uses_user_ranges = true;
22303 	vm_map_unlock(map);
22304 
22305 	return KERN_SUCCESS;
22306 }
22307 
22308 /*
22309  * vm_map_range_fork:
22310  *	clones the array of ranges from old_map to new_map in support
22311  *  of a VM map fork.
22312  */
22313 void
vm_map_range_fork(vm_map_t new_map,vm_map_t old_map)22314 vm_map_range_fork(vm_map_t new_map, vm_map_t old_map)
22315 {
22316 	int i = 0;
22317 
22318 	if (!old_map->uses_user_ranges) {
22319 		/* nothing to do */
22320 		return;
22321 	}
22322 
22323 	for (i = 0; i < UMEM_RANGE_COUNT; i++) {
22324 		new_map->user_range[i].min_address = old_map->user_range[i].min_address;
22325 		new_map->user_range[i].max_address = old_map->user_range[i].max_address;
22326 	}
22327 
22328 	new_map->uses_user_ranges = true;
22329 }
22330 
22331 /*
22332  * vm_map_get_user_range_id:
22333  *	looks up the vm_map_range_id_map lookup table to determine which range ID to
22334  *  utilize for any given user memory tag. If no ranges are present return the
22335  *  default range.
22336  */
22337 __attribute__((overloadable))
22338 vm_map_range_id_t
vm_map_get_user_range_id(vm_map_t map,uint16_t tag)22339 vm_map_get_user_range_id(vm_map_t map, uint16_t tag)
22340 {
22341 	vm_map_range_id_t range_id = UMEM_RANGE_ID_DEFAULT;
22342 
22343 	if (map != NULL && map->uses_user_ranges && tag < VM_MEMORY_COUNT) {
22344 		range_id = vm_map_range_id_map[tag];
22345 	}
22346 
22347 	return range_id;
22348 }
22349 
22350 /*
22351  * vm_map_get_user_range_id:
22352  *	determines which range ID the given addr/size combination maps to. If
22353  *  range ID cannot be determined return the default range.
22354  */
22355 __attribute__((overloadable))
22356 vm_map_range_id_t
vm_map_get_user_range_id(vm_map_t map,mach_vm_offset_t addr,mach_vm_size_t size)22357 vm_map_get_user_range_id(
22358 	vm_map_t                map,
22359 	mach_vm_offset_t        addr,
22360 	mach_vm_size_t          size)
22361 {
22362 	vm_map_range_id_t range_id = UMEM_RANGE_ID_MAX;
22363 
22364 	if (map == NULL || !map->uses_user_ranges) {
22365 		return UMEM_RANGE_ID_DEFAULT;
22366 	}
22367 
22368 	for (; range_id > UMEM_RANGE_ID_DEFAULT; --range_id) {
22369 		if (mach_vm_range_contains(&map->user_range[range_id], addr, size)) {
22370 			break;
22371 		}
22372 	}
22373 
22374 	assert(range_id < UMEM_RANGE_COUNT);
22375 	return range_id;
22376 }
22377 
22378 /*
22379  * vm_map_get_user_range:
22380  *	copy the VM user range for the given VM map and range ID.
22381  */
22382 kern_return_t
vm_map_get_user_range(vm_map_t map,vm_map_range_id_t range_id,mach_vm_range_t range)22383 vm_map_get_user_range(
22384 	vm_map_t                map,
22385 	vm_map_range_id_t       range_id,
22386 	mach_vm_range_t         range)
22387 {
22388 	if (map == NULL ||
22389 	    !map->uses_user_ranges ||
22390 	    range_id > UMEM_RANGE_ID_MAX ||
22391 	    range == NULL) {
22392 		return KERN_INVALID_ARGUMENT;
22393 	}
22394 
22395 	*range = map->user_range[range_id];
22396 	return KERN_SUCCESS;
22397 }
22398 #endif /* CONFIG_MAP_RANGES */
22399 
22400 /*
22401  * vm_map_entry_has_device_pager:
22402  * Check if the vm map entry specified by the virtual address has a device pager.
22403  * If the vm map entry does not exist or if the map is NULL, this returns FALSE.
22404  */
22405 boolean_t
vm_map_entry_has_device_pager(vm_map_t map,vm_map_offset_t vaddr)22406 vm_map_entry_has_device_pager(vm_map_t map, vm_map_offset_t vaddr)
22407 {
22408 	vm_map_entry_t entry;
22409 	vm_object_t object;
22410 	boolean_t result;
22411 
22412 	if (map == NULL) {
22413 		return FALSE;
22414 	}
22415 
22416 	vm_map_lock(map);
22417 	while (TRUE) {
22418 		if (!vm_map_lookup_entry(map, vaddr, &entry)) {
22419 			result = FALSE;
22420 			break;
22421 		}
22422 		if (entry->is_sub_map) {
22423 			// Check the submap
22424 			vm_map_t submap = VME_SUBMAP(entry);
22425 			assert(submap != NULL);
22426 			vm_map_lock(submap);
22427 			vm_map_unlock(map);
22428 			map = submap;
22429 			continue;
22430 		}
22431 		object = VME_OBJECT(entry);
22432 		if (object != NULL && object->pager != NULL && is_device_pager_ops(object->pager->mo_pager_ops)) {
22433 			result = TRUE;
22434 			break;
22435 		}
22436 		result = FALSE;
22437 		break;
22438 	}
22439 
22440 	vm_map_unlock(map);
22441 	return result;
22442 }
22443 
22444 
22445 #if MACH_ASSERT
22446 
22447 extern int pmap_ledgers_panic;
22448 extern int pmap_ledgers_panic_leeway;
22449 
22450 #define LEDGER_DRIFT(__LEDGER)                    \
22451 	int             __LEDGER##_over;          \
22452 	ledger_amount_t __LEDGER##_over_total;    \
22453 	ledger_amount_t __LEDGER##_over_max;      \
22454 	int             __LEDGER##_under;         \
22455 	ledger_amount_t __LEDGER##_under_total;   \
22456 	ledger_amount_t __LEDGER##_under_max
22457 
22458 struct {
22459 	uint64_t        num_pmaps_checked;
22460 
22461 	LEDGER_DRIFT(phys_footprint);
22462 	LEDGER_DRIFT(internal);
22463 	LEDGER_DRIFT(internal_compressed);
22464 	LEDGER_DRIFT(external);
22465 	LEDGER_DRIFT(reusable);
22466 	LEDGER_DRIFT(iokit_mapped);
22467 	LEDGER_DRIFT(alternate_accounting);
22468 	LEDGER_DRIFT(alternate_accounting_compressed);
22469 	LEDGER_DRIFT(page_table);
22470 	LEDGER_DRIFT(purgeable_volatile);
22471 	LEDGER_DRIFT(purgeable_nonvolatile);
22472 	LEDGER_DRIFT(purgeable_volatile_compressed);
22473 	LEDGER_DRIFT(purgeable_nonvolatile_compressed);
22474 	LEDGER_DRIFT(tagged_nofootprint);
22475 	LEDGER_DRIFT(tagged_footprint);
22476 	LEDGER_DRIFT(tagged_nofootprint_compressed);
22477 	LEDGER_DRIFT(tagged_footprint_compressed);
22478 	LEDGER_DRIFT(network_volatile);
22479 	LEDGER_DRIFT(network_nonvolatile);
22480 	LEDGER_DRIFT(network_volatile_compressed);
22481 	LEDGER_DRIFT(network_nonvolatile_compressed);
22482 	LEDGER_DRIFT(media_nofootprint);
22483 	LEDGER_DRIFT(media_footprint);
22484 	LEDGER_DRIFT(media_nofootprint_compressed);
22485 	LEDGER_DRIFT(media_footprint_compressed);
22486 	LEDGER_DRIFT(graphics_nofootprint);
22487 	LEDGER_DRIFT(graphics_footprint);
22488 	LEDGER_DRIFT(graphics_nofootprint_compressed);
22489 	LEDGER_DRIFT(graphics_footprint_compressed);
22490 	LEDGER_DRIFT(neural_nofootprint);
22491 	LEDGER_DRIFT(neural_footprint);
22492 	LEDGER_DRIFT(neural_nofootprint_compressed);
22493 	LEDGER_DRIFT(neural_footprint_compressed);
22494 } pmap_ledgers_drift;
22495 
22496 void
vm_map_pmap_check_ledgers(pmap_t pmap,ledger_t ledger,int pid,char * procname)22497 vm_map_pmap_check_ledgers(
22498 	pmap_t          pmap,
22499 	ledger_t        ledger,
22500 	int             pid,
22501 	char            *procname)
22502 {
22503 	ledger_amount_t bal;
22504 	boolean_t       do_panic;
22505 
22506 	do_panic = FALSE;
22507 
22508 	pmap_ledgers_drift.num_pmaps_checked++;
22509 
22510 #define LEDGER_CHECK_BALANCE(__LEDGER)                                  \
22511 MACRO_BEGIN                                                             \
22512 	int panic_on_negative = TRUE;                                   \
22513 	ledger_get_balance(ledger,                                      \
22514 	                   task_ledgers.__LEDGER,                       \
22515 	                   &bal);                                       \
22516 	ledger_get_panic_on_negative(ledger,                            \
22517 	                             task_ledgers.__LEDGER,             \
22518 	                             &panic_on_negative);               \
22519 	if (bal != 0) {                                                 \
22520 	        if (panic_on_negative ||                                \
22521 	            (pmap_ledgers_panic &&                              \
22522 	             pmap_ledgers_panic_leeway > 0 &&                   \
22523 	             (bal > (pmap_ledgers_panic_leeway * PAGE_SIZE) ||  \
22524 	              bal < (-pmap_ledgers_panic_leeway * PAGE_SIZE)))) { \
22525 	                do_panic = TRUE;                                \
22526 	        }                                                       \
22527 	        printf("LEDGER BALANCE proc %d (%s) "                   \
22528 	               "\"%s\" = %lld\n",                               \
22529 	               pid, procname, #__LEDGER, bal);                  \
22530 	        if (bal > 0) {                                          \
22531 	                pmap_ledgers_drift.__LEDGER##_over++;           \
22532 	                pmap_ledgers_drift.__LEDGER##_over_total += bal; \
22533 	                if (bal > pmap_ledgers_drift.__LEDGER##_over_max) { \
22534 	                        pmap_ledgers_drift.__LEDGER##_over_max = bal; \
22535 	                }                                               \
22536 	        } else if (bal < 0) {                                   \
22537 	                pmap_ledgers_drift.__LEDGER##_under++;          \
22538 	                pmap_ledgers_drift.__LEDGER##_under_total += bal; \
22539 	                if (bal < pmap_ledgers_drift.__LEDGER##_under_max) { \
22540 	                        pmap_ledgers_drift.__LEDGER##_under_max = bal; \
22541 	                }                                               \
22542 	        }                                                       \
22543 	}                                                               \
22544 MACRO_END
22545 
22546 	LEDGER_CHECK_BALANCE(phys_footprint);
22547 	LEDGER_CHECK_BALANCE(internal);
22548 	LEDGER_CHECK_BALANCE(internal_compressed);
22549 	LEDGER_CHECK_BALANCE(external);
22550 	LEDGER_CHECK_BALANCE(reusable);
22551 	LEDGER_CHECK_BALANCE(iokit_mapped);
22552 	LEDGER_CHECK_BALANCE(alternate_accounting);
22553 	LEDGER_CHECK_BALANCE(alternate_accounting_compressed);
22554 	LEDGER_CHECK_BALANCE(page_table);
22555 	LEDGER_CHECK_BALANCE(purgeable_volatile);
22556 	LEDGER_CHECK_BALANCE(purgeable_nonvolatile);
22557 	LEDGER_CHECK_BALANCE(purgeable_volatile_compressed);
22558 	LEDGER_CHECK_BALANCE(purgeable_nonvolatile_compressed);
22559 	LEDGER_CHECK_BALANCE(tagged_nofootprint);
22560 	LEDGER_CHECK_BALANCE(tagged_footprint);
22561 	LEDGER_CHECK_BALANCE(tagged_nofootprint_compressed);
22562 	LEDGER_CHECK_BALANCE(tagged_footprint_compressed);
22563 	LEDGER_CHECK_BALANCE(network_volatile);
22564 	LEDGER_CHECK_BALANCE(network_nonvolatile);
22565 	LEDGER_CHECK_BALANCE(network_volatile_compressed);
22566 	LEDGER_CHECK_BALANCE(network_nonvolatile_compressed);
22567 	LEDGER_CHECK_BALANCE(media_nofootprint);
22568 	LEDGER_CHECK_BALANCE(media_footprint);
22569 	LEDGER_CHECK_BALANCE(media_nofootprint_compressed);
22570 	LEDGER_CHECK_BALANCE(media_footprint_compressed);
22571 	LEDGER_CHECK_BALANCE(graphics_nofootprint);
22572 	LEDGER_CHECK_BALANCE(graphics_footprint);
22573 	LEDGER_CHECK_BALANCE(graphics_nofootprint_compressed);
22574 	LEDGER_CHECK_BALANCE(graphics_footprint_compressed);
22575 	LEDGER_CHECK_BALANCE(neural_nofootprint);
22576 	LEDGER_CHECK_BALANCE(neural_footprint);
22577 	LEDGER_CHECK_BALANCE(neural_nofootprint_compressed);
22578 	LEDGER_CHECK_BALANCE(neural_footprint_compressed);
22579 
22580 	if (do_panic) {
22581 		if (pmap_ledgers_panic) {
22582 			panic("pmap_destroy(%p) %d[%s] has imbalanced ledgers",
22583 			    pmap, pid, procname);
22584 		} else {
22585 			printf("pmap_destroy(%p) %d[%s] has imbalanced ledgers\n",
22586 			    pmap, pid, procname);
22587 		}
22588 	}
22589 }
22590 
22591 void
vm_map_pmap_set_process(vm_map_t map,int pid,char * procname)22592 vm_map_pmap_set_process(
22593 	vm_map_t map,
22594 	int pid,
22595 	char *procname)
22596 {
22597 	pmap_set_process(vm_map_pmap(map), pid, procname);
22598 }
22599 
22600 #endif /* MACH_ASSERT */
22601