xref: /xnu-8020.140.41/osfmk/vm/vm_map.c (revision 27b03b360a988dfd3dfdf34262bb0042026747cc)
1 /*
2  * Copyright (c) 2000-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * @OSF_COPYRIGHT@
30  */
31 /*
32  * Mach Operating System
33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34  * All Rights Reserved.
35  *
36  * Permission to use, copy, modify and distribute this software and its
37  * documentation is hereby granted, provided that both the copyright
38  * notice and this permission notice appear in all copies of the
39  * software, derivative works or modified versions, and any portions
40  * thereof, and that both notices appear in supporting documentation.
41  *
42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45  *
46  * Carnegie Mellon requests users of this software to return to
47  *
48  *  Software Distribution Coordinator  or  [email protected]
49  *  School of Computer Science
50  *  Carnegie Mellon University
51  *  Pittsburgh PA 15213-3890
52  *
53  * any improvements or extensions that they make and grant Carnegie Mellon
54  * the rights to redistribute these changes.
55  */
56 /*
57  */
58 /*
59  *	File:	vm/vm_map.c
60  *	Author:	Avadis Tevanian, Jr., Michael Wayne Young
61  *	Date:	1985
62  *
63  *	Virtual memory mapping module.
64  */
65 
66 #include <mach_assert.h>
67 
68 #include <vm/vm_options.h>
69 
70 #include <libkern/OSAtomic.h>
71 
72 #include <mach/kern_return.h>
73 #include <mach/port.h>
74 #include <mach/vm_attributes.h>
75 #include <mach/vm_param.h>
76 #include <mach/vm_behavior.h>
77 #include <mach/vm_statistics.h>
78 #include <mach/memory_object.h>
79 #include <mach/mach_vm.h>
80 #include <machine/cpu_capabilities.h>
81 #include <mach/sdt.h>
82 
83 #include <kern/assert.h>
84 #include <kern/backtrace.h>
85 #include <kern/counter.h>
86 #include <kern/exc_guard.h>
87 #include <kern/kalloc.h>
88 #include <kern/zalloc_internal.h>
89 
90 #include <vm/cpm.h>
91 #include <vm/vm_compressor.h>
92 #include <vm/vm_compressor_pager.h>
93 #include <vm/vm_init.h>
94 #include <vm/vm_fault.h>
95 #include <vm/vm_map_internal.h>
96 #include <vm/vm_object.h>
97 #include <vm/vm_page.h>
98 #include <vm/vm_pageout.h>
99 #include <vm/pmap.h>
100 #include <vm/vm_kern.h>
101 #include <ipc/ipc_port.h>
102 #include <kern/sched_prim.h>
103 #include <kern/misc_protos.h>
104 
105 #include <mach/vm_map_server.h>
106 #include <mach/mach_host_server.h>
107 #include <vm/vm_protos.h>
108 #include <vm/vm_purgeable_internal.h>
109 
110 #include <vm/vm_protos.h>
111 #include <vm/vm_shared_region.h>
112 #include <vm/vm_map_store.h>
113 
114 #include <san/kasan.h>
115 
116 #include <sys/resource.h>
117 #include <sys/codesign.h>
118 #include <sys/mman.h>
119 #include <sys/reboot.h>
120 #include <sys/kdebug_triage.h>
121 
122 #if __LP64__
123 #define HAVE_VM_MAP_RESERVED_ENTRY_ZONE 0
124 #else
125 #define HAVE_VM_MAP_RESERVED_ENTRY_ZONE 1
126 #endif
127 
128 #include <libkern/section_keywords.h>
129 #if DEVELOPMENT || DEBUG
130 extern int proc_selfcsflags(void);
131 int panic_on_unsigned_execute = 0;
132 int panic_on_mlock_failure = 0;
133 #endif /* DEVELOPMENT || DEBUG */
134 
135 #if MACH_ASSERT
136 int debug4k_filter = 0;
137 char debug4k_proc_name[1024] = "";
138 int debug4k_proc_filter = (int)-1 & ~(1 << __DEBUG4K_FAULT);
139 int debug4k_panic_on_misaligned_sharing = 0;
140 const char *debug4k_category_name[] = {
141 	"error",        /* 0 */
142 	"life",         /* 1 */
143 	"load",         /* 2 */
144 	"fault",        /* 3 */
145 	"copy",         /* 4 */
146 	"share",        /* 5 */
147 	"adjust",       /* 6 */
148 	"pmap",         /* 7 */
149 	"mementry",     /* 8 */
150 	"iokit",        /* 9 */
151 	"upl",          /* 10 */
152 	"exc",          /* 11 */
153 	"vfs"           /* 12 */
154 };
155 #endif /* MACH_ASSERT */
156 int debug4k_no_cow_copyin = 0;
157 
158 
159 #if __arm64__
160 extern const int fourk_binary_compatibility_unsafe;
161 extern const int fourk_binary_compatibility_allow_wx;
162 #endif /* __arm64__ */
163 extern int proc_selfpid(void);
164 extern char *proc_name_address(void *p);
165 
166 #if VM_MAP_DEBUG_APPLE_PROTECT
167 int vm_map_debug_apple_protect = 0;
168 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
169 #if VM_MAP_DEBUG_FOURK
170 int vm_map_debug_fourk = 0;
171 #endif /* VM_MAP_DEBUG_FOURK */
172 
173 #if DEBUG || DEVELOPMENT
174 static TUNABLE(bool, vm_map_executable_immutable,
175     "vm_map_executable_immutable", true);
176 #else
177 #define vm_map_executable_immutable true
178 #endif
179 
180 os_refgrp_decl(static, map_refgrp, "vm_map", NULL);
181 
182 extern u_int32_t random(void);  /* from <libkern/libkern.h> */
183 /* Internal prototypes
184  */
185 
186 typedef struct vm_map_zap {
187 	vm_map_entry_t          vmz_head;
188 	vm_map_entry_t         *vmz_tail;
189 } *vm_map_zap_t;
190 
191 #define VM_MAP_ZAP_DECLARE(zap) \
192 	struct vm_map_zap zap = { .vmz_tail = &zap.vmz_head }
193 
194 static vm_map_entry_t   vm_map_entry_insert(
195 	vm_map_t                map,
196 	vm_map_entry_t          insp_entry,
197 	vm_map_offset_t         start,
198 	vm_map_offset_t         end,
199 	vm_object_t             object,
200 	vm_object_offset_t      offset,
201 	vm_map_kernel_flags_t   vmk_flags,
202 	boolean_t               needs_copy,
203 	vm_prot_t               cur_protection,
204 	vm_prot_t               max_protection,
205 	vm_inherit_t            inheritance,
206 	boolean_t               no_cache,
207 	boolean_t               permanent,
208 	unsigned int            superpage_size,
209 	boolean_t               clear_map_aligned,
210 	int                     alias);
211 
212 static void vm_map_simplify_range(
213 	vm_map_t        map,
214 	vm_map_offset_t start,
215 	vm_map_offset_t end);   /* forward */
216 
217 static boolean_t        vm_map_range_check(
218 	vm_map_t        map,
219 	vm_map_offset_t start,
220 	vm_map_offset_t end,
221 	vm_map_entry_t  *entry);
222 
223 static void vm_map_submap_pmap_clean(
224 	vm_map_t        map,
225 	vm_map_offset_t start,
226 	vm_map_offset_t end,
227 	vm_map_t        sub_map,
228 	vm_map_offset_t offset);
229 
230 static void             vm_map_pmap_enter(
231 	vm_map_t                map,
232 	vm_map_offset_t         addr,
233 	vm_map_offset_t         end_addr,
234 	vm_object_t             object,
235 	vm_object_offset_t      offset,
236 	vm_prot_t               protection);
237 
238 static void             _vm_map_clip_end(
239 	struct vm_map_header    *map_header,
240 	vm_map_entry_t          entry,
241 	vm_map_offset_t         end);
242 
243 static void             _vm_map_clip_start(
244 	struct vm_map_header    *map_header,
245 	vm_map_entry_t          entry,
246 	vm_map_offset_t         start);
247 
248 static kmem_return_t vm_map_delete(
249 	vm_map_t        map,
250 	vm_map_offset_t start,
251 	vm_map_offset_t end,
252 	vmr_flags_t     flags,
253 	kmem_guard_t    guard,
254 	vm_map_zap_t    zap);
255 
256 static void             vm_map_copy_insert(
257 	vm_map_t        map,
258 	vm_map_entry_t  after_where,
259 	vm_map_copy_t   copy);
260 
261 static kern_return_t    vm_map_copy_overwrite_unaligned(
262 	vm_map_t        dst_map,
263 	vm_map_entry_t  entry,
264 	vm_map_copy_t   copy,
265 	vm_map_address_t start,
266 	boolean_t       discard_on_success);
267 
268 static kern_return_t    vm_map_copy_overwrite_aligned(
269 	vm_map_t        dst_map,
270 	vm_map_entry_t  tmp_entry,
271 	vm_map_copy_t   copy,
272 	vm_map_offset_t start,
273 	pmap_t          pmap);
274 
275 static kern_return_t    vm_map_copyin_kernel_buffer(
276 	vm_map_t        src_map,
277 	vm_map_address_t src_addr,
278 	vm_map_size_t   len,
279 	boolean_t       src_destroy,
280 	vm_map_copy_t   *copy_result);  /* OUT */
281 
282 static kern_return_t    vm_map_copyout_kernel_buffer(
283 	vm_map_t        map,
284 	vm_map_address_t *addr, /* IN/OUT */
285 	vm_map_copy_t   copy,
286 	vm_map_size_t   copy_size,
287 	boolean_t       overwrite,
288 	boolean_t       consume_on_success);
289 
290 static void             vm_map_fork_share(
291 	vm_map_t        old_map,
292 	vm_map_entry_t  old_entry,
293 	vm_map_t        new_map);
294 
295 static boolean_t        vm_map_fork_copy(
296 	vm_map_t        old_map,
297 	vm_map_entry_t  *old_entry_p,
298 	vm_map_t        new_map,
299 	int             vm_map_copyin_flags);
300 
301 static kern_return_t    vm_map_wire_nested(
302 	vm_map_t                   map,
303 	vm_map_offset_t            start,
304 	vm_map_offset_t            end,
305 	vm_prot_t                  caller_prot,
306 	vm_tag_t                   tag,
307 	boolean_t                  user_wire,
308 	pmap_t                     map_pmap,
309 	vm_map_offset_t            pmap_addr,
310 	ppnum_t                    *physpage_p);
311 
312 static kern_return_t    vm_map_unwire_nested(
313 	vm_map_t                   map,
314 	vm_map_offset_t            start,
315 	vm_map_offset_t            end,
316 	boolean_t                  user_wire,
317 	pmap_t                     map_pmap,
318 	vm_map_offset_t            pmap_addr);
319 
320 static kern_return_t    vm_map_overwrite_submap_recurse(
321 	vm_map_t                   dst_map,
322 	vm_map_offset_t            dst_addr,
323 	vm_map_size_t              dst_size);
324 
325 static kern_return_t    vm_map_copy_overwrite_nested(
326 	vm_map_t                   dst_map,
327 	vm_map_offset_t            dst_addr,
328 	vm_map_copy_t              copy,
329 	boolean_t                  interruptible,
330 	pmap_t                     pmap,
331 	boolean_t                  discard_on_success);
332 
333 static kern_return_t    vm_map_remap_extract(
334 	vm_map_t                map,
335 	vm_map_offset_t         addr,
336 	vm_map_size_t           size,
337 	boolean_t               copy,
338 	struct vm_map_header    *map_header,
339 	vm_prot_t               *cur_protection,
340 	vm_prot_t               *max_protection,
341 	vm_inherit_t            inheritance,
342 	vm_map_kernel_flags_t   vmk_flags);
343 
344 static kern_return_t    vm_map_remap_range_allocate(
345 	vm_map_t                map,
346 	vm_map_address_t        *address,
347 	vm_map_size_t           size,
348 	vm_map_offset_t         mask,
349 	int                     flags,
350 	vm_map_kernel_flags_t   vmk_flags,
351 	vm_tag_t                tag,
352 	vm_map_entry_t          *map_entry,
353 	vm_map_zap_t            zap_list);
354 
355 static void             vm_map_region_look_for_page(
356 	vm_map_t                   map,
357 	vm_map_offset_t            va,
358 	vm_object_t                object,
359 	vm_object_offset_t         offset,
360 	int                        max_refcnt,
361 	unsigned short             depth,
362 	vm_region_extended_info_t  extended,
363 	mach_msg_type_number_t count);
364 
365 static int              vm_map_region_count_obj_refs(
366 	vm_map_entry_t             entry,
367 	vm_object_t                object);
368 
369 
370 static kern_return_t    vm_map_willneed(
371 	vm_map_t        map,
372 	vm_map_offset_t start,
373 	vm_map_offset_t end);
374 
375 static kern_return_t    vm_map_reuse_pages(
376 	vm_map_t        map,
377 	vm_map_offset_t start,
378 	vm_map_offset_t end);
379 
380 static kern_return_t    vm_map_reusable_pages(
381 	vm_map_t        map,
382 	vm_map_offset_t start,
383 	vm_map_offset_t end);
384 
385 static kern_return_t    vm_map_can_reuse(
386 	vm_map_t        map,
387 	vm_map_offset_t start,
388 	vm_map_offset_t end);
389 
390 #if MACH_ASSERT
391 static kern_return_t    vm_map_pageout(
392 	vm_map_t        map,
393 	vm_map_offset_t start,
394 	vm_map_offset_t end);
395 #endif /* MACH_ASSERT */
396 
397 kern_return_t vm_map_corpse_footprint_collect(
398 	vm_map_t        old_map,
399 	vm_map_entry_t  old_entry,
400 	vm_map_t        new_map);
401 void vm_map_corpse_footprint_collect_done(
402 	vm_map_t        new_map);
403 void vm_map_corpse_footprint_destroy(
404 	vm_map_t        map);
405 kern_return_t vm_map_corpse_footprint_query_page_info(
406 	vm_map_t        map,
407 	vm_map_offset_t va,
408 	int             *disposition_p);
409 void vm_map_footprint_query_page_info(
410 	vm_map_t        map,
411 	vm_map_entry_t  map_entry,
412 	vm_map_offset_t curr_s_offset,
413 	int             *disposition_p);
414 
415 pid_t find_largest_process_vm_map_entries(void);
416 
417 extern int exit_with_guard_exception(void *p, mach_exception_data_type_t code,
418     mach_exception_data_type_t subcode);
419 
420 /*
421  * Macros to copy a vm_map_entry. We must be careful to correctly
422  * manage the wired page count. vm_map_entry_copy() creates a new
423  * map entry to the same memory - the wired count in the new entry
424  * must be set to zero. vm_map_entry_copy_full() creates a new
425  * entry that is identical to the old entry.  This preserves the
426  * wire count; it's used for map splitting and zone changing in
427  * vm_map_copyout.
428  */
429 
430 static inline void
vm_map_entry_copy_pmap_cs_assoc(vm_map_t map __unused,vm_map_entry_t new __unused,vm_map_entry_t old __unused)431 vm_map_entry_copy_pmap_cs_assoc(
432 	vm_map_t map __unused,
433 	vm_map_entry_t new __unused,
434 	vm_map_entry_t old __unused)
435 {
436 	/* when pmap_cs is not enabled, assert as a sanity check */
437 	assert(new->pmap_cs_associated == FALSE);
438 }
439 
440 /*
441  * The "used_for_jit" flag was copied from OLD to NEW in vm_map_entry_copy().
442  * But for security reasons on some platforms, we don't want the
443  * new mapping to be "used for jit", so we reset the flag here.
444  */
445 static inline void
vm_map_entry_copy_code_signing(vm_map_t map,vm_map_entry_t new,vm_map_entry_t old __unused)446 vm_map_entry_copy_code_signing(
447 	vm_map_t map,
448 	vm_map_entry_t new,
449 	vm_map_entry_t old __unused)
450 {
451 	if (VM_MAP_POLICY_ALLOW_JIT_COPY(map)) {
452 		assert(new->used_for_jit == old->used_for_jit);
453 	} else {
454 		new->used_for_jit = FALSE;
455 	}
456 }
457 
458 static inline void
vm_map_entry_copy_full(vm_map_entry_t new,vm_map_entry_t old)459 vm_map_entry_copy_full(
460 	vm_map_entry_t new,
461 	vm_map_entry_t old)
462 {
463 #if MAP_ENTRY_CREATION_DEBUG
464 	btref_put(new->vme_creation_bt);
465 	btref_retain(old->vme_creation_bt);
466 #endif
467 #if MAP_ENTRY_INSERTION_DEBUG
468 	btref_put(new->vme_insertion_bt);
469 	btref_retain(old->vme_insertion_bt);
470 #endif
471 	*new = *old;
472 }
473 
474 static inline void
vm_map_entry_copy(vm_map_t map,vm_map_entry_t new,vm_map_entry_t old)475 vm_map_entry_copy(
476 	vm_map_t map,
477 	vm_map_entry_t new,
478 	vm_map_entry_t old)
479 {
480 	vm_map_entry_copy_full(new, old);
481 
482 	new->is_shared = FALSE;
483 	new->needs_wakeup = FALSE;
484 	new->in_transition = FALSE;
485 	new->wired_count = 0;
486 	new->user_wired_count = 0;
487 	new->permanent = FALSE;
488 	vm_map_entry_copy_code_signing(map, new, old);
489 	vm_map_entry_copy_pmap_cs_assoc(map, new, old);
490 	if (new->iokit_acct) {
491 		assertf(!new->use_pmap, "old %p new %p\n", old, new);
492 		new->iokit_acct = FALSE;
493 		new->use_pmap = TRUE;
494 	}
495 	new->vme_resilient_codesign = FALSE;
496 	new->vme_resilient_media = FALSE;
497 	new->vme_atomic = FALSE;
498 	new->vme_no_copy_on_read = FALSE;
499 }
500 
501 /*
502  * Normal lock_read_to_write() returns FALSE/0 on failure.
503  * These functions evaluate to zero on success and non-zero value on failure.
504  */
505 __attribute__((always_inline))
506 int
vm_map_lock_read_to_write(vm_map_t map)507 vm_map_lock_read_to_write(vm_map_t map)
508 {
509 	if (lck_rw_lock_shared_to_exclusive(&(map)->lock)) {
510 		DTRACE_VM(vm_map_lock_upgrade);
511 		return 0;
512 	}
513 	return 1;
514 }
515 
516 __attribute__((always_inline))
517 boolean_t
vm_map_try_lock(vm_map_t map)518 vm_map_try_lock(vm_map_t map)
519 {
520 	if (lck_rw_try_lock_exclusive(&(map)->lock)) {
521 		DTRACE_VM(vm_map_lock_w);
522 		return TRUE;
523 	}
524 	return FALSE;
525 }
526 
527 __attribute__((always_inline))
528 boolean_t
vm_map_try_lock_read(vm_map_t map)529 vm_map_try_lock_read(vm_map_t map)
530 {
531 	if (lck_rw_try_lock_shared(&(map)->lock)) {
532 		DTRACE_VM(vm_map_lock_r);
533 		return TRUE;
534 	}
535 	return FALSE;
536 }
537 
538 /*
539  * Routines to get the page size the caller should
540  * use while inspecting the target address space.
541  * Use the "_safely" variant if the caller is dealing with a user-provided
542  * array whose size depends on the page size, to avoid any overflow or
543  * underflow of a user-allocated buffer.
544  */
545 int
vm_self_region_page_shift_safely(vm_map_t target_map)546 vm_self_region_page_shift_safely(
547 	vm_map_t target_map)
548 {
549 	int effective_page_shift = 0;
550 
551 	if (PAGE_SIZE == (4096)) {
552 		/* x86_64 and 4k watches: always use 4k */
553 		return PAGE_SHIFT;
554 	}
555 	/* did caller provide an explicit page size for this thread to use? */
556 	effective_page_shift = thread_self_region_page_shift();
557 	if (effective_page_shift) {
558 		/* use the explicitly-provided page size */
559 		return effective_page_shift;
560 	}
561 	/* no explicit page size: use the caller's page size... */
562 	effective_page_shift = VM_MAP_PAGE_SHIFT(current_map());
563 	if (effective_page_shift == VM_MAP_PAGE_SHIFT(target_map)) {
564 		/* page size match: safe to use */
565 		return effective_page_shift;
566 	}
567 	/* page size mismatch */
568 	return -1;
569 }
570 int
vm_self_region_page_shift(vm_map_t target_map)571 vm_self_region_page_shift(
572 	vm_map_t target_map)
573 {
574 	int effective_page_shift;
575 
576 	effective_page_shift = vm_self_region_page_shift_safely(target_map);
577 	if (effective_page_shift == -1) {
578 		/* no safe value but OK to guess for caller */
579 		effective_page_shift = MIN(VM_MAP_PAGE_SHIFT(current_map()),
580 		    VM_MAP_PAGE_SHIFT(target_map));
581 	}
582 	return effective_page_shift;
583 }
584 
585 
586 /*
587  *	Decide if we want to allow processes to execute from their data or stack areas.
588  *	override_nx() returns true if we do.  Data/stack execution can be enabled independently
589  *	for 32 and 64 bit processes.  Set the VM_ABI_32 or VM_ABI_64 flags in allow_data_exec
590  *	or allow_stack_exec to enable data execution for that type of data area for that particular
591  *	ABI (or both by or'ing the flags together).  These are initialized in the architecture
592  *	specific pmap files since the default behavior varies according to architecture.  The
593  *	main reason it varies is because of the need to provide binary compatibility with old
594  *	applications that were written before these restrictions came into being.  In the old
595  *	days, an app could execute anything it could read, but this has slowly been tightened
596  *	up over time.  The default behavior is:
597  *
598  *	32-bit PPC apps		may execute from both stack and data areas
599  *	32-bit Intel apps	may exeucte from data areas but not stack
600  *	64-bit PPC/Intel apps	may not execute from either data or stack
601  *
602  *	An application on any architecture may override these defaults by explicitly
603  *	adding PROT_EXEC permission to the page in question with the mprotect(2)
604  *	system call.  This code here just determines what happens when an app tries to
605  *      execute from a page that lacks execute permission.
606  *
607  *	Note that allow_data_exec or allow_stack_exec may also be modified by sysctl to change the
608  *	default behavior for both 32 and 64 bit apps on a system-wide basis. Furthermore,
609  *	a Mach-O header flag bit (MH_NO_HEAP_EXECUTION) can be used to forcibly disallow
610  *	execution from data areas for a particular binary even if the arch normally permits it. As
611  *	a final wrinkle, a posix_spawn attribute flag can be used to negate this opt-in header bit
612  *	to support some complicated use cases, notably browsers with out-of-process plugins that
613  *	are not all NX-safe.
614  */
615 
616 extern int allow_data_exec, allow_stack_exec;
617 
618 int
override_nx(vm_map_t map,uint32_t user_tag)619 override_nx(vm_map_t map, uint32_t user_tag) /* map unused on arm */
620 {
621 	int current_abi;
622 
623 	if (map->pmap == kernel_pmap) {
624 		return FALSE;
625 	}
626 
627 	/*
628 	 * Determine if the app is running in 32 or 64 bit mode.
629 	 */
630 
631 	if (vm_map_is_64bit(map)) {
632 		current_abi = VM_ABI_64;
633 	} else {
634 		current_abi = VM_ABI_32;
635 	}
636 
637 	/*
638 	 * Determine if we should allow the execution based on whether it's a
639 	 * stack or data area and the current architecture.
640 	 */
641 
642 	if (user_tag == VM_MEMORY_STACK) {
643 		return allow_stack_exec & current_abi;
644 	}
645 
646 	return (allow_data_exec & current_abi) && (map->map_disallow_data_exec == FALSE);
647 }
648 
649 
650 /*
651  *	Virtual memory maps provide for the mapping, protection,
652  *	and sharing of virtual memory objects.  In addition,
653  *	this module provides for an efficient virtual copy of
654  *	memory from one map to another.
655  *
656  *	Synchronization is required prior to most operations.
657  *
658  *	Maps consist of an ordered doubly-linked list of simple
659  *	entries; a single hint is used to speed up lookups.
660  *
661  *	Sharing maps have been deleted from this version of Mach.
662  *	All shared objects are now mapped directly into the respective
663  *	maps.  This requires a change in the copy on write strategy;
664  *	the asymmetric (delayed) strategy is used for shared temporary
665  *	objects instead of the symmetric (shadow) strategy.  All maps
666  *	are now "top level" maps (either task map, kernel map or submap
667  *	of the kernel map).
668  *
669  *	Since portions of maps are specified by start/end addreses,
670  *	which may not align with existing map entries, all
671  *	routines merely "clip" entries to these start/end values.
672  *	[That is, an entry is split into two, bordering at a
673  *	start or end value.]  Note that these clippings may not
674  *	always be necessary (as the two resulting entries are then
675  *	not changed); however, the clipping is done for convenience.
676  *	No attempt is currently made to "glue back together" two
677  *	abutting entries.
678  *
679  *	The symmetric (shadow) copy strategy implements virtual copy
680  *	by copying VM object references from one map to
681  *	another, and then marking both regions as copy-on-write.
682  *	It is important to note that only one writeable reference
683  *	to a VM object region exists in any map when this strategy
684  *	is used -- this means that shadow object creation can be
685  *	delayed until a write operation occurs.  The symmetric (delayed)
686  *	strategy allows multiple maps to have writeable references to
687  *	the same region of a vm object, and hence cannot delay creating
688  *	its copy objects.  See vm_object_copy_quickly() in vm_object.c.
689  *	Copying of permanent objects is completely different; see
690  *	vm_object_copy_strategically() in vm_object.c.
691  */
692 
693 static SECURITY_READ_ONLY_LATE(zone_t) vm_map_zone;       /* zone for vm_map structures */
694 static SECURITY_READ_ONLY_LATE(zone_t) vm_map_copy_zone;  /* zone for vm_map_copy structures */
695 
696 SECURITY_READ_ONLY_LATE(zone_t)        vm_map_entry_zone; /* zone for vm_map_entry structures */
697 SECURITY_READ_ONLY_LATE(zone_t)        vm_map_holes_zone; /* zone for vm map holes (vm_map_links) structures */
698 #if HAVE_VM_MAP_RESERVED_ENTRY_ZONE
699 SECURITY_READ_ONLY_LATE(zone_t)        vm_map_entry_reserved_zone;
700 #endif /* HAVE_VM_MAP_RESERVED_ENTRY_ZONE */
701 
702 #define VM_MAP_ZONE_NAME "maps"
703 #define VM_MAP_ZFLAGS ( \
704 	ZC_NOENCRYPT | \
705 	ZC_NOGZALLOC | \
706 	ZC_VM_LP64)
707 
708 #define VM_MAP_ENTRY_ZONE_NAME "VM map entries"
709 #define VM_MAP_ENTRY_ZFLAGS ( \
710 	ZC_NOENCRYPT | \
711 	ZC_CACHING | \
712 	ZC_NOGZALLOC | \
713 	ZC_KASAN_NOQUARANTINE | \
714 	ZC_VM_LP64)
715 
716 #if HAVE_VM_MAP_RESERVED_ENTRY_ZONE
717 #define VM_MAP_ENTRY_RESERVED_ZONE_NAME "Reserved VM map entries"
718 #define VM_MAP_ENTRY_RESERVED_ZFLAGS ( \
719 	ZC_NOENCRYPT | \
720 	ZC_NOCACHING | \
721 	ZC_NOGZALLOC | \
722 	ZC_KASAN_NOQUARANTINE | \
723 	ZC_VM)
724 #endif /* HAVE_VM_MAP_RESERVED_ENTRY_ZONE */
725 
726 #define VM_MAP_HOLES_ZONE_NAME "VM map holes"
727 #define VM_MAP_HOLES_ZFLAGS ( \
728 	ZC_NOENCRYPT | \
729 	ZC_CACHING | \
730 	ZC_NOGZALLOC | \
731 	ZC_KASAN_NOQUARANTINE | \
732 	ZC_VM_LP64)
733 
734 /*
735  * Asserts that a vm_map_copy object is coming from the
736  * vm_map_copy_zone to ensure that it isn't a fake constructed
737  * anywhere else.
738  */
739 void
vm_map_copy_require(struct vm_map_copy * copy)740 vm_map_copy_require(struct vm_map_copy *copy)
741 {
742 	zone_id_require(ZONE_ID_VM_MAP_COPY, sizeof(struct vm_map_copy), copy);
743 }
744 
745 /*
746  *	vm_map_require:
747  *
748  *	Ensures that the argument is memory allocated from the genuine
749  *	vm map zone. (See zone_id_require_allow_foreign).
750  */
751 void
vm_map_require(vm_map_t map)752 vm_map_require(vm_map_t map)
753 {
754 	zone_id_require(ZONE_ID_VM_MAP, sizeof(struct _vm_map), map);
755 }
756 
757 #define VM_MAP_EARLY_COUNT_MAX         16
758 static __startup_data vm_offset_t      map_data;
759 static __startup_data vm_size_t        map_data_size;
760 static __startup_data vm_offset_t      kentry_data;
761 static __startup_data vm_size_t        kentry_data_size;
762 static __startup_data vm_offset_t      map_holes_data;
763 static __startup_data vm_size_t        map_holes_data_size;
764 static __startup_data vm_map_t        *early_map_owners[VM_MAP_EARLY_COUNT_MAX];
765 static __startup_data uint32_t         early_map_count;
766 
767 #if XNU_TARGET_OS_OSX
768 #define         NO_COALESCE_LIMIT  ((1024 * 128) - 1)
769 #else /* XNU_TARGET_OS_OSX */
770 #define         NO_COALESCE_LIMIT  0
771 #endif /* XNU_TARGET_OS_OSX */
772 
773 /* Skip acquiring locks if we're in the midst of a kernel core dump */
774 unsigned int not_in_kdp = 1;
775 
776 unsigned int vm_map_set_cache_attr_count = 0;
777 
778 kern_return_t
vm_map_set_cache_attr(vm_map_t map,vm_map_offset_t va)779 vm_map_set_cache_attr(
780 	vm_map_t        map,
781 	vm_map_offset_t va)
782 {
783 	vm_map_entry_t  map_entry;
784 	vm_object_t     object;
785 	kern_return_t   kr = KERN_SUCCESS;
786 
787 	vm_map_lock_read(map);
788 
789 	if (!vm_map_lookup_entry(map, va, &map_entry) ||
790 	    map_entry->is_sub_map) {
791 		/*
792 		 * that memory is not properly mapped
793 		 */
794 		kr = KERN_INVALID_ARGUMENT;
795 		goto done;
796 	}
797 	object = VME_OBJECT(map_entry);
798 
799 	if (object == VM_OBJECT_NULL) {
800 		/*
801 		 * there should be a VM object here at this point
802 		 */
803 		kr = KERN_INVALID_ARGUMENT;
804 		goto done;
805 	}
806 	vm_object_lock(object);
807 	object->set_cache_attr = TRUE;
808 	vm_object_unlock(object);
809 
810 	vm_map_set_cache_attr_count++;
811 done:
812 	vm_map_unlock_read(map);
813 
814 	return kr;
815 }
816 
817 
818 #if CONFIG_CODE_DECRYPTION
819 /*
820  * vm_map_apple_protected:
821  * This remaps the requested part of the object with an object backed by
822  * the decrypting pager.
823  * crypt_info contains entry points and session data for the crypt module.
824  * The crypt_info block will be copied by vm_map_apple_protected. The data structures
825  * referenced in crypt_info must remain valid until crypt_info->crypt_end() is called.
826  */
827 kern_return_t
vm_map_apple_protected(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_object_offset_t crypto_backing_offset,struct pager_crypt_info * crypt_info,uint32_t cryptid)828 vm_map_apple_protected(
829 	vm_map_t                map,
830 	vm_map_offset_t         start,
831 	vm_map_offset_t         end,
832 	vm_object_offset_t      crypto_backing_offset,
833 	struct pager_crypt_info *crypt_info,
834 	uint32_t                cryptid)
835 {
836 	boolean_t       map_locked;
837 	kern_return_t   kr;
838 	vm_map_entry_t  map_entry;
839 	struct vm_map_entry tmp_entry;
840 	memory_object_t unprotected_mem_obj;
841 	vm_object_t     protected_object;
842 	vm_map_offset_t map_addr;
843 	vm_map_offset_t start_aligned, end_aligned;
844 	vm_object_offset_t      crypto_start, crypto_end;
845 	int             vm_flags;
846 	vm_map_kernel_flags_t vmk_flags;
847 	boolean_t       cache_pager;
848 
849 	vm_flags = 0;
850 	vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
851 
852 	map_locked = FALSE;
853 	unprotected_mem_obj = MEMORY_OBJECT_NULL;
854 
855 	start_aligned = vm_map_trunc_page(start, PAGE_MASK_64);
856 	end_aligned = vm_map_round_page(end, PAGE_MASK_64);
857 	start_aligned = vm_map_trunc_page(start_aligned, VM_MAP_PAGE_MASK(map));
858 	end_aligned = vm_map_round_page(end_aligned, VM_MAP_PAGE_MASK(map));
859 
860 #if __arm64__
861 	/*
862 	 * "start" and "end" might be 4K-aligned but not 16K-aligned,
863 	 * so we might have to loop and establish up to 3 mappings:
864 	 *
865 	 * + the first 16K-page, which might overlap with the previous
866 	 *   4K-aligned mapping,
867 	 * + the center,
868 	 * + the last 16K-page, which might overlap with the next
869 	 *   4K-aligned mapping.
870 	 * Each of these mapping might be backed by a vnode pager (if
871 	 * properly page-aligned) or a "fourk_pager", itself backed by a
872 	 * vnode pager (if 4K-aligned but not page-aligned).
873 	 */
874 #endif /* __arm64__ */
875 
876 	map_addr = start_aligned;
877 	for (map_addr = start_aligned;
878 	    map_addr < end;
879 	    map_addr = tmp_entry.vme_end) {
880 		vm_map_lock(map);
881 		map_locked = TRUE;
882 
883 		/* lookup the protected VM object */
884 		if (!vm_map_lookup_entry(map,
885 		    map_addr,
886 		    &map_entry) ||
887 		    map_entry->is_sub_map ||
888 		    VME_OBJECT(map_entry) == VM_OBJECT_NULL) {
889 			/* that memory is not properly mapped */
890 			kr = KERN_INVALID_ARGUMENT;
891 			goto done;
892 		}
893 
894 		/* ensure mapped memory is mapped as executable except
895 		 *  except for model decryption flow */
896 		if ((cryptid != CRYPTID_MODEL_ENCRYPTION) &&
897 		    !(map_entry->protection & VM_PROT_EXECUTE)) {
898 			kr = KERN_INVALID_ARGUMENT;
899 			goto done;
900 		}
901 
902 		/* get the protected object to be decrypted */
903 		protected_object = VME_OBJECT(map_entry);
904 		if (protected_object == VM_OBJECT_NULL) {
905 			/* there should be a VM object here at this point */
906 			kr = KERN_INVALID_ARGUMENT;
907 			goto done;
908 		}
909 		/* ensure protected object stays alive while map is unlocked */
910 		vm_object_reference(protected_object);
911 
912 		/* limit the map entry to the area we want to cover */
913 		vm_map_clip_start(map, map_entry, start_aligned);
914 		vm_map_clip_end(map, map_entry, end_aligned);
915 
916 		tmp_entry = *map_entry;
917 		map_entry = VM_MAP_ENTRY_NULL; /* not valid after unlocking map */
918 		vm_map_unlock(map);
919 		map_locked = FALSE;
920 
921 		/*
922 		 * This map entry might be only partially encrypted
923 		 * (if not fully "page-aligned").
924 		 */
925 		crypto_start = 0;
926 		crypto_end = tmp_entry.vme_end - tmp_entry.vme_start;
927 		if (tmp_entry.vme_start < start) {
928 			if (tmp_entry.vme_start != start_aligned) {
929 				kr = KERN_INVALID_ADDRESS;
930 			}
931 			crypto_start += (start - tmp_entry.vme_start);
932 		}
933 		if (tmp_entry.vme_end > end) {
934 			if (tmp_entry.vme_end != end_aligned) {
935 				kr = KERN_INVALID_ADDRESS;
936 			}
937 			crypto_end -= (tmp_entry.vme_end - end);
938 		}
939 
940 		/*
941 		 * This "extra backing offset" is needed to get the decryption
942 		 * routine to use the right key.  It adjusts for the possibly
943 		 * relative offset of an interposed "4K" pager...
944 		 */
945 		if (crypto_backing_offset == (vm_object_offset_t) -1) {
946 			crypto_backing_offset = VME_OFFSET(&tmp_entry);
947 		}
948 
949 		cache_pager = TRUE;
950 #if XNU_TARGET_OS_OSX
951 		if (vm_map_is_alien(map)) {
952 			cache_pager = FALSE;
953 		}
954 #endif /* XNU_TARGET_OS_OSX */
955 
956 		/*
957 		 * Lookup (and create if necessary) the protected memory object
958 		 * matching that VM object.
959 		 * If successful, this also grabs a reference on the memory object,
960 		 * to guarantee that it doesn't go away before we get a chance to map
961 		 * it.
962 		 */
963 		unprotected_mem_obj = apple_protect_pager_setup(
964 			protected_object,
965 			VME_OFFSET(&tmp_entry),
966 			crypto_backing_offset,
967 			crypt_info,
968 			crypto_start,
969 			crypto_end,
970 			cache_pager);
971 
972 		/* release extra ref on protected object */
973 		vm_object_deallocate(protected_object);
974 
975 		if (unprotected_mem_obj == NULL) {
976 			kr = KERN_FAILURE;
977 			goto done;
978 		}
979 
980 		vm_flags = VM_FLAGS_FIXED | VM_FLAGS_OVERWRITE;
981 		/* can overwrite an immutable mapping */
982 		vmk_flags.vmkf_overwrite_immutable = TRUE;
983 #if __arm64__
984 		if (tmp_entry.used_for_jit &&
985 		    (VM_MAP_PAGE_SHIFT(map) != FOURK_PAGE_SHIFT ||
986 		    PAGE_SHIFT != FOURK_PAGE_SHIFT) &&
987 		    fourk_binary_compatibility_unsafe &&
988 		    fourk_binary_compatibility_allow_wx) {
989 			printf("** FOURK_COMPAT [%d]: "
990 			    "allowing write+execute at 0x%llx\n",
991 			    proc_selfpid(), tmp_entry.vme_start);
992 			vmk_flags.vmkf_map_jit = TRUE;
993 		}
994 #endif /* __arm64__ */
995 
996 		/* map this memory object in place of the current one */
997 		map_addr = tmp_entry.vme_start;
998 		kr = vm_map_enter_mem_object(map,
999 		    &map_addr,
1000 		    (tmp_entry.vme_end -
1001 		    tmp_entry.vme_start),
1002 		    (mach_vm_offset_t) 0,
1003 		    vm_flags,
1004 		    vmk_flags,
1005 		    VM_KERN_MEMORY_NONE,
1006 		    (ipc_port_t)(uintptr_t) unprotected_mem_obj,
1007 		    0,
1008 		    TRUE,
1009 		    tmp_entry.protection,
1010 		    tmp_entry.max_protection,
1011 		    tmp_entry.inheritance);
1012 		assertf(kr == KERN_SUCCESS,
1013 		    "kr = 0x%x\n", kr);
1014 		assertf(map_addr == tmp_entry.vme_start,
1015 		    "map_addr=0x%llx vme_start=0x%llx tmp_entry=%p\n",
1016 		    (uint64_t)map_addr,
1017 		    (uint64_t) tmp_entry.vme_start,
1018 		    &tmp_entry);
1019 
1020 #if VM_MAP_DEBUG_APPLE_PROTECT
1021 		if (vm_map_debug_apple_protect) {
1022 			printf("APPLE_PROTECT: map %p [0x%llx:0x%llx] pager %p:"
1023 			    " backing:[object:%p,offset:0x%llx,"
1024 			    "crypto_backing_offset:0x%llx,"
1025 			    "crypto_start:0x%llx,crypto_end:0x%llx]\n",
1026 			    map,
1027 			    (uint64_t) map_addr,
1028 			    (uint64_t) (map_addr + (tmp_entry.vme_end -
1029 			    tmp_entry.vme_start)),
1030 			    unprotected_mem_obj,
1031 			    protected_object,
1032 			    VME_OFFSET(&tmp_entry),
1033 			    crypto_backing_offset,
1034 			    crypto_start,
1035 			    crypto_end);
1036 		}
1037 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
1038 
1039 		/*
1040 		 * Release the reference obtained by
1041 		 * apple_protect_pager_setup().
1042 		 * The mapping (if it succeeded) is now holding a reference on
1043 		 * the memory object.
1044 		 */
1045 		memory_object_deallocate(unprotected_mem_obj);
1046 		unprotected_mem_obj = MEMORY_OBJECT_NULL;
1047 
1048 		/* continue with next map entry */
1049 		crypto_backing_offset += (tmp_entry.vme_end -
1050 		    tmp_entry.vme_start);
1051 		crypto_backing_offset -= crypto_start;
1052 	}
1053 	kr = KERN_SUCCESS;
1054 
1055 done:
1056 	if (map_locked) {
1057 		vm_map_unlock(map);
1058 	}
1059 	return kr;
1060 }
1061 #endif  /* CONFIG_CODE_DECRYPTION */
1062 
1063 
1064 LCK_GRP_DECLARE(vm_map_lck_grp, "vm_map");
1065 LCK_ATTR_DECLARE(vm_map_lck_attr, 0, 0);
1066 LCK_ATTR_DECLARE(vm_map_lck_rw_attr, 0, LCK_ATTR_DEBUG);
1067 
1068 #if XNU_TARGET_OS_OSX
1069 int malloc_no_cow = 0;
1070 #else /* XNU_TARGET_OS_OSX */
1071 int malloc_no_cow = 1;
1072 #endif /* XNU_TARGET_OS_OSX */
1073 uint64_t vm_memory_malloc_no_cow_mask = 0ULL;
1074 #if DEBUG
1075 int vm_check_map_sanity = 0;
1076 #endif
1077 
1078 /*
1079  *	vm_map_init:
1080  *
1081  *	Initialize the vm_map module.  Must be called before
1082  *	any other vm_map routines.
1083  *
1084  *	Map and entry structures are allocated from zones -- we must
1085  *	initialize those zones.
1086  *
1087  *	There are three zones of interest:
1088  *
1089  *	vm_map_zone:		used to allocate maps.
1090  *	vm_map_entry_zone:	used to allocate map entries.
1091  *
1092  *	LP32:
1093  *	vm_map_entry_reserved_zone:     fallback zone for kernel map entries
1094  *
1095  *	The kernel allocates map entries from a special zone that is initially
1096  *	"crammed" with memory.  It would be difficult (perhaps impossible) for
1097  *	the kernel to allocate more memory to a entry zone when it became
1098  *	empty since the very act of allocating memory implies the creation
1099  *	of a new entry.
1100  */
1101 __startup_func
1102 void
vm_map_init(void)1103 vm_map_init(void)
1104 {
1105 
1106 #if MACH_ASSERT
1107 	PE_parse_boot_argn("debug4k_filter", &debug4k_filter,
1108 	    sizeof(debug4k_filter));
1109 #endif /* MACH_ASSERT */
1110 
1111 	vm_map_zone = zone_create_ext(VM_MAP_ZONE_NAME, sizeof(struct _vm_map),
1112 	    VM_MAP_ZFLAGS, ZONE_ID_VM_MAP, NULL);
1113 
1114 	/*
1115 	 * Don't quarantine because we always need elements available
1116 	 * Disallow GC on this zone... to aid the GC.
1117 	 */
1118 	vm_map_entry_zone = zone_create_ext(VM_MAP_ENTRY_ZONE_NAME,
1119 	    sizeof(struct vm_map_entry), VM_MAP_ENTRY_ZFLAGS,
1120 	    ZONE_ID_VM_MAP_ENTRY, ^(zone_t z) {
1121 		z->z_elems_rsv = (uint16_t)(32 *
1122 		(ml_early_cpu_max_number() + 1));
1123 	});
1124 #if HAVE_VM_MAP_RESERVED_ENTRY_ZONE
1125 	vm_map_entry_reserved_zone = zone_create(VM_MAP_ENTRY_RESERVED_ZONE_NAME,
1126 	    sizeof(struct vm_map_entry), VM_MAP_ENTRY_RESERVED_ZFLAGS);
1127 #endif /* HAVE_VM_MAP_RESERVED_ENTRY_ZONE */
1128 
1129 	vm_map_holes_zone = zone_create_ext(VM_MAP_HOLES_ZONE_NAME,
1130 	    sizeof(struct vm_map_links), VM_MAP_HOLES_ZFLAGS,
1131 	    ZONE_ID_VM_MAP_HOLES, ^(zone_t z) {
1132 		z->z_elems_rsv = (uint16_t)(16 * 1024 / zone_elem_size(z));
1133 	});
1134 
1135 	vm_map_copy_zone = zone_create_ext("VM map copies", sizeof(struct vm_map_copy),
1136 	    ZC_NOENCRYPT | ZC_CACHING, ZONE_ID_VM_MAP_COPY, NULL);
1137 
1138 	/*
1139 	 * Add the stolen memory to zones, adjust zone size and stolen counts.
1140 	 */
1141 	zone_cram_early(vm_map_zone, map_data, map_data_size);
1142 	zone_cram_early(vm_map_entry_zone, kentry_data, kentry_data_size);
1143 	zone_cram_early(vm_map_holes_zone, map_holes_data, map_holes_data_size);
1144 	printf("VM boostrap: %d maps, %d entries and %d holes available\n",
1145 	    vm_map_zone->z_elems_free,
1146 	    vm_map_entry_zone->z_elems_free,
1147 	    vm_map_holes_zone->z_elems_free);
1148 
1149 	/*
1150 	 * Since these are covered by zones, remove them from stolen page accounting.
1151 	 */
1152 	VM_PAGE_MOVE_STOLEN(atop_64(map_data_size) + atop_64(kentry_data_size) + atop_64(map_holes_data_size));
1153 
1154 #if VM_MAP_DEBUG_APPLE_PROTECT
1155 	PE_parse_boot_argn("vm_map_debug_apple_protect",
1156 	    &vm_map_debug_apple_protect,
1157 	    sizeof(vm_map_debug_apple_protect));
1158 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
1159 #if VM_MAP_DEBUG_APPLE_FOURK
1160 	PE_parse_boot_argn("vm_map_debug_fourk",
1161 	    &vm_map_debug_fourk,
1162 	    sizeof(vm_map_debug_fourk));
1163 #endif /* VM_MAP_DEBUG_FOURK */
1164 
1165 	PE_parse_boot_argn("malloc_no_cow",
1166 	    &malloc_no_cow,
1167 	    sizeof(malloc_no_cow));
1168 	if (malloc_no_cow) {
1169 		vm_memory_malloc_no_cow_mask = 0ULL;
1170 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC;
1171 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_SMALL;
1172 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_MEDIUM;
1173 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE;
1174 //		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_HUGE;
1175 //		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_REALLOC;
1176 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_TINY;
1177 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE_REUSABLE;
1178 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE_REUSED;
1179 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_NANO;
1180 //		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_TCMALLOC;
1181 		PE_parse_boot_argn("vm_memory_malloc_no_cow_mask",
1182 		    &vm_memory_malloc_no_cow_mask,
1183 		    sizeof(vm_memory_malloc_no_cow_mask));
1184 	}
1185 
1186 #if DEBUG
1187 	PE_parse_boot_argn("vm_check_map_sanity", &vm_check_map_sanity, sizeof(vm_check_map_sanity));
1188 	if (vm_check_map_sanity) {
1189 		kprintf("VM sanity checking enabled\n");
1190 	} else {
1191 		kprintf("VM sanity checking disabled. Set bootarg vm_check_map_sanity=1 to enable\n");
1192 	}
1193 #endif /* DEBUG */
1194 
1195 #if DEVELOPMENT || DEBUG
1196 	PE_parse_boot_argn("panic_on_unsigned_execute",
1197 	    &panic_on_unsigned_execute,
1198 	    sizeof(panic_on_unsigned_execute));
1199 	PE_parse_boot_argn("panic_on_mlock_failure",
1200 	    &panic_on_mlock_failure,
1201 	    sizeof(panic_on_mlock_failure));
1202 #endif /* DEVELOPMENT || DEBUG */
1203 }
1204 
1205 __startup_func
1206 static void
vm_map_steal_memory(void)1207 vm_map_steal_memory(void)
1208 {
1209 	/*
1210 	 * We need to reserve enough memory to support boostraping VM maps
1211 	 * and the zone subsystem.
1212 	 *
1213 	 * The VM Maps that need to function before zones can support them
1214 	 * are the ones registered with vm_map_will_allocate_early_map(),
1215 	 * which are:
1216 	 * - the kernel map
1217 	 * - the various submaps used by zones (pgz, meta, ...)
1218 	 *
1219 	 * We also need enough entries and holes to support them
1220 	 * until zone_metadata_init() is called, which is when
1221 	 * the zone allocator becomes capable of expanding dynamically.
1222 	 *
1223 	 * We need:
1224 	 * - VM_MAP_EARLY_COUNT_MAX worth of VM Maps.
1225 	 * - To allow for 3-4 entries per map, but the kernel map
1226 	 *   needs a multiple of VM_MAP_EARLY_COUNT_MAX entries
1227 	 *   to describe the submaps, so double it (and make it 8x too)
1228 	 * - To allow for holes between entries,
1229 	 *   hence needs the same budget as entries
1230 	 */
1231 	map_data_size = zone_get_early_alloc_size(VM_MAP_ZONE_NAME,
1232 	    sizeof(struct _vm_map), VM_MAP_ZFLAGS,
1233 	    VM_MAP_EARLY_COUNT_MAX);
1234 
1235 	kentry_data_size = zone_get_early_alloc_size(VM_MAP_ENTRY_ZONE_NAME,
1236 	    sizeof(struct vm_map_entry), VM_MAP_ENTRY_ZFLAGS,
1237 	    8 * VM_MAP_EARLY_COUNT_MAX);
1238 
1239 	map_holes_data_size = zone_get_early_alloc_size(VM_MAP_HOLES_ZONE_NAME,
1240 	    sizeof(struct vm_map_links), VM_MAP_HOLES_ZFLAGS,
1241 	    8 * VM_MAP_EARLY_COUNT_MAX);
1242 
1243 	/*
1244 	 * Steal a contiguous range of memory so that a simple range check
1245 	 * can validate early addresses being freed/crammed to these
1246 	 * zones
1247 	 */
1248 	map_data       = zone_early_mem_init(map_data_size + kentry_data_size +
1249 	    map_holes_data_size);
1250 	kentry_data    = map_data + map_data_size;
1251 	map_holes_data = kentry_data + kentry_data_size;
1252 }
1253 STARTUP(PMAP_STEAL, STARTUP_RANK_FIRST, vm_map_steal_memory);
1254 
1255 __startup_func
1256 static void
vm_kernel_boostraped(void)1257 vm_kernel_boostraped(void)
1258 {
1259 	printf("VM bootstrap done: %d maps, %d entries and %d holes left\n",
1260 	    vm_map_zone->z_elems_free,
1261 	    vm_map_entry_zone->z_elems_free,
1262 	    vm_map_holes_zone->z_elems_free);
1263 }
1264 STARTUP(ZALLOC, STARTUP_RANK_SECOND, vm_kernel_boostraped);
1265 
1266 void
vm_map_disable_hole_optimization(vm_map_t map)1267 vm_map_disable_hole_optimization(vm_map_t map)
1268 {
1269 	vm_map_entry_t  head_entry, hole_entry, next_hole_entry;
1270 
1271 	if (map->holelistenabled) {
1272 		head_entry = hole_entry = CAST_TO_VM_MAP_ENTRY(map->holes_list);
1273 
1274 		while (hole_entry != NULL) {
1275 			next_hole_entry = hole_entry->vme_next;
1276 
1277 			hole_entry->vme_next = NULL;
1278 			hole_entry->vme_prev = NULL;
1279 			zfree(vm_map_holes_zone, hole_entry);
1280 
1281 			if (next_hole_entry == head_entry) {
1282 				hole_entry = NULL;
1283 			} else {
1284 				hole_entry = next_hole_entry;
1285 			}
1286 		}
1287 
1288 		map->holes_list = NULL;
1289 		map->holelistenabled = FALSE;
1290 
1291 		map->first_free = vm_map_first_entry(map);
1292 		SAVE_HINT_HOLE_WRITE(map, NULL);
1293 	}
1294 }
1295 
1296 boolean_t
vm_kernel_map_is_kernel(vm_map_t map)1297 vm_kernel_map_is_kernel(vm_map_t map)
1298 {
1299 	return map->pmap == kernel_pmap;
1300 }
1301 
1302 /*
1303  *	vm_map_create:
1304  *
1305  *	Creates and returns a new empty VM map with
1306  *	the given physical map structure, and having
1307  *	the given lower and upper address bounds.
1308  */
1309 
1310 extern vm_map_t vm_map_create_external(
1311 	pmap_t                  pmap,
1312 	vm_map_offset_t         min_off,
1313 	vm_map_offset_t         max_off,
1314 	boolean_t               pageable);
1315 
1316 vm_map_t
vm_map_create_external(pmap_t pmap,vm_map_offset_t min,vm_map_offset_t max,boolean_t pageable)1317 vm_map_create_external(
1318 	pmap_t                  pmap,
1319 	vm_map_offset_t         min,
1320 	vm_map_offset_t         max,
1321 	boolean_t               pageable)
1322 {
1323 	vm_map_create_options_t options = VM_MAP_CREATE_DEFAULT;
1324 
1325 	if (pageable) {
1326 		options |= VM_MAP_CREATE_PAGEABLE;
1327 	}
1328 	return vm_map_create_options(pmap, min, max, options);
1329 }
1330 
1331 __startup_func
1332 void
vm_map_will_allocate_early_map(vm_map_t * owner)1333 vm_map_will_allocate_early_map(vm_map_t *owner)
1334 {
1335 	if (early_map_count >= VM_MAP_EARLY_COUNT_MAX) {
1336 		panic("VM_MAP_EARLY_COUNT_MAX is too low");
1337 	}
1338 
1339 	early_map_owners[early_map_count++] = owner;
1340 }
1341 
1342 __startup_func
1343 void
vm_map_relocate_early_maps(vm_offset_t delta)1344 vm_map_relocate_early_maps(vm_offset_t delta)
1345 {
1346 	for (uint32_t i = 0; i < early_map_count; i++) {
1347 		vm_address_t addr = (vm_address_t)*early_map_owners[i];
1348 
1349 		*early_map_owners[i] = (vm_map_t)(addr + delta);
1350 	}
1351 
1352 	early_map_count = ~0u;
1353 }
1354 
1355 /*
1356  *	Routine:	vm_map_relocate_early_elem
1357  *
1358  *	Purpose:
1359  *		Early zone elements are allocated in a temporary part
1360  *		of the address space.
1361  *
1362  *		Once the zones live in their final place, the early
1363  *		VM maps, map entries and map holes need to be relocated.
1364  *
1365  *		It involves rewriting any vm_map_t, vm_map_entry_t or
1366  *		pointers to vm_map_links. Other pointers to other types
1367  *		are fine.
1368  *
1369  *		Fortunately, pointers to those types are self-contained
1370  *		in those zones, _except_ for pointers to VM maps,
1371  *		which are tracked during early boot and fixed with
1372  *		vm_map_relocate_early_maps().
1373  */
1374 __startup_func
1375 void
vm_map_relocate_early_elem(uint32_t zone_id,vm_offset_t new_addr,vm_offset_t delta)1376 vm_map_relocate_early_elem(
1377 	uint32_t                zone_id,
1378 	vm_offset_t             new_addr,
1379 	vm_offset_t             delta)
1380 {
1381 #define relocate(type_t, field)  ({ \
1382 	typeof(((type_t)NULL)->field) *__field = &((type_t)new_addr)->field;   \
1383 	if (*__field) {                                                        \
1384 	        *__field = (typeof(*__field))((vm_offset_t)*__field + delta);  \
1385 	}                                                                      \
1386 })
1387 
1388 	switch (zone_id) {
1389 	case ZONE_ID_VM_MAP:
1390 	case ZONE_ID_VM_MAP_ENTRY:
1391 	case ZONE_ID_VM_MAP_HOLES:
1392 		break;
1393 
1394 	default:
1395 		panic("Unexpected zone ID %d", zone_id);
1396 	}
1397 
1398 	if (zone_id == ZONE_ID_VM_MAP) {
1399 		relocate(vm_map_t, hdr.links.prev);
1400 		relocate(vm_map_t, hdr.links.next);
1401 		((vm_map_t)new_addr)->pmap = kernel_pmap;
1402 #ifdef VM_MAP_STORE_USE_RB
1403 		relocate(vm_map_t, hdr.rb_head_store.rbh_root);
1404 #endif /* VM_MAP_STORE_USE_RB */
1405 		relocate(vm_map_t, hint);
1406 		relocate(vm_map_t, hole_hint);
1407 		relocate(vm_map_t, first_free);
1408 		return;
1409 	}
1410 
1411 	relocate(struct vm_map_links *, prev);
1412 	relocate(struct vm_map_links *, next);
1413 
1414 	if (zone_id == ZONE_ID_VM_MAP_ENTRY) {
1415 #ifdef VM_MAP_STORE_USE_RB
1416 		relocate(vm_map_entry_t, store.entry.rbe_left);
1417 		relocate(vm_map_entry_t, store.entry.rbe_right);
1418 		relocate(vm_map_entry_t, store.entry.rbe_parent);
1419 #endif /* VM_MAP_STORE_USE_RB */
1420 		if (((vm_map_entry_t)new_addr)->is_sub_map) {
1421 			/* no object to relocate because we haven't made any */
1422 			((vm_map_entry_t)new_addr)->vme_submap +=
1423 			    delta >> VME_SUBMAP_SHIFT;
1424 		}
1425 #if MAP_ENTRY_CREATION_DEBUG
1426 		relocate(vm_map_entry_t, vme_creation_maphdr);
1427 #endif /* MAP_ENTRY_CREATION_DEBUG */
1428 	}
1429 
1430 #undef relocate
1431 }
1432 
1433 vm_map_t
vm_map_create_options(pmap_t pmap,vm_map_offset_t min,vm_map_offset_t max,vm_map_create_options_t options)1434 vm_map_create_options(
1435 	pmap_t                  pmap,
1436 	vm_map_offset_t         min,
1437 	vm_map_offset_t         max,
1438 	vm_map_create_options_t options)
1439 {
1440 	vm_map_t result;
1441 
1442 #if DEBUG || DEVELOPMENT
1443 	if (__improbable(startup_phase < STARTUP_SUB_ZALLOC)) {
1444 		if (early_map_count != ~0u && early_map_count !=
1445 		    zone_count_allocated(vm_map_zone) + 1) {
1446 			panic("allocating %dth early map, owner not known",
1447 			    zone_count_allocated(vm_map_zone) + 1);
1448 		}
1449 		if (early_map_count != ~0u && pmap && pmap != kernel_pmap) {
1450 			panic("allocating %dth early map for non kernel pmap",
1451 			    early_map_count);
1452 		}
1453 	}
1454 #endif /* DEBUG || DEVELOPMENT */
1455 
1456 	result = zalloc_flags(vm_map_zone, Z_WAITOK | Z_NOFAIL | Z_ZERO);
1457 
1458 	vm_map_first_entry(result) = vm_map_to_entry(result);
1459 	vm_map_last_entry(result)  = vm_map_to_entry(result);
1460 
1461 	vm_map_store_init(&result->hdr);
1462 	result->hdr.entries_pageable = (bool)(options & VM_MAP_CREATE_PAGEABLE);
1463 	vm_map_set_page_shift(result, PAGE_SHIFT);
1464 
1465 	result->size_limit = RLIM_INFINITY;             /* default unlimited */
1466 	result->data_limit = RLIM_INFINITY;             /* default unlimited */
1467 	result->user_wire_limit = MACH_VM_MAX_ADDRESS;  /* default limit is unlimited */
1468 	os_ref_init_count_raw(&result->map_refcnt, &map_refgrp, 1);
1469 	result->pmap = pmap;
1470 	result->min_offset = min;
1471 	result->max_offset = max;
1472 	result->first_free = vm_map_to_entry(result);
1473 	result->hint = vm_map_to_entry(result);
1474 
1475 	if (options & VM_MAP_CREATE_NEVER_FAULTS) {
1476 		assert(pmap == kernel_pmap);
1477 		result->never_faults = true;
1478 	}
1479 
1480 	/* "has_corpse_footprint" and "holelistenabled" are mutually exclusive */
1481 	if (options & VM_MAP_CREATE_CORPSE_FOOTPRINT) {
1482 		result->has_corpse_footprint = true;
1483 	} else if (!(options & VM_MAP_CREATE_DISABLE_HOLELIST)) {
1484 		struct vm_map_links *hole_entry = zalloc(vm_map_holes_zone);
1485 
1486 		hole_entry->start = min;
1487 #if defined(__arm__) || defined(__arm64__)
1488 		hole_entry->end = result->max_offset;
1489 #else
1490 		hole_entry->end = MAX(max, (vm_map_offset_t)MACH_VM_MAX_ADDRESS);
1491 #endif
1492 		result->holes_list = result->hole_hint = hole_entry;
1493 		hole_entry->prev = hole_entry->next = CAST_TO_VM_MAP_ENTRY(hole_entry);
1494 		result->holelistenabled = true;
1495 	}
1496 
1497 	vm_map_lock_init(result);
1498 
1499 	return result;
1500 }
1501 
1502 /*
1503  * Adjusts a submap that was made by kmem_suballoc()
1504  * before it knew where it would be mapped,
1505  * so that it has the right min/max offsets.
1506  *
1507  * We do not need to hold any locks:
1508  * only the caller knows about this map,
1509  * and it is not published on any entry yet.
1510  */
1511 static void
vm_map_adjust_offsets(vm_map_t map,vm_map_offset_t min_off,vm_map_offset_t max_off)1512 vm_map_adjust_offsets(
1513 	vm_map_t                map,
1514 	vm_map_offset_t         min_off,
1515 	vm_map_offset_t         max_off)
1516 {
1517 	assert(map->min_offset == 0);
1518 	assert(map->max_offset == max_off - min_off);
1519 	assert(map->hdr.nentries == 0);
1520 	assert(os_ref_get_count_raw(&map->map_refcnt) == 2);
1521 
1522 	map->min_offset = min_off;
1523 	map->max_offset = max_off;
1524 
1525 	if (map->holelistenabled) {
1526 		struct vm_map_links *hole = map->holes_list;
1527 
1528 		hole->start = min_off;
1529 #if defined(__arm__) || defined(__arm64__)
1530 		hole->end = max_off;
1531 #else
1532 		hole->end = MAX(max_off, (vm_map_offset_t)MACH_VM_MAX_ADDRESS);
1533 #endif
1534 	}
1535 }
1536 
1537 
1538 vm_map_size_t
vm_map_adjusted_size(vm_map_t map)1539 vm_map_adjusted_size(vm_map_t map)
1540 {
1541 	struct vm_reserved_region *regions = NULL;
1542 	size_t num_regions = 0;
1543 	mach_vm_size_t  reserved_size = 0, map_size = 0;
1544 
1545 	if (map == NULL || (map->size == 0)) {
1546 		return 0;
1547 	}
1548 
1549 	map_size = map->size;
1550 
1551 	if (map->reserved_regions == FALSE || !vm_map_is_exotic(map) || map->terminated) {
1552 		/*
1553 		 * No special reserved regions or not an exotic map or the task
1554 		 * is terminating and these special regions might have already
1555 		 * been deallocated.
1556 		 */
1557 		return map_size;
1558 	}
1559 
1560 	num_regions = ml_get_vm_reserved_regions(vm_map_is_64bit(map), &regions);
1561 	assert((num_regions == 0) || (num_regions > 0 && regions != NULL));
1562 
1563 	while (num_regions) {
1564 		reserved_size += regions[--num_regions].vmrr_size;
1565 	}
1566 
1567 	/*
1568 	 * There are a few places where the map is being switched out due to
1569 	 * 'termination' without that bit being set (e.g. exec and corpse purging).
1570 	 * In those cases, we could have the map's regions being deallocated on
1571 	 * a core while some accounting process is trying to get the map's size.
1572 	 * So this assert can't be enabled till all those places are uniform in
1573 	 * their use of the 'map->terminated' bit.
1574 	 *
1575 	 * assert(map_size >= reserved_size);
1576 	 */
1577 
1578 	return (map_size >= reserved_size) ? (map_size - reserved_size) : map_size;
1579 }
1580 
1581 /*
1582  *	vm_map_entry_create:	[ internal use only ]
1583  *
1584  *	Allocates a VM map entry for insertion in the
1585  *	given map (or map copy).  No fields are filled.
1586  *
1587  *	The VM entry will be zero initialized, except for:
1588  *	- behavior set to VM_BEHAVIOR_DEFAULT
1589  *	- inheritance set to VM_INHERIT_DEFAULT
1590  */
1591 #define vm_map_entry_create(map)    _vm_map_entry_create(&(map)->hdr)
1592 
1593 #define vm_map_copy_entry_create(copy) _vm_map_entry_create(&(copy)->cpy_hdr)
1594 
1595 static vm_map_entry_t
_vm_map_entry_create(struct vm_map_header * map_header __unused)1596 _vm_map_entry_create(
1597 	struct vm_map_header    *map_header __unused)
1598 {
1599 	vm_map_entry_t  entry = NULL;
1600 	zone_t zone = vm_map_entry_zone;
1601 
1602 #if HAVE_VM_MAP_RESERVED_ENTRY_ZONE
1603 	zone_security_flags_t zsflags = zone_security_array[ZONE_ID_VM_MAP_ENTRY];
1604 	if (map_header == &zone_submap(zsflags)->hdr) {
1605 		/*
1606 		 * If we are trying to allocate an entry for the submap
1607 		 * of the vm_map_entry_zone, then this can cause recursive
1608 		 * locking of this map.
1609 		 *
1610 		 * Try to allocate _without blocking_ from this zone,
1611 		 * but if it is depleted, we need to go to the
1612 		 * vm_map_entry_reserved_zone which is in the zalloc
1613 		 * "VM" submap, which can grow without taking any map lock.
1614 		 *
1615 		 * Note: the vm_map_entry_zone has a rather high "reserve"
1616 		 * setup in order to minimize usage of the reserved one.
1617 		 */
1618 		entry = zalloc_flags(vm_map_entry_zone, Z_NOWAIT | Z_ZERO);
1619 		zone = vm_map_entry_reserved_zone;
1620 	}
1621 #endif
1622 	if (entry == NULL) {
1623 		entry = zalloc_flags(zone, Z_WAITOK | Z_ZERO);
1624 	}
1625 
1626 	/*
1627 	 * Help the compiler with what we know to be true,
1628 	 * so that the further bitfields inits have good codegen.
1629 	 *
1630 	 * See rdar://87041299
1631 	 */
1632 	__builtin_assume(entry->vme_object_value == 0);
1633 #if __LP64__
1634 	__builtin_assume(*(uint64_t *)(&entry->vme_object_value + 1) == 0);
1635 	__builtin_assume(*(uint64_t *)(&entry->vme_object_value + 2) == 0);
1636 #else
1637 	__builtin_assume(*(uint32_t *)(&entry->vme_object_value + 1) == 0);
1638 	__builtin_assume(*(uint32_t *)(&entry->vme_object_value + 2) == 0);
1639 	__builtin_assume(*(uint32_t *)(&entry->vme_object_value + 3) == 0);
1640 	__builtin_assume(*(uint32_t *)(&entry->vme_object_value + 4) == 0);
1641 #endif
1642 
1643 	static_assert(VM_MAX_TAG_VALUE <= VME_ALIAS_MASK,
1644 	    "VME_ALIAS_MASK covers tags");
1645 
1646 	static_assert(VM_BEHAVIOR_DEFAULT == 0,
1647 	    "can skip zeroing of the behavior field");
1648 	entry->inheritance = VM_INHERIT_DEFAULT;
1649 
1650 	vm_map_store_update((vm_map_t) NULL, entry, VM_MAP_ENTRY_CREATE);
1651 
1652 #if MAP_ENTRY_CREATION_DEBUG
1653 	entry->vme_creation_maphdr = map_header;
1654 	entry->vme_creation_bt = btref_get(__builtin_frame_address(0),
1655 	    BTREF_GET_NOWAIT);
1656 #endif
1657 	return entry;
1658 }
1659 
1660 /*
1661  *	vm_map_entry_dispose:	[ internal use only ]
1662  *
1663  *	Inverse of vm_map_entry_create.
1664  *
1665  *      write map lock held so no need to
1666  *	do anything special to insure correctness
1667  *      of the stores
1668  */
1669 static void
vm_map_entry_dispose(vm_map_entry_t entry)1670 vm_map_entry_dispose(
1671 	vm_map_entry_t          entry)
1672 {
1673 #if MAP_ENTRY_CREATION_DEBUG
1674 	btref_put(entry->vme_creation_bt);
1675 #endif
1676 #if MAP_ENTRY_INSERTION_DEBUG
1677 	btref_put(entry->vme_insertion_bt);
1678 #endif
1679 #if HAVE_VM_MAP_RESERVED_ENTRY_ZONE
1680 	if (zone_id_for_element(entry, sizeof(*entry)) != ZONE_ID_VM_MAP_ENTRY) {
1681 		zfree(vm_map_entry_reserved_zone, entry);
1682 		return;
1683 	}
1684 #endif /* HAVE_VM_MAP_RESERVED_ENTRY_ZONE */
1685 	zfree(vm_map_entry_zone, entry);
1686 }
1687 
1688 #define vm_map_copy_entry_dispose(copy_entry) \
1689 	vm_map_entry_dispose(copy_entry)
1690 
1691 static vm_map_entry_t
vm_map_zap_first_entry(vm_map_zap_t list)1692 vm_map_zap_first_entry(
1693 	vm_map_zap_t            list)
1694 {
1695 	return list->vmz_head;
1696 }
1697 
1698 static vm_map_entry_t
vm_map_zap_last_entry(vm_map_zap_t list)1699 vm_map_zap_last_entry(
1700 	vm_map_zap_t            list)
1701 {
1702 	assert(vm_map_zap_first_entry(list));
1703 	return __container_of(list->vmz_tail, struct vm_map_entry, vme_next);
1704 }
1705 
1706 static void
vm_map_zap_append(vm_map_zap_t list,vm_map_entry_t entry)1707 vm_map_zap_append(
1708 	vm_map_zap_t            list,
1709 	vm_map_entry_t          entry)
1710 {
1711 	entry->vme_next = VM_MAP_ENTRY_NULL;
1712 	*list->vmz_tail = entry;
1713 	list->vmz_tail = &entry->vme_next;
1714 }
1715 
1716 static vm_map_entry_t
vm_map_zap_pop(vm_map_zap_t list)1717 vm_map_zap_pop(
1718 	vm_map_zap_t            list)
1719 {
1720 	vm_map_entry_t head = list->vmz_head;
1721 
1722 	if (head != VM_MAP_ENTRY_NULL &&
1723 	    (list->vmz_head = head->vme_next) == VM_MAP_ENTRY_NULL) {
1724 		list->vmz_tail = &list->vmz_head;
1725 	}
1726 
1727 	return head;
1728 }
1729 
1730 static void
vm_map_zap_dispose(vm_map_zap_t list)1731 vm_map_zap_dispose(
1732 	vm_map_zap_t            list)
1733 {
1734 	vm_map_entry_t          entry;
1735 
1736 	while ((entry = vm_map_zap_pop(list))) {
1737 		if (entry->is_sub_map) {
1738 			vm_map_deallocate(VME_SUBMAP(entry));
1739 		} else {
1740 			vm_object_deallocate(VME_OBJECT(entry));
1741 		}
1742 
1743 		vm_map_entry_dispose(entry);
1744 	}
1745 }
1746 
1747 #if MACH_ASSERT
1748 static boolean_t first_free_check = FALSE;
1749 boolean_t
first_free_is_valid(vm_map_t map)1750 first_free_is_valid(
1751 	vm_map_t        map)
1752 {
1753 	if (!first_free_check) {
1754 		return TRUE;
1755 	}
1756 
1757 	return first_free_is_valid_store( map );
1758 }
1759 #endif /* MACH_ASSERT */
1760 
1761 
1762 #define vm_map_copy_entry_link(copy, after_where, entry)                \
1763 	_vm_map_store_entry_link(&(copy)->cpy_hdr, after_where, (entry))
1764 
1765 #define vm_map_copy_entry_unlink(copy, entry)                           \
1766 	_vm_map_store_entry_unlink(&(copy)->cpy_hdr, (entry))
1767 
1768 /*
1769  *	vm_map_destroy:
1770  *
1771  *	Actually destroy a map.
1772  */
1773 void
vm_map_destroy(vm_map_t map)1774 vm_map_destroy(
1775 	vm_map_t        map)
1776 {
1777 	/* final cleanup: this is not allowed to fail */
1778 	vmr_flags_t flags = VM_MAP_REMOVE_NO_FLAGS;
1779 
1780 	VM_MAP_ZAP_DECLARE(zap);
1781 
1782 	vm_map_lock(map);
1783 
1784 	map->terminated = true;
1785 	/* clean up regular map entries */
1786 	(void)vm_map_delete(map, map->min_offset, map->max_offset, flags,
1787 	    KMEM_GUARD_NONE, &zap);
1788 	/* clean up leftover special mappings (commpage, GPU carveout, etc...) */
1789 #if     !defined(__arm__)
1790 	(void)vm_map_delete(map, 0x0, 0xFFFFFFFFFFFFF000ULL, flags,
1791 	    KMEM_GUARD_NONE, &zap);
1792 #endif /* !__arm__ */
1793 
1794 	vm_map_disable_hole_optimization(map);
1795 	vm_map_corpse_footprint_destroy(map);
1796 
1797 	vm_map_unlock(map);
1798 
1799 	vm_map_zap_dispose(&zap);
1800 
1801 	assert(map->hdr.nentries == 0);
1802 
1803 	if (map->pmap) {
1804 		pmap_destroy(map->pmap);
1805 	}
1806 
1807 #if LOCKS_INDIRECT_ALLOW
1808 	if (vm_map_lck_attr.lck_attr_val & LCK_ATTR_DEBUG) {
1809 		/*
1810 		 * If lock debugging is enabled the mutexes get tagged as LCK_MTX_TAG_INDIRECT.
1811 		 * And this is regardless of whether the lck_mtx_ext_t is embedded in the
1812 		 * structure or kalloc'ed via lck_mtx_init.
1813 		 * An example is s_lock_ext within struct _vm_map.
1814 		 *
1815 		 * A lck_mtx_destroy on such a mutex will attempt a kfree and panic. We
1816 		 * can add another tag to detect embedded vs alloc'ed indirect external
1817 		 * mutexes but that'll be additional checks in the lock path and require
1818 		 * updating dependencies for the old vs new tag.
1819 		 *
1820 		 * Since the kfree() is for LCK_MTX_TAG_INDIRECT mutexes and that tag is applied
1821 		 * just when lock debugging is ON, we choose to forego explicitly destroying
1822 		 * the vm_map mutex and rw lock. Because the vm_map_lck_grp is
1823 		 * permanent, this has no serious side-effect.
1824 		 */
1825 	} else
1826 #endif /* LOCKS_INDIRECT_ALLOW */
1827 	{
1828 		lck_rw_destroy(&(map)->lock, &vm_map_lck_grp);
1829 	}
1830 
1831 	zfree(vm_map_zone, map);
1832 }
1833 
1834 /*
1835  * Returns pid of the task with the largest number of VM map entries.
1836  * Used in the zone-map-exhaustion jetsam path.
1837  */
1838 pid_t
find_largest_process_vm_map_entries(void)1839 find_largest_process_vm_map_entries(void)
1840 {
1841 	pid_t victim_pid = -1;
1842 	int max_vm_map_entries = 0;
1843 	task_t task = TASK_NULL;
1844 	queue_head_t *task_list = &tasks;
1845 
1846 	lck_mtx_lock(&tasks_threads_lock);
1847 	queue_iterate(task_list, task, task_t, tasks) {
1848 		if (task == kernel_task || !task->active) {
1849 			continue;
1850 		}
1851 
1852 		vm_map_t task_map = task->map;
1853 		if (task_map != VM_MAP_NULL) {
1854 			int task_vm_map_entries = task_map->hdr.nentries;
1855 			if (task_vm_map_entries > max_vm_map_entries) {
1856 				max_vm_map_entries = task_vm_map_entries;
1857 				victim_pid = pid_from_task(task);
1858 			}
1859 		}
1860 	}
1861 	lck_mtx_unlock(&tasks_threads_lock);
1862 
1863 	printf("zone_map_exhaustion: victim pid %d, vm region count: %d\n", victim_pid, max_vm_map_entries);
1864 	return victim_pid;
1865 }
1866 
1867 
1868 /*
1869  *	vm_map_lookup_entry:	[ internal use only ]
1870  *
1871  *	Calls into the vm map store layer to find the map
1872  *	entry containing (or immediately preceding) the
1873  *	specified address in the given map; the entry is returned
1874  *	in the "entry" parameter.  The boolean
1875  *	result indicates whether the address is
1876  *	actually contained in the map.
1877  */
1878 boolean_t
vm_map_lookup_entry(vm_map_t map,vm_map_offset_t address,vm_map_entry_t * entry)1879 vm_map_lookup_entry(
1880 	vm_map_t        map,
1881 	vm_map_offset_t address,
1882 	vm_map_entry_t  *entry)         /* OUT */
1883 {
1884 #if CONFIG_KERNEL_TBI
1885 	if (VM_KERNEL_ADDRESS(address)) {
1886 		address = VM_KERNEL_STRIP_UPTR(address);
1887 	}
1888 #endif /* CONFIG_KERNEL_TBI */
1889 #if CONFIG_PROB_GZALLOC
1890 	if (map->pmap == kernel_pmap) {
1891 		assertf(!pgz_owned(address),
1892 		    "it is the responsibility of callers to unguard PGZ addresses");
1893 	}
1894 #endif /* CONFIG_PROB_GZALLOC */
1895 	return vm_map_store_lookup_entry( map, address, entry );
1896 }
1897 
1898 boolean_t
vm_map_lookup_entry_or_next(vm_map_t map,vm_map_offset_t address,vm_map_entry_t * entry)1899 vm_map_lookup_entry_or_next(
1900 	vm_map_t        map,
1901 	vm_map_offset_t address,
1902 	vm_map_entry_t  *entry)         /* OUT */
1903 {
1904 	if (vm_map_lookup_entry(map, address, entry)) {
1905 		return true;
1906 	}
1907 
1908 	*entry = (*entry)->vme_next;
1909 	return false;
1910 }
1911 
1912 #if CONFIG_PROB_GZALLOC
1913 boolean_t
vm_map_lookup_entry_allow_pgz(vm_map_t map,vm_map_offset_t address,vm_map_entry_t * entry)1914 vm_map_lookup_entry_allow_pgz(
1915 	vm_map_t        map,
1916 	vm_map_offset_t address,
1917 	vm_map_entry_t  *entry)         /* OUT */
1918 {
1919 #if CONFIG_KERNEL_TBI
1920 	if (VM_KERNEL_ADDRESS(address)) {
1921 		address = VM_KERNEL_STRIP_UPTR(address);
1922 	}
1923 #endif /* CONFIG_KERNEL_TBI */
1924 	return vm_map_store_lookup_entry( map, address, entry );
1925 }
1926 #endif /* CONFIG_PROB_GZALLOC */
1927 
1928 #if !ZSECURITY_CONFIG(KERNEL_DATA_SPLIT)
1929 /*
1930  *	Routine:	vm_map_adjust_direction
1931  *	Purpose:
1932  *			Overrides direction to reduce fragmentation. Allocate small
1933  *			allocations from the end and large allocations from the right.
1934  */
1935 static void
vm_map_adjust_direction(vm_map_kernel_flags_t * vmk_flags,vm_map_size_t size)1936 vm_map_adjust_direction(
1937 	vm_map_kernel_flags_t *vmk_flags,
1938 	vm_map_size_t          size)
1939 {
1940 	if (size < KMEM_SMALLMAP_THRESHOLD) {
1941 		vmk_flags->vmkf_last_free = true;
1942 	} else {
1943 		vmk_flags->vmkf_last_free = false;
1944 	}
1945 }
1946 #endif /* !ZSECURITY_CONFIG(KERNEL_DATA_SPLIT) */
1947 
1948 /*
1949  *	Routine:	vm_map_get_range
1950  *	Purpose:
1951  *			Adjust bounds based on security policy.
1952  */
1953 static struct kmem_range
vm_map_get_range(vm_map_t map,vm_map_offset_t * address,vm_map_kernel_flags_t * vmk_flags,vm_map_size_t size)1954 vm_map_get_range(
1955 	vm_map_t                map,
1956 	vm_map_offset_t        *address,
1957 	vm_map_kernel_flags_t  *vmk_flags,
1958 	vm_map_size_t           size)
1959 {
1960 	struct kmem_range effective_range = {};
1961 	if (map == kernel_map) {
1962 		kmem_range_id_t range_id = vmk_flags->vmkf_range_id;
1963 		effective_range = kmem_ranges[range_id];
1964 
1965 		if (startup_phase >= STARTUP_SUB_KMEM) {
1966 			/*
1967 			 * Hint provided by caller is zeroed as the range is restricted to a
1968 			 * subset of the entire kernel_map VA, which could put the hint outside
1969 			 * the range, causing vm_map_store_find_space to fail.
1970 			 */
1971 			*address = 0ull;
1972 			assert(range_id != 0);
1973 #if ZSECURITY_CONFIG(KERNEL_DATA_SPLIT)
1974 			/*
1975 			 * Adjust range for allocations larger than KMEM_SMALLMAP_THRESHOLD.
1976 			 * Allocations smaller than KMEM_SMALLMAP_THRESHOLD are allowed to
1977 			 * use the entire range. Two small allocations from different fronts
1978 			 * (left and right) can only meet when memory in the that range is
1979 			 * entirely exhausted.
1980 			 */
1981 			if (size >= KMEM_SMALLMAP_THRESHOLD) {
1982 				effective_range = kmem_large_ranges[range_id];
1983 			}
1984 #else /* ZSECURITY_CONFIG(KERNEL_DATA_SPLIT) */
1985 			vm_map_adjust_direction(vmk_flags, size);
1986 #endif /* ZSECURITY_CONFIG(KERNEL_DATA_SPLIT) */
1987 		}
1988 	} else {
1989 		/*
1990 		 * If minimum is 0, bump it up by PAGE_SIZE.  We want to limit
1991 		 * allocations of PAGEZERO to explicit requests since its
1992 		 * normal use is to catch dereferences of NULL and many
1993 		 * applications also treat pointers with a value of 0 as
1994 		 * special and suddenly having address 0 contain useable
1995 		 * memory would tend to confuse those applications.
1996 		 */
1997 		effective_range.min_address = MAX(map->min_offset, VM_MAP_PAGE_SIZE(map));
1998 		effective_range.max_address = map->max_offset;
1999 	}
2000 
2001 	return effective_range;
2002 }
2003 
2004 /*
2005  *	Routine:	vm_map_locate_space
2006  *	Purpose:
2007  *		Finds a range in the specified virtual address map,
2008  *		returning the start of that range,
2009  *		as well as the entry right before it.
2010  */
2011 kern_return_t
vm_map_locate_space(vm_map_t map,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_map_offset_t * start_inout,vm_map_entry_t * entry_out)2012 vm_map_locate_space(
2013 	vm_map_t                map,
2014 	vm_map_size_t           size,
2015 	vm_map_offset_t         mask,
2016 	vm_map_kernel_flags_t   vmk_flags,
2017 	vm_map_offset_t        *start_inout,
2018 	vm_map_entry_t         *entry_out)
2019 {
2020 	struct kmem_range effective_range = {};
2021 	vm_map_size_t   guard_offset;
2022 	vm_map_offset_t hint, limit;
2023 	vm_map_entry_t  entry;
2024 
2025 	/*
2026 	 * Only supported by vm_map_enter() with a fixed address.
2027 	 */
2028 	assert(!vmk_flags.vmkf_beyond_max);
2029 
2030 	if (__improbable(map->wait_for_space)) {
2031 		/*
2032 		 * support for "wait_for_space" is minimal,
2033 		 * its only consumer is the ipc_kernel_copy_map.
2034 		 */
2035 		assert(!map->holelistenabled &&
2036 		    !vmk_flags.vmkf_last_free &&
2037 		    !vmk_flags.vmkf_keep_map_locked &&
2038 		    !vmk_flags.vmkf_map_jit &&
2039 		    !vmk_flags.vmkf_random_address &&
2040 		    *start_inout <= map->min_offset);
2041 	} else if (vmk_flags.vmkf_last_free) {
2042 		assert(!vmk_flags.vmkf_map_jit &&
2043 		    !vmk_flags.vmkf_random_address);
2044 	}
2045 
2046 	if (vmk_flags.vmkf_guard_before) {
2047 		guard_offset = VM_MAP_PAGE_SIZE(map);
2048 		assert(size > guard_offset);
2049 		size -= guard_offset;
2050 	} else {
2051 		assert(size != 0);
2052 		guard_offset = 0;
2053 	}
2054 
2055 	effective_range = vm_map_get_range(map, start_inout, &vmk_flags, size);
2056 #if XNU_TARGET_OS_OSX
2057 	if (__improbable(vmk_flags.vmkf_32bit_map_va)) {
2058 		assert(map != kernel_map);
2059 		effective_range.max_address = MIN(map->max_offset, 0x00000000FFFFF000ULL);
2060 	}
2061 #endif /* XNU_TARGET_OS_OSX */
2062 
2063 again:
2064 	if (vmk_flags.vmkf_last_free) {
2065 		hint = *start_inout;
2066 
2067 		if (hint == 0 || hint > effective_range.max_address) {
2068 			hint = effective_range.max_address;
2069 		}
2070 		if (hint <= effective_range.min_address) {
2071 			return KERN_NO_SPACE;
2072 		}
2073 		limit = effective_range.min_address;
2074 	} else {
2075 		hint = *start_inout;
2076 
2077 		if (vmk_flags.vmkf_map_jit) {
2078 			if (map->jit_entry_exists &&
2079 			    !VM_MAP_POLICY_ALLOW_MULTIPLE_JIT(map)) {
2080 				return KERN_INVALID_ARGUMENT;
2081 			}
2082 			if (VM_MAP_POLICY_ALLOW_JIT_RANDOM_ADDRESS(map)) {
2083 				vmk_flags.vmkf_random_address = true;
2084 			}
2085 		}
2086 
2087 		if (vmk_flags.vmkf_random_address) {
2088 			kern_return_t kr;
2089 
2090 			kr = vm_map_random_address_for_size(map, &hint, size, vmk_flags);
2091 			if (kr != KERN_SUCCESS) {
2092 				return kr;
2093 			}
2094 		}
2095 #if XNU_TARGET_OS_OSX
2096 		else if ((hint == 0 || hint == vm_map_min(map)) &&
2097 		    !map->disable_vmentry_reuse &&
2098 		    map->vmmap_high_start != 0) {
2099 			hint = map->vmmap_high_start;
2100 		}
2101 #endif /* XNU_TARGET_OS_OSX */
2102 
2103 		if (hint < effective_range.min_address) {
2104 			hint = effective_range.min_address;
2105 		}
2106 		if (effective_range.max_address <= hint) {
2107 			return KERN_NO_SPACE;
2108 		}
2109 
2110 		limit = effective_range.max_address;
2111 	}
2112 	entry = vm_map_store_find_space(map,
2113 	    hint, limit, vmk_flags.vmkf_last_free,
2114 	    guard_offset, size, mask,
2115 	    start_inout);
2116 
2117 	if (__improbable(entry == NULL)) {
2118 		if (map->wait_for_space &&
2119 		    guard_offset + size <=
2120 		    effective_range.max_address - effective_range.min_address) {
2121 			assert_wait((event_t)map, THREAD_ABORTSAFE);
2122 			vm_map_unlock(map);
2123 			thread_block(THREAD_CONTINUE_NULL);
2124 			vm_map_lock(map);
2125 			goto again;
2126 		}
2127 		return KERN_NO_SPACE;
2128 	}
2129 
2130 	if (entry_out) {
2131 		*entry_out = entry;
2132 	}
2133 	return KERN_SUCCESS;
2134 }
2135 
2136 
2137 /*
2138  *	Routine:	vm_map_find_space
2139  *	Purpose:
2140  *		Allocate a range in the specified virtual address map,
2141  *		returning the entry allocated for that range.
2142  *		Used by kmem_alloc, etc.
2143  *
2144  *		The map must be NOT be locked. It will be returned locked
2145  *		on KERN_SUCCESS, unlocked on failure.
2146  *
2147  *		If an entry is allocated, the object/offset fields
2148  *		are initialized to zero.
2149  */
2150 kern_return_t
vm_map_find_space(vm_map_t map,vm_map_offset_t hint_address,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_map_entry_t * o_entry)2151 vm_map_find_space(
2152 	vm_map_t                map,
2153 	vm_map_offset_t         hint_address,
2154 	vm_map_size_t           size,
2155 	vm_map_offset_t         mask,
2156 	vm_map_kernel_flags_t   vmk_flags,
2157 	vm_map_entry_t          *o_entry)       /* OUT */
2158 {
2159 	vm_map_entry_t          new_entry, entry;
2160 	kern_return_t           kr;
2161 
2162 	if (size == 0) {
2163 		return KERN_INVALID_ARGUMENT;
2164 	}
2165 
2166 	new_entry = vm_map_entry_create(map);
2167 	new_entry->use_pmap = true;
2168 	new_entry->protection = VM_PROT_DEFAULT;
2169 	new_entry->max_protection = VM_PROT_ALL;
2170 
2171 	if (VM_MAP_PAGE_SHIFT(map) != PAGE_SHIFT) {
2172 		new_entry->map_aligned = true;
2173 	}
2174 	if (vmk_flags.vmkf_permanent) {
2175 		new_entry->permanent = true;
2176 	}
2177 
2178 	vm_map_lock(map);
2179 
2180 	kr = vm_map_locate_space(map, size, mask, vmk_flags,
2181 	    &hint_address, &entry);
2182 	if (kr != KERN_SUCCESS) {
2183 		vm_map_unlock(map);
2184 		vm_map_entry_dispose(new_entry);
2185 		return kr;
2186 	}
2187 	new_entry->vme_start = hint_address;
2188 	new_entry->vme_end = hint_address + size;
2189 
2190 	/*
2191 	 *	At this point,
2192 	 *
2193 	 *	- new_entry's "vme_start" and "vme_end" should define
2194 	 *	  the endpoints of the available new range,
2195 	 *
2196 	 *	- and "entry" should refer to the region before
2197 	 *	  the new range,
2198 	 *
2199 	 *	- and the map should still be locked.
2200 	 */
2201 
2202 	assert(page_aligned(new_entry->vme_start));
2203 	assert(page_aligned(new_entry->vme_end));
2204 	assert(VM_MAP_PAGE_ALIGNED(new_entry->vme_start, VM_MAP_PAGE_MASK(map)));
2205 	assert(VM_MAP_PAGE_ALIGNED(new_entry->vme_end, VM_MAP_PAGE_MASK(map)));
2206 
2207 	/*
2208 	 *	Insert the new entry into the list
2209 	 */
2210 
2211 	vm_map_store_entry_link(map, entry, new_entry, VM_MAP_KERNEL_FLAGS_NONE);
2212 	map->size += size;
2213 
2214 	/*
2215 	 *	Update the lookup hint
2216 	 */
2217 	SAVE_HINT_MAP_WRITE(map, new_entry);
2218 
2219 	*o_entry = new_entry;
2220 	return KERN_SUCCESS;
2221 }
2222 
2223 int vm_map_pmap_enter_print = FALSE;
2224 int vm_map_pmap_enter_enable = FALSE;
2225 
2226 /*
2227  *	Routine:	vm_map_pmap_enter [internal only]
2228  *
2229  *	Description:
2230  *		Force pages from the specified object to be entered into
2231  *		the pmap at the specified address if they are present.
2232  *		As soon as a page not found in the object the scan ends.
2233  *
2234  *	Returns:
2235  *		Nothing.
2236  *
2237  *	In/out conditions:
2238  *		The source map should not be locked on entry.
2239  */
2240 __unused static void
vm_map_pmap_enter(vm_map_t map,vm_map_offset_t addr,vm_map_offset_t end_addr,vm_object_t object,vm_object_offset_t offset,vm_prot_t protection)2241 vm_map_pmap_enter(
2242 	vm_map_t                map,
2243 	vm_map_offset_t         addr,
2244 	vm_map_offset_t         end_addr,
2245 	vm_object_t             object,
2246 	vm_object_offset_t      offset,
2247 	vm_prot_t               protection)
2248 {
2249 	int                     type_of_fault;
2250 	kern_return_t           kr;
2251 	struct vm_object_fault_info fault_info = {};
2252 
2253 	if (map->pmap == 0) {
2254 		return;
2255 	}
2256 
2257 	assert(VM_MAP_PAGE_SHIFT(map) == PAGE_SHIFT);
2258 
2259 	while (addr < end_addr) {
2260 		vm_page_t       m;
2261 
2262 
2263 		/*
2264 		 * TODO:
2265 		 * From vm_map_enter(), we come into this function without the map
2266 		 * lock held or the object lock held.
2267 		 * We haven't taken a reference on the object either.
2268 		 * We should do a proper lookup on the map to make sure
2269 		 * that things are sane before we go locking objects that
2270 		 * could have been deallocated from under us.
2271 		 */
2272 
2273 		vm_object_lock(object);
2274 
2275 		m = vm_page_lookup(object, offset);
2276 
2277 		if (m == VM_PAGE_NULL || m->vmp_busy || m->vmp_fictitious ||
2278 		    (m->vmp_unusual && (m->vmp_error || m->vmp_restart || m->vmp_absent))) {
2279 			vm_object_unlock(object);
2280 			return;
2281 		}
2282 
2283 		if (vm_map_pmap_enter_print) {
2284 			printf("vm_map_pmap_enter:");
2285 			printf("map: %p, addr: %llx, object: %p, offset: %llx\n",
2286 			    map, (unsigned long long)addr, object, (unsigned long long)offset);
2287 		}
2288 		type_of_fault = DBG_CACHE_HIT_FAULT;
2289 		kr = vm_fault_enter(m, map->pmap,
2290 		    addr,
2291 		    PAGE_SIZE, 0,
2292 		    protection, protection,
2293 		    VM_PAGE_WIRED(m),
2294 		    FALSE,                 /* change_wiring */
2295 		    VM_KERN_MEMORY_NONE,                 /* tag - not wiring */
2296 		    &fault_info,
2297 		    NULL,                  /* need_retry */
2298 		    &type_of_fault);
2299 
2300 		vm_object_unlock(object);
2301 
2302 		offset += PAGE_SIZE_64;
2303 		addr += PAGE_SIZE;
2304 	}
2305 }
2306 
2307 #define MAX_TRIES_TO_GET_RANDOM_ADDRESS 1000
2308 kern_return_t
vm_map_random_address_for_size(vm_map_t map,vm_map_offset_t * address,vm_map_size_t size,vm_map_kernel_flags_t vmk_flags)2309 vm_map_random_address_for_size(
2310 	vm_map_t                map,
2311 	vm_map_offset_t        *address,
2312 	vm_map_size_t           size,
2313 	vm_map_kernel_flags_t   vmk_flags)
2314 {
2315 	kern_return_t   kr = KERN_SUCCESS;
2316 	int             tries = 0;
2317 	vm_map_offset_t random_addr = 0;
2318 	vm_map_offset_t hole_end;
2319 
2320 	vm_map_entry_t  next_entry = VM_MAP_ENTRY_NULL;
2321 	vm_map_entry_t  prev_entry = VM_MAP_ENTRY_NULL;
2322 	vm_map_size_t   vm_hole_size = 0;
2323 	vm_map_size_t   addr_space_size;
2324 	struct kmem_range effective_range = vm_map_get_range(map, address, &vmk_flags, size);
2325 
2326 	addr_space_size = effective_range.max_address - effective_range.min_address;
2327 	if (size >= addr_space_size) {
2328 		return KERN_NO_SPACE;
2329 	}
2330 	addr_space_size -= size;
2331 
2332 	assert(VM_MAP_PAGE_ALIGNED(size, VM_MAP_PAGE_MASK(map)));
2333 
2334 	while (tries < MAX_TRIES_TO_GET_RANDOM_ADDRESS) {
2335 		if (startup_phase < STARTUP_SUB_ZALLOC) {
2336 			random_addr = (vm_map_offset_t)early_random();
2337 		} else {
2338 			random_addr = (vm_map_offset_t)random();
2339 		}
2340 		random_addr <<= VM_MAP_PAGE_SHIFT(map);
2341 		random_addr = vm_map_trunc_page(
2342 			effective_range.min_address + (random_addr % addr_space_size),
2343 			VM_MAP_PAGE_MASK(map));
2344 
2345 #if CONFIG_PROB_GZALLOC
2346 		if (map->pmap == kernel_pmap && pgz_owned(random_addr)) {
2347 			continue;
2348 		}
2349 #endif /* CONFIG_PROB_GZALLOC */
2350 
2351 		if (vm_map_lookup_entry(map, random_addr, &prev_entry) == FALSE) {
2352 			if (prev_entry == vm_map_to_entry(map)) {
2353 				next_entry = vm_map_first_entry(map);
2354 			} else {
2355 				next_entry = prev_entry->vme_next;
2356 			}
2357 			if (next_entry == vm_map_to_entry(map)) {
2358 				hole_end = vm_map_max(map);
2359 			} else {
2360 				hole_end = next_entry->vme_start;
2361 			}
2362 			vm_hole_size = hole_end - random_addr;
2363 			if (vm_hole_size >= size) {
2364 				*address = random_addr;
2365 				break;
2366 			}
2367 		}
2368 		tries++;
2369 	}
2370 
2371 	if (tries == MAX_TRIES_TO_GET_RANDOM_ADDRESS) {
2372 		kr = KERN_NO_SPACE;
2373 	}
2374 	return kr;
2375 }
2376 
2377 static boolean_t
vm_memory_malloc_no_cow(int alias)2378 vm_memory_malloc_no_cow(
2379 	int alias)
2380 {
2381 	uint64_t alias_mask;
2382 
2383 	if (alias > 63) {
2384 		return FALSE;
2385 	}
2386 
2387 	alias_mask = 1ULL << alias;
2388 	if (alias_mask & vm_memory_malloc_no_cow_mask) {
2389 		return TRUE;
2390 	}
2391 	return FALSE;
2392 }
2393 
2394 uint64_t vm_map_enter_RLIMIT_AS_count = 0;
2395 uint64_t vm_map_enter_RLIMIT_DATA_count = 0;
2396 /*
2397  *	Routine:	vm_map_enter
2398  *
2399  *	Description:
2400  *		Allocate a range in the specified virtual address map.
2401  *		The resulting range will refer to memory defined by
2402  *		the given memory object and offset into that object.
2403  *
2404  *		Arguments are as defined in the vm_map call.
2405  */
2406 static unsigned int vm_map_enter_restore_successes = 0;
2407 static unsigned int vm_map_enter_restore_failures = 0;
2408 kern_return_t
vm_map_enter(vm_map_t map,vm_map_offset_t * address,vm_map_size_t size,vm_map_offset_t mask,int flags,vm_map_kernel_flags_t vmk_flags,vm_tag_t alias,vm_object_t object,vm_object_offset_t offset,boolean_t needs_copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)2409 vm_map_enter(
2410 	vm_map_t                map,
2411 	vm_map_offset_t         *address,       /* IN/OUT */
2412 	vm_map_size_t           size,
2413 	vm_map_offset_t         mask,
2414 	int                     flags,
2415 	vm_map_kernel_flags_t   vmk_flags,
2416 	vm_tag_t                alias,
2417 	vm_object_t             object,
2418 	vm_object_offset_t      offset,
2419 	boolean_t               needs_copy,
2420 	vm_prot_t               cur_protection,
2421 	vm_prot_t               max_protection,
2422 	vm_inherit_t            inheritance)
2423 {
2424 	vm_map_entry_t          entry, new_entry;
2425 	vm_map_offset_t         start, tmp_start, tmp_offset;
2426 	vm_map_offset_t         end, tmp_end;
2427 	vm_map_offset_t         tmp2_start, tmp2_end;
2428 	vm_map_offset_t         step;
2429 	kern_return_t           result = KERN_SUCCESS;
2430 	boolean_t               map_locked = FALSE;
2431 	boolean_t               pmap_empty = TRUE;
2432 	boolean_t               new_mapping_established = FALSE;
2433 	boolean_t               keep_map_locked = vmk_flags.vmkf_keep_map_locked;
2434 	boolean_t               anywhere = ((flags & VM_FLAGS_ANYWHERE) != 0);
2435 	boolean_t               purgable = ((flags & VM_FLAGS_PURGABLE) != 0);
2436 	boolean_t               overwrite = ((flags & VM_FLAGS_OVERWRITE) != 0);
2437 	boolean_t               no_cache = ((flags & VM_FLAGS_NO_CACHE) != 0);
2438 	const boolean_t         is_submap = vmk_flags.vmkf_submap;
2439 	boolean_t               permanent = (((flags & VM_FLAGS_PERMANENT) != 0) || vmk_flags.vmkf_permanent);
2440 	const boolean_t         no_copy_on_read = vmk_flags.vmkf_no_copy_on_read;
2441 	const boolean_t         entry_for_jit = vmk_flags.vmkf_map_jit;
2442 	boolean_t               iokit_acct = vmk_flags.vmkf_iokit_acct;
2443 	boolean_t               resilient_codesign = ((flags & VM_FLAGS_RESILIENT_CODESIGN) != 0);
2444 	boolean_t               resilient_media = ((flags & VM_FLAGS_RESILIENT_MEDIA) != 0);
2445 	unsigned int            superpage_size = ((flags & VM_FLAGS_SUPERPAGE_MASK) >> VM_FLAGS_SUPERPAGE_SHIFT);
2446 	vm_tag_t                user_alias;
2447 	kern_return_t           kr;
2448 	boolean_t               clear_map_aligned = FALSE;
2449 	vm_map_size_t           chunk_size = 0;
2450 	vm_object_t             caller_object;
2451 	VM_MAP_ZAP_DECLARE(zap_old_list);
2452 	VM_MAP_ZAP_DECLARE(zap_new_list);
2453 
2454 	caller_object = object;
2455 
2456 	assertf(vmk_flags.__vmkf_unused == 0, "vmk_flags unused=0x%x\n", vmk_flags.__vmkf_unused);
2457 
2458 	if (flags & VM_FLAGS_4GB_CHUNK) {
2459 #if defined(__LP64__)
2460 		chunk_size = (4ULL * 1024 * 1024 * 1024); /* max. 4GB chunks for the new allocation */
2461 #else /* __LP64__ */
2462 		chunk_size = ANON_CHUNK_SIZE;
2463 #endif /* __LP64__ */
2464 	} else {
2465 		chunk_size = ANON_CHUNK_SIZE;
2466 	}
2467 
2468 	if (superpage_size) {
2469 		switch (superpage_size) {
2470 			/*
2471 			 * Note that the current implementation only supports
2472 			 * a single size for superpages, SUPERPAGE_SIZE, per
2473 			 * architecture. As soon as more sizes are supposed
2474 			 * to be supported, SUPERPAGE_SIZE has to be replaced
2475 			 * with a lookup of the size depending on superpage_size.
2476 			 */
2477 #ifdef __x86_64__
2478 		case SUPERPAGE_SIZE_ANY:
2479 			/* handle it like 2 MB and round up to page size */
2480 			size = (size + 2 * 1024 * 1024 - 1) & ~(2 * 1024 * 1024 - 1);
2481 			OS_FALLTHROUGH;
2482 		case SUPERPAGE_SIZE_2MB:
2483 			break;
2484 #endif
2485 		default:
2486 			return KERN_INVALID_ARGUMENT;
2487 		}
2488 		mask = SUPERPAGE_SIZE - 1;
2489 		if (size & (SUPERPAGE_SIZE - 1)) {
2490 			return KERN_INVALID_ARGUMENT;
2491 		}
2492 		inheritance = VM_INHERIT_NONE;  /* fork() children won't inherit superpages */
2493 	}
2494 
2495 
2496 	if ((cur_protection & VM_PROT_WRITE) &&
2497 	    (cur_protection & VM_PROT_EXECUTE) &&
2498 #if XNU_TARGET_OS_OSX
2499 	    map->pmap != kernel_pmap &&
2500 	    (cs_process_global_enforcement() ||
2501 	    (vmk_flags.vmkf_cs_enforcement_override
2502 	    ? vmk_flags.vmkf_cs_enforcement
2503 	    : (vm_map_cs_enforcement(map)
2504 #if __arm64__
2505 	    || !VM_MAP_IS_EXOTIC(map)
2506 #endif /* __arm64__ */
2507 	    ))) &&
2508 #endif /* XNU_TARGET_OS_OSX */
2509 	    (VM_MAP_POLICY_WX_FAIL(map) ||
2510 	    VM_MAP_POLICY_WX_STRIP_X(map)) &&
2511 	    !entry_for_jit) {
2512 		boolean_t vm_protect_wx_fail = VM_MAP_POLICY_WX_FAIL(map);
2513 
2514 		DTRACE_VM3(cs_wx,
2515 		    uint64_t, 0,
2516 		    uint64_t, 0,
2517 		    vm_prot_t, cur_protection);
2518 		printf("CODE SIGNING: %d[%s] %s: curprot cannot be write+execute. %s\n",
2519 		    proc_selfpid(),
2520 		    (current_task()->bsd_info
2521 		    ? proc_name_address(current_task()->bsd_info)
2522 		    : "?"),
2523 		    __FUNCTION__,
2524 		    (vm_protect_wx_fail ? "failing" : "turning off execute"));
2525 		cur_protection &= ~VM_PROT_EXECUTE;
2526 		if (vm_protect_wx_fail) {
2527 			return KERN_PROTECTION_FAILURE;
2528 		}
2529 	}
2530 
2531 	/*
2532 	 * If the task has requested executable lockdown,
2533 	 * deny any new executable mapping.
2534 	 */
2535 	if (map->map_disallow_new_exec == TRUE) {
2536 		if (cur_protection & VM_PROT_EXECUTE) {
2537 			return KERN_PROTECTION_FAILURE;
2538 		}
2539 	}
2540 
2541 	if (resilient_codesign) {
2542 		assert(!is_submap);
2543 		int reject_prot = (needs_copy ? VM_PROT_ALLEXEC : (VM_PROT_WRITE | VM_PROT_ALLEXEC));
2544 		if ((cur_protection | max_protection) & reject_prot) {
2545 			return KERN_PROTECTION_FAILURE;
2546 		}
2547 	}
2548 
2549 	if (resilient_media) {
2550 		assert(!is_submap);
2551 //		assert(!needs_copy);
2552 		if (object != VM_OBJECT_NULL &&
2553 		    !object->internal) {
2554 			/*
2555 			 * This mapping is directly backed by an external
2556 			 * memory manager (e.g. a vnode pager for a file):
2557 			 * we would not have any safe place to inject
2558 			 * a zero-filled page if an actual page is not
2559 			 * available, without possibly impacting the actual
2560 			 * contents of the mapped object (e.g. the file),
2561 			 * so we can't provide any media resiliency here.
2562 			 */
2563 			return KERN_INVALID_ARGUMENT;
2564 		}
2565 	}
2566 
2567 	if (is_submap) {
2568 		if (purgable) {
2569 			/* submaps can not be purgeable */
2570 			return KERN_INVALID_ARGUMENT;
2571 		}
2572 		if (object == VM_OBJECT_NULL) {
2573 			/* submaps can not be created lazily */
2574 			return KERN_INVALID_ARGUMENT;
2575 		}
2576 	}
2577 	if (vmk_flags.vmkf_already) {
2578 		/*
2579 		 * VM_FLAGS_ALREADY says that it's OK if the same mapping
2580 		 * is already present.  For it to be meaningul, the requested
2581 		 * mapping has to be at a fixed address (!VM_FLAGS_ANYWHERE) and
2582 		 * we shouldn't try and remove what was mapped there first
2583 		 * (!VM_FLAGS_OVERWRITE).
2584 		 */
2585 		if ((flags & VM_FLAGS_ANYWHERE) ||
2586 		    (flags & VM_FLAGS_OVERWRITE)) {
2587 			return KERN_INVALID_ARGUMENT;
2588 		}
2589 	}
2590 
2591 	if (size == 0 ||
2592 	    (offset & MIN(VM_MAP_PAGE_MASK(map), PAGE_MASK_64)) != 0) {
2593 		*address = 0;
2594 		return KERN_INVALID_ARGUMENT;
2595 	}
2596 
2597 	if (map->pmap == kernel_pmap) {
2598 		user_alias = VM_KERN_MEMORY_NONE;
2599 	} else {
2600 		user_alias = alias;
2601 	}
2602 
2603 	if (user_alias == VM_MEMORY_MALLOC_MEDIUM) {
2604 		chunk_size = MALLOC_MEDIUM_CHUNK_SIZE;
2605 	}
2606 
2607 #define RETURN(value)   { result = value; goto BailOut; }
2608 
2609 	assertf(VM_MAP_PAGE_ALIGNED(*address, FOURK_PAGE_MASK), "0x%llx", (uint64_t)*address);
2610 	assertf(VM_MAP_PAGE_ALIGNED(size, FOURK_PAGE_MASK), "0x%llx", (uint64_t)size);
2611 	if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK) {
2612 		assertf(page_aligned(*address), "0x%llx", (uint64_t)*address);
2613 		assertf(page_aligned(size), "0x%llx", (uint64_t)size);
2614 	}
2615 
2616 	if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK &&
2617 	    !VM_MAP_PAGE_ALIGNED(size, VM_MAP_PAGE_MASK(map))) {
2618 		/*
2619 		 * In most cases, the caller rounds the size up to the
2620 		 * map's page size.
2621 		 * If we get a size that is explicitly not map-aligned here,
2622 		 * we'll have to respect the caller's wish and mark the
2623 		 * mapping as "not map-aligned" to avoid tripping the
2624 		 * map alignment checks later.
2625 		 */
2626 		clear_map_aligned = TRUE;
2627 	}
2628 	if (!anywhere &&
2629 	    VM_MAP_PAGE_MASK(map) >= PAGE_MASK &&
2630 	    !VM_MAP_PAGE_ALIGNED(*address, VM_MAP_PAGE_MASK(map))) {
2631 		/*
2632 		 * We've been asked to map at a fixed address and that
2633 		 * address is not aligned to the map's specific alignment.
2634 		 * The caller should know what it's doing (i.e. most likely
2635 		 * mapping some fragmented copy map, transferring memory from
2636 		 * a VM map with a different alignment), so clear map_aligned
2637 		 * for this new VM map entry and proceed.
2638 		 */
2639 		clear_map_aligned = TRUE;
2640 	}
2641 
2642 	/*
2643 	 * Only zero-fill objects are allowed to be purgable.
2644 	 * LP64todo - limit purgable objects to 32-bits for now
2645 	 */
2646 	if (purgable &&
2647 	    (offset != 0 ||
2648 	    (object != VM_OBJECT_NULL &&
2649 	    (object->vo_size != size ||
2650 	    object->purgable == VM_PURGABLE_DENY))
2651 	    || size > ANON_MAX_SIZE)) { /* LP64todo: remove when dp capable */
2652 		return KERN_INVALID_ARGUMENT;
2653 	}
2654 
2655 	start = *address;
2656 
2657 	if (anywhere) {
2658 		vm_map_lock(map);
2659 		map_locked = TRUE;
2660 
2661 		if (flags & VM_FLAGS_RANDOM_ADDR) {
2662 			vmk_flags.vmkf_random_address = true;
2663 		}
2664 
2665 		/*
2666 		 * Default to data range when an explicit range id isn't specified
2667 		 */
2668 		if ((vmk_flags.vmkf_range_id == KMEM_RANGE_ID_NONE) &&
2669 		    (map == kernel_map)) {
2670 			vmk_flags.vmkf_range_id = KMEM_RANGE_ID_DATA;
2671 		}
2672 
2673 		result = vm_map_locate_space(map, size, mask, vmk_flags,
2674 		    &start, &entry);
2675 		if (result != KERN_SUCCESS) {
2676 			goto BailOut;
2677 		}
2678 
2679 		*address = start;
2680 		end = start + size;
2681 		assert(VM_MAP_PAGE_ALIGNED(*address,
2682 		    VM_MAP_PAGE_MASK(map)));
2683 	} else {
2684 		vm_map_offset_t effective_min_offset, effective_max_offset;
2685 
2686 		effective_min_offset = map->min_offset;
2687 		effective_max_offset = map->max_offset;
2688 
2689 		if (vmk_flags.vmkf_beyond_max) {
2690 			/*
2691 			 * Allow an insertion beyond the map's max offset.
2692 			 */
2693 			effective_max_offset = 0x00000000FFFFF000ULL;
2694 #if !defined(__arm__)
2695 			if (vm_map_is_64bit(map)) {
2696 				effective_max_offset = 0xFFFFFFFFFFFFF000ULL;
2697 			}
2698 #endif  /* __arm__ */
2699 #if XNU_TARGET_OS_OSX
2700 		} else if (__improbable(vmk_flags.vmkf_32bit_map_va)) {
2701 			effective_max_offset = MIN(map->max_offset, 0x00000000FFFFF000ULL);
2702 #endif /* XNU_TARGET_OS_OSX */
2703 		}
2704 
2705 		if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT &&
2706 		    !overwrite &&
2707 		    user_alias == VM_MEMORY_REALLOC) {
2708 			/*
2709 			 * Force realloc() to switch to a new allocation,
2710 			 * to prevent 4k-fragmented virtual ranges.
2711 			 */
2712 //			DEBUG4K_ERROR("no realloc in place");
2713 			return KERN_NO_SPACE;
2714 		}
2715 
2716 		/*
2717 		 *	Verify that:
2718 		 *		the address doesn't itself violate
2719 		 *		the mask requirement.
2720 		 */
2721 
2722 		vm_map_lock(map);
2723 		map_locked = TRUE;
2724 		if ((start & mask) != 0) {
2725 			RETURN(KERN_NO_SPACE);
2726 		}
2727 
2728 		/*
2729 		 *	...	the address is within bounds
2730 		 */
2731 
2732 		end = start + size;
2733 
2734 		if ((start < effective_min_offset) ||
2735 		    (end > effective_max_offset) ||
2736 		    (start >= end)) {
2737 			RETURN(KERN_INVALID_ADDRESS);
2738 		}
2739 
2740 		if (overwrite) {
2741 			vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_MAP_ALIGN;
2742 
2743 			/*
2744 			 * Fixed mapping and "overwrite" flag: attempt to
2745 			 * remove all existing mappings in the specified
2746 			 * address range, saving them in our "zap_old_list".
2747 			 *
2748 			 * This avoids releasing the VM map lock in
2749 			 * vm_map_entry_delete() and allows atomicity
2750 			 * when we want to replace some mappings with a new one.
2751 			 * It also allows us to restore the old VM mappings if the
2752 			 * new mapping fails.
2753 			 */
2754 			remove_flags |= VM_MAP_REMOVE_NO_YIELD;
2755 
2756 			if (vmk_flags.vmkf_overwrite_immutable) {
2757 				/* we can overwrite immutable mappings */
2758 				remove_flags |= VM_MAP_REMOVE_IMMUTABLE;
2759 			}
2760 			(void)vm_map_delete(map, start, end, remove_flags,
2761 			    KMEM_GUARD_NONE, &zap_old_list);
2762 		}
2763 
2764 		/*
2765 		 *	...	the starting address isn't allocated
2766 		 */
2767 
2768 		if (vm_map_lookup_entry(map, start, &entry)) {
2769 			if (!(vmk_flags.vmkf_already)) {
2770 				RETURN(KERN_NO_SPACE);
2771 			}
2772 			/*
2773 			 * Check if what's already there is what we want.
2774 			 */
2775 			tmp_start = start;
2776 			tmp_offset = offset;
2777 			if (entry->vme_start < start) {
2778 				tmp_start -= start - entry->vme_start;
2779 				tmp_offset -= start - entry->vme_start;
2780 			}
2781 			for (; entry->vme_start < end;
2782 			    entry = entry->vme_next) {
2783 				/*
2784 				 * Check if the mapping's attributes
2785 				 * match the existing map entry.
2786 				 */
2787 				if (entry == vm_map_to_entry(map) ||
2788 				    entry->vme_start != tmp_start ||
2789 				    entry->is_sub_map != is_submap ||
2790 				    VME_OFFSET(entry) != tmp_offset ||
2791 				    entry->needs_copy != needs_copy ||
2792 				    entry->protection != cur_protection ||
2793 				    entry->max_protection != max_protection ||
2794 				    entry->inheritance != inheritance ||
2795 				    entry->iokit_acct != iokit_acct ||
2796 				    VME_ALIAS(entry) != alias) {
2797 					/* not the same mapping ! */
2798 					RETURN(KERN_NO_SPACE);
2799 				}
2800 				/*
2801 				 * Check if the same object is being mapped.
2802 				 */
2803 				if (is_submap) {
2804 					if (VME_SUBMAP(entry) !=
2805 					    (vm_map_t) object) {
2806 						/* not the same submap */
2807 						RETURN(KERN_NO_SPACE);
2808 					}
2809 				} else {
2810 					if (VME_OBJECT(entry) != object) {
2811 						/* not the same VM object... */
2812 						vm_object_t obj2;
2813 
2814 						obj2 = VME_OBJECT(entry);
2815 						if ((obj2 == VM_OBJECT_NULL ||
2816 						    obj2->internal) &&
2817 						    (object == VM_OBJECT_NULL ||
2818 						    object->internal)) {
2819 							/*
2820 							 * ... but both are
2821 							 * anonymous memory,
2822 							 * so equivalent.
2823 							 */
2824 						} else {
2825 							RETURN(KERN_NO_SPACE);
2826 						}
2827 					}
2828 				}
2829 
2830 				tmp_offset += entry->vme_end - entry->vme_start;
2831 				tmp_start += entry->vme_end - entry->vme_start;
2832 				if (entry->vme_end >= end) {
2833 					/* reached the end of our mapping */
2834 					break;
2835 				}
2836 			}
2837 			/* it all matches:  let's use what's already there ! */
2838 			RETURN(KERN_MEMORY_PRESENT);
2839 		}
2840 
2841 		/*
2842 		 *	...	the next region doesn't overlap the
2843 		 *		end point.
2844 		 */
2845 
2846 		if ((entry->vme_next != vm_map_to_entry(map)) &&
2847 		    (entry->vme_next->vme_start < end)) {
2848 			RETURN(KERN_NO_SPACE);
2849 		}
2850 	}
2851 
2852 	/*
2853 	 *	At this point,
2854 	 *		"start" and "end" should define the endpoints of the
2855 	 *			available new range, and
2856 	 *		"entry" should refer to the region before the new
2857 	 *			range, and
2858 	 *
2859 	 *		the map should be locked.
2860 	 */
2861 
2862 	/*
2863 	 *	See whether we can avoid creating a new entry (and object) by
2864 	 *	extending one of our neighbors.  [So far, we only attempt to
2865 	 *	extend from below.]  Note that we can never extend/join
2866 	 *	purgable objects because they need to remain distinct
2867 	 *	entities in order to implement their "volatile object"
2868 	 *	semantics.
2869 	 */
2870 
2871 	if (purgable ||
2872 	    entry_for_jit ||
2873 	    vm_memory_malloc_no_cow(user_alias)) {
2874 		if (object == VM_OBJECT_NULL) {
2875 			object = vm_object_allocate(size);
2876 			object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
2877 			object->true_share = FALSE;
2878 			if (purgable) {
2879 				task_t owner;
2880 				object->purgable = VM_PURGABLE_NONVOLATILE;
2881 				if (map->pmap == kernel_pmap) {
2882 					/*
2883 					 * Purgeable mappings made in a kernel
2884 					 * map are "owned" by the kernel itself
2885 					 * rather than the current user task
2886 					 * because they're likely to be used by
2887 					 * more than this user task (see
2888 					 * execargs_purgeable_allocate(), for
2889 					 * example).
2890 					 */
2891 					owner = kernel_task;
2892 				} else {
2893 					owner = current_task();
2894 				}
2895 				assert(object->vo_owner == NULL);
2896 				assert(object->resident_page_count == 0);
2897 				assert(object->wired_page_count == 0);
2898 				vm_object_lock(object);
2899 				vm_purgeable_nonvolatile_enqueue(object, owner);
2900 				vm_object_unlock(object);
2901 			}
2902 			offset = (vm_object_offset_t)0;
2903 		}
2904 	} else if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
2905 		/* no coalescing if address space uses sub-pages */
2906 	} else if ((is_submap == FALSE) &&
2907 	    (object == VM_OBJECT_NULL) &&
2908 	    (entry != vm_map_to_entry(map)) &&
2909 	    (entry->vme_end == start) &&
2910 	    (!entry->is_shared) &&
2911 	    (!entry->is_sub_map) &&
2912 	    (!entry->in_transition) &&
2913 	    (!entry->needs_wakeup) &&
2914 	    (entry->behavior == VM_BEHAVIOR_DEFAULT) &&
2915 	    (entry->protection == cur_protection) &&
2916 	    (entry->max_protection == max_protection) &&
2917 	    (entry->inheritance == inheritance) &&
2918 	    ((user_alias == VM_MEMORY_REALLOC) ||
2919 	    (VME_ALIAS(entry) == alias)) &&
2920 	    (entry->no_cache == no_cache) &&
2921 	    (entry->permanent == permanent) &&
2922 	    /* no coalescing for immutable executable mappings */
2923 	    !((entry->protection & VM_PROT_EXECUTE) &&
2924 	    entry->permanent) &&
2925 	    (!entry->superpage_size && !superpage_size) &&
2926 	    /*
2927 	     * No coalescing if not map-aligned, to avoid propagating
2928 	     * that condition any further than needed:
2929 	     */
2930 	    (!entry->map_aligned || !clear_map_aligned) &&
2931 	    (!entry->zero_wired_pages) &&
2932 	    (!entry->used_for_jit && !entry_for_jit) &&
2933 	    (!entry->pmap_cs_associated) &&
2934 	    (entry->iokit_acct == iokit_acct) &&
2935 	    (!entry->vme_resilient_codesign) &&
2936 	    (!entry->vme_resilient_media) &&
2937 	    (!entry->vme_atomic) &&
2938 	    (entry->vme_no_copy_on_read == no_copy_on_read) &&
2939 
2940 	    ((entry->vme_end - entry->vme_start) + size <=
2941 	    (user_alias == VM_MEMORY_REALLOC ?
2942 	    ANON_CHUNK_SIZE :
2943 	    NO_COALESCE_LIMIT)) &&
2944 
2945 	    (entry->wired_count == 0)) {        /* implies user_wired_count == 0 */
2946 		if (vm_object_coalesce(VME_OBJECT(entry),
2947 		    VM_OBJECT_NULL,
2948 		    VME_OFFSET(entry),
2949 		    (vm_object_offset_t) 0,
2950 		    (vm_map_size_t)(entry->vme_end - entry->vme_start),
2951 		    (vm_map_size_t)(end - entry->vme_end))) {
2952 			/*
2953 			 *	Coalesced the two objects - can extend
2954 			 *	the previous map entry to include the
2955 			 *	new range.
2956 			 */
2957 			map->size += (end - entry->vme_end);
2958 			assert(entry->vme_start < end);
2959 			assert(VM_MAP_PAGE_ALIGNED(end,
2960 			    VM_MAP_PAGE_MASK(map)));
2961 			if (__improbable(vm_debug_events)) {
2962 				DTRACE_VM5(map_entry_extend, vm_map_t, map, vm_map_entry_t, entry, vm_address_t, entry->vme_start, vm_address_t, entry->vme_end, vm_address_t, end);
2963 			}
2964 			entry->vme_end = end;
2965 			if (map->holelistenabled) {
2966 				vm_map_store_update_first_free(map, entry, TRUE);
2967 			} else {
2968 				vm_map_store_update_first_free(map, map->first_free, TRUE);
2969 			}
2970 			new_mapping_established = TRUE;
2971 			RETURN(KERN_SUCCESS);
2972 		}
2973 	}
2974 
2975 	step = superpage_size ? SUPERPAGE_SIZE : (end - start);
2976 	new_entry = NULL;
2977 
2978 	if (vmk_flags.vmkf_submap_adjust) {
2979 		vm_map_adjust_offsets((vm_map_t)caller_object, start, end);
2980 		offset = start;
2981 	}
2982 
2983 	for (tmp2_start = start; tmp2_start < end; tmp2_start += step) {
2984 		tmp2_end = tmp2_start + step;
2985 		/*
2986 		 *	Create a new entry
2987 		 *
2988 		 * XXX FBDP
2989 		 * The reserved "page zero" in each process's address space can
2990 		 * be arbitrarily large.  Splitting it into separate objects and
2991 		 * therefore different VM map entries serves no purpose and just
2992 		 * slows down operations on the VM map, so let's not split the
2993 		 * allocation into chunks if the max protection is NONE.  That
2994 		 * memory should never be accessible, so it will never get to the
2995 		 * default pager.
2996 		 */
2997 		tmp_start = tmp2_start;
2998 		if (object == VM_OBJECT_NULL &&
2999 		    size > chunk_size &&
3000 		    max_protection != VM_PROT_NONE &&
3001 		    superpage_size == 0) {
3002 			tmp_end = tmp_start + chunk_size;
3003 		} else {
3004 			tmp_end = tmp2_end;
3005 		}
3006 		do {
3007 			if (!is_submap &&
3008 			    object != VM_OBJECT_NULL &&
3009 			    object->internal &&
3010 			    offset + (tmp_end - tmp_start) > object->vo_size) {
3011 //				printf("FBDP object %p size 0x%llx overmapping offset 0x%llx size 0x%llx\n", object, object->vo_size, offset, (uint64_t)(tmp_end - tmp_start));
3012 				DTRACE_VM5(vm_map_enter_overmap,
3013 				    vm_map_t, map,
3014 				    vm_map_address_t, tmp_start,
3015 				    vm_map_address_t, tmp_end,
3016 				    vm_object_offset_t, offset,
3017 				    vm_object_size_t, object->vo_size);
3018 			}
3019 			new_entry = vm_map_entry_insert(map,
3020 			    entry, tmp_start, tmp_end,
3021 			    object, offset, vmk_flags,
3022 			    needs_copy,
3023 			    cur_protection, max_protection,
3024 			    (entry_for_jit && !VM_MAP_POLICY_ALLOW_JIT_INHERIT(map) ?
3025 			    VM_INHERIT_NONE : inheritance),
3026 			    no_cache,
3027 			    permanent,
3028 			    superpage_size,
3029 			    clear_map_aligned,
3030 			    alias);
3031 
3032 			assert((object != kernel_object) || (VM_KERN_MEMORY_NONE != alias));
3033 
3034 			if (resilient_codesign) {
3035 				int reject_prot = (needs_copy ? VM_PROT_ALLEXEC : (VM_PROT_WRITE | VM_PROT_ALLEXEC));
3036 				if (!((cur_protection | max_protection) & reject_prot)) {
3037 					new_entry->vme_resilient_codesign = TRUE;
3038 				}
3039 			}
3040 
3041 			if (resilient_media &&
3042 			    (object == VM_OBJECT_NULL ||
3043 			    object->internal)) {
3044 				new_entry->vme_resilient_media = TRUE;
3045 			}
3046 
3047 			assert(!new_entry->iokit_acct);
3048 			if (!is_submap &&
3049 			    object != VM_OBJECT_NULL &&
3050 			    (object->purgable != VM_PURGABLE_DENY ||
3051 			    object->vo_ledger_tag)) {
3052 				assert(new_entry->use_pmap);
3053 				assert(!new_entry->iokit_acct);
3054 				/*
3055 				 * Turn off pmap accounting since
3056 				 * purgeable (or tagged) objects have their
3057 				 * own ledgers.
3058 				 */
3059 				new_entry->use_pmap = FALSE;
3060 			} else if (!is_submap &&
3061 			    iokit_acct &&
3062 			    object != VM_OBJECT_NULL &&
3063 			    object->internal) {
3064 				/* alternate accounting */
3065 				assert(!new_entry->iokit_acct);
3066 				assert(new_entry->use_pmap);
3067 				new_entry->iokit_acct = TRUE;
3068 				new_entry->use_pmap = FALSE;
3069 				DTRACE_VM4(
3070 					vm_map_iokit_mapped_region,
3071 					vm_map_t, map,
3072 					vm_map_offset_t, new_entry->vme_start,
3073 					vm_map_offset_t, new_entry->vme_end,
3074 					int, VME_ALIAS(new_entry));
3075 				vm_map_iokit_mapped_region(
3076 					map,
3077 					(new_entry->vme_end -
3078 					new_entry->vme_start));
3079 			} else if (!is_submap) {
3080 				assert(!new_entry->iokit_acct);
3081 				assert(new_entry->use_pmap);
3082 			}
3083 
3084 			if (is_submap) {
3085 				vm_map_t        submap;
3086 				boolean_t       submap_is_64bit;
3087 				boolean_t       use_pmap;
3088 
3089 				assert(new_entry->is_sub_map);
3090 				assert(!new_entry->use_pmap);
3091 				assert(!new_entry->iokit_acct);
3092 				submap = (vm_map_t) object;
3093 				submap_is_64bit = vm_map_is_64bit(submap);
3094 				use_pmap = vmk_flags.vmkf_nested_pmap;
3095 #ifndef NO_NESTED_PMAP
3096 				if (use_pmap && submap->pmap == NULL) {
3097 					ledger_t ledger = map->pmap->ledger;
3098 					/* we need a sub pmap to nest... */
3099 					submap->pmap = pmap_create_options(ledger, 0,
3100 					    submap_is_64bit ? PMAP_CREATE_64BIT : 0);
3101 					if (submap->pmap == NULL) {
3102 						/* let's proceed without nesting... */
3103 					}
3104 #if     defined(__arm__) || defined(__arm64__)
3105 					else {
3106 						pmap_set_nested(submap->pmap);
3107 					}
3108 #endif
3109 				}
3110 				if (use_pmap && submap->pmap != NULL) {
3111 					if (VM_MAP_PAGE_SHIFT(map) != VM_MAP_PAGE_SHIFT(submap)) {
3112 						DEBUG4K_ERROR("map %p (%d) submap %p (%d): incompatible page sizes\n", map, VM_MAP_PAGE_SHIFT(map), submap, VM_MAP_PAGE_SHIFT(submap));
3113 						kr = KERN_FAILURE;
3114 					} else {
3115 						kr = pmap_nest(map->pmap,
3116 						    submap->pmap,
3117 						    tmp_start,
3118 						    tmp_end - tmp_start);
3119 					}
3120 					if (kr != KERN_SUCCESS) {
3121 						printf("vm_map_enter: "
3122 						    "pmap_nest(0x%llx,0x%llx) "
3123 						    "error 0x%x\n",
3124 						    (long long)tmp_start,
3125 						    (long long)tmp_end,
3126 						    kr);
3127 					} else {
3128 						/* we're now nested ! */
3129 						new_entry->use_pmap = TRUE;
3130 						pmap_empty = FALSE;
3131 					}
3132 				}
3133 #endif /* NO_NESTED_PMAP */
3134 			}
3135 			entry = new_entry;
3136 
3137 			if (superpage_size) {
3138 				vm_page_t pages, m;
3139 				vm_object_t sp_object;
3140 				vm_object_offset_t sp_offset;
3141 
3142 				VME_OFFSET_SET(entry, 0);
3143 
3144 				/* allocate one superpage */
3145 				kr = cpm_allocate(SUPERPAGE_SIZE, &pages, 0, SUPERPAGE_NBASEPAGES - 1, TRUE, 0);
3146 				if (kr != KERN_SUCCESS) {
3147 					/* deallocate whole range... */
3148 					new_mapping_established = TRUE;
3149 					/* ... but only up to "tmp_end" */
3150 					size -= end - tmp_end;
3151 					RETURN(kr);
3152 				}
3153 
3154 				/* create one vm_object per superpage */
3155 				sp_object = vm_object_allocate((vm_map_size_t)(entry->vme_end - entry->vme_start));
3156 				sp_object->phys_contiguous = TRUE;
3157 				sp_object->vo_shadow_offset = (vm_object_offset_t)VM_PAGE_GET_PHYS_PAGE(pages) * PAGE_SIZE;
3158 				VME_OBJECT_SET(entry, sp_object, false, 0);
3159 				assert(entry->use_pmap);
3160 
3161 				/* enter the base pages into the object */
3162 				vm_object_lock(sp_object);
3163 				for (sp_offset = 0;
3164 				    sp_offset < SUPERPAGE_SIZE;
3165 				    sp_offset += PAGE_SIZE) {
3166 					m = pages;
3167 					pmap_zero_page(VM_PAGE_GET_PHYS_PAGE(m));
3168 					pages = NEXT_PAGE(m);
3169 					*(NEXT_PAGE_PTR(m)) = VM_PAGE_NULL;
3170 					vm_page_insert_wired(m, sp_object, sp_offset, VM_KERN_MEMORY_OSFMK);
3171 				}
3172 				vm_object_unlock(sp_object);
3173 			}
3174 		} while (tmp_end != tmp2_end &&
3175 		    (tmp_start = tmp_end) &&
3176 		    (tmp_end = (tmp2_end - tmp_end > chunk_size) ?
3177 		    tmp_end + chunk_size : tmp2_end));
3178 	}
3179 
3180 	new_mapping_established = TRUE;
3181 
3182 BailOut:
3183 	assert(map_locked == TRUE);
3184 
3185 	/*
3186 	 * Address space limit enforcement (RLIMIT_AS and RLIMIT_DATA):
3187 	 * If we have identified and possibly established the new mapping(s),
3188 	 * make sure we did not go beyond the address space limit.
3189 	 */
3190 	if (result == KERN_SUCCESS) {
3191 		if (map->size_limit != RLIM_INFINITY &&
3192 		    map->size > map->size_limit) {
3193 			/*
3194 			 * Establishing the requested mappings would exceed
3195 			 * the process's RLIMIT_AS limit: fail with
3196 			 * KERN_NO_SPACE.
3197 			 */
3198 			result = KERN_NO_SPACE;
3199 			printf("%d[%s] %s: map size 0x%llx over RLIMIT_AS 0x%llx\n",
3200 			    proc_selfpid(),
3201 			    (current_task()->bsd_info
3202 			    ? proc_name_address(current_task()->bsd_info)
3203 			    : "?"),
3204 			    __FUNCTION__,
3205 			    (uint64_t) map->size,
3206 			    (uint64_t) map->size_limit);
3207 			DTRACE_VM2(vm_map_enter_RLIMIT_AS,
3208 			    vm_map_size_t, map->size,
3209 			    uint64_t, map->size_limit);
3210 			vm_map_enter_RLIMIT_AS_count++;
3211 		} else if (map->data_limit != RLIM_INFINITY &&
3212 		    map->size > map->data_limit) {
3213 			/*
3214 			 * Establishing the requested mappings would exceed
3215 			 * the process's RLIMIT_DATA limit: fail with
3216 			 * KERN_NO_SPACE.
3217 			 */
3218 			result = KERN_NO_SPACE;
3219 			printf("%d[%s] %s: map size 0x%llx over RLIMIT_DATA 0x%llx\n",
3220 			    proc_selfpid(),
3221 			    (current_task()->bsd_info
3222 			    ? proc_name_address(current_task()->bsd_info)
3223 			    : "?"),
3224 			    __FUNCTION__,
3225 			    (uint64_t) map->size,
3226 			    (uint64_t) map->data_limit);
3227 			DTRACE_VM2(vm_map_enter_RLIMIT_DATA,
3228 			    vm_map_size_t, map->size,
3229 			    uint64_t, map->data_limit);
3230 			vm_map_enter_RLIMIT_DATA_count++;
3231 		}
3232 	}
3233 
3234 	if (result == KERN_SUCCESS) {
3235 		vm_prot_t pager_prot;
3236 		memory_object_t pager;
3237 
3238 #if DEBUG
3239 		if (pmap_empty &&
3240 		    !(vmk_flags.vmkf_no_pmap_check)) {
3241 			assert(pmap_is_empty(map->pmap,
3242 			    *address,
3243 			    *address + size));
3244 		}
3245 #endif /* DEBUG */
3246 
3247 		/*
3248 		 * For "named" VM objects, let the pager know that the
3249 		 * memory object is being mapped.  Some pagers need to keep
3250 		 * track of this, to know when they can reclaim the memory
3251 		 * object, for example.
3252 		 * VM calls memory_object_map() for each mapping (specifying
3253 		 * the protection of each mapping) and calls
3254 		 * memory_object_last_unmap() when all the mappings are gone.
3255 		 */
3256 		pager_prot = max_protection;
3257 		if (needs_copy) {
3258 			/*
3259 			 * Copy-On-Write mapping: won't modify
3260 			 * the memory object.
3261 			 */
3262 			pager_prot &= ~VM_PROT_WRITE;
3263 		}
3264 		if (!is_submap &&
3265 		    object != VM_OBJECT_NULL &&
3266 		    object->named &&
3267 		    object->pager != MEMORY_OBJECT_NULL) {
3268 			vm_object_lock(object);
3269 			pager = object->pager;
3270 			if (object->named &&
3271 			    pager != MEMORY_OBJECT_NULL) {
3272 				assert(object->pager_ready);
3273 				vm_object_mapping_wait(object, THREAD_UNINT);
3274 				vm_object_mapping_begin(object);
3275 				vm_object_unlock(object);
3276 
3277 				kr = memory_object_map(pager, pager_prot);
3278 				assert(kr == KERN_SUCCESS);
3279 
3280 				vm_object_lock(object);
3281 				vm_object_mapping_end(object);
3282 			}
3283 			vm_object_unlock(object);
3284 		}
3285 	}
3286 
3287 	assert(map_locked == TRUE);
3288 
3289 	if (!keep_map_locked) {
3290 		vm_map_unlock(map);
3291 		map_locked = FALSE;
3292 	}
3293 
3294 	/*
3295 	 * We can't hold the map lock if we enter this block.
3296 	 */
3297 
3298 	if (result == KERN_SUCCESS) {
3299 		/*	Wire down the new entry if the user
3300 		 *	requested all new map entries be wired.
3301 		 */
3302 		if ((map->wiring_required) || (superpage_size)) {
3303 			assert(!keep_map_locked);
3304 			pmap_empty = FALSE; /* pmap won't be empty */
3305 			kr = vm_map_wire_kernel(map, start, end,
3306 			    new_entry->protection, VM_KERN_MEMORY_MLOCK,
3307 			    TRUE);
3308 			result = kr;
3309 		}
3310 
3311 	}
3312 
3313 	if (result != KERN_SUCCESS) {
3314 		if (new_mapping_established) {
3315 			/*
3316 			 * The caller had an extra reference on the VM object
3317 			 * it gave us.
3318 			 * We've transferred that reference to the mapping we
3319 			 * just established but we're about to undo that mapping
3320 			 * and release that reference.
3321 			 * The caller expects its reference to be consumed on
3322 			 * success only, so we have to get the extra reference
3323 			 * back for the caller.
3324 			 */
3325 			vm_object_reference(caller_object);
3326 
3327 			/*
3328 			 * We have to get rid of the new mappings since we
3329 			 * won't make them available to the user.
3330 			 * Try and do that atomically, to minimize the risk
3331 			 * that someone else create new mappings that range.
3332 			 */
3333 
3334 			if (!map_locked) {
3335 				vm_map_lock(map);
3336 				map_locked = TRUE;
3337 			}
3338 			(void)vm_map_delete(map, *address, *address + size,
3339 			    VM_MAP_REMOVE_NO_MAP_ALIGN | VM_MAP_REMOVE_NO_YIELD,
3340 			    KMEM_GUARD_NONE, &zap_new_list);
3341 		}
3342 
3343 		if (vm_map_zap_first_entry(&zap_old_list)) {
3344 			vm_map_entry_t entry1, entry2;
3345 
3346 			/*
3347 			 * The new mapping failed.  Attempt to restore
3348 			 * the old mappings, saved in the "zap_old_map".
3349 			 */
3350 			if (!map_locked) {
3351 				vm_map_lock(map);
3352 				map_locked = TRUE;
3353 			}
3354 
3355 			/* first check if the coast is still clear */
3356 			start = vm_map_zap_first_entry(&zap_old_list)->vme_start;
3357 			end   = vm_map_zap_last_entry(&zap_old_list)->vme_end;
3358 
3359 			if (vm_map_lookup_entry(map, start, &entry1) ||
3360 			    vm_map_lookup_entry(map, end, &entry2) ||
3361 			    entry1 != entry2) {
3362 				/*
3363 				 * Part of that range has already been
3364 				 * re-mapped:  we can't restore the old
3365 				 * mappings...
3366 				 */
3367 				vm_map_enter_restore_failures++;
3368 			} else {
3369 				/*
3370 				 * Transfer the saved map entries from
3371 				 * "zap_old_map" to the original "map",
3372 				 * inserting them all after "entry1".
3373 				 */
3374 				while ((entry2 = vm_map_zap_pop(&zap_old_list))) {
3375 					vm_map_size_t entry_size;
3376 
3377 					entry_size = (entry2->vme_end -
3378 					    entry2->vme_start);
3379 					vm_map_store_entry_link(map, entry1, entry2,
3380 					    VM_MAP_KERNEL_FLAGS_NONE);
3381 					map->size += entry_size;
3382 					entry1 = entry2;
3383 				}
3384 				if (map->wiring_required) {
3385 					/*
3386 					 * XXX TODO: we should rewire the
3387 					 * old pages here...
3388 					 */
3389 				}
3390 				vm_map_enter_restore_successes++;
3391 			}
3392 		}
3393 	}
3394 
3395 	/*
3396 	 * The caller is responsible for releasing the lock if it requested to
3397 	 * keep the map locked.
3398 	 */
3399 	if (map_locked && !keep_map_locked) {
3400 		vm_map_unlock(map);
3401 	}
3402 
3403 	vm_map_zap_dispose(&zap_old_list);
3404 	vm_map_zap_dispose(&zap_new_list);
3405 
3406 	return result;
3407 
3408 #undef  RETURN
3409 }
3410 
3411 #if __arm64__
3412 extern const struct memory_object_pager_ops fourk_pager_ops;
3413 kern_return_t
vm_map_enter_fourk(vm_map_t map,vm_map_offset_t * address,vm_map_size_t size,vm_map_offset_t mask,int flags,vm_map_kernel_flags_t vmk_flags,vm_tag_t alias,vm_object_t object,vm_object_offset_t offset,boolean_t needs_copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)3414 vm_map_enter_fourk(
3415 	vm_map_t                map,
3416 	vm_map_offset_t         *address,       /* IN/OUT */
3417 	vm_map_size_t           size,
3418 	vm_map_offset_t         mask,
3419 	int                     flags,
3420 	vm_map_kernel_flags_t   vmk_flags,
3421 	vm_tag_t                alias,
3422 	vm_object_t             object,
3423 	vm_object_offset_t      offset,
3424 	boolean_t               needs_copy,
3425 	vm_prot_t               cur_protection,
3426 	vm_prot_t               max_protection,
3427 	vm_inherit_t            inheritance)
3428 {
3429 	vm_map_entry_t          entry, new_entry;
3430 	vm_map_offset_t         start, fourk_start;
3431 	vm_map_offset_t         end, fourk_end;
3432 	vm_map_size_t           fourk_size;
3433 	kern_return_t           result = KERN_SUCCESS;
3434 	boolean_t               map_locked = FALSE;
3435 	boolean_t               pmap_empty = TRUE;
3436 	boolean_t               new_mapping_established = FALSE;
3437 	boolean_t               keep_map_locked = vmk_flags.vmkf_keep_map_locked;
3438 	boolean_t               anywhere = ((flags & VM_FLAGS_ANYWHERE) != 0);
3439 	boolean_t               purgable = ((flags & VM_FLAGS_PURGABLE) != 0);
3440 	boolean_t               overwrite = ((flags & VM_FLAGS_OVERWRITE) != 0);
3441 	boolean_t               no_cache = ((flags & VM_FLAGS_NO_CACHE) != 0);
3442 	const boolean_t         is_submap = vmk_flags.vmkf_submap;
3443 	boolean_t               permanent = (((flags & VM_FLAGS_PERMANENT) != 0) || vmk_flags.vmkf_permanent);
3444 	const boolean_t         entry_for_jit = vmk_flags.vmkf_map_jit;
3445 //	boolean_t		iokit_acct = vmk_flags.vmkf_iokit_acct;
3446 	unsigned int            superpage_size = ((flags & VM_FLAGS_SUPERPAGE_MASK) >> VM_FLAGS_SUPERPAGE_SHIFT);
3447 	vm_map_offset_t         effective_min_offset, effective_max_offset;
3448 	kern_return_t           kr;
3449 	boolean_t               clear_map_aligned = FALSE;
3450 	memory_object_t         fourk_mem_obj;
3451 	vm_object_t             fourk_object;
3452 	vm_map_offset_t         fourk_pager_offset;
3453 	int                     fourk_pager_index_start, fourk_pager_index_num;
3454 	int                     cur_idx;
3455 	boolean_t               fourk_copy;
3456 	vm_object_t             copy_object;
3457 	vm_object_offset_t      copy_offset;
3458 	VM_MAP_ZAP_DECLARE(zap_list);
3459 
3460 	if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
3461 		panic("%s:%d", __FUNCTION__, __LINE__);
3462 	}
3463 	fourk_mem_obj = MEMORY_OBJECT_NULL;
3464 	fourk_object = VM_OBJECT_NULL;
3465 
3466 	if (superpage_size) {
3467 		return KERN_NOT_SUPPORTED;
3468 	}
3469 
3470 	if ((cur_protection & VM_PROT_WRITE) &&
3471 	    (cur_protection & VM_PROT_EXECUTE) &&
3472 #if XNU_TARGET_OS_OSX
3473 	    map->pmap != kernel_pmap &&
3474 	    (vm_map_cs_enforcement(map)
3475 #if __arm64__
3476 	    || !VM_MAP_IS_EXOTIC(map)
3477 #endif /* __arm64__ */
3478 	    ) &&
3479 #endif /* XNU_TARGET_OS_OSX */
3480 	    !entry_for_jit) {
3481 		DTRACE_VM3(cs_wx,
3482 		    uint64_t, 0,
3483 		    uint64_t, 0,
3484 		    vm_prot_t, cur_protection);
3485 		printf("CODE SIGNING: %d[%s] %s: curprot cannot be write+execute. "
3486 		    "turning off execute\n",
3487 		    proc_selfpid(),
3488 		    (current_task()->bsd_info
3489 		    ? proc_name_address(current_task()->bsd_info)
3490 		    : "?"),
3491 		    __FUNCTION__);
3492 		cur_protection &= ~VM_PROT_EXECUTE;
3493 	}
3494 
3495 	/*
3496 	 * If the task has requested executable lockdown,
3497 	 * deny any new executable mapping.
3498 	 */
3499 	if (map->map_disallow_new_exec == TRUE) {
3500 		if (cur_protection & VM_PROT_EXECUTE) {
3501 			return KERN_PROTECTION_FAILURE;
3502 		}
3503 	}
3504 
3505 	if (is_submap) {
3506 		return KERN_NOT_SUPPORTED;
3507 	}
3508 	if (vmk_flags.vmkf_already) {
3509 		return KERN_NOT_SUPPORTED;
3510 	}
3511 	if (purgable || entry_for_jit) {
3512 		return KERN_NOT_SUPPORTED;
3513 	}
3514 
3515 	effective_min_offset = map->min_offset;
3516 
3517 	if (vmk_flags.vmkf_beyond_max) {
3518 		return KERN_NOT_SUPPORTED;
3519 	} else {
3520 		effective_max_offset = map->max_offset;
3521 	}
3522 
3523 	if (size == 0 ||
3524 	    (offset & FOURK_PAGE_MASK) != 0) {
3525 		*address = 0;
3526 		return KERN_INVALID_ARGUMENT;
3527 	}
3528 
3529 #define RETURN(value)   { result = value; goto BailOut; }
3530 
3531 	assert(VM_MAP_PAGE_ALIGNED(*address, FOURK_PAGE_MASK));
3532 	assert(VM_MAP_PAGE_ALIGNED(size, FOURK_PAGE_MASK));
3533 
3534 	if (!anywhere && overwrite) {
3535 		return KERN_NOT_SUPPORTED;
3536 	}
3537 
3538 	fourk_start = *address;
3539 	fourk_size = size;
3540 	fourk_end = fourk_start + fourk_size;
3541 
3542 	start = vm_map_trunc_page(*address, VM_MAP_PAGE_MASK(map));
3543 	end = vm_map_round_page(fourk_end, VM_MAP_PAGE_MASK(map));
3544 	size = end - start;
3545 
3546 	if (anywhere) {
3547 		return KERN_NOT_SUPPORTED;
3548 	} else {
3549 		/*
3550 		 *	Verify that:
3551 		 *		the address doesn't itself violate
3552 		 *		the mask requirement.
3553 		 */
3554 
3555 		vm_map_lock(map);
3556 		map_locked = TRUE;
3557 		if ((start & mask) != 0) {
3558 			RETURN(KERN_NO_SPACE);
3559 		}
3560 
3561 		/*
3562 		 *	...	the address is within bounds
3563 		 */
3564 
3565 		end = start + size;
3566 
3567 		if ((start < effective_min_offset) ||
3568 		    (end > effective_max_offset) ||
3569 		    (start >= end)) {
3570 			RETURN(KERN_INVALID_ADDRESS);
3571 		}
3572 
3573 		/*
3574 		 *	...	the starting address isn't allocated
3575 		 */
3576 		if (vm_map_lookup_entry(map, start, &entry)) {
3577 			vm_object_t cur_object, shadow_object;
3578 
3579 			/*
3580 			 * We might already some 4K mappings
3581 			 * in a 16K page here.
3582 			 */
3583 
3584 			if (entry->vme_end - entry->vme_start
3585 			    != SIXTEENK_PAGE_SIZE) {
3586 				RETURN(KERN_NO_SPACE);
3587 			}
3588 			if (entry->is_sub_map) {
3589 				RETURN(KERN_NO_SPACE);
3590 			}
3591 			if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
3592 				RETURN(KERN_NO_SPACE);
3593 			}
3594 
3595 			/* go all the way down the shadow chain */
3596 			cur_object = VME_OBJECT(entry);
3597 			vm_object_lock(cur_object);
3598 			while (cur_object->shadow != VM_OBJECT_NULL) {
3599 				shadow_object = cur_object->shadow;
3600 				vm_object_lock(shadow_object);
3601 				vm_object_unlock(cur_object);
3602 				cur_object = shadow_object;
3603 				shadow_object = VM_OBJECT_NULL;
3604 			}
3605 			if (cur_object->internal ||
3606 			    cur_object->pager == NULL) {
3607 				vm_object_unlock(cur_object);
3608 				RETURN(KERN_NO_SPACE);
3609 			}
3610 			if (cur_object->pager->mo_pager_ops
3611 			    != &fourk_pager_ops) {
3612 				vm_object_unlock(cur_object);
3613 				RETURN(KERN_NO_SPACE);
3614 			}
3615 			fourk_object = cur_object;
3616 			fourk_mem_obj = fourk_object->pager;
3617 
3618 			/* keep the "4K" object alive */
3619 			vm_object_reference_locked(fourk_object);
3620 			memory_object_reference(fourk_mem_obj);
3621 			vm_object_unlock(fourk_object);
3622 
3623 			/* merge permissions */
3624 			entry->protection |= cur_protection;
3625 			entry->max_protection |= max_protection;
3626 
3627 			if ((entry->protection & VM_PROT_WRITE) &&
3628 			    (entry->protection & VM_PROT_ALLEXEC) &&
3629 			    fourk_binary_compatibility_unsafe &&
3630 			    fourk_binary_compatibility_allow_wx) {
3631 				/* write+execute: need to be "jit" */
3632 				entry->used_for_jit = TRUE;
3633 			}
3634 			goto map_in_fourk_pager;
3635 		}
3636 
3637 		/*
3638 		 *	...	the next region doesn't overlap the
3639 		 *		end point.
3640 		 */
3641 
3642 		if ((entry->vme_next != vm_map_to_entry(map)) &&
3643 		    (entry->vme_next->vme_start < end)) {
3644 			RETURN(KERN_NO_SPACE);
3645 		}
3646 	}
3647 
3648 	/*
3649 	 *	At this point,
3650 	 *		"start" and "end" should define the endpoints of the
3651 	 *			available new range, and
3652 	 *		"entry" should refer to the region before the new
3653 	 *			range, and
3654 	 *
3655 	 *		the map should be locked.
3656 	 */
3657 
3658 	/* create a new "4K" pager */
3659 	fourk_mem_obj = fourk_pager_create();
3660 	fourk_object = fourk_pager_to_vm_object(fourk_mem_obj);
3661 	assert(fourk_object);
3662 
3663 	/* keep the "4" object alive */
3664 	vm_object_reference(fourk_object);
3665 
3666 	/* create a "copy" object, to map the "4K" object copy-on-write */
3667 	fourk_copy = TRUE;
3668 	result = vm_object_copy_strategically(fourk_object,
3669 	    0,
3670 	    end - start,
3671 	    &copy_object,
3672 	    &copy_offset,
3673 	    &fourk_copy);
3674 	assert(result == KERN_SUCCESS);
3675 	assert(copy_object != VM_OBJECT_NULL);
3676 	assert(copy_offset == 0);
3677 
3678 	/* map the "4K" pager's copy object */
3679 	new_entry = vm_map_entry_insert(map,
3680 	    entry,
3681 	    vm_map_trunc_page(start, VM_MAP_PAGE_MASK(map)),
3682 	    vm_map_round_page(end, VM_MAP_PAGE_MASK(map)),
3683 	    copy_object,
3684 	    0,                      /* offset */
3685 	    vmk_flags,
3686 	    FALSE,                  /* needs_copy */
3687 	    cur_protection, max_protection,
3688 	    (entry_for_jit && !VM_MAP_POLICY_ALLOW_JIT_INHERIT(map) ?
3689 	    VM_INHERIT_NONE : inheritance),
3690 	    no_cache,
3691 	    permanent,
3692 	    superpage_size,
3693 	    clear_map_aligned,
3694 	    alias);
3695 	entry = new_entry;
3696 
3697 #if VM_MAP_DEBUG_FOURK
3698 	if (vm_map_debug_fourk) {
3699 		printf("FOURK_PAGER: map %p [0x%llx:0x%llx] new pager %p\n",
3700 		    map,
3701 		    (uint64_t) entry->vme_start,
3702 		    (uint64_t) entry->vme_end,
3703 		    fourk_mem_obj);
3704 	}
3705 #endif /* VM_MAP_DEBUG_FOURK */
3706 
3707 	new_mapping_established = TRUE;
3708 
3709 map_in_fourk_pager:
3710 	/* "map" the original "object" where it belongs in the "4K" pager */
3711 	fourk_pager_offset = (fourk_start & SIXTEENK_PAGE_MASK);
3712 	fourk_pager_index_start = (int) (fourk_pager_offset / FOURK_PAGE_SIZE);
3713 	if (fourk_size > SIXTEENK_PAGE_SIZE) {
3714 		fourk_pager_index_num = 4;
3715 	} else {
3716 		fourk_pager_index_num = (int) (fourk_size / FOURK_PAGE_SIZE);
3717 	}
3718 	if (fourk_pager_index_start + fourk_pager_index_num > 4) {
3719 		fourk_pager_index_num = 4 - fourk_pager_index_start;
3720 	}
3721 	for (cur_idx = 0;
3722 	    cur_idx < fourk_pager_index_num;
3723 	    cur_idx++) {
3724 		vm_object_t             old_object;
3725 		vm_object_offset_t      old_offset;
3726 
3727 		kr = fourk_pager_populate(fourk_mem_obj,
3728 		    TRUE,                       /* overwrite */
3729 		    fourk_pager_index_start + cur_idx,
3730 		    object,
3731 		    (object
3732 		    ? (offset +
3733 		    (cur_idx * FOURK_PAGE_SIZE))
3734 		    : 0),
3735 		    &old_object,
3736 		    &old_offset);
3737 #if VM_MAP_DEBUG_FOURK
3738 		if (vm_map_debug_fourk) {
3739 			if (old_object == (vm_object_t) -1 &&
3740 			    old_offset == (vm_object_offset_t) -1) {
3741 				printf("FOURK_PAGER: map %p [0x%llx:0x%llx] "
3742 				    "pager [%p:0x%llx] "
3743 				    "populate[%d] "
3744 				    "[object:%p,offset:0x%llx]\n",
3745 				    map,
3746 				    (uint64_t) entry->vme_start,
3747 				    (uint64_t) entry->vme_end,
3748 				    fourk_mem_obj,
3749 				    VME_OFFSET(entry),
3750 				    fourk_pager_index_start + cur_idx,
3751 				    object,
3752 				    (object
3753 				    ? (offset + (cur_idx * FOURK_PAGE_SIZE))
3754 				    : 0));
3755 			} else {
3756 				printf("FOURK_PAGER: map %p [0x%llx:0x%llx] "
3757 				    "pager [%p:0x%llx] "
3758 				    "populate[%d] [object:%p,offset:0x%llx] "
3759 				    "old [%p:0x%llx]\n",
3760 				    map,
3761 				    (uint64_t) entry->vme_start,
3762 				    (uint64_t) entry->vme_end,
3763 				    fourk_mem_obj,
3764 				    VME_OFFSET(entry),
3765 				    fourk_pager_index_start + cur_idx,
3766 				    object,
3767 				    (object
3768 				    ? (offset + (cur_idx * FOURK_PAGE_SIZE))
3769 				    : 0),
3770 				    old_object,
3771 				    old_offset);
3772 			}
3773 		}
3774 #endif /* VM_MAP_DEBUG_FOURK */
3775 
3776 		assert(kr == KERN_SUCCESS);
3777 		if (object != old_object &&
3778 		    object != VM_OBJECT_NULL &&
3779 		    object != (vm_object_t) -1) {
3780 			vm_object_reference(object);
3781 		}
3782 		if (object != old_object &&
3783 		    old_object != VM_OBJECT_NULL &&
3784 		    old_object != (vm_object_t) -1) {
3785 			vm_object_deallocate(old_object);
3786 		}
3787 	}
3788 
3789 BailOut:
3790 	assert(map_locked == TRUE);
3791 
3792 	if (result == KERN_SUCCESS) {
3793 		vm_prot_t pager_prot;
3794 		memory_object_t pager;
3795 
3796 #if DEBUG
3797 		if (pmap_empty &&
3798 		    !(vmk_flags.vmkf_no_pmap_check)) {
3799 			assert(pmap_is_empty(map->pmap,
3800 			    *address,
3801 			    *address + size));
3802 		}
3803 #endif /* DEBUG */
3804 
3805 		/*
3806 		 * For "named" VM objects, let the pager know that the
3807 		 * memory object is being mapped.  Some pagers need to keep
3808 		 * track of this, to know when they can reclaim the memory
3809 		 * object, for example.
3810 		 * VM calls memory_object_map() for each mapping (specifying
3811 		 * the protection of each mapping) and calls
3812 		 * memory_object_last_unmap() when all the mappings are gone.
3813 		 */
3814 		pager_prot = max_protection;
3815 		if (needs_copy) {
3816 			/*
3817 			 * Copy-On-Write mapping: won't modify
3818 			 * the memory object.
3819 			 */
3820 			pager_prot &= ~VM_PROT_WRITE;
3821 		}
3822 		if (!is_submap &&
3823 		    object != VM_OBJECT_NULL &&
3824 		    object->named &&
3825 		    object->pager != MEMORY_OBJECT_NULL) {
3826 			vm_object_lock(object);
3827 			pager = object->pager;
3828 			if (object->named &&
3829 			    pager != MEMORY_OBJECT_NULL) {
3830 				assert(object->pager_ready);
3831 				vm_object_mapping_wait(object, THREAD_UNINT);
3832 				vm_object_mapping_begin(object);
3833 				vm_object_unlock(object);
3834 
3835 				kr = memory_object_map(pager, pager_prot);
3836 				assert(kr == KERN_SUCCESS);
3837 
3838 				vm_object_lock(object);
3839 				vm_object_mapping_end(object);
3840 			}
3841 			vm_object_unlock(object);
3842 		}
3843 		if (!is_submap &&
3844 		    fourk_object != VM_OBJECT_NULL &&
3845 		    fourk_object->named &&
3846 		    fourk_object->pager != MEMORY_OBJECT_NULL) {
3847 			vm_object_lock(fourk_object);
3848 			pager = fourk_object->pager;
3849 			if (fourk_object->named &&
3850 			    pager != MEMORY_OBJECT_NULL) {
3851 				assert(fourk_object->pager_ready);
3852 				vm_object_mapping_wait(fourk_object,
3853 				    THREAD_UNINT);
3854 				vm_object_mapping_begin(fourk_object);
3855 				vm_object_unlock(fourk_object);
3856 
3857 				kr = memory_object_map(pager, VM_PROT_READ);
3858 				assert(kr == KERN_SUCCESS);
3859 
3860 				vm_object_lock(fourk_object);
3861 				vm_object_mapping_end(fourk_object);
3862 			}
3863 			vm_object_unlock(fourk_object);
3864 		}
3865 	}
3866 
3867 	if (fourk_object != VM_OBJECT_NULL) {
3868 		vm_object_deallocate(fourk_object);
3869 		fourk_object = VM_OBJECT_NULL;
3870 		memory_object_deallocate(fourk_mem_obj);
3871 		fourk_mem_obj = MEMORY_OBJECT_NULL;
3872 	}
3873 
3874 	assert(map_locked == TRUE);
3875 
3876 	if (!keep_map_locked) {
3877 		vm_map_unlock(map);
3878 		map_locked = FALSE;
3879 	}
3880 
3881 	/*
3882 	 * We can't hold the map lock if we enter this block.
3883 	 */
3884 
3885 	if (result == KERN_SUCCESS) {
3886 		/*	Wire down the new entry if the user
3887 		 *	requested all new map entries be wired.
3888 		 */
3889 		if ((map->wiring_required) || (superpage_size)) {
3890 			assert(!keep_map_locked);
3891 			pmap_empty = FALSE; /* pmap won't be empty */
3892 			kr = vm_map_wire_kernel(map, start, end,
3893 			    new_entry->protection, VM_KERN_MEMORY_MLOCK,
3894 			    TRUE);
3895 			result = kr;
3896 		}
3897 
3898 	}
3899 
3900 	if (result != KERN_SUCCESS) {
3901 		if (new_mapping_established) {
3902 			/*
3903 			 * We have to get rid of the new mappings since we
3904 			 * won't make them available to the user.
3905 			 * Try and do that atomically, to minimize the risk
3906 			 * that someone else create new mappings that range.
3907 			 */
3908 
3909 			if (!map_locked) {
3910 				vm_map_lock(map);
3911 				map_locked = TRUE;
3912 			}
3913 			(void)vm_map_delete(map, *address, *address + size,
3914 			    VM_MAP_REMOVE_NO_MAP_ALIGN | VM_MAP_REMOVE_NO_YIELD,
3915 			    KMEM_GUARD_NONE, &zap_list);
3916 		}
3917 	}
3918 
3919 	/*
3920 	 * The caller is responsible for releasing the lock if it requested to
3921 	 * keep the map locked.
3922 	 */
3923 	if (map_locked && !keep_map_locked) {
3924 		vm_map_unlock(map);
3925 	}
3926 
3927 	vm_map_zap_dispose(&zap_list);
3928 
3929 	return result;
3930 
3931 #undef  RETURN
3932 }
3933 #endif /* __arm64__ */
3934 
3935 /*
3936  * Counters for the prefault optimization.
3937  */
3938 int64_t vm_prefault_nb_pages = 0;
3939 int64_t vm_prefault_nb_bailout = 0;
3940 
3941 static kern_return_t
vm_map_enter_mem_object_helper(vm_map_t target_map,vm_map_offset_t * address,vm_map_size_t initial_size,vm_map_offset_t mask,int flags,vm_map_kernel_flags_t vmk_flags,vm_tag_t tag,ipc_port_t port,vm_object_offset_t offset,boolean_t copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance,upl_page_list_ptr_t page_list,unsigned int page_list_count)3942 vm_map_enter_mem_object_helper(
3943 	vm_map_t                target_map,
3944 	vm_map_offset_t         *address,
3945 	vm_map_size_t           initial_size,
3946 	vm_map_offset_t         mask,
3947 	int                     flags,
3948 	vm_map_kernel_flags_t   vmk_flags,
3949 	vm_tag_t                tag,
3950 	ipc_port_t              port,
3951 	vm_object_offset_t      offset,
3952 	boolean_t               copy,
3953 	vm_prot_t               cur_protection,
3954 	vm_prot_t               max_protection,
3955 	vm_inherit_t            inheritance,
3956 	upl_page_list_ptr_t     page_list,
3957 	unsigned int            page_list_count)
3958 {
3959 	vm_map_address_t        map_addr;
3960 	vm_map_size_t           map_size;
3961 	vm_object_t             object;
3962 	vm_object_size_t        size;
3963 	kern_return_t           result;
3964 	boolean_t               mask_cur_protection, mask_max_protection;
3965 	boolean_t               kernel_prefault, try_prefault = (page_list_count != 0);
3966 	vm_map_offset_t         offset_in_mapping = 0;
3967 #if __arm64__
3968 	boolean_t               fourk = vmk_flags.vmkf_fourk;
3969 #endif /* __arm64__ */
3970 
3971 	if (VM_MAP_PAGE_SHIFT(target_map) < PAGE_SHIFT) {
3972 		/* XXX TODO4K prefaulting depends on page size... */
3973 		try_prefault = FALSE;
3974 	}
3975 
3976 	assertf(vmk_flags.__vmkf_unused == 0, "vmk_flags unused=0x%x\n", vmk_flags.__vmkf_unused);
3977 
3978 	mask_cur_protection = cur_protection & VM_PROT_IS_MASK;
3979 	mask_max_protection = max_protection & VM_PROT_IS_MASK;
3980 	cur_protection &= ~VM_PROT_IS_MASK;
3981 	max_protection &= ~VM_PROT_IS_MASK;
3982 
3983 	/*
3984 	 * Check arguments for validity
3985 	 */
3986 	if ((target_map == VM_MAP_NULL) ||
3987 	    (cur_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)) ||
3988 	    (max_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)) ||
3989 	    (inheritance > VM_INHERIT_LAST_VALID) ||
3990 	    (try_prefault && (copy || !page_list)) ||
3991 	    initial_size == 0) {
3992 		return KERN_INVALID_ARGUMENT;
3993 	}
3994 
3995 	/*
3996 	 * Redirect to kmem_ranges[data]
3997 	 */
3998 	if (target_map == kernel_map) {
3999 		vmk_flags.vmkf_range_id = KMEM_RANGE_ID_DATA;
4000 	}
4001 
4002 #if __arm64__
4003 	if (fourk && VM_MAP_PAGE_SHIFT(target_map) < PAGE_SHIFT) {
4004 		/* no "fourk" if map is using a sub-page page size */
4005 		fourk = FALSE;
4006 	}
4007 	if (fourk) {
4008 		map_addr = vm_map_trunc_page(*address, FOURK_PAGE_MASK);
4009 		map_size = vm_map_round_page(initial_size, FOURK_PAGE_MASK);
4010 	} else
4011 #endif /* __arm64__ */
4012 	{
4013 		map_addr = vm_map_trunc_page(*address,
4014 		    VM_MAP_PAGE_MASK(target_map));
4015 		map_size = vm_map_round_page(initial_size,
4016 		    VM_MAP_PAGE_MASK(target_map));
4017 	}
4018 	size = vm_object_round_page(initial_size);
4019 
4020 	/*
4021 	 * Find the vm object (if any) corresponding to this port.
4022 	 */
4023 	if (!IP_VALID(port)) {
4024 		object = VM_OBJECT_NULL;
4025 		offset = 0;
4026 		copy = FALSE;
4027 	} else if (ip_kotype(port) == IKOT_NAMED_ENTRY) {
4028 		vm_named_entry_t        named_entry;
4029 		vm_object_offset_t      data_offset;
4030 
4031 		named_entry = mach_memory_entry_from_port(port);
4032 
4033 		if (flags & (VM_FLAGS_RETURN_DATA_ADDR |
4034 		    VM_FLAGS_RETURN_4K_DATA_ADDR)) {
4035 			data_offset = named_entry->data_offset;
4036 			offset += named_entry->data_offset;
4037 		} else {
4038 			data_offset = 0;
4039 		}
4040 
4041 		/* a few checks to make sure user is obeying rules */
4042 		if (size == 0) {
4043 			if (offset >= named_entry->size) {
4044 				return KERN_INVALID_RIGHT;
4045 			}
4046 			size = named_entry->size - offset;
4047 		}
4048 		if (mask_max_protection) {
4049 			max_protection &= named_entry->protection;
4050 		}
4051 		if (mask_cur_protection) {
4052 			cur_protection &= named_entry->protection;
4053 		}
4054 		if ((named_entry->protection & max_protection) !=
4055 		    max_protection) {
4056 			return KERN_INVALID_RIGHT;
4057 		}
4058 		if ((named_entry->protection & cur_protection) !=
4059 		    cur_protection) {
4060 			return KERN_INVALID_RIGHT;
4061 		}
4062 		if (offset + size < offset) {
4063 			/* overflow */
4064 			return KERN_INVALID_ARGUMENT;
4065 		}
4066 		if (named_entry->size < (offset + initial_size)) {
4067 			return KERN_INVALID_ARGUMENT;
4068 		}
4069 
4070 		if (named_entry->is_copy) {
4071 			/* for a vm_map_copy, we can only map it whole */
4072 			if ((size != named_entry->size) &&
4073 			    (vm_map_round_page(size,
4074 			    VM_MAP_PAGE_MASK(target_map)) ==
4075 			    named_entry->size)) {
4076 				/* XXX FBDP use the rounded size... */
4077 				size = vm_map_round_page(
4078 					size,
4079 					VM_MAP_PAGE_MASK(target_map));
4080 			}
4081 		}
4082 
4083 		/* the callers parameter offset is defined to be the */
4084 		/* offset from beginning of named entry offset in object */
4085 		offset = offset + named_entry->offset;
4086 
4087 		if (!VM_MAP_PAGE_ALIGNED(size,
4088 		    VM_MAP_PAGE_MASK(target_map))) {
4089 			/*
4090 			 * Let's not map more than requested;
4091 			 * vm_map_enter() will handle this "not map-aligned"
4092 			 * case.
4093 			 */
4094 			map_size = size;
4095 		}
4096 
4097 		named_entry_lock(named_entry);
4098 		if (named_entry->is_sub_map) {
4099 			vm_map_t                submap;
4100 
4101 			if (flags & (VM_FLAGS_RETURN_DATA_ADDR |
4102 			    VM_FLAGS_RETURN_4K_DATA_ADDR)) {
4103 				panic("VM_FLAGS_RETURN_DATA_ADDR not expected for submap.");
4104 			}
4105 
4106 			submap = named_entry->backing.map;
4107 			vm_map_reference(submap);
4108 			named_entry_unlock(named_entry);
4109 
4110 			vmk_flags.vmkf_submap = TRUE;
4111 
4112 			result = vm_map_enter(target_map,
4113 			    &map_addr,
4114 			    map_size,
4115 			    mask,
4116 			    flags,
4117 			    vmk_flags,
4118 			    tag,
4119 			    (vm_object_t)(uintptr_t) submap,
4120 			    offset,
4121 			    copy,
4122 			    cur_protection,
4123 			    max_protection,
4124 			    inheritance);
4125 			if (result != KERN_SUCCESS) {
4126 				vm_map_deallocate(submap);
4127 			} else {
4128 				/*
4129 				 * No need to lock "submap" just to check its
4130 				 * "mapped" flag: that flag is never reset
4131 				 * once it's been set and if we race, we'll
4132 				 * just end up setting it twice, which is OK.
4133 				 */
4134 				if (submap->mapped_in_other_pmaps == FALSE &&
4135 				    vm_map_pmap(submap) != PMAP_NULL &&
4136 				    vm_map_pmap(submap) !=
4137 				    vm_map_pmap(target_map)) {
4138 					/*
4139 					 * This submap is being mapped in a map
4140 					 * that uses a different pmap.
4141 					 * Set its "mapped_in_other_pmaps" flag
4142 					 * to indicate that we now need to
4143 					 * remove mappings from all pmaps rather
4144 					 * than just the submap's pmap.
4145 					 */
4146 					vm_map_lock(submap);
4147 					submap->mapped_in_other_pmaps = TRUE;
4148 					vm_map_unlock(submap);
4149 				}
4150 				*address = map_addr;
4151 			}
4152 			return result;
4153 		} else if (named_entry->is_copy) {
4154 			kern_return_t   kr;
4155 			vm_map_copy_t   copy_map;
4156 			vm_map_entry_t  copy_entry;
4157 			vm_map_offset_t copy_addr;
4158 			vm_map_copy_t   target_copy_map;
4159 			vm_map_offset_t overmap_start, overmap_end;
4160 			vm_map_offset_t trimmed_start;
4161 			vm_map_size_t   target_size;
4162 
4163 			if (flags & ~(VM_FLAGS_FIXED |
4164 			    VM_FLAGS_ANYWHERE |
4165 			    VM_FLAGS_OVERWRITE |
4166 			    VM_FLAGS_RETURN_4K_DATA_ADDR |
4167 			    VM_FLAGS_RETURN_DATA_ADDR |
4168 			    VM_FLAGS_ALIAS_MASK)) {
4169 				named_entry_unlock(named_entry);
4170 				return KERN_INVALID_ARGUMENT;
4171 			}
4172 
4173 			copy_map = named_entry->backing.copy;
4174 			assert(copy_map->type == VM_MAP_COPY_ENTRY_LIST);
4175 			if (copy_map->type != VM_MAP_COPY_ENTRY_LIST) {
4176 				/* unsupported type; should not happen */
4177 				printf("vm_map_enter_mem_object: "
4178 				    "memory_entry->backing.copy "
4179 				    "unsupported type 0x%x\n",
4180 				    copy_map->type);
4181 				named_entry_unlock(named_entry);
4182 				return KERN_INVALID_ARGUMENT;
4183 			}
4184 
4185 			if (VM_MAP_PAGE_SHIFT(target_map) != copy_map->cpy_hdr.page_shift) {
4186 				DEBUG4K_SHARE("copy_map %p offset %llx size 0x%llx pgshift %d -> target_map %p pgshift %d\n", copy_map, offset, (uint64_t)map_size, copy_map->cpy_hdr.page_shift, target_map, VM_MAP_PAGE_SHIFT(target_map));
4187 			}
4188 
4189 			if (flags & (VM_FLAGS_RETURN_DATA_ADDR |
4190 			    VM_FLAGS_RETURN_4K_DATA_ADDR)) {
4191 				offset_in_mapping = offset & VM_MAP_PAGE_MASK(target_map);
4192 				if (flags & VM_FLAGS_RETURN_4K_DATA_ADDR) {
4193 					offset_in_mapping &= ~((signed)(0xFFF));
4194 				}
4195 			}
4196 
4197 			target_copy_map = VM_MAP_COPY_NULL;
4198 			target_size = copy_map->size;
4199 			overmap_start = 0;
4200 			overmap_end = 0;
4201 			trimmed_start = 0;
4202 			if (copy_map->cpy_hdr.page_shift != VM_MAP_PAGE_SHIFT(target_map)) {
4203 				DEBUG4K_ADJUST("adjusting...\n");
4204 				kr = vm_map_copy_adjust_to_target(
4205 					copy_map,
4206 					offset /* includes data_offset */,
4207 					initial_size,
4208 					target_map,
4209 					copy,
4210 					&target_copy_map,
4211 					&overmap_start,
4212 					&overmap_end,
4213 					&trimmed_start);
4214 				if (kr != KERN_SUCCESS) {
4215 					named_entry_unlock(named_entry);
4216 					return kr;
4217 				}
4218 				target_size = target_copy_map->size;
4219 				if (trimmed_start >= data_offset) {
4220 					data_offset = offset & VM_MAP_PAGE_MASK(target_map);
4221 				} else {
4222 					data_offset -= trimmed_start;
4223 				}
4224 			} else {
4225 				/*
4226 				 * Assert that the vm_map_copy is coming from the right
4227 				 * zone and hasn't been forged
4228 				 */
4229 				vm_map_copy_require(copy_map);
4230 				target_copy_map = copy_map;
4231 			}
4232 
4233 			/* reserve a contiguous range */
4234 			kr = vm_map_enter(target_map,
4235 			    &map_addr,
4236 			    vm_map_round_page(target_size, VM_MAP_PAGE_MASK(target_map)),
4237 			    mask,
4238 			    flags & (VM_FLAGS_ANYWHERE |
4239 			    VM_FLAGS_OVERWRITE |
4240 			    VM_FLAGS_RETURN_4K_DATA_ADDR |
4241 			    VM_FLAGS_RETURN_DATA_ADDR),
4242 			    vmk_flags,
4243 			    tag,
4244 			    VM_OBJECT_NULL,
4245 			    0,
4246 			    FALSE,               /* copy */
4247 			    cur_protection,
4248 			    max_protection,
4249 			    inheritance);
4250 			if (kr != KERN_SUCCESS) {
4251 				DEBUG4K_ERROR("kr 0x%x\n", kr);
4252 				if (target_copy_map != copy_map) {
4253 					vm_map_copy_discard(target_copy_map);
4254 					target_copy_map = VM_MAP_COPY_NULL;
4255 				}
4256 				named_entry_unlock(named_entry);
4257 				return kr;
4258 			}
4259 
4260 			copy_addr = map_addr;
4261 
4262 			for (copy_entry = vm_map_copy_first_entry(target_copy_map);
4263 			    copy_entry != vm_map_copy_to_entry(target_copy_map);
4264 			    copy_entry = copy_entry->vme_next) {
4265 				int                     remap_flags;
4266 				vm_map_kernel_flags_t   vmk_remap_flags;
4267 				vm_map_t                copy_submap = VM_MAP_NULL;
4268 				vm_object_t             copy_object = VM_OBJECT_NULL;
4269 				vm_map_size_t           copy_size;
4270 				vm_object_offset_t      copy_offset;
4271 				int                     copy_vm_alias;
4272 				boolean_t               do_copy;
4273 
4274 				do_copy = FALSE;
4275 				remap_flags = 0;
4276 				vmk_remap_flags = VM_MAP_KERNEL_FLAGS_NONE;
4277 
4278 				if (copy_entry->is_sub_map) {
4279 					copy_submap = VME_SUBMAP(copy_entry);
4280 					copy_object = (vm_object_t)copy_submap;
4281 				} else {
4282 					copy_object = VME_OBJECT(copy_entry);
4283 				}
4284 				copy_offset = VME_OFFSET(copy_entry);
4285 				copy_size = (copy_entry->vme_end -
4286 				    copy_entry->vme_start);
4287 				VM_GET_FLAGS_ALIAS(flags, copy_vm_alias);
4288 				if (copy_vm_alias == 0) {
4289 					/*
4290 					 * Caller does not want a specific
4291 					 * alias for this new mapping:  use
4292 					 * the alias of the original mapping.
4293 					 */
4294 					copy_vm_alias = VME_ALIAS(copy_entry);
4295 				}
4296 
4297 				/* sanity check */
4298 				if ((copy_addr + copy_size) >
4299 				    (map_addr +
4300 				    overmap_start + overmap_end +
4301 				    named_entry->size /* XXX full size */)) {
4302 					/* over-mapping too much !? */
4303 					kr = KERN_INVALID_ARGUMENT;
4304 					DEBUG4K_ERROR("kr 0x%x\n", kr);
4305 					/* abort */
4306 					break;
4307 				}
4308 
4309 				/* take a reference on the object */
4310 				if (copy_entry->is_sub_map) {
4311 					vmk_remap_flags.vmkf_submap = TRUE;
4312 					vm_map_reference(copy_submap);
4313 					copy_object = (vm_object_t)(uintptr_t) copy_submap;
4314 				} else {
4315 					if (!copy &&
4316 					    copy_object != VM_OBJECT_NULL &&
4317 					    copy_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
4318 						/*
4319 						 * We need to resolve our side of this
4320 						 * "symmetric" copy-on-write now; we
4321 						 * need a new object to map and share,
4322 						 * instead of the current one which
4323 						 * might still be shared with the
4324 						 * original mapping.
4325 						 *
4326 						 * Note: A "vm_map_copy_t" does not
4327 						 * have a lock but we're protected by
4328 						 * the named entry's lock here.
4329 						 */
4330 						// assert(copy_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC);
4331 						VME_OBJECT_SHADOW(copy_entry, copy_size);
4332 						assert(copy_object != VME_OBJECT(copy_entry));
4333 						if (!copy_entry->needs_copy &&
4334 						    copy_entry->protection & VM_PROT_WRITE) {
4335 							vm_prot_t prot;
4336 
4337 							prot = copy_entry->protection & ~VM_PROT_WRITE;
4338 							vm_object_pmap_protect(copy_object,
4339 							    copy_offset,
4340 							    copy_size,
4341 							    PMAP_NULL,
4342 							    PAGE_SIZE,
4343 							    0,
4344 							    prot);
4345 						}
4346 
4347 						copy_entry->needs_copy = FALSE;
4348 						copy_entry->is_shared = TRUE;
4349 						copy_object = VME_OBJECT(copy_entry);
4350 						copy_offset = VME_OFFSET(copy_entry);
4351 						vm_object_lock(copy_object);
4352 						/* we're about to make a shared mapping of this object */
4353 						copy_object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
4354 						copy_object->true_share = TRUE;
4355 						vm_object_unlock(copy_object);
4356 					}
4357 
4358 					if (copy_object != VM_OBJECT_NULL &&
4359 					    copy_object->named &&
4360 					    copy_object->pager != MEMORY_OBJECT_NULL &&
4361 					    copy_object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4362 						memory_object_t pager;
4363 						vm_prot_t       pager_prot;
4364 
4365 						/*
4366 						 * For "named" VM objects, let the pager know that the
4367 						 * memory object is being mapped.  Some pagers need to keep
4368 						 * track of this, to know when they can reclaim the memory
4369 						 * object, for example.
4370 						 * VM calls memory_object_map() for each mapping (specifying
4371 						 * the protection of each mapping) and calls
4372 						 * memory_object_last_unmap() when all the mappings are gone.
4373 						 */
4374 						pager_prot = max_protection;
4375 						if (copy) {
4376 							/*
4377 							 * Copy-On-Write mapping: won't modify the
4378 							 * memory object.
4379 							 */
4380 							pager_prot &= ~VM_PROT_WRITE;
4381 						}
4382 						vm_object_lock(copy_object);
4383 						pager = copy_object->pager;
4384 						if (copy_object->named &&
4385 						    pager != MEMORY_OBJECT_NULL &&
4386 						    copy_object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4387 							assert(copy_object->pager_ready);
4388 							vm_object_mapping_wait(copy_object, THREAD_UNINT);
4389 							vm_object_mapping_begin(copy_object);
4390 							vm_object_unlock(copy_object);
4391 
4392 							kr = memory_object_map(pager, pager_prot);
4393 							assert(kr == KERN_SUCCESS);
4394 
4395 							vm_object_lock(copy_object);
4396 							vm_object_mapping_end(copy_object);
4397 						}
4398 						vm_object_unlock(copy_object);
4399 					}
4400 
4401 					/*
4402 					 *	Perform the copy if requested
4403 					 */
4404 
4405 					if (copy && copy_object != VM_OBJECT_NULL) {
4406 						vm_object_t             new_object;
4407 						vm_object_offset_t      new_offset;
4408 
4409 						result = vm_object_copy_strategically(copy_object, copy_offset,
4410 						    copy_size,
4411 						    &new_object, &new_offset,
4412 						    &do_copy);
4413 
4414 
4415 						if (result == KERN_MEMORY_RESTART_COPY) {
4416 							boolean_t success;
4417 							boolean_t src_needs_copy;
4418 
4419 							/*
4420 							 * XXX
4421 							 * We currently ignore src_needs_copy.
4422 							 * This really is the issue of how to make
4423 							 * MEMORY_OBJECT_COPY_SYMMETRIC safe for
4424 							 * non-kernel users to use. Solution forthcoming.
4425 							 * In the meantime, since we don't allow non-kernel
4426 							 * memory managers to specify symmetric copy,
4427 							 * we won't run into problems here.
4428 							 */
4429 							new_object = copy_object;
4430 							new_offset = copy_offset;
4431 							success = vm_object_copy_quickly(new_object,
4432 							    new_offset,
4433 							    copy_size,
4434 							    &src_needs_copy,
4435 							    &do_copy);
4436 							assert(success);
4437 							result = KERN_SUCCESS;
4438 						}
4439 						if (result != KERN_SUCCESS) {
4440 							kr = result;
4441 							break;
4442 						}
4443 
4444 						copy_object = new_object;
4445 						copy_offset = new_offset;
4446 						/*
4447 						 * No extra object reference for the mapping:
4448 						 * the mapping should be the only thing keeping
4449 						 * this new object alive.
4450 						 */
4451 					} else {
4452 						/*
4453 						 * We already have the right object
4454 						 * to map.
4455 						 */
4456 						copy_object = VME_OBJECT(copy_entry);
4457 						/* take an extra ref for the mapping below */
4458 						vm_object_reference(copy_object);
4459 					}
4460 				}
4461 
4462 				/* over-map the object into destination */
4463 				remap_flags |= flags;
4464 				remap_flags |= VM_FLAGS_FIXED;
4465 				remap_flags |= VM_FLAGS_OVERWRITE;
4466 				remap_flags &= ~VM_FLAGS_ANYWHERE;
4467 				if (!copy && !copy_entry->is_sub_map) {
4468 					/*
4469 					 * copy-on-write should have been
4470 					 * resolved at this point, or we would
4471 					 * end up sharing instead of copying.
4472 					 */
4473 					assert(!copy_entry->needs_copy);
4474 				}
4475 #if XNU_TARGET_OS_OSX
4476 				if (copy_entry->used_for_jit) {
4477 					vmk_remap_flags.vmkf_map_jit = TRUE;
4478 				}
4479 #endif /* XNU_TARGET_OS_OSX */
4480 
4481 				assertf((copy_vm_alias & VME_ALIAS_MASK) == copy_vm_alias,
4482 				    "VM Tag truncated from 0x%x to 0x%x\n", copy_vm_alias, (copy_vm_alias & VME_ALIAS_MASK));
4483 				kr = vm_map_enter(target_map,
4484 				    &copy_addr,
4485 				    copy_size,
4486 				    (vm_map_offset_t) 0,
4487 				    remap_flags,
4488 				    vmk_remap_flags,
4489 				    (vm_tag_t) copy_vm_alias, /* see comment at end of vm_fault_unwire re. cast*/
4490 				    copy_object,
4491 				    copy_offset,
4492 				    ((copy_object == NULL)
4493 				    ? FALSE
4494 				    : (copy || copy_entry->needs_copy)),
4495 				    cur_protection,
4496 				    max_protection,
4497 				    inheritance);
4498 				if (kr != KERN_SUCCESS) {
4499 					DEBUG4K_SHARE("failed kr 0x%x\n", kr);
4500 					if (copy_entry->is_sub_map) {
4501 						vm_map_deallocate(copy_submap);
4502 					} else {
4503 						vm_object_deallocate(copy_object);
4504 					}
4505 					/* abort */
4506 					break;
4507 				}
4508 
4509 				/* next mapping */
4510 				copy_addr += copy_size;
4511 			}
4512 
4513 			if (kr == KERN_SUCCESS) {
4514 				if (flags & (VM_FLAGS_RETURN_DATA_ADDR |
4515 				    VM_FLAGS_RETURN_4K_DATA_ADDR)) {
4516 					*address = map_addr + offset_in_mapping;
4517 				} else {
4518 					*address = map_addr;
4519 				}
4520 				if (overmap_start) {
4521 					*address += overmap_start;
4522 					DEBUG4K_SHARE("map %p map_addr 0x%llx offset_in_mapping 0x%llx overmap_start 0x%llx -> *address 0x%llx\n", target_map, (uint64_t)map_addr, (uint64_t) offset_in_mapping, (uint64_t)overmap_start, (uint64_t)*address);
4523 				}
4524 			}
4525 			named_entry_unlock(named_entry);
4526 			if (target_copy_map != copy_map) {
4527 				vm_map_copy_discard(target_copy_map);
4528 				target_copy_map = VM_MAP_COPY_NULL;
4529 			}
4530 
4531 			if (kr != KERN_SUCCESS) {
4532 				if (!(flags & VM_FLAGS_OVERWRITE)) {
4533 					/* deallocate the contiguous range */
4534 					(void) vm_deallocate(target_map,
4535 					    map_addr,
4536 					    map_size);
4537 				}
4538 			}
4539 
4540 			return kr;
4541 		}
4542 
4543 		if (named_entry->is_object) {
4544 			unsigned int    access;
4545 			vm_prot_t       protections;
4546 			unsigned int    wimg_mode;
4547 
4548 			/* we are mapping a VM object */
4549 
4550 			protections = named_entry->protection & (VM_PROT_ALL | VM_PROT_ALLEXEC);
4551 			access = GET_MAP_MEM(named_entry->protection);
4552 
4553 			if (flags & (VM_FLAGS_RETURN_DATA_ADDR |
4554 			    VM_FLAGS_RETURN_4K_DATA_ADDR)) {
4555 				offset_in_mapping = offset - VM_MAP_TRUNC_PAGE(offset, VM_MAP_PAGE_MASK(target_map));
4556 				if (flags & VM_FLAGS_RETURN_4K_DATA_ADDR) {
4557 					offset_in_mapping &= ~((signed)(0xFFF));
4558 				}
4559 				offset = VM_MAP_TRUNC_PAGE(offset, VM_MAP_PAGE_MASK(target_map));
4560 				map_size = VM_MAP_ROUND_PAGE((offset + offset_in_mapping + initial_size) - offset, VM_MAP_PAGE_MASK(target_map));
4561 			}
4562 
4563 			object = vm_named_entry_to_vm_object(named_entry);
4564 			assert(object != VM_OBJECT_NULL);
4565 			vm_object_lock(object);
4566 			named_entry_unlock(named_entry);
4567 
4568 			vm_object_reference_locked(object);
4569 
4570 			wimg_mode = object->wimg_bits;
4571 			vm_prot_to_wimg(access, &wimg_mode);
4572 			if (object->wimg_bits != wimg_mode) {
4573 				vm_object_change_wimg_mode(object, wimg_mode);
4574 			}
4575 
4576 			vm_object_unlock(object);
4577 		} else {
4578 			panic("invalid VM named entry %p", named_entry);
4579 		}
4580 	} else if (ip_kotype(port) == IKOT_MEMORY_OBJECT) {
4581 		/*
4582 		 * JMM - This is temporary until we unify named entries
4583 		 * and raw memory objects.
4584 		 *
4585 		 * Detected fake ip_kotype for a memory object.  In
4586 		 * this case, the port isn't really a port at all, but
4587 		 * instead is just a raw memory object.
4588 		 */
4589 		if (flags & (VM_FLAGS_RETURN_DATA_ADDR |
4590 		    VM_FLAGS_RETURN_4K_DATA_ADDR)) {
4591 			panic("VM_FLAGS_RETURN_DATA_ADDR not expected for raw memory object.");
4592 		}
4593 
4594 		object = memory_object_to_vm_object((memory_object_t)port);
4595 		if (object == VM_OBJECT_NULL) {
4596 			return KERN_INVALID_OBJECT;
4597 		}
4598 		vm_object_reference(object);
4599 
4600 		/* wait for object (if any) to be ready */
4601 		if (object != VM_OBJECT_NULL) {
4602 			if (object == kernel_object) {
4603 				printf("Warning: Attempt to map kernel object"
4604 				    " by a non-private kernel entity\n");
4605 				return KERN_INVALID_OBJECT;
4606 			}
4607 			if (!object->pager_ready) {
4608 				vm_object_lock(object);
4609 
4610 				while (!object->pager_ready) {
4611 					vm_object_wait(object,
4612 					    VM_OBJECT_EVENT_PAGER_READY,
4613 					    THREAD_UNINT);
4614 					vm_object_lock(object);
4615 				}
4616 				vm_object_unlock(object);
4617 			}
4618 		}
4619 	} else {
4620 		return KERN_INVALID_OBJECT;
4621 	}
4622 
4623 	if (object != VM_OBJECT_NULL &&
4624 	    object->named &&
4625 	    object->pager != MEMORY_OBJECT_NULL &&
4626 	    object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4627 		memory_object_t pager;
4628 		vm_prot_t       pager_prot;
4629 		kern_return_t   kr;
4630 
4631 		/*
4632 		 * For "named" VM objects, let the pager know that the
4633 		 * memory object is being mapped.  Some pagers need to keep
4634 		 * track of this, to know when they can reclaim the memory
4635 		 * object, for example.
4636 		 * VM calls memory_object_map() for each mapping (specifying
4637 		 * the protection of each mapping) and calls
4638 		 * memory_object_last_unmap() when all the mappings are gone.
4639 		 */
4640 		pager_prot = max_protection;
4641 		if (copy) {
4642 			/*
4643 			 * Copy-On-Write mapping: won't modify the
4644 			 * memory object.
4645 			 */
4646 			pager_prot &= ~VM_PROT_WRITE;
4647 		}
4648 		vm_object_lock(object);
4649 		pager = object->pager;
4650 		if (object->named &&
4651 		    pager != MEMORY_OBJECT_NULL &&
4652 		    object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4653 			assert(object->pager_ready);
4654 			vm_object_mapping_wait(object, THREAD_UNINT);
4655 			vm_object_mapping_begin(object);
4656 			vm_object_unlock(object);
4657 
4658 			kr = memory_object_map(pager, pager_prot);
4659 			assert(kr == KERN_SUCCESS);
4660 
4661 			vm_object_lock(object);
4662 			vm_object_mapping_end(object);
4663 		}
4664 		vm_object_unlock(object);
4665 	}
4666 
4667 	/*
4668 	 *	Perform the copy if requested
4669 	 */
4670 
4671 	if (copy) {
4672 		vm_object_t             new_object;
4673 		vm_object_offset_t      new_offset;
4674 
4675 		result = vm_object_copy_strategically(object, offset,
4676 		    map_size,
4677 		    &new_object, &new_offset,
4678 		    &copy);
4679 
4680 
4681 		if (result == KERN_MEMORY_RESTART_COPY) {
4682 			boolean_t success;
4683 			boolean_t src_needs_copy;
4684 
4685 			/*
4686 			 * XXX
4687 			 * We currently ignore src_needs_copy.
4688 			 * This really is the issue of how to make
4689 			 * MEMORY_OBJECT_COPY_SYMMETRIC safe for
4690 			 * non-kernel users to use. Solution forthcoming.
4691 			 * In the meantime, since we don't allow non-kernel
4692 			 * memory managers to specify symmetric copy,
4693 			 * we won't run into problems here.
4694 			 */
4695 			new_object = object;
4696 			new_offset = offset;
4697 			success = vm_object_copy_quickly(new_object,
4698 			    new_offset,
4699 			    map_size,
4700 			    &src_needs_copy,
4701 			    &copy);
4702 			assert(success);
4703 			result = KERN_SUCCESS;
4704 		}
4705 		/*
4706 		 *	Throw away the reference to the
4707 		 *	original object, as it won't be mapped.
4708 		 */
4709 
4710 		vm_object_deallocate(object);
4711 
4712 		if (result != KERN_SUCCESS) {
4713 			return result;
4714 		}
4715 
4716 		object = new_object;
4717 		offset = new_offset;
4718 	}
4719 
4720 	/*
4721 	 * If non-kernel users want to try to prefault pages, the mapping and prefault
4722 	 * needs to be atomic.
4723 	 */
4724 	kernel_prefault = (try_prefault && vm_kernel_map_is_kernel(target_map));
4725 	vmk_flags.vmkf_keep_map_locked = (try_prefault && !kernel_prefault);
4726 
4727 #if __arm64__
4728 	if (fourk) {
4729 		/* map this object in a "4K" pager */
4730 		result = vm_map_enter_fourk(target_map,
4731 		    &map_addr,
4732 		    map_size,
4733 		    (vm_map_offset_t) mask,
4734 		    flags,
4735 		    vmk_flags,
4736 		    tag,
4737 		    object,
4738 		    offset,
4739 		    copy,
4740 		    cur_protection,
4741 		    max_protection,
4742 		    inheritance);
4743 	} else
4744 #endif /* __arm64__ */
4745 	{
4746 		result = vm_map_enter(target_map,
4747 		    &map_addr, map_size,
4748 		    (vm_map_offset_t)mask,
4749 		    flags,
4750 		    vmk_flags,
4751 		    tag,
4752 		    object, offset,
4753 		    copy,
4754 		    cur_protection, max_protection,
4755 		    inheritance);
4756 	}
4757 	if (result != KERN_SUCCESS) {
4758 		vm_object_deallocate(object);
4759 	}
4760 
4761 	/*
4762 	 * Try to prefault, and do not forget to release the vm map lock.
4763 	 */
4764 	if (result == KERN_SUCCESS && try_prefault) {
4765 		mach_vm_address_t va = map_addr;
4766 		kern_return_t kr = KERN_SUCCESS;
4767 		unsigned int i = 0;
4768 		int pmap_options;
4769 
4770 		pmap_options = kernel_prefault ? 0 : PMAP_OPTIONS_NOWAIT;
4771 		if (object->internal) {
4772 			pmap_options |= PMAP_OPTIONS_INTERNAL;
4773 		}
4774 
4775 		for (i = 0; i < page_list_count; ++i) {
4776 			if (!UPL_VALID_PAGE(page_list, i)) {
4777 				if (kernel_prefault) {
4778 					assertf(FALSE, "kernel_prefault && !UPL_VALID_PAGE");
4779 					result = KERN_MEMORY_ERROR;
4780 					break;
4781 				}
4782 			} else {
4783 				/*
4784 				 * If this function call failed, we should stop
4785 				 * trying to optimize, other calls are likely
4786 				 * going to fail too.
4787 				 *
4788 				 * We are not gonna report an error for such
4789 				 * failure though. That's an optimization, not
4790 				 * something critical.
4791 				 */
4792 				kr = pmap_enter_options(target_map->pmap,
4793 				    va, UPL_PHYS_PAGE(page_list, i),
4794 				    cur_protection, VM_PROT_NONE,
4795 				    0, TRUE, pmap_options, NULL);
4796 				if (kr != KERN_SUCCESS) {
4797 					OSIncrementAtomic64(&vm_prefault_nb_bailout);
4798 					if (kernel_prefault) {
4799 						result = kr;
4800 					}
4801 					break;
4802 				}
4803 				OSIncrementAtomic64(&vm_prefault_nb_pages);
4804 			}
4805 
4806 			/* Next virtual address */
4807 			va += PAGE_SIZE;
4808 		}
4809 		if (vmk_flags.vmkf_keep_map_locked) {
4810 			vm_map_unlock(target_map);
4811 		}
4812 	}
4813 
4814 	if (flags & (VM_FLAGS_RETURN_DATA_ADDR |
4815 	    VM_FLAGS_RETURN_4K_DATA_ADDR)) {
4816 		*address = map_addr + offset_in_mapping;
4817 	} else {
4818 		*address = map_addr;
4819 	}
4820 	return result;
4821 }
4822 
4823 kern_return_t
vm_map_enter_mem_object(vm_map_t target_map,vm_map_offset_t * address,vm_map_size_t initial_size,vm_map_offset_t mask,int flags,vm_map_kernel_flags_t vmk_flags,vm_tag_t tag,ipc_port_t port,vm_object_offset_t offset,boolean_t copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)4824 vm_map_enter_mem_object(
4825 	vm_map_t                target_map,
4826 	vm_map_offset_t         *address,
4827 	vm_map_size_t           initial_size,
4828 	vm_map_offset_t         mask,
4829 	int                     flags,
4830 	vm_map_kernel_flags_t   vmk_flags,
4831 	vm_tag_t                tag,
4832 	ipc_port_t              port,
4833 	vm_object_offset_t      offset,
4834 	boolean_t               copy,
4835 	vm_prot_t               cur_protection,
4836 	vm_prot_t               max_protection,
4837 	vm_inherit_t            inheritance)
4838 {
4839 	kern_return_t ret;
4840 
4841 	ret = vm_map_enter_mem_object_helper(target_map,
4842 	    address,
4843 	    initial_size,
4844 	    mask,
4845 	    flags,
4846 	    vmk_flags,
4847 	    tag,
4848 	    port,
4849 	    offset,
4850 	    copy,
4851 	    cur_protection,
4852 	    max_protection,
4853 	    inheritance,
4854 	    NULL,
4855 	    0);
4856 
4857 #if KASAN
4858 	if (ret == KERN_SUCCESS && address && target_map->pmap == kernel_pmap) {
4859 		kasan_notify_address(*address, initial_size);
4860 	}
4861 #endif
4862 
4863 	return ret;
4864 }
4865 
4866 kern_return_t
vm_map_enter_mem_object_prefault(vm_map_t target_map,vm_map_offset_t * address,vm_map_size_t initial_size,vm_map_offset_t mask,int flags,vm_map_kernel_flags_t vmk_flags,vm_tag_t tag,ipc_port_t port,vm_object_offset_t offset,vm_prot_t cur_protection,vm_prot_t max_protection,upl_page_list_ptr_t page_list,unsigned int page_list_count)4867 vm_map_enter_mem_object_prefault(
4868 	vm_map_t                target_map,
4869 	vm_map_offset_t         *address,
4870 	vm_map_size_t           initial_size,
4871 	vm_map_offset_t         mask,
4872 	int                     flags,
4873 	vm_map_kernel_flags_t   vmk_flags,
4874 	vm_tag_t                tag,
4875 	ipc_port_t              port,
4876 	vm_object_offset_t      offset,
4877 	vm_prot_t               cur_protection,
4878 	vm_prot_t               max_protection,
4879 	upl_page_list_ptr_t     page_list,
4880 	unsigned int            page_list_count)
4881 {
4882 	kern_return_t ret;
4883 
4884 	ret = vm_map_enter_mem_object_helper(target_map,
4885 	    address,
4886 	    initial_size,
4887 	    mask,
4888 	    flags,
4889 	    vmk_flags,
4890 	    tag,
4891 	    port,
4892 	    offset,
4893 	    FALSE,
4894 	    cur_protection,
4895 	    max_protection,
4896 	    VM_INHERIT_DEFAULT,
4897 	    page_list,
4898 	    page_list_count);
4899 
4900 #if KASAN
4901 	if (ret == KERN_SUCCESS && address && target_map->pmap == kernel_pmap) {
4902 		kasan_notify_address(*address, initial_size);
4903 	}
4904 #endif
4905 
4906 	return ret;
4907 }
4908 
4909 
4910 kern_return_t
vm_map_enter_mem_object_control(vm_map_t target_map,vm_map_offset_t * address,vm_map_size_t initial_size,vm_map_offset_t mask,int flags,vm_map_kernel_flags_t vmk_flags,vm_tag_t tag,memory_object_control_t control,vm_object_offset_t offset,boolean_t copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)4911 vm_map_enter_mem_object_control(
4912 	vm_map_t                target_map,
4913 	vm_map_offset_t         *address,
4914 	vm_map_size_t           initial_size,
4915 	vm_map_offset_t         mask,
4916 	int                     flags,
4917 	vm_map_kernel_flags_t   vmk_flags,
4918 	vm_tag_t                tag,
4919 	memory_object_control_t control,
4920 	vm_object_offset_t      offset,
4921 	boolean_t               copy,
4922 	vm_prot_t               cur_protection,
4923 	vm_prot_t               max_protection,
4924 	vm_inherit_t            inheritance)
4925 {
4926 	vm_map_address_t        map_addr;
4927 	vm_map_size_t           map_size;
4928 	vm_object_t             object;
4929 	vm_object_size_t        size;
4930 	kern_return_t           result;
4931 	memory_object_t         pager;
4932 	vm_prot_t               pager_prot;
4933 	kern_return_t           kr;
4934 #if __arm64__
4935 	boolean_t               fourk = vmk_flags.vmkf_fourk;
4936 #endif /* __arm64__ */
4937 
4938 	/*
4939 	 * Check arguments for validity
4940 	 */
4941 	if ((target_map == VM_MAP_NULL) ||
4942 	    (cur_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)) ||
4943 	    (max_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)) ||
4944 	    (inheritance > VM_INHERIT_LAST_VALID) ||
4945 	    initial_size == 0) {
4946 		return KERN_INVALID_ARGUMENT;
4947 	}
4948 
4949 #if __arm64__
4950 	if (fourk && VM_MAP_PAGE_MASK(target_map) < PAGE_MASK) {
4951 		fourk = FALSE;
4952 	}
4953 
4954 	if (fourk) {
4955 		map_addr = vm_map_trunc_page(*address,
4956 		    FOURK_PAGE_MASK);
4957 		map_size = vm_map_round_page(initial_size,
4958 		    FOURK_PAGE_MASK);
4959 	} else
4960 #endif /* __arm64__ */
4961 	{
4962 		map_addr = vm_map_trunc_page(*address,
4963 		    VM_MAP_PAGE_MASK(target_map));
4964 		map_size = vm_map_round_page(initial_size,
4965 		    VM_MAP_PAGE_MASK(target_map));
4966 	}
4967 	size = vm_object_round_page(initial_size);
4968 
4969 	object = memory_object_control_to_vm_object(control);
4970 
4971 	if (object == VM_OBJECT_NULL) {
4972 		return KERN_INVALID_OBJECT;
4973 	}
4974 
4975 	if (object == kernel_object) {
4976 		printf("Warning: Attempt to map kernel object"
4977 		    " by a non-private kernel entity\n");
4978 		return KERN_INVALID_OBJECT;
4979 	}
4980 
4981 	vm_object_lock(object);
4982 	object->ref_count++;
4983 
4984 	/*
4985 	 * For "named" VM objects, let the pager know that the
4986 	 * memory object is being mapped.  Some pagers need to keep
4987 	 * track of this, to know when they can reclaim the memory
4988 	 * object, for example.
4989 	 * VM calls memory_object_map() for each mapping (specifying
4990 	 * the protection of each mapping) and calls
4991 	 * memory_object_last_unmap() when all the mappings are gone.
4992 	 */
4993 	pager_prot = max_protection;
4994 	if (copy) {
4995 		pager_prot &= ~VM_PROT_WRITE;
4996 	}
4997 	pager = object->pager;
4998 	if (object->named &&
4999 	    pager != MEMORY_OBJECT_NULL &&
5000 	    object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
5001 		assert(object->pager_ready);
5002 		vm_object_mapping_wait(object, THREAD_UNINT);
5003 		vm_object_mapping_begin(object);
5004 		vm_object_unlock(object);
5005 
5006 		kr = memory_object_map(pager, pager_prot);
5007 		assert(kr == KERN_SUCCESS);
5008 
5009 		vm_object_lock(object);
5010 		vm_object_mapping_end(object);
5011 	}
5012 	vm_object_unlock(object);
5013 
5014 	/*
5015 	 *	Perform the copy if requested
5016 	 */
5017 
5018 	if (copy) {
5019 		vm_object_t             new_object;
5020 		vm_object_offset_t      new_offset;
5021 
5022 		result = vm_object_copy_strategically(object, offset, size,
5023 		    &new_object, &new_offset,
5024 		    &copy);
5025 
5026 
5027 		if (result == KERN_MEMORY_RESTART_COPY) {
5028 			boolean_t success;
5029 			boolean_t src_needs_copy;
5030 
5031 			/*
5032 			 * XXX
5033 			 * We currently ignore src_needs_copy.
5034 			 * This really is the issue of how to make
5035 			 * MEMORY_OBJECT_COPY_SYMMETRIC safe for
5036 			 * non-kernel users to use. Solution forthcoming.
5037 			 * In the meantime, since we don't allow non-kernel
5038 			 * memory managers to specify symmetric copy,
5039 			 * we won't run into problems here.
5040 			 */
5041 			new_object = object;
5042 			new_offset = offset;
5043 			success = vm_object_copy_quickly(new_object,
5044 			    new_offset, size,
5045 			    &src_needs_copy,
5046 			    &copy);
5047 			assert(success);
5048 			result = KERN_SUCCESS;
5049 		}
5050 		/*
5051 		 *	Throw away the reference to the
5052 		 *	original object, as it won't be mapped.
5053 		 */
5054 
5055 		vm_object_deallocate(object);
5056 
5057 		if (result != KERN_SUCCESS) {
5058 			return result;
5059 		}
5060 
5061 		object = new_object;
5062 		offset = new_offset;
5063 	}
5064 
5065 #if __arm64__
5066 	if (fourk) {
5067 		result = vm_map_enter_fourk(target_map,
5068 		    &map_addr,
5069 		    map_size,
5070 		    (vm_map_offset_t)mask,
5071 		    flags,
5072 		    vmk_flags,
5073 		    tag,
5074 		    object, offset,
5075 		    copy,
5076 		    cur_protection, max_protection,
5077 		    inheritance);
5078 	} else
5079 #endif /* __arm64__ */
5080 	{
5081 		result = vm_map_enter(target_map,
5082 		    &map_addr, map_size,
5083 		    (vm_map_offset_t)mask,
5084 		    flags,
5085 		    vmk_flags,
5086 		    tag,
5087 		    object, offset,
5088 		    copy,
5089 		    cur_protection, max_protection,
5090 		    inheritance);
5091 	}
5092 	if (result != KERN_SUCCESS) {
5093 		vm_object_deallocate(object);
5094 	}
5095 	*address = map_addr;
5096 
5097 	return result;
5098 }
5099 
5100 
5101 #if     VM_CPM
5102 
5103 #ifdef MACH_ASSERT
5104 extern pmap_paddr_t     avail_start, avail_end;
5105 #endif
5106 
5107 /*
5108  *	Allocate memory in the specified map, with the caveat that
5109  *	the memory is physically contiguous.  This call may fail
5110  *	if the system can't find sufficient contiguous memory.
5111  *	This call may cause or lead to heart-stopping amounts of
5112  *	paging activity.
5113  *
5114  *	Memory obtained from this call should be freed in the
5115  *	normal way, viz., via vm_deallocate.
5116  */
5117 kern_return_t
vm_map_enter_cpm(vm_map_t map,vm_map_offset_t * addr,vm_map_size_t size,int flags)5118 vm_map_enter_cpm(
5119 	vm_map_t                map,
5120 	vm_map_offset_t *addr,
5121 	vm_map_size_t           size,
5122 	int                     flags)
5123 {
5124 	vm_object_t             cpm_obj;
5125 	pmap_t                  pmap;
5126 	vm_page_t               m, pages;
5127 	kern_return_t           kr;
5128 	vm_map_offset_t         va, start, end, offset;
5129 #if     MACH_ASSERT
5130 	vm_map_offset_t         prev_addr = 0;
5131 #endif  /* MACH_ASSERT */
5132 
5133 	boolean_t               anywhere = ((VM_FLAGS_ANYWHERE & flags) != 0);
5134 	vm_tag_t tag;
5135 
5136 	if (VM_MAP_PAGE_SHIFT(map) != PAGE_SHIFT) {
5137 		/* XXX TODO4K do we need to support this? */
5138 		*addr = 0;
5139 		return KERN_NOT_SUPPORTED;
5140 	}
5141 
5142 	VM_GET_FLAGS_ALIAS(flags, tag);
5143 
5144 	if (size == 0) {
5145 		*addr = 0;
5146 		return KERN_SUCCESS;
5147 	}
5148 	if (anywhere) {
5149 		*addr = vm_map_min(map);
5150 	} else {
5151 		*addr = vm_map_trunc_page(*addr,
5152 		    VM_MAP_PAGE_MASK(map));
5153 	}
5154 	size = vm_map_round_page(size,
5155 	    VM_MAP_PAGE_MASK(map));
5156 
5157 	/*
5158 	 * LP64todo - cpm_allocate should probably allow
5159 	 * allocations of >4GB, but not with the current
5160 	 * algorithm, so just cast down the size for now.
5161 	 */
5162 	if (size > VM_MAX_ADDRESS) {
5163 		return KERN_RESOURCE_SHORTAGE;
5164 	}
5165 	if ((kr = cpm_allocate(CAST_DOWN(vm_size_t, size),
5166 	    &pages, 0, 0, TRUE, flags)) != KERN_SUCCESS) {
5167 		return kr;
5168 	}
5169 
5170 	cpm_obj = vm_object_allocate((vm_object_size_t)size);
5171 	assert(cpm_obj != VM_OBJECT_NULL);
5172 	assert(cpm_obj->internal);
5173 	assert(cpm_obj->vo_size == (vm_object_size_t)size);
5174 	assert(cpm_obj->can_persist == FALSE);
5175 	assert(cpm_obj->pager_created == FALSE);
5176 	assert(cpm_obj->pageout == FALSE);
5177 	assert(cpm_obj->shadow == VM_OBJECT_NULL);
5178 
5179 	/*
5180 	 *	Insert pages into object.
5181 	 */
5182 
5183 	vm_object_lock(cpm_obj);
5184 	for (offset = 0; offset < size; offset += PAGE_SIZE) {
5185 		m = pages;
5186 		pages = NEXT_PAGE(m);
5187 		*(NEXT_PAGE_PTR(m)) = VM_PAGE_NULL;
5188 
5189 		assert(!m->vmp_gobbled);
5190 		assert(!m->vmp_wanted);
5191 		assert(!m->vmp_pageout);
5192 		assert(!m->vmp_tabled);
5193 		assert(VM_PAGE_WIRED(m));
5194 		assert(m->vmp_busy);
5195 		assert(VM_PAGE_GET_PHYS_PAGE(m) >= (avail_start >> PAGE_SHIFT) && VM_PAGE_GET_PHYS_PAGE(m) <= (avail_end >> PAGE_SHIFT));
5196 
5197 		m->vmp_busy = FALSE;
5198 		vm_page_insert(m, cpm_obj, offset);
5199 	}
5200 	assert(cpm_obj->resident_page_count == size / PAGE_SIZE);
5201 	vm_object_unlock(cpm_obj);
5202 
5203 	/*
5204 	 *	Hang onto a reference on the object in case a
5205 	 *	multi-threaded application for some reason decides
5206 	 *	to deallocate the portion of the address space into
5207 	 *	which we will insert this object.
5208 	 *
5209 	 *	Unfortunately, we must insert the object now before
5210 	 *	we can talk to the pmap module about which addresses
5211 	 *	must be wired down.  Hence, the race with a multi-
5212 	 *	threaded app.
5213 	 */
5214 	vm_object_reference(cpm_obj);
5215 
5216 	/*
5217 	 *	Insert object into map.
5218 	 */
5219 
5220 	kr = vm_map_enter(
5221 		map,
5222 		addr,
5223 		size,
5224 		(vm_map_offset_t)0,
5225 		flags,
5226 		VM_MAP_KERNEL_FLAGS_NONE,
5227 		cpm_obj,
5228 		(vm_object_offset_t)0,
5229 		FALSE,
5230 		VM_PROT_ALL,
5231 		VM_PROT_ALL,
5232 		VM_INHERIT_DEFAULT);
5233 
5234 	if (kr != KERN_SUCCESS) {
5235 		/*
5236 		 *	A CPM object doesn't have can_persist set,
5237 		 *	so all we have to do is deallocate it to
5238 		 *	free up these pages.
5239 		 */
5240 		assert(cpm_obj->pager_created == FALSE);
5241 		assert(cpm_obj->can_persist == FALSE);
5242 		assert(cpm_obj->pageout == FALSE);
5243 		assert(cpm_obj->shadow == VM_OBJECT_NULL);
5244 		vm_object_deallocate(cpm_obj); /* kill acquired ref */
5245 		vm_object_deallocate(cpm_obj); /* kill creation ref */
5246 	}
5247 
5248 	/*
5249 	 *	Inform the physical mapping system that the
5250 	 *	range of addresses may not fault, so that
5251 	 *	page tables and such can be locked down as well.
5252 	 */
5253 	start = *addr;
5254 	end = start + size;
5255 	pmap = vm_map_pmap(map);
5256 	pmap_pageable(pmap, start, end, FALSE);
5257 
5258 	/*
5259 	 *	Enter each page into the pmap, to avoid faults.
5260 	 *	Note that this loop could be coded more efficiently,
5261 	 *	if the need arose, rather than looking up each page
5262 	 *	again.
5263 	 */
5264 	for (offset = 0, va = start; offset < size;
5265 	    va += PAGE_SIZE, offset += PAGE_SIZE) {
5266 		int type_of_fault;
5267 
5268 		vm_object_lock(cpm_obj);
5269 		m = vm_page_lookup(cpm_obj, (vm_object_offset_t)offset);
5270 		assert(m != VM_PAGE_NULL);
5271 
5272 		vm_page_zero_fill(m);
5273 
5274 		type_of_fault = DBG_ZERO_FILL_FAULT;
5275 
5276 		vm_fault_enter(m, pmap, va,
5277 		    PAGE_SIZE, 0,
5278 		    VM_PROT_ALL, VM_PROT_WRITE,
5279 		    VM_PAGE_WIRED(m),
5280 		    FALSE,                             /* change_wiring */
5281 		    VM_KERN_MEMORY_NONE,                             /* tag - not wiring */
5282 		    FALSE,                             /* no_cache */
5283 		    FALSE,                             /* cs_bypass */
5284 		    0,                                 /* user_tag */
5285 		    0,                             /* pmap_options */
5286 		    NULL,                              /* need_retry */
5287 		    &type_of_fault);
5288 
5289 		vm_object_unlock(cpm_obj);
5290 	}
5291 
5292 #if     MACH_ASSERT
5293 	/*
5294 	 *	Verify ordering in address space.
5295 	 */
5296 	for (offset = 0; offset < size; offset += PAGE_SIZE) {
5297 		vm_object_lock(cpm_obj);
5298 		m = vm_page_lookup(cpm_obj, (vm_object_offset_t)offset);
5299 		vm_object_unlock(cpm_obj);
5300 		if (m == VM_PAGE_NULL) {
5301 			panic("vm_allocate_cpm:  obj %p off 0x%llx no page",
5302 			    cpm_obj, (uint64_t)offset);
5303 		}
5304 		assert(m->vmp_tabled);
5305 		assert(!m->vmp_busy);
5306 		assert(!m->vmp_wanted);
5307 		assert(!m->vmp_fictitious);
5308 		assert(!m->vmp_private);
5309 		assert(!m->vmp_absent);
5310 		assert(!m->vmp_error);
5311 		assert(!m->vmp_cleaning);
5312 		assert(!m->vmp_laundry);
5313 		assert(!m->vmp_precious);
5314 		assert(!m->vmp_clustered);
5315 		if (offset != 0) {
5316 			if (VM_PAGE_GET_PHYS_PAGE(m) != prev_addr + 1) {
5317 				printf("start 0x%llx end 0x%llx va 0x%llx\n",
5318 				    (uint64_t)start, (uint64_t)end, (uint64_t)va);
5319 				printf("obj %p off 0x%llx\n", cpm_obj, (uint64_t)offset);
5320 				printf("m %p prev_address 0x%llx\n", m, (uint64_t)prev_addr);
5321 				panic("vm_allocate_cpm:  pages not contig!");
5322 			}
5323 		}
5324 		prev_addr = VM_PAGE_GET_PHYS_PAGE(m);
5325 	}
5326 #endif  /* MACH_ASSERT */
5327 
5328 	vm_object_deallocate(cpm_obj); /* kill extra ref */
5329 
5330 	return kr;
5331 }
5332 
5333 
5334 #else   /* VM_CPM */
5335 
5336 /*
5337  *	Interface is defined in all cases, but unless the kernel
5338  *	is built explicitly for this option, the interface does
5339  *	nothing.
5340  */
5341 
5342 kern_return_t
vm_map_enter_cpm(__unused vm_map_t map,__unused vm_map_offset_t * addr,__unused vm_map_size_t size,__unused int flags)5343 vm_map_enter_cpm(
5344 	__unused vm_map_t       map,
5345 	__unused vm_map_offset_t        *addr,
5346 	__unused vm_map_size_t  size,
5347 	__unused int            flags)
5348 {
5349 	return KERN_FAILURE;
5350 }
5351 #endif /* VM_CPM */
5352 
5353 /* Not used without nested pmaps */
5354 #ifndef NO_NESTED_PMAP
5355 /*
5356  * Clip and unnest a portion of a nested submap mapping.
5357  */
5358 
5359 
5360 static void
vm_map_clip_unnest(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t start_unnest,vm_map_offset_t end_unnest)5361 vm_map_clip_unnest(
5362 	vm_map_t        map,
5363 	vm_map_entry_t  entry,
5364 	vm_map_offset_t start_unnest,
5365 	vm_map_offset_t end_unnest)
5366 {
5367 	vm_map_offset_t old_start_unnest = start_unnest;
5368 	vm_map_offset_t old_end_unnest = end_unnest;
5369 
5370 	assert(entry->is_sub_map);
5371 	assert(VME_SUBMAP(entry) != NULL);
5372 	assert(entry->use_pmap);
5373 
5374 	/*
5375 	 * Query the platform for the optimal unnest range.
5376 	 * DRK: There's some duplication of effort here, since
5377 	 * callers may have adjusted the range to some extent. This
5378 	 * routine was introduced to support 1GiB subtree nesting
5379 	 * for x86 platforms, which can also nest on 2MiB boundaries
5380 	 * depending on size/alignment.
5381 	 */
5382 	if (pmap_adjust_unnest_parameters(map->pmap, &start_unnest, &end_unnest)) {
5383 		assert(VME_SUBMAP(entry)->is_nested_map);
5384 		assert(!VME_SUBMAP(entry)->disable_vmentry_reuse);
5385 		log_unnest_badness(map,
5386 		    old_start_unnest,
5387 		    old_end_unnest,
5388 		    VME_SUBMAP(entry)->is_nested_map,
5389 		    (entry->vme_start +
5390 		    VME_SUBMAP(entry)->lowest_unnestable_start -
5391 		    VME_OFFSET(entry)));
5392 	}
5393 
5394 	if (entry->vme_start > start_unnest ||
5395 	    entry->vme_end < end_unnest) {
5396 		panic("vm_map_clip_unnest(0x%llx,0x%llx): "
5397 		    "bad nested entry: start=0x%llx end=0x%llx\n",
5398 		    (long long)start_unnest, (long long)end_unnest,
5399 		    (long long)entry->vme_start, (long long)entry->vme_end);
5400 	}
5401 
5402 	if (start_unnest > entry->vme_start) {
5403 		_vm_map_clip_start(&map->hdr,
5404 		    entry,
5405 		    start_unnest);
5406 		if (map->holelistenabled) {
5407 			vm_map_store_update_first_free(map, NULL, FALSE);
5408 		} else {
5409 			vm_map_store_update_first_free(map, map->first_free, FALSE);
5410 		}
5411 	}
5412 	if (entry->vme_end > end_unnest) {
5413 		_vm_map_clip_end(&map->hdr,
5414 		    entry,
5415 		    end_unnest);
5416 		if (map->holelistenabled) {
5417 			vm_map_store_update_first_free(map, NULL, FALSE);
5418 		} else {
5419 			vm_map_store_update_first_free(map, map->first_free, FALSE);
5420 		}
5421 	}
5422 
5423 	pmap_unnest(map->pmap,
5424 	    entry->vme_start,
5425 	    entry->vme_end - entry->vme_start);
5426 	if ((map->mapped_in_other_pmaps) && os_ref_get_count_raw(&map->map_refcnt) != 0) {
5427 		/* clean up parent map/maps */
5428 		vm_map_submap_pmap_clean(
5429 			map, entry->vme_start,
5430 			entry->vme_end,
5431 			VME_SUBMAP(entry),
5432 			VME_OFFSET(entry));
5433 	}
5434 	entry->use_pmap = FALSE;
5435 	if ((map->pmap != kernel_pmap) &&
5436 	    (VME_ALIAS(entry) == VM_MEMORY_SHARED_PMAP)) {
5437 		VME_ALIAS_SET(entry, VM_MEMORY_UNSHARED_PMAP);
5438 	}
5439 }
5440 #endif  /* NO_NESTED_PMAP */
5441 
5442 __abortlike
5443 static void
__vm_map_clip_atomic_entry_panic(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t where)5444 __vm_map_clip_atomic_entry_panic(
5445 	vm_map_t        map,
5446 	vm_map_entry_t  entry,
5447 	vm_map_offset_t where)
5448 {
5449 	panic("vm_map_clip(%p): Attempting to clip an atomic VM map entry "
5450 	    "%p [0x%llx:0x%llx] at 0x%llx", map, entry,
5451 	    (uint64_t)entry->vme_start,
5452 	    (uint64_t)entry->vme_end,
5453 	    (uint64_t)where);
5454 }
5455 
5456 /*
5457  *	vm_map_clip_start:	[ internal use only ]
5458  *
5459  *	Asserts that the given entry begins at or after
5460  *	the specified address; if necessary,
5461  *	it splits the entry into two.
5462  */
5463 void
vm_map_clip_start(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t startaddr)5464 vm_map_clip_start(
5465 	vm_map_t        map,
5466 	vm_map_entry_t  entry,
5467 	vm_map_offset_t startaddr)
5468 {
5469 #ifndef NO_NESTED_PMAP
5470 	if (entry->is_sub_map &&
5471 	    entry->use_pmap &&
5472 	    startaddr >= entry->vme_start) {
5473 		vm_map_offset_t start_unnest, end_unnest;
5474 
5475 		/*
5476 		 * Make sure "startaddr" is no longer in a nested range
5477 		 * before we clip.  Unnest only the minimum range the platform
5478 		 * can handle.
5479 		 * vm_map_clip_unnest may perform additional adjustments to
5480 		 * the unnest range.
5481 		 */
5482 		start_unnest = startaddr & ~(pmap_shared_region_size_min(map->pmap) - 1);
5483 		end_unnest = start_unnest + pmap_shared_region_size_min(map->pmap);
5484 		vm_map_clip_unnest(map, entry, start_unnest, end_unnest);
5485 	}
5486 #endif /* NO_NESTED_PMAP */
5487 	if (startaddr > entry->vme_start) {
5488 		if (!entry->is_sub_map &&
5489 		    VME_OBJECT(entry) &&
5490 		    VME_OBJECT(entry)->phys_contiguous) {
5491 			pmap_remove(map->pmap,
5492 			    (addr64_t)(entry->vme_start),
5493 			    (addr64_t)(entry->vme_end));
5494 		}
5495 		if (entry->vme_atomic) {
5496 			__vm_map_clip_atomic_entry_panic(map, entry, startaddr);
5497 		}
5498 
5499 		DTRACE_VM5(
5500 			vm_map_clip_start,
5501 			vm_map_t, map,
5502 			vm_map_offset_t, entry->vme_start,
5503 			vm_map_offset_t, entry->vme_end,
5504 			vm_map_offset_t, startaddr,
5505 			int, VME_ALIAS(entry));
5506 
5507 		_vm_map_clip_start(&map->hdr, entry, startaddr);
5508 		if (map->holelistenabled) {
5509 			vm_map_store_update_first_free(map, NULL, FALSE);
5510 		} else {
5511 			vm_map_store_update_first_free(map, map->first_free, FALSE);
5512 		}
5513 	}
5514 }
5515 
5516 
5517 #define vm_map_copy_clip_start(copy, entry, startaddr) \
5518 	MACRO_BEGIN \
5519 	if ((startaddr) > (entry)->vme_start) \
5520 	        _vm_map_clip_start(&(copy)->cpy_hdr,(entry),(startaddr)); \
5521 	MACRO_END
5522 
5523 /*
5524  *	This routine is called only when it is known that
5525  *	the entry must be split.
5526  */
5527 static void
_vm_map_clip_start(struct vm_map_header * map_header,vm_map_entry_t entry,vm_map_offset_t start)5528 _vm_map_clip_start(
5529 	struct vm_map_header    *map_header,
5530 	vm_map_entry_t          entry,
5531 	vm_map_offset_t         start)
5532 {
5533 	vm_map_entry_t  new_entry;
5534 
5535 	/*
5536 	 *	Split off the front portion --
5537 	 *	note that we must insert the new
5538 	 *	entry BEFORE this one, so that
5539 	 *	this entry has the specified starting
5540 	 *	address.
5541 	 */
5542 
5543 	if (entry->map_aligned) {
5544 		assert(VM_MAP_PAGE_ALIGNED(start,
5545 		    VM_MAP_HDR_PAGE_MASK(map_header)));
5546 	}
5547 
5548 	new_entry = _vm_map_entry_create(map_header);
5549 	vm_map_entry_copy_full(new_entry, entry);
5550 
5551 	new_entry->vme_end = start;
5552 	assert(new_entry->vme_start < new_entry->vme_end);
5553 	VME_OFFSET_SET(entry, VME_OFFSET(entry) + (start - entry->vme_start));
5554 	assert(start < entry->vme_end);
5555 	entry->vme_start = start;
5556 
5557 	_vm_map_store_entry_link(map_header, entry->vme_prev, new_entry);
5558 
5559 	if (entry->is_sub_map) {
5560 		vm_map_reference(VME_SUBMAP(new_entry));
5561 	} else {
5562 		vm_object_reference(VME_OBJECT(new_entry));
5563 	}
5564 }
5565 
5566 
5567 /*
5568  *	vm_map_clip_end:	[ internal use only ]
5569  *
5570  *	Asserts that the given entry ends at or before
5571  *	the specified address; if necessary,
5572  *	it splits the entry into two.
5573  */
5574 void
vm_map_clip_end(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t endaddr)5575 vm_map_clip_end(
5576 	vm_map_t        map,
5577 	vm_map_entry_t  entry,
5578 	vm_map_offset_t endaddr)
5579 {
5580 	if (endaddr > entry->vme_end) {
5581 		/*
5582 		 * Within the scope of this clipping, limit "endaddr" to
5583 		 * the end of this map entry...
5584 		 */
5585 		endaddr = entry->vme_end;
5586 	}
5587 #ifndef NO_NESTED_PMAP
5588 	if (entry->is_sub_map && entry->use_pmap) {
5589 		vm_map_offset_t start_unnest, end_unnest;
5590 
5591 		/*
5592 		 * Make sure the range between the start of this entry and
5593 		 * the new "endaddr" is no longer nested before we clip.
5594 		 * Unnest only the minimum range the platform can handle.
5595 		 * vm_map_clip_unnest may perform additional adjustments to
5596 		 * the unnest range.
5597 		 */
5598 		start_unnest = entry->vme_start;
5599 		end_unnest =
5600 		    (endaddr + pmap_shared_region_size_min(map->pmap) - 1) &
5601 		    ~(pmap_shared_region_size_min(map->pmap) - 1);
5602 		vm_map_clip_unnest(map, entry, start_unnest, end_unnest);
5603 	}
5604 #endif /* NO_NESTED_PMAP */
5605 	if (endaddr < entry->vme_end) {
5606 		if (!entry->is_sub_map &&
5607 		    VME_OBJECT(entry) &&
5608 		    VME_OBJECT(entry)->phys_contiguous) {
5609 			pmap_remove(map->pmap,
5610 			    (addr64_t)(entry->vme_start),
5611 			    (addr64_t)(entry->vme_end));
5612 		}
5613 		if (entry->vme_atomic) {
5614 			__vm_map_clip_atomic_entry_panic(map, entry, endaddr);
5615 		}
5616 		DTRACE_VM5(
5617 			vm_map_clip_end,
5618 			vm_map_t, map,
5619 			vm_map_offset_t, entry->vme_start,
5620 			vm_map_offset_t, entry->vme_end,
5621 			vm_map_offset_t, endaddr,
5622 			int, VME_ALIAS(entry));
5623 
5624 		_vm_map_clip_end(&map->hdr, entry, endaddr);
5625 		if (map->holelistenabled) {
5626 			vm_map_store_update_first_free(map, NULL, FALSE);
5627 		} else {
5628 			vm_map_store_update_first_free(map, map->first_free, FALSE);
5629 		}
5630 	}
5631 }
5632 
5633 
5634 #define vm_map_copy_clip_end(copy, entry, endaddr) \
5635 	MACRO_BEGIN \
5636 	if ((endaddr) < (entry)->vme_end) \
5637 	        _vm_map_clip_end(&(copy)->cpy_hdr,(entry),(endaddr)); \
5638 	MACRO_END
5639 
5640 /*
5641  *	This routine is called only when it is known that
5642  *	the entry must be split.
5643  */
5644 static void
_vm_map_clip_end(struct vm_map_header * map_header,vm_map_entry_t entry,vm_map_offset_t end)5645 _vm_map_clip_end(
5646 	struct vm_map_header    *map_header,
5647 	vm_map_entry_t          entry,
5648 	vm_map_offset_t         end)
5649 {
5650 	vm_map_entry_t  new_entry;
5651 
5652 	/*
5653 	 *	Create a new entry and insert it
5654 	 *	AFTER the specified entry
5655 	 */
5656 
5657 	if (entry->map_aligned) {
5658 		assert(VM_MAP_PAGE_ALIGNED(end,
5659 		    VM_MAP_HDR_PAGE_MASK(map_header)));
5660 	}
5661 
5662 	new_entry = _vm_map_entry_create(map_header);
5663 	vm_map_entry_copy_full(new_entry, entry);
5664 
5665 	assert(entry->vme_start < end);
5666 	new_entry->vme_start = entry->vme_end = end;
5667 	VME_OFFSET_SET(new_entry,
5668 	    VME_OFFSET(new_entry) + (end - entry->vme_start));
5669 	assert(new_entry->vme_start < new_entry->vme_end);
5670 
5671 	_vm_map_store_entry_link(map_header, entry, new_entry);
5672 
5673 	if (entry->is_sub_map) {
5674 		vm_map_reference(VME_SUBMAP(new_entry));
5675 	} else {
5676 		vm_object_reference(VME_OBJECT(new_entry));
5677 	}
5678 }
5679 
5680 
5681 /*
5682  *	VM_MAP_RANGE_CHECK:	[ internal use only ]
5683  *
5684  *	Asserts that the starting and ending region
5685  *	addresses fall within the valid range of the map.
5686  */
5687 #define VM_MAP_RANGE_CHECK(map, start, end)     \
5688 	MACRO_BEGIN                             \
5689 	if (start < vm_map_min(map))            \
5690 	        start = vm_map_min(map);        \
5691 	if (end > vm_map_max(map))              \
5692 	        end = vm_map_max(map);          \
5693 	if (start > end)                        \
5694 	        start = end;                    \
5695 	MACRO_END
5696 
5697 /*
5698  *	vm_map_range_check:	[ internal use only ]
5699  *
5700  *	Check that the region defined by the specified start and
5701  *	end addresses are wholly contained within a single map
5702  *	entry or set of adjacent map entries of the spacified map,
5703  *	i.e. the specified region contains no unmapped space.
5704  *	If any or all of the region is unmapped, FALSE is returned.
5705  *	Otherwise, TRUE is returned and if the output argument 'entry'
5706  *	is not NULL it points to the map entry containing the start
5707  *	of the region.
5708  *
5709  *	The map is locked for reading on entry and is left locked.
5710  */
5711 static boolean_t
vm_map_range_check(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_map_entry_t * entry)5712 vm_map_range_check(
5713 	vm_map_t                map,
5714 	vm_map_offset_t         start,
5715 	vm_map_offset_t         end,
5716 	vm_map_entry_t          *entry)
5717 {
5718 	vm_map_entry_t          cur;
5719 	vm_map_offset_t         prev;
5720 
5721 	/*
5722 	 *      Basic sanity checks first
5723 	 */
5724 	if (start < vm_map_min(map) || end > vm_map_max(map) || start > end) {
5725 		return FALSE;
5726 	}
5727 
5728 	/*
5729 	 *      Check first if the region starts within a valid
5730 	 *	mapping for the map.
5731 	 */
5732 	if (!vm_map_lookup_entry(map, start, &cur)) {
5733 		return FALSE;
5734 	}
5735 
5736 	/*
5737 	 *	Optimize for the case that the region is contained
5738 	 *	in a single map entry.
5739 	 */
5740 	if (entry != (vm_map_entry_t *) NULL) {
5741 		*entry = cur;
5742 	}
5743 	if (end <= cur->vme_end) {
5744 		return TRUE;
5745 	}
5746 
5747 	/*
5748 	 *      If the region is not wholly contained within a
5749 	 *      single entry, walk the entries looking for holes.
5750 	 */
5751 	prev = cur->vme_end;
5752 	cur = cur->vme_next;
5753 	while ((cur != vm_map_to_entry(map)) && (prev == cur->vme_start)) {
5754 		if (end <= cur->vme_end) {
5755 			return TRUE;
5756 		}
5757 		prev = cur->vme_end;
5758 		cur = cur->vme_next;
5759 	}
5760 	return FALSE;
5761 }
5762 
5763 /*
5764  *	vm_map_protect:
5765  *
5766  *	Sets the protection of the specified address
5767  *	region in the target map.  If "set_max" is
5768  *	specified, the maximum protection is to be set;
5769  *	otherwise, only the current protection is affected.
5770  */
5771 kern_return_t
vm_map_protect(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t new_prot,boolean_t set_max)5772 vm_map_protect(
5773 	vm_map_t        map,
5774 	vm_map_offset_t start,
5775 	vm_map_offset_t end,
5776 	vm_prot_t       new_prot,
5777 	boolean_t       set_max)
5778 {
5779 	vm_map_entry_t                  current;
5780 	vm_map_offset_t                 prev;
5781 	vm_map_entry_t                  entry;
5782 	vm_prot_t                       new_max;
5783 	int                             pmap_options = 0;
5784 	kern_return_t                   kr;
5785 
5786 	if (new_prot & VM_PROT_COPY) {
5787 		vm_map_offset_t         new_start;
5788 		vm_prot_t               cur_prot, max_prot;
5789 		vm_map_kernel_flags_t   kflags;
5790 
5791 		/* LP64todo - see below */
5792 		if (start >= map->max_offset) {
5793 			return KERN_INVALID_ADDRESS;
5794 		}
5795 
5796 		if ((new_prot & VM_PROT_ALLEXEC) &&
5797 		    map->pmap != kernel_pmap &&
5798 		    (vm_map_cs_enforcement(map)
5799 #if XNU_TARGET_OS_OSX && __arm64__
5800 		    || !VM_MAP_IS_EXOTIC(map)
5801 #endif /* XNU_TARGET_OS_OSX && __arm64__ */
5802 		    ) &&
5803 		    VM_MAP_POLICY_WX_FAIL(map)) {
5804 			DTRACE_VM3(cs_wx,
5805 			    uint64_t, (uint64_t) start,
5806 			    uint64_t, (uint64_t) end,
5807 			    vm_prot_t, new_prot);
5808 			printf("CODE SIGNING: %d[%s] %s can't have both write and exec at the same time\n",
5809 			    proc_selfpid(),
5810 			    (current_task()->bsd_info
5811 			    ? proc_name_address(current_task()->bsd_info)
5812 			    : "?"),
5813 			    __FUNCTION__);
5814 			return KERN_PROTECTION_FAILURE;
5815 		}
5816 
5817 		/*
5818 		 * Let vm_map_remap_extract() know that it will need to:
5819 		 * + make a copy of the mapping
5820 		 * + add VM_PROT_WRITE to the max protections
5821 		 * + remove any protections that are no longer allowed from the
5822 		 *   max protections (to avoid any WRITE/EXECUTE conflict, for
5823 		 *   example).
5824 		 * Note that "max_prot" is an IN/OUT parameter only for this
5825 		 * specific (VM_PROT_COPY) case.  It's usually an OUT parameter
5826 		 * only.
5827 		 */
5828 		max_prot = new_prot & (VM_PROT_ALL | VM_PROT_ALLEXEC);
5829 		cur_prot = VM_PROT_NONE;
5830 		kflags = VM_MAP_KERNEL_FLAGS_NONE;
5831 		kflags.vmkf_remap_prot_copy = TRUE;
5832 		kflags.vmkf_overwrite_immutable = TRUE;
5833 		new_start = start;
5834 		kr = vm_map_remap(map,
5835 		    &new_start,
5836 		    end - start,
5837 		    0, /* mask */
5838 		    VM_FLAGS_FIXED | VM_FLAGS_OVERWRITE,
5839 		    kflags,
5840 		    0,
5841 		    map,
5842 		    start,
5843 		    TRUE, /* copy-on-write remapping! */
5844 		    &cur_prot, /* IN/OUT */
5845 		    &max_prot, /* IN/OUT */
5846 		    VM_INHERIT_DEFAULT);
5847 		if (kr != KERN_SUCCESS) {
5848 			return kr;
5849 		}
5850 		new_prot &= ~VM_PROT_COPY;
5851 	}
5852 
5853 	vm_map_lock(map);
5854 
5855 	/* LP64todo - remove this check when vm_map_commpage64()
5856 	 * no longer has to stuff in a map_entry for the commpage
5857 	 * above the map's max_offset.
5858 	 */
5859 	if (start >= map->max_offset) {
5860 		vm_map_unlock(map);
5861 		return KERN_INVALID_ADDRESS;
5862 	}
5863 
5864 	while (1) {
5865 		/*
5866 		 *      Lookup the entry.  If it doesn't start in a valid
5867 		 *	entry, return an error.
5868 		 */
5869 		if (!vm_map_lookup_entry(map, start, &entry)) {
5870 			vm_map_unlock(map);
5871 			return KERN_INVALID_ADDRESS;
5872 		}
5873 
5874 		if (entry->superpage_size && (start & (SUPERPAGE_SIZE - 1))) { /* extend request to whole entry */
5875 			start = SUPERPAGE_ROUND_DOWN(start);
5876 			continue;
5877 		}
5878 		break;
5879 	}
5880 	if (entry->superpage_size) {
5881 		end = SUPERPAGE_ROUND_UP(end);
5882 	}
5883 
5884 	/*
5885 	 *	Make a first pass to check for protection and address
5886 	 *	violations.
5887 	 */
5888 
5889 	current = entry;
5890 	prev = current->vme_start;
5891 	while ((current != vm_map_to_entry(map)) &&
5892 	    (current->vme_start < end)) {
5893 		/*
5894 		 * If there is a hole, return an error.
5895 		 */
5896 		if (current->vme_start != prev) {
5897 			vm_map_unlock(map);
5898 			return KERN_INVALID_ADDRESS;
5899 		}
5900 
5901 		new_max = current->max_protection;
5902 
5903 #if defined(__x86_64__)
5904 		/* Allow max mask to include execute prot bits if this map doesn't enforce CS */
5905 		if (set_max && (new_prot & VM_PROT_ALLEXEC) && !vm_map_cs_enforcement(map)) {
5906 			new_max = (new_max & ~VM_PROT_ALLEXEC) | (new_prot & VM_PROT_ALLEXEC);
5907 		}
5908 #endif
5909 		if ((new_prot & new_max) != new_prot) {
5910 			vm_map_unlock(map);
5911 			return KERN_PROTECTION_FAILURE;
5912 		}
5913 
5914 		if (current->used_for_jit &&
5915 		    pmap_has_prot_policy(map->pmap, current->translated_allow_execute, current->protection)) {
5916 			vm_map_unlock(map);
5917 			return KERN_PROTECTION_FAILURE;
5918 		}
5919 
5920 		if ((new_prot & VM_PROT_WRITE) &&
5921 		    (new_prot & VM_PROT_ALLEXEC) &&
5922 #if XNU_TARGET_OS_OSX
5923 		    map->pmap != kernel_pmap &&
5924 		    (vm_map_cs_enforcement(map)
5925 #if __arm64__
5926 		    || !VM_MAP_IS_EXOTIC(map)
5927 #endif /* __arm64__ */
5928 		    ) &&
5929 #endif /* XNU_TARGET_OS_OSX */
5930 		    !(current->used_for_jit)) {
5931 			DTRACE_VM3(cs_wx,
5932 			    uint64_t, (uint64_t) current->vme_start,
5933 			    uint64_t, (uint64_t) current->vme_end,
5934 			    vm_prot_t, new_prot);
5935 			printf("CODE SIGNING: %d[%s] %s can't have both write and exec at the same time\n",
5936 			    proc_selfpid(),
5937 			    (current_task()->bsd_info
5938 			    ? proc_name_address(current_task()->bsd_info)
5939 			    : "?"),
5940 			    __FUNCTION__);
5941 			new_prot &= ~VM_PROT_ALLEXEC;
5942 			if (VM_MAP_POLICY_WX_FAIL(map)) {
5943 				vm_map_unlock(map);
5944 				return KERN_PROTECTION_FAILURE;
5945 			}
5946 		}
5947 
5948 		/*
5949 		 * If the task has requested executable lockdown,
5950 		 * deny both:
5951 		 * - adding executable protections OR
5952 		 * - adding write protections to an existing executable mapping.
5953 		 */
5954 		if (map->map_disallow_new_exec == TRUE) {
5955 			if ((new_prot & VM_PROT_ALLEXEC) ||
5956 			    ((current->protection & VM_PROT_EXECUTE) && (new_prot & VM_PROT_WRITE))) {
5957 				vm_map_unlock(map);
5958 				return KERN_PROTECTION_FAILURE;
5959 			}
5960 		}
5961 
5962 		prev = current->vme_end;
5963 		current = current->vme_next;
5964 	}
5965 
5966 #if __arm64__
5967 	if (end > prev &&
5968 	    end == vm_map_round_page(prev, VM_MAP_PAGE_MASK(map))) {
5969 		vm_map_entry_t prev_entry;
5970 
5971 		prev_entry = current->vme_prev;
5972 		if (prev_entry != vm_map_to_entry(map) &&
5973 		    !prev_entry->map_aligned &&
5974 		    (vm_map_round_page(prev_entry->vme_end,
5975 		    VM_MAP_PAGE_MASK(map))
5976 		    == end)) {
5977 			/*
5978 			 * The last entry in our range is not "map-aligned"
5979 			 * but it would have reached all the way to "end"
5980 			 * if it had been map-aligned, so this is not really
5981 			 * a hole in the range and we can proceed.
5982 			 */
5983 			prev = end;
5984 		}
5985 	}
5986 #endif /* __arm64__ */
5987 
5988 	if (end > prev) {
5989 		vm_map_unlock(map);
5990 		return KERN_INVALID_ADDRESS;
5991 	}
5992 
5993 	/*
5994 	 *	Go back and fix up protections.
5995 	 *	Clip to start here if the range starts within
5996 	 *	the entry.
5997 	 */
5998 
5999 	current = entry;
6000 	if (current != vm_map_to_entry(map)) {
6001 		/* clip and unnest if necessary */
6002 		vm_map_clip_start(map, current, start);
6003 	}
6004 
6005 	while ((current != vm_map_to_entry(map)) &&
6006 	    (current->vme_start < end)) {
6007 		vm_prot_t       old_prot;
6008 
6009 		vm_map_clip_end(map, current, end);
6010 
6011 		if (current->is_sub_map) {
6012 			/* clipping did unnest if needed */
6013 			assert(!current->use_pmap);
6014 		}
6015 
6016 		old_prot = current->protection;
6017 
6018 		if (set_max) {
6019 			current->max_protection = new_prot;
6020 			/* Consider either EXECUTE or UEXEC as EXECUTE for this masking */
6021 			current->protection = (new_prot & old_prot);
6022 		} else {
6023 			current->protection = new_prot;
6024 		}
6025 
6026 		/*
6027 		 *	Update physical map if necessary.
6028 		 *	If the request is to turn off write protection,
6029 		 *	we won't do it for real (in pmap). This is because
6030 		 *	it would cause copy-on-write to fail.  We've already
6031 		 *	set, the new protection in the map, so if a
6032 		 *	write-protect fault occurred, it will be fixed up
6033 		 *	properly, COW or not.
6034 		 */
6035 		if (current->protection != old_prot) {
6036 			/* Look one level in we support nested pmaps */
6037 			/* from mapped submaps which are direct entries */
6038 			/* in our map */
6039 
6040 			vm_prot_t prot;
6041 
6042 			prot = current->protection;
6043 			if (current->is_sub_map || (VME_OBJECT(current) == NULL) || (VME_OBJECT(current) != compressor_object)) {
6044 				prot &= ~VM_PROT_WRITE;
6045 			} else {
6046 				assert(!VME_OBJECT(current)->code_signed);
6047 				assert(VME_OBJECT(current)->copy_strategy == MEMORY_OBJECT_COPY_NONE);
6048 				if (prot & VM_PROT_WRITE) {
6049 					/*
6050 					 * For write requests on the
6051 					 * compressor, we wil ask the
6052 					 * pmap layer to prevent us from
6053 					 * taking a write fault when we
6054 					 * attempt to access the mapping
6055 					 * next.
6056 					 */
6057 					pmap_options |= PMAP_OPTIONS_PROTECT_IMMEDIATE;
6058 				}
6059 			}
6060 
6061 			if (override_nx(map, VME_ALIAS(current)) && prot) {
6062 				prot |= VM_PROT_EXECUTE;
6063 			}
6064 
6065 #if DEVELOPMENT || DEBUG
6066 			if (!(old_prot & VM_PROT_EXECUTE) &&
6067 			    (prot & VM_PROT_EXECUTE) &&
6068 			    panic_on_unsigned_execute &&
6069 			    (proc_selfcsflags() & CS_KILL)) {
6070 				panic("vm_map_protect(%p,0x%llx,0x%llx) old=0x%x new=0x%x - <rdar://23770418> code-signing bypass?", map, (uint64_t)current->vme_start, (uint64_t)current->vme_end, old_prot, prot);
6071 			}
6072 #endif /* DEVELOPMENT || DEBUG */
6073 
6074 			if (pmap_has_prot_policy(map->pmap, current->translated_allow_execute, prot)) {
6075 				if (current->wired_count) {
6076 					panic("vm_map_protect(%p,0x%llx,0x%llx) new=0x%x wired=%x",
6077 					    map, (uint64_t)current->vme_start, (uint64_t)current->vme_end, prot, current->wired_count);
6078 				}
6079 
6080 				/* If the pmap layer cares about this
6081 				 * protection type, force a fault for
6082 				 * each page so that vm_fault will
6083 				 * repopulate the page with the full
6084 				 * set of protections.
6085 				 */
6086 				/*
6087 				 * TODO: We don't seem to need this,
6088 				 * but this is due to an internal
6089 				 * implementation detail of
6090 				 * pmap_protect.  Do we want to rely
6091 				 * on this?
6092 				 */
6093 				prot = VM_PROT_NONE;
6094 			}
6095 
6096 			if (current->is_sub_map && current->use_pmap) {
6097 				pmap_protect(VME_SUBMAP(current)->pmap,
6098 				    current->vme_start,
6099 				    current->vme_end,
6100 				    prot);
6101 			} else {
6102 				pmap_protect_options(map->pmap,
6103 				    current->vme_start,
6104 				    current->vme_end,
6105 				    prot,
6106 				    pmap_options,
6107 				    NULL);
6108 			}
6109 		}
6110 		current = current->vme_next;
6111 	}
6112 
6113 	current = entry;
6114 	while ((current != vm_map_to_entry(map)) &&
6115 	    (current->vme_start <= end)) {
6116 		vm_map_simplify_entry(map, current);
6117 		current = current->vme_next;
6118 	}
6119 
6120 	vm_map_unlock(map);
6121 	return KERN_SUCCESS;
6122 }
6123 
6124 /*
6125  *	vm_map_inherit:
6126  *
6127  *	Sets the inheritance of the specified address
6128  *	range in the target map.  Inheritance
6129  *	affects how the map will be shared with
6130  *	child maps at the time of vm_map_fork.
6131  */
6132 kern_return_t
vm_map_inherit(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_inherit_t new_inheritance)6133 vm_map_inherit(
6134 	vm_map_t        map,
6135 	vm_map_offset_t start,
6136 	vm_map_offset_t end,
6137 	vm_inherit_t    new_inheritance)
6138 {
6139 	vm_map_entry_t  entry;
6140 	vm_map_entry_t  temp_entry;
6141 
6142 	vm_map_lock(map);
6143 
6144 	VM_MAP_RANGE_CHECK(map, start, end);
6145 
6146 	if (vm_map_lookup_entry(map, start, &temp_entry)) {
6147 		entry = temp_entry;
6148 	} else {
6149 		temp_entry = temp_entry->vme_next;
6150 		entry = temp_entry;
6151 	}
6152 
6153 	/* first check entire range for submaps which can't support the */
6154 	/* given inheritance. */
6155 	while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
6156 		if (entry->is_sub_map) {
6157 			if (new_inheritance == VM_INHERIT_COPY) {
6158 				vm_map_unlock(map);
6159 				return KERN_INVALID_ARGUMENT;
6160 			}
6161 		}
6162 
6163 		entry = entry->vme_next;
6164 	}
6165 
6166 	entry = temp_entry;
6167 	if (entry != vm_map_to_entry(map)) {
6168 		/* clip and unnest if necessary */
6169 		vm_map_clip_start(map, entry, start);
6170 	}
6171 
6172 	while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
6173 		vm_map_clip_end(map, entry, end);
6174 		if (entry->is_sub_map) {
6175 			/* clip did unnest if needed */
6176 			assert(!entry->use_pmap);
6177 		}
6178 
6179 		entry->inheritance = new_inheritance;
6180 
6181 		entry = entry->vme_next;
6182 	}
6183 
6184 	vm_map_unlock(map);
6185 	return KERN_SUCCESS;
6186 }
6187 
6188 /*
6189  * Update the accounting for the amount of wired memory in this map.  If the user has
6190  * exceeded the defined limits, then we fail.  Wiring on behalf of the kernel never fails.
6191  */
6192 
6193 static kern_return_t
add_wire_counts(vm_map_t map,vm_map_entry_t entry,boolean_t user_wire)6194 add_wire_counts(
6195 	vm_map_t        map,
6196 	vm_map_entry_t  entry,
6197 	boolean_t       user_wire)
6198 {
6199 	vm_map_size_t   size;
6200 
6201 	if (user_wire) {
6202 		unsigned int total_wire_count =  vm_page_wire_count + vm_lopage_free_count;
6203 
6204 		/*
6205 		 * We're wiring memory at the request of the user.  Check if this is the first time the user is wiring
6206 		 * this map entry.
6207 		 */
6208 
6209 		if (entry->user_wired_count == 0) {
6210 			size = entry->vme_end - entry->vme_start;
6211 
6212 			/*
6213 			 * Since this is the first time the user is wiring this map entry, check to see if we're
6214 			 * exceeding the user wire limits.  There is a per map limit which is the smaller of either
6215 			 * the process's rlimit or the global vm_per_task_user_wire_limit which caps this value.  There is also
6216 			 * a system-wide limit on the amount of memory all users can wire.  If the user is over either
6217 			 * limit, then we fail.
6218 			 */
6219 
6220 			if (size + map->user_wire_size > MIN(map->user_wire_limit, vm_per_task_user_wire_limit) ||
6221 			    size + ptoa_64(total_wire_count) > vm_global_user_wire_limit) {
6222 				if (size + ptoa_64(total_wire_count) > vm_global_user_wire_limit) {
6223 #if DEVELOPMENT || DEBUG
6224 					if (panic_on_mlock_failure) {
6225 						panic("mlock: Over global wire limit. %llu bytes wired and requested to wire %llu bytes more", ptoa_64(total_wire_count), (uint64_t) size);
6226 					}
6227 #endif /* DEVELOPMENT || DEBUG */
6228 					os_atomic_inc(&vm_add_wire_count_over_global_limit, relaxed);
6229 				} else {
6230 					os_atomic_inc(&vm_add_wire_count_over_user_limit, relaxed);
6231 #if DEVELOPMENT || DEBUG
6232 					if (panic_on_mlock_failure) {
6233 						panic("mlock: Over process wire limit. %llu bytes wired and requested to wire %llu bytes more", (uint64_t) map->user_wire_size, (uint64_t) size);
6234 					}
6235 #endif /* DEVELOPMENT || DEBUG */
6236 				}
6237 				return KERN_RESOURCE_SHORTAGE;
6238 			}
6239 
6240 			/*
6241 			 * The first time the user wires an entry, we also increment the wired_count and add this to
6242 			 * the total that has been wired in the map.
6243 			 */
6244 
6245 			if (entry->wired_count >= MAX_WIRE_COUNT) {
6246 				return KERN_FAILURE;
6247 			}
6248 
6249 			entry->wired_count++;
6250 			map->user_wire_size += size;
6251 		}
6252 
6253 		if (entry->user_wired_count >= MAX_WIRE_COUNT) {
6254 			return KERN_FAILURE;
6255 		}
6256 
6257 		entry->user_wired_count++;
6258 	} else {
6259 		/*
6260 		 * The kernel's wiring the memory.  Just bump the count and continue.
6261 		 */
6262 
6263 		if (entry->wired_count >= MAX_WIRE_COUNT) {
6264 			panic("vm_map_wire: too many wirings");
6265 		}
6266 
6267 		entry->wired_count++;
6268 	}
6269 
6270 	return KERN_SUCCESS;
6271 }
6272 
6273 /*
6274  * Update the memory wiring accounting now that the given map entry is being unwired.
6275  */
6276 
6277 static void
subtract_wire_counts(vm_map_t map,vm_map_entry_t entry,boolean_t user_wire)6278 subtract_wire_counts(
6279 	vm_map_t        map,
6280 	vm_map_entry_t  entry,
6281 	boolean_t       user_wire)
6282 {
6283 	if (user_wire) {
6284 		/*
6285 		 * We're unwiring memory at the request of the user.  See if we're removing the last user wire reference.
6286 		 */
6287 
6288 		if (entry->user_wired_count == 1) {
6289 			/*
6290 			 * We're removing the last user wire reference.  Decrement the wired_count and the total
6291 			 * user wired memory for this map.
6292 			 */
6293 
6294 			assert(entry->wired_count >= 1);
6295 			entry->wired_count--;
6296 			map->user_wire_size -= entry->vme_end - entry->vme_start;
6297 		}
6298 
6299 		assert(entry->user_wired_count >= 1);
6300 		entry->user_wired_count--;
6301 	} else {
6302 		/*
6303 		 * The kernel is unwiring the memory.   Just update the count.
6304 		 */
6305 
6306 		assert(entry->wired_count >= 1);
6307 		entry->wired_count--;
6308 	}
6309 }
6310 
6311 int cs_executable_wire = 0;
6312 
6313 /*
6314  *	vm_map_wire:
6315  *
6316  *	Sets the pageability of the specified address range in the
6317  *	target map as wired.  Regions specified as not pageable require
6318  *	locked-down physical memory and physical page maps.  The
6319  *	access_type variable indicates types of accesses that must not
6320  *	generate page faults.  This is checked against protection of
6321  *	memory being locked-down.
6322  *
6323  *	The map must not be locked, but a reference must remain to the
6324  *	map throughout the call.
6325  */
6326 static kern_return_t
vm_map_wire_nested(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t caller_prot,vm_tag_t tag,boolean_t user_wire,pmap_t map_pmap,vm_map_offset_t pmap_addr,ppnum_t * physpage_p)6327 vm_map_wire_nested(
6328 	vm_map_t                map,
6329 	vm_map_offset_t         start,
6330 	vm_map_offset_t         end,
6331 	vm_prot_t               caller_prot,
6332 	vm_tag_t                tag,
6333 	boolean_t               user_wire,
6334 	pmap_t                  map_pmap,
6335 	vm_map_offset_t         pmap_addr,
6336 	ppnum_t                 *physpage_p)
6337 {
6338 	vm_map_entry_t          entry;
6339 	vm_prot_t               access_type;
6340 	struct vm_map_entry     *first_entry, tmp_entry;
6341 	vm_map_t                real_map;
6342 	vm_map_offset_t         s, e;
6343 	kern_return_t           rc;
6344 	boolean_t               need_wakeup;
6345 	boolean_t               main_map = FALSE;
6346 	wait_interrupt_t        interruptible_state;
6347 	thread_t                cur_thread;
6348 	unsigned int            last_timestamp;
6349 	vm_map_size_t           size;
6350 	boolean_t               wire_and_extract;
6351 	vm_prot_t               extra_prots;
6352 
6353 	extra_prots = VM_PROT_COPY;
6354 	extra_prots |= VM_PROT_COPY_FAIL_IF_EXECUTABLE;
6355 #if XNU_TARGET_OS_OSX
6356 	if (map->pmap == kernel_pmap ||
6357 	    !vm_map_cs_enforcement(map)) {
6358 		extra_prots &= ~VM_PROT_COPY_FAIL_IF_EXECUTABLE;
6359 	}
6360 #endif /* XNU_TARGET_OS_OSX */
6361 
6362 	access_type = (caller_prot & (VM_PROT_ALL | VM_PROT_ALLEXEC));
6363 
6364 	wire_and_extract = FALSE;
6365 	if (physpage_p != NULL) {
6366 		/*
6367 		 * The caller wants the physical page number of the
6368 		 * wired page.  We return only one physical page number
6369 		 * so this works for only one page at a time.
6370 		 */
6371 		if ((end - start) != PAGE_SIZE) {
6372 			return KERN_INVALID_ARGUMENT;
6373 		}
6374 		wire_and_extract = TRUE;
6375 		*physpage_p = 0;
6376 	}
6377 
6378 	vm_map_lock(map);
6379 	if (map_pmap == NULL) {
6380 		main_map = TRUE;
6381 	}
6382 	last_timestamp = map->timestamp;
6383 
6384 	VM_MAP_RANGE_CHECK(map, start, end);
6385 	assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
6386 	assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
6387 
6388 	if (start == end) {
6389 		/* We wired what the caller asked for, zero pages */
6390 		vm_map_unlock(map);
6391 		return KERN_SUCCESS;
6392 	}
6393 
6394 	need_wakeup = FALSE;
6395 	cur_thread = current_thread();
6396 
6397 	s = start;
6398 	rc = KERN_SUCCESS;
6399 
6400 	if (vm_map_lookup_entry(map, s, &first_entry)) {
6401 		entry = first_entry;
6402 		/*
6403 		 * vm_map_clip_start will be done later.
6404 		 * We don't want to unnest any nested submaps here !
6405 		 */
6406 	} else {
6407 		/* Start address is not in map */
6408 		rc = KERN_INVALID_ADDRESS;
6409 		goto done;
6410 	}
6411 
6412 	while ((entry != vm_map_to_entry(map)) && (s < end)) {
6413 		/*
6414 		 * At this point, we have wired from "start" to "s".
6415 		 * We still need to wire from "s" to "end".
6416 		 *
6417 		 * "entry" hasn't been clipped, so it could start before "s"
6418 		 * and/or end after "end".
6419 		 */
6420 
6421 		/* "e" is how far we want to wire in this entry */
6422 		e = entry->vme_end;
6423 		if (e > end) {
6424 			e = end;
6425 		}
6426 
6427 		/*
6428 		 * If another thread is wiring/unwiring this entry then
6429 		 * block after informing other thread to wake us up.
6430 		 */
6431 		if (entry->in_transition) {
6432 			wait_result_t wait_result;
6433 
6434 			/*
6435 			 * We have not clipped the entry.  Make sure that
6436 			 * the start address is in range so that the lookup
6437 			 * below will succeed.
6438 			 * "s" is the current starting point: we've already
6439 			 * wired from "start" to "s" and we still have
6440 			 * to wire from "s" to "end".
6441 			 */
6442 
6443 			entry->needs_wakeup = TRUE;
6444 
6445 			/*
6446 			 * wake up anybody waiting on entries that we have
6447 			 * already wired.
6448 			 */
6449 			if (need_wakeup) {
6450 				vm_map_entry_wakeup(map);
6451 				need_wakeup = FALSE;
6452 			}
6453 			/*
6454 			 * User wiring is interruptible
6455 			 */
6456 			wait_result = vm_map_entry_wait(map,
6457 			    (user_wire) ? THREAD_ABORTSAFE :
6458 			    THREAD_UNINT);
6459 			if (user_wire && wait_result == THREAD_INTERRUPTED) {
6460 				/*
6461 				 * undo the wirings we have done so far
6462 				 * We do not clear the needs_wakeup flag,
6463 				 * because we cannot tell if we were the
6464 				 * only one waiting.
6465 				 */
6466 				rc = KERN_FAILURE;
6467 				goto done;
6468 			}
6469 
6470 			/*
6471 			 * Cannot avoid a lookup here. reset timestamp.
6472 			 */
6473 			last_timestamp = map->timestamp;
6474 
6475 			/*
6476 			 * The entry could have been clipped, look it up again.
6477 			 * Worse that can happen is, it may not exist anymore.
6478 			 */
6479 			if (!vm_map_lookup_entry(map, s, &first_entry)) {
6480 				/*
6481 				 * User: undo everything upto the previous
6482 				 * entry.  let vm_map_unwire worry about
6483 				 * checking the validity of the range.
6484 				 */
6485 				rc = KERN_FAILURE;
6486 				goto done;
6487 			}
6488 			entry = first_entry;
6489 			continue;
6490 		}
6491 
6492 		if (entry->is_sub_map) {
6493 			vm_map_offset_t sub_start;
6494 			vm_map_offset_t sub_end;
6495 			vm_map_offset_t local_start;
6496 			vm_map_offset_t local_end;
6497 			pmap_t          pmap;
6498 
6499 			if (wire_and_extract) {
6500 				/*
6501 				 * Wiring would result in copy-on-write
6502 				 * which would not be compatible with
6503 				 * the sharing we have with the original
6504 				 * provider of this memory.
6505 				 */
6506 				rc = KERN_INVALID_ARGUMENT;
6507 				goto done;
6508 			}
6509 
6510 			vm_map_clip_start(map, entry, s);
6511 			vm_map_clip_end(map, entry, end);
6512 
6513 			sub_start = VME_OFFSET(entry);
6514 			sub_end = entry->vme_end;
6515 			sub_end += VME_OFFSET(entry) - entry->vme_start;
6516 
6517 			local_end = entry->vme_end;
6518 			if (map_pmap == NULL) {
6519 				vm_object_t             object;
6520 				vm_object_offset_t      offset;
6521 				vm_prot_t               prot;
6522 				boolean_t               wired;
6523 				vm_map_entry_t          local_entry;
6524 				vm_map_version_t         version;
6525 				vm_map_t                lookup_map;
6526 
6527 				if (entry->use_pmap) {
6528 					pmap = VME_SUBMAP(entry)->pmap;
6529 					/* ppc implementation requires that */
6530 					/* submaps pmap address ranges line */
6531 					/* up with parent map */
6532 #ifdef notdef
6533 					pmap_addr = sub_start;
6534 #endif
6535 					pmap_addr = s;
6536 				} else {
6537 					pmap = map->pmap;
6538 					pmap_addr = s;
6539 				}
6540 
6541 				if (entry->wired_count) {
6542 					if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6543 						goto done;
6544 					}
6545 
6546 					/*
6547 					 * The map was not unlocked:
6548 					 * no need to goto re-lookup.
6549 					 * Just go directly to next entry.
6550 					 */
6551 					entry = entry->vme_next;
6552 					s = entry->vme_start;
6553 					continue;
6554 				}
6555 
6556 				/* call vm_map_lookup_locked to */
6557 				/* cause any needs copy to be   */
6558 				/* evaluated */
6559 				local_start = entry->vme_start;
6560 				lookup_map = map;
6561 				vm_map_lock_write_to_read(map);
6562 				rc = vm_map_lookup_locked(
6563 					&lookup_map, local_start,
6564 					(access_type | extra_prots),
6565 					OBJECT_LOCK_EXCLUSIVE,
6566 					&version, &object,
6567 					&offset, &prot, &wired,
6568 					NULL,
6569 					&real_map, NULL);
6570 				if (rc != KERN_SUCCESS) {
6571 					vm_map_unlock_read(lookup_map);
6572 					assert(map_pmap == NULL);
6573 					vm_map_unwire(map, start,
6574 					    s, user_wire);
6575 					return rc;
6576 				}
6577 				vm_object_unlock(object);
6578 				if (real_map != lookup_map) {
6579 					vm_map_unlock(real_map);
6580 				}
6581 				vm_map_unlock_read(lookup_map);
6582 				vm_map_lock(map);
6583 
6584 				/* we unlocked, so must re-lookup */
6585 				if (!vm_map_lookup_entry(map,
6586 				    local_start,
6587 				    &local_entry)) {
6588 					rc = KERN_FAILURE;
6589 					goto done;
6590 				}
6591 
6592 				/*
6593 				 * entry could have been "simplified",
6594 				 * so re-clip
6595 				 */
6596 				entry = local_entry;
6597 				assert(s == local_start);
6598 				vm_map_clip_start(map, entry, s);
6599 				vm_map_clip_end(map, entry, end);
6600 				/* re-compute "e" */
6601 				e = entry->vme_end;
6602 				if (e > end) {
6603 					e = end;
6604 				}
6605 
6606 				/* did we have a change of type? */
6607 				if (!entry->is_sub_map) {
6608 					last_timestamp = map->timestamp;
6609 					continue;
6610 				}
6611 			} else {
6612 				local_start = entry->vme_start;
6613 				pmap = map_pmap;
6614 			}
6615 
6616 			if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6617 				goto done;
6618 			}
6619 
6620 			entry->in_transition = TRUE;
6621 
6622 			vm_map_unlock(map);
6623 			rc = vm_map_wire_nested(VME_SUBMAP(entry),
6624 			    sub_start, sub_end,
6625 			    caller_prot, tag,
6626 			    user_wire, pmap, pmap_addr,
6627 			    NULL);
6628 			vm_map_lock(map);
6629 
6630 			/*
6631 			 * Find the entry again.  It could have been clipped
6632 			 * after we unlocked the map.
6633 			 */
6634 			if (!vm_map_lookup_entry(map, local_start,
6635 			    &first_entry)) {
6636 				panic("vm_map_wire: re-lookup failed");
6637 			}
6638 			entry = first_entry;
6639 
6640 			assert(local_start == s);
6641 			/* re-compute "e" */
6642 			e = entry->vme_end;
6643 			if (e > end) {
6644 				e = end;
6645 			}
6646 
6647 			last_timestamp = map->timestamp;
6648 			while ((entry != vm_map_to_entry(map)) &&
6649 			    (entry->vme_start < e)) {
6650 				assert(entry->in_transition);
6651 				entry->in_transition = FALSE;
6652 				if (entry->needs_wakeup) {
6653 					entry->needs_wakeup = FALSE;
6654 					need_wakeup = TRUE;
6655 				}
6656 				if (rc != KERN_SUCCESS) {/* from vm_*_wire */
6657 					subtract_wire_counts(map, entry, user_wire);
6658 				}
6659 				entry = entry->vme_next;
6660 			}
6661 			if (rc != KERN_SUCCESS) {       /* from vm_*_wire */
6662 				goto done;
6663 			}
6664 
6665 			/* no need to relookup again */
6666 			s = entry->vme_start;
6667 			continue;
6668 		}
6669 
6670 		/*
6671 		 * If this entry is already wired then increment
6672 		 * the appropriate wire reference count.
6673 		 */
6674 		if (entry->wired_count) {
6675 			if ((entry->protection & access_type) != access_type) {
6676 				/* found a protection problem */
6677 
6678 				/*
6679 				 * XXX FBDP
6680 				 * We should always return an error
6681 				 * in this case but since we didn't
6682 				 * enforce it before, let's do
6683 				 * it only for the new "wire_and_extract"
6684 				 * code path for now...
6685 				 */
6686 				if (wire_and_extract) {
6687 					rc = KERN_PROTECTION_FAILURE;
6688 					goto done;
6689 				}
6690 			}
6691 
6692 			/*
6693 			 * entry is already wired down, get our reference
6694 			 * after clipping to our range.
6695 			 */
6696 			vm_map_clip_start(map, entry, s);
6697 			vm_map_clip_end(map, entry, end);
6698 
6699 			if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6700 				goto done;
6701 			}
6702 
6703 			if (wire_and_extract) {
6704 				vm_object_t             object;
6705 				vm_object_offset_t      offset;
6706 				vm_page_t               m;
6707 
6708 				/*
6709 				 * We don't have to "wire" the page again
6710 				 * bit we still have to "extract" its
6711 				 * physical page number, after some sanity
6712 				 * checks.
6713 				 */
6714 				assert((entry->vme_end - entry->vme_start)
6715 				    == PAGE_SIZE);
6716 				assert(!entry->needs_copy);
6717 				assert(!entry->is_sub_map);
6718 				assert(VME_OBJECT(entry));
6719 				if (((entry->vme_end - entry->vme_start)
6720 				    != PAGE_SIZE) ||
6721 				    entry->needs_copy ||
6722 				    entry->is_sub_map ||
6723 				    VME_OBJECT(entry) == VM_OBJECT_NULL) {
6724 					rc = KERN_INVALID_ARGUMENT;
6725 					goto done;
6726 				}
6727 
6728 				object = VME_OBJECT(entry);
6729 				offset = VME_OFFSET(entry);
6730 				/* need exclusive lock to update m->dirty */
6731 				if (entry->protection & VM_PROT_WRITE) {
6732 					vm_object_lock(object);
6733 				} else {
6734 					vm_object_lock_shared(object);
6735 				}
6736 				m = vm_page_lookup(object, offset);
6737 				assert(m != VM_PAGE_NULL);
6738 				assert(VM_PAGE_WIRED(m));
6739 				if (m != VM_PAGE_NULL && VM_PAGE_WIRED(m)) {
6740 					*physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
6741 					if (entry->protection & VM_PROT_WRITE) {
6742 						vm_object_lock_assert_exclusive(
6743 							object);
6744 						m->vmp_dirty = TRUE;
6745 					}
6746 				} else {
6747 					/* not already wired !? */
6748 					*physpage_p = 0;
6749 				}
6750 				vm_object_unlock(object);
6751 			}
6752 
6753 			/* map was not unlocked: no need to relookup */
6754 			entry = entry->vme_next;
6755 			s = entry->vme_start;
6756 			continue;
6757 		}
6758 
6759 		/*
6760 		 * Unwired entry or wire request transmitted via submap
6761 		 */
6762 
6763 		/*
6764 		 * Wiring would copy the pages to the shadow object.
6765 		 * The shadow object would not be code-signed so
6766 		 * attempting to execute code from these copied pages
6767 		 * would trigger a code-signing violation.
6768 		 */
6769 
6770 		if ((entry->protection & VM_PROT_EXECUTE)
6771 #if XNU_TARGET_OS_OSX
6772 		    &&
6773 		    map->pmap != kernel_pmap &&
6774 		    (vm_map_cs_enforcement(map)
6775 #if __arm64__
6776 		    || !VM_MAP_IS_EXOTIC(map)
6777 #endif /* __arm64__ */
6778 		    )
6779 #endif /* XNU_TARGET_OS_OSX */
6780 		    ) {
6781 #if MACH_ASSERT
6782 			printf("pid %d[%s] wiring executable range from "
6783 			    "0x%llx to 0x%llx: rejected to preserve "
6784 			    "code-signing\n",
6785 			    proc_selfpid(),
6786 			    (current_task()->bsd_info
6787 			    ? proc_name_address(current_task()->bsd_info)
6788 			    : "?"),
6789 			    (uint64_t) entry->vme_start,
6790 			    (uint64_t) entry->vme_end);
6791 #endif /* MACH_ASSERT */
6792 			DTRACE_VM2(cs_executable_wire,
6793 			    uint64_t, (uint64_t)entry->vme_start,
6794 			    uint64_t, (uint64_t)entry->vme_end);
6795 			cs_executable_wire++;
6796 			rc = KERN_PROTECTION_FAILURE;
6797 			goto done;
6798 		}
6799 
6800 		/*
6801 		 * Perform actions of vm_map_lookup that need the write
6802 		 * lock on the map: create a shadow object for a
6803 		 * copy-on-write region, or an object for a zero-fill
6804 		 * region.
6805 		 */
6806 		size = entry->vme_end - entry->vme_start;
6807 		/*
6808 		 * If wiring a copy-on-write page, we need to copy it now
6809 		 * even if we're only (currently) requesting read access.
6810 		 * This is aggressive, but once it's wired we can't move it.
6811 		 */
6812 		if (entry->needs_copy) {
6813 			if (wire_and_extract) {
6814 				/*
6815 				 * We're supposed to share with the original
6816 				 * provider so should not be "needs_copy"
6817 				 */
6818 				rc = KERN_INVALID_ARGUMENT;
6819 				goto done;
6820 			}
6821 
6822 			VME_OBJECT_SHADOW(entry, size);
6823 			entry->needs_copy = FALSE;
6824 		} else if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
6825 			if (wire_and_extract) {
6826 				/*
6827 				 * We're supposed to share with the original
6828 				 * provider so should already have an object.
6829 				 */
6830 				rc = KERN_INVALID_ARGUMENT;
6831 				goto done;
6832 			}
6833 			VME_OBJECT_SET(entry, vm_object_allocate(size), false, 0);
6834 			VME_OFFSET_SET(entry, (vm_object_offset_t)0);
6835 			assert(entry->use_pmap);
6836 		}
6837 
6838 		vm_map_clip_start(map, entry, s);
6839 		vm_map_clip_end(map, entry, end);
6840 
6841 		/* re-compute "e" */
6842 		e = entry->vme_end;
6843 		if (e > end) {
6844 			e = end;
6845 		}
6846 
6847 		/*
6848 		 * Check for holes and protection mismatch.
6849 		 * Holes: Next entry should be contiguous unless this
6850 		 *	  is the end of the region.
6851 		 * Protection: Access requested must be allowed, unless
6852 		 *	wiring is by protection class
6853 		 */
6854 		if ((entry->vme_end < end) &&
6855 		    ((entry->vme_next == vm_map_to_entry(map)) ||
6856 		    (entry->vme_next->vme_start > entry->vme_end))) {
6857 			/* found a hole */
6858 			rc = KERN_INVALID_ADDRESS;
6859 			goto done;
6860 		}
6861 		if ((entry->protection & access_type) != access_type) {
6862 			/* found a protection problem */
6863 			rc = KERN_PROTECTION_FAILURE;
6864 			goto done;
6865 		}
6866 
6867 		assert(entry->wired_count == 0 && entry->user_wired_count == 0);
6868 
6869 		if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6870 			goto done;
6871 		}
6872 
6873 		entry->in_transition = TRUE;
6874 
6875 		/*
6876 		 * This entry might get split once we unlock the map.
6877 		 * In vm_fault_wire(), we need the current range as
6878 		 * defined by this entry.  In order for this to work
6879 		 * along with a simultaneous clip operation, we make a
6880 		 * temporary copy of this entry and use that for the
6881 		 * wiring.  Note that the underlying objects do not
6882 		 * change during a clip.
6883 		 */
6884 		tmp_entry = *entry;
6885 
6886 		/*
6887 		 * The in_transition state guarentees that the entry
6888 		 * (or entries for this range, if split occured) will be
6889 		 * there when the map lock is acquired for the second time.
6890 		 */
6891 		vm_map_unlock(map);
6892 
6893 		if (!user_wire && cur_thread != THREAD_NULL) {
6894 			interruptible_state = thread_interrupt_level(THREAD_UNINT);
6895 		} else {
6896 			interruptible_state = THREAD_UNINT;
6897 		}
6898 
6899 		if (map_pmap) {
6900 			rc = vm_fault_wire(map,
6901 			    &tmp_entry, caller_prot, tag, map_pmap, pmap_addr,
6902 			    physpage_p);
6903 		} else {
6904 			rc = vm_fault_wire(map,
6905 			    &tmp_entry, caller_prot, tag, map->pmap,
6906 			    tmp_entry.vme_start,
6907 			    physpage_p);
6908 		}
6909 
6910 		if (!user_wire && cur_thread != THREAD_NULL) {
6911 			thread_interrupt_level(interruptible_state);
6912 		}
6913 
6914 		vm_map_lock(map);
6915 
6916 		if (last_timestamp + 1 != map->timestamp) {
6917 			/*
6918 			 * Find the entry again.  It could have been clipped
6919 			 * after we unlocked the map.
6920 			 */
6921 			if (!vm_map_lookup_entry(map, tmp_entry.vme_start,
6922 			    &first_entry)) {
6923 				panic("vm_map_wire: re-lookup failed");
6924 			}
6925 
6926 			entry = first_entry;
6927 		}
6928 
6929 		last_timestamp = map->timestamp;
6930 
6931 		while ((entry != vm_map_to_entry(map)) &&
6932 		    (entry->vme_start < tmp_entry.vme_end)) {
6933 			assert(entry->in_transition);
6934 			entry->in_transition = FALSE;
6935 			if (entry->needs_wakeup) {
6936 				entry->needs_wakeup = FALSE;
6937 				need_wakeup = TRUE;
6938 			}
6939 			if (rc != KERN_SUCCESS) {       /* from vm_*_wire */
6940 				subtract_wire_counts(map, entry, user_wire);
6941 			}
6942 			entry = entry->vme_next;
6943 		}
6944 
6945 		if (rc != KERN_SUCCESS) {               /* from vm_*_wire */
6946 			goto done;
6947 		}
6948 
6949 		if ((entry != vm_map_to_entry(map)) && /* we still have entries in the map */
6950 		    (tmp_entry.vme_end != end) &&    /* AND, we are not at the end of the requested range */
6951 		    (entry->vme_start != tmp_entry.vme_end)) { /* AND, the next entry is not contiguous. */
6952 			/* found a "new" hole */
6953 			s = tmp_entry.vme_end;
6954 			rc = KERN_INVALID_ADDRESS;
6955 			goto done;
6956 		}
6957 
6958 		s = entry->vme_start;
6959 	} /* end while loop through map entries */
6960 
6961 done:
6962 	if (rc == KERN_SUCCESS) {
6963 		/* repair any damage we may have made to the VM map */
6964 		vm_map_simplify_range(map, start, end);
6965 	}
6966 
6967 	vm_map_unlock(map);
6968 
6969 	/*
6970 	 * wake up anybody waiting on entries we wired.
6971 	 */
6972 	if (need_wakeup) {
6973 		vm_map_entry_wakeup(map);
6974 	}
6975 
6976 	if (rc != KERN_SUCCESS) {
6977 		/* undo what has been wired so far */
6978 		vm_map_unwire_nested(map, start, s, user_wire,
6979 		    map_pmap, pmap_addr);
6980 		if (physpage_p) {
6981 			*physpage_p = 0;
6982 		}
6983 	}
6984 
6985 	return rc;
6986 }
6987 
6988 kern_return_t
vm_map_wire_external(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t caller_prot,boolean_t user_wire)6989 vm_map_wire_external(
6990 	vm_map_t                map,
6991 	vm_map_offset_t         start,
6992 	vm_map_offset_t         end,
6993 	vm_prot_t               caller_prot,
6994 	boolean_t               user_wire)
6995 {
6996 	kern_return_t   kret;
6997 
6998 	kret = vm_map_wire_nested(map, start, end, caller_prot, vm_tag_bt(),
6999 	    user_wire, (pmap_t)NULL, 0, NULL);
7000 	return kret;
7001 }
7002 
7003 kern_return_t
vm_map_wire_kernel(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t caller_prot,vm_tag_t tag,boolean_t user_wire)7004 vm_map_wire_kernel(
7005 	vm_map_t                map,
7006 	vm_map_offset_t         start,
7007 	vm_map_offset_t         end,
7008 	vm_prot_t               caller_prot,
7009 	vm_tag_t                tag,
7010 	boolean_t               user_wire)
7011 {
7012 	kern_return_t   kret;
7013 
7014 	kret = vm_map_wire_nested(map, start, end, caller_prot, tag,
7015 	    user_wire, (pmap_t)NULL, 0, NULL);
7016 	return kret;
7017 }
7018 
7019 kern_return_t
vm_map_wire_and_extract_external(vm_map_t map,vm_map_offset_t start,vm_prot_t caller_prot,boolean_t user_wire,ppnum_t * physpage_p)7020 vm_map_wire_and_extract_external(
7021 	vm_map_t        map,
7022 	vm_map_offset_t start,
7023 	vm_prot_t       caller_prot,
7024 	boolean_t       user_wire,
7025 	ppnum_t         *physpage_p)
7026 {
7027 	kern_return_t   kret;
7028 
7029 	kret = vm_map_wire_nested(map,
7030 	    start,
7031 	    start + VM_MAP_PAGE_SIZE(map),
7032 	    caller_prot,
7033 	    vm_tag_bt(),
7034 	    user_wire,
7035 	    (pmap_t)NULL,
7036 	    0,
7037 	    physpage_p);
7038 	if (kret != KERN_SUCCESS &&
7039 	    physpage_p != NULL) {
7040 		*physpage_p = 0;
7041 	}
7042 	return kret;
7043 }
7044 
7045 kern_return_t
vm_map_wire_and_extract_kernel(vm_map_t map,vm_map_offset_t start,vm_prot_t caller_prot,vm_tag_t tag,boolean_t user_wire,ppnum_t * physpage_p)7046 vm_map_wire_and_extract_kernel(
7047 	vm_map_t        map,
7048 	vm_map_offset_t start,
7049 	vm_prot_t       caller_prot,
7050 	vm_tag_t        tag,
7051 	boolean_t       user_wire,
7052 	ppnum_t         *physpage_p)
7053 {
7054 	kern_return_t   kret;
7055 
7056 	kret = vm_map_wire_nested(map,
7057 	    start,
7058 	    start + VM_MAP_PAGE_SIZE(map),
7059 	    caller_prot,
7060 	    tag,
7061 	    user_wire,
7062 	    (pmap_t)NULL,
7063 	    0,
7064 	    physpage_p);
7065 	if (kret != KERN_SUCCESS &&
7066 	    physpage_p != NULL) {
7067 		*physpage_p = 0;
7068 	}
7069 	return kret;
7070 }
7071 
7072 /*
7073  *	vm_map_unwire:
7074  *
7075  *	Sets the pageability of the specified address range in the target
7076  *	as pageable.  Regions specified must have been wired previously.
7077  *
7078  *	The map must not be locked, but a reference must remain to the map
7079  *	throughout the call.
7080  *
7081  *	Kernel will panic on failures.  User unwire ignores holes and
7082  *	unwired and intransition entries to avoid losing memory by leaving
7083  *	it unwired.
7084  */
7085 static kern_return_t
vm_map_unwire_nested(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,boolean_t user_wire,pmap_t map_pmap,vm_map_offset_t pmap_addr)7086 vm_map_unwire_nested(
7087 	vm_map_t                map,
7088 	vm_map_offset_t         start,
7089 	vm_map_offset_t         end,
7090 	boolean_t               user_wire,
7091 	pmap_t                  map_pmap,
7092 	vm_map_offset_t         pmap_addr)
7093 {
7094 	vm_map_entry_t          entry;
7095 	struct vm_map_entry     *first_entry, tmp_entry;
7096 	boolean_t               need_wakeup;
7097 	boolean_t               main_map = FALSE;
7098 	unsigned int            last_timestamp;
7099 
7100 	vm_map_lock(map);
7101 	if (map_pmap == NULL) {
7102 		main_map = TRUE;
7103 	}
7104 	last_timestamp = map->timestamp;
7105 
7106 	VM_MAP_RANGE_CHECK(map, start, end);
7107 	assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
7108 	assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
7109 
7110 	if (start == end) {
7111 		/* We unwired what the caller asked for: zero pages */
7112 		vm_map_unlock(map);
7113 		return KERN_SUCCESS;
7114 	}
7115 
7116 	if (vm_map_lookup_entry(map, start, &first_entry)) {
7117 		entry = first_entry;
7118 		/*
7119 		 * vm_map_clip_start will be done later.
7120 		 * We don't want to unnest any nested sub maps here !
7121 		 */
7122 	} else {
7123 		if (!user_wire) {
7124 			panic("vm_map_unwire: start not found");
7125 		}
7126 		/*	Start address is not in map. */
7127 		vm_map_unlock(map);
7128 		return KERN_INVALID_ADDRESS;
7129 	}
7130 
7131 	if (entry->superpage_size) {
7132 		/* superpages are always wired */
7133 		vm_map_unlock(map);
7134 		return KERN_INVALID_ADDRESS;
7135 	}
7136 
7137 	need_wakeup = FALSE;
7138 	while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
7139 		if (entry->in_transition) {
7140 			/*
7141 			 * 1)
7142 			 * Another thread is wiring down this entry. Note
7143 			 * that if it is not for the other thread we would
7144 			 * be unwiring an unwired entry.  This is not
7145 			 * permitted.  If we wait, we will be unwiring memory
7146 			 * we did not wire.
7147 			 *
7148 			 * 2)
7149 			 * Another thread is unwiring this entry.  We did not
7150 			 * have a reference to it, because if we did, this
7151 			 * entry will not be getting unwired now.
7152 			 */
7153 			if (!user_wire) {
7154 				/*
7155 				 * XXX FBDP
7156 				 * This could happen:  there could be some
7157 				 * overlapping vslock/vsunlock operations
7158 				 * going on.
7159 				 * We should probably just wait and retry,
7160 				 * but then we have to be careful that this
7161 				 * entry could get "simplified" after
7162 				 * "in_transition" gets unset and before
7163 				 * we re-lookup the entry, so we would
7164 				 * have to re-clip the entry to avoid
7165 				 * re-unwiring what we have already unwired...
7166 				 * See vm_map_wire_nested().
7167 				 *
7168 				 * Or we could just ignore "in_transition"
7169 				 * here and proceed to decement the wired
7170 				 * count(s) on this entry.  That should be fine
7171 				 * as long as "wired_count" doesn't drop all
7172 				 * the way to 0 (and we should panic if THAT
7173 				 * happens).
7174 				 */
7175 				panic("vm_map_unwire: in_transition entry");
7176 			}
7177 
7178 			entry = entry->vme_next;
7179 			continue;
7180 		}
7181 
7182 		if (entry->is_sub_map) {
7183 			vm_map_offset_t sub_start;
7184 			vm_map_offset_t sub_end;
7185 			vm_map_offset_t local_end;
7186 			pmap_t          pmap;
7187 
7188 			vm_map_clip_start(map, entry, start);
7189 			vm_map_clip_end(map, entry, end);
7190 
7191 			sub_start = VME_OFFSET(entry);
7192 			sub_end = entry->vme_end - entry->vme_start;
7193 			sub_end += VME_OFFSET(entry);
7194 			local_end = entry->vme_end;
7195 			if (map_pmap == NULL) {
7196 				if (entry->use_pmap) {
7197 					pmap = VME_SUBMAP(entry)->pmap;
7198 					pmap_addr = sub_start;
7199 				} else {
7200 					pmap = map->pmap;
7201 					pmap_addr = start;
7202 				}
7203 				if (entry->wired_count == 0 ||
7204 				    (user_wire && entry->user_wired_count == 0)) {
7205 					if (!user_wire) {
7206 						panic("vm_map_unwire: entry is unwired");
7207 					}
7208 					entry = entry->vme_next;
7209 					continue;
7210 				}
7211 
7212 				/*
7213 				 * Check for holes
7214 				 * Holes: Next entry should be contiguous unless
7215 				 * this is the end of the region.
7216 				 */
7217 				if (((entry->vme_end < end) &&
7218 				    ((entry->vme_next == vm_map_to_entry(map)) ||
7219 				    (entry->vme_next->vme_start
7220 				    > entry->vme_end)))) {
7221 					if (!user_wire) {
7222 						panic("vm_map_unwire: non-contiguous region");
7223 					}
7224 /*
7225  *                                       entry = entry->vme_next;
7226  *                                       continue;
7227  */
7228 				}
7229 
7230 				subtract_wire_counts(map, entry, user_wire);
7231 
7232 				if (entry->wired_count != 0) {
7233 					entry = entry->vme_next;
7234 					continue;
7235 				}
7236 
7237 				entry->in_transition = TRUE;
7238 				tmp_entry = *entry;/* see comment in vm_map_wire() */
7239 
7240 				/*
7241 				 * We can unlock the map now. The in_transition state
7242 				 * guarantees existance of the entry.
7243 				 */
7244 				vm_map_unlock(map);
7245 				vm_map_unwire_nested(VME_SUBMAP(entry),
7246 				    sub_start, sub_end, user_wire, pmap, pmap_addr);
7247 				vm_map_lock(map);
7248 
7249 				if (last_timestamp + 1 != map->timestamp) {
7250 					/*
7251 					 * Find the entry again.  It could have been
7252 					 * clipped or deleted after we unlocked the map.
7253 					 */
7254 					if (!vm_map_lookup_entry(map,
7255 					    tmp_entry.vme_start,
7256 					    &first_entry)) {
7257 						if (!user_wire) {
7258 							panic("vm_map_unwire: re-lookup failed");
7259 						}
7260 						entry = first_entry->vme_next;
7261 					} else {
7262 						entry = first_entry;
7263 					}
7264 				}
7265 				last_timestamp = map->timestamp;
7266 
7267 				/*
7268 				 * clear transition bit for all constituent entries
7269 				 * that were in the original entry (saved in
7270 				 * tmp_entry).  Also check for waiters.
7271 				 */
7272 				while ((entry != vm_map_to_entry(map)) &&
7273 				    (entry->vme_start < tmp_entry.vme_end)) {
7274 					assert(entry->in_transition);
7275 					entry->in_transition = FALSE;
7276 					if (entry->needs_wakeup) {
7277 						entry->needs_wakeup = FALSE;
7278 						need_wakeup = TRUE;
7279 					}
7280 					entry = entry->vme_next;
7281 				}
7282 				continue;
7283 			} else {
7284 				tmp_entry = *entry;
7285 				vm_map_unlock(map);
7286 				vm_map_unwire_nested(VME_SUBMAP(entry),
7287 				    sub_start, sub_end, user_wire, map_pmap,
7288 				    pmap_addr);
7289 				vm_map_lock(map);
7290 
7291 				if (last_timestamp + 1 != map->timestamp) {
7292 					/*
7293 					 * Find the entry again.  It could have been
7294 					 * clipped or deleted after we unlocked the map.
7295 					 */
7296 					if (!vm_map_lookup_entry(map,
7297 					    tmp_entry.vme_start,
7298 					    &first_entry)) {
7299 						if (!user_wire) {
7300 							panic("vm_map_unwire: re-lookup failed");
7301 						}
7302 						entry = first_entry->vme_next;
7303 					} else {
7304 						entry = first_entry;
7305 					}
7306 				}
7307 				last_timestamp = map->timestamp;
7308 			}
7309 		}
7310 
7311 
7312 		if ((entry->wired_count == 0) ||
7313 		    (user_wire && entry->user_wired_count == 0)) {
7314 			if (!user_wire) {
7315 				panic("vm_map_unwire: entry is unwired");
7316 			}
7317 
7318 			entry = entry->vme_next;
7319 			continue;
7320 		}
7321 
7322 		assert(entry->wired_count > 0 &&
7323 		    (!user_wire || entry->user_wired_count > 0));
7324 
7325 		vm_map_clip_start(map, entry, start);
7326 		vm_map_clip_end(map, entry, end);
7327 
7328 		/*
7329 		 * Check for holes
7330 		 * Holes: Next entry should be contiguous unless
7331 		 *	  this is the end of the region.
7332 		 */
7333 		if (((entry->vme_end < end) &&
7334 		    ((entry->vme_next == vm_map_to_entry(map)) ||
7335 		    (entry->vme_next->vme_start > entry->vme_end)))) {
7336 			if (!user_wire) {
7337 				panic("vm_map_unwire: non-contiguous region");
7338 			}
7339 			entry = entry->vme_next;
7340 			continue;
7341 		}
7342 
7343 		subtract_wire_counts(map, entry, user_wire);
7344 
7345 		if (entry->wired_count != 0) {
7346 			entry = entry->vme_next;
7347 			continue;
7348 		}
7349 
7350 		if (entry->zero_wired_pages) {
7351 			entry->zero_wired_pages = FALSE;
7352 		}
7353 
7354 		entry->in_transition = TRUE;
7355 		tmp_entry = *entry;     /* see comment in vm_map_wire() */
7356 
7357 		/*
7358 		 * We can unlock the map now. The in_transition state
7359 		 * guarantees existance of the entry.
7360 		 */
7361 		vm_map_unlock(map);
7362 		if (map_pmap) {
7363 			vm_fault_unwire(map,
7364 			    &tmp_entry, FALSE, map_pmap, pmap_addr);
7365 		} else {
7366 			vm_fault_unwire(map,
7367 			    &tmp_entry, FALSE, map->pmap,
7368 			    tmp_entry.vme_start);
7369 		}
7370 		vm_map_lock(map);
7371 
7372 		if (last_timestamp + 1 != map->timestamp) {
7373 			/*
7374 			 * Find the entry again.  It could have been clipped
7375 			 * or deleted after we unlocked the map.
7376 			 */
7377 			if (!vm_map_lookup_entry(map, tmp_entry.vme_start,
7378 			    &first_entry)) {
7379 				if (!user_wire) {
7380 					panic("vm_map_unwire: re-lookup failed");
7381 				}
7382 				entry = first_entry->vme_next;
7383 			} else {
7384 				entry = first_entry;
7385 			}
7386 		}
7387 		last_timestamp = map->timestamp;
7388 
7389 		/*
7390 		 * clear transition bit for all constituent entries that
7391 		 * were in the original entry (saved in tmp_entry).  Also
7392 		 * check for waiters.
7393 		 */
7394 		while ((entry != vm_map_to_entry(map)) &&
7395 		    (entry->vme_start < tmp_entry.vme_end)) {
7396 			assert(entry->in_transition);
7397 			entry->in_transition = FALSE;
7398 			if (entry->needs_wakeup) {
7399 				entry->needs_wakeup = FALSE;
7400 				need_wakeup = TRUE;
7401 			}
7402 			entry = entry->vme_next;
7403 		}
7404 	}
7405 
7406 	/*
7407 	 * We might have fragmented the address space when we wired this
7408 	 * range of addresses.  Attempt to re-coalesce these VM map entries
7409 	 * with their neighbors now that they're no longer wired.
7410 	 * Under some circumstances, address space fragmentation can
7411 	 * prevent VM object shadow chain collapsing, which can cause
7412 	 * swap space leaks.
7413 	 */
7414 	vm_map_simplify_range(map, start, end);
7415 
7416 	vm_map_unlock(map);
7417 	/*
7418 	 * wake up anybody waiting on entries that we have unwired.
7419 	 */
7420 	if (need_wakeup) {
7421 		vm_map_entry_wakeup(map);
7422 	}
7423 	return KERN_SUCCESS;
7424 }
7425 
7426 kern_return_t
vm_map_unwire(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,boolean_t user_wire)7427 vm_map_unwire(
7428 	vm_map_t                map,
7429 	vm_map_offset_t         start,
7430 	vm_map_offset_t         end,
7431 	boolean_t               user_wire)
7432 {
7433 	return vm_map_unwire_nested(map, start, end,
7434 	           user_wire, (pmap_t)NULL, 0);
7435 }
7436 
7437 
7438 /*
7439  *	vm_map_entry_zap:	[ internal use only ]
7440  *
7441  *	Remove the entry from the target map
7442  *	and put it on a zap list.
7443  */
7444 static void
vm_map_entry_zap(vm_map_t map,vm_map_entry_t entry,vm_map_zap_t zap)7445 vm_map_entry_zap(
7446 	vm_map_t                map,
7447 	vm_map_entry_t          entry,
7448 	vm_map_zap_t            zap)
7449 {
7450 	vm_map_offset_t s, e;
7451 
7452 	s = entry->vme_start;
7453 	e = entry->vme_end;
7454 	assert(VM_MAP_PAGE_ALIGNED(s, FOURK_PAGE_MASK));
7455 	assert(VM_MAP_PAGE_ALIGNED(e, FOURK_PAGE_MASK));
7456 	if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK) {
7457 		assert(page_aligned(s));
7458 		assert(page_aligned(e));
7459 	}
7460 	if (entry->map_aligned == TRUE) {
7461 		assert(VM_MAP_PAGE_ALIGNED(s, VM_MAP_PAGE_MASK(map)));
7462 		assert(VM_MAP_PAGE_ALIGNED(e, VM_MAP_PAGE_MASK(map)));
7463 	}
7464 	assert(entry->wired_count == 0);
7465 	assert(entry->user_wired_count == 0);
7466 	assert(!entry->permanent);
7467 
7468 	vm_map_store_entry_unlink(map, entry);
7469 	map->size -= e - s;
7470 
7471 	vm_map_zap_append(zap, entry);
7472 }
7473 
7474 static void
vm_map_submap_pmap_clean(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_map_t sub_map,vm_map_offset_t offset)7475 vm_map_submap_pmap_clean(
7476 	vm_map_t        map,
7477 	vm_map_offset_t start,
7478 	vm_map_offset_t end,
7479 	vm_map_t        sub_map,
7480 	vm_map_offset_t offset)
7481 {
7482 	vm_map_offset_t submap_start;
7483 	vm_map_offset_t submap_end;
7484 	vm_map_size_t   remove_size;
7485 	vm_map_entry_t  entry;
7486 
7487 	submap_end = offset + (end - start);
7488 	submap_start = offset;
7489 
7490 	vm_map_lock_read(sub_map);
7491 	if (vm_map_lookup_entry(sub_map, offset, &entry)) {
7492 		remove_size = (entry->vme_end - entry->vme_start);
7493 		if (offset > entry->vme_start) {
7494 			remove_size -= offset - entry->vme_start;
7495 		}
7496 
7497 
7498 		if (submap_end < entry->vme_end) {
7499 			remove_size -=
7500 			    entry->vme_end - submap_end;
7501 		}
7502 		if (entry->is_sub_map) {
7503 			vm_map_submap_pmap_clean(
7504 				sub_map,
7505 				start,
7506 				start + remove_size,
7507 				VME_SUBMAP(entry),
7508 				VME_OFFSET(entry));
7509 		} else {
7510 			if (map->mapped_in_other_pmaps &&
7511 			    os_ref_get_count_raw(&map->map_refcnt) != 0 &&
7512 			    VME_OBJECT(entry) != NULL) {
7513 				vm_object_pmap_protect_options(
7514 					VME_OBJECT(entry),
7515 					(VME_OFFSET(entry) +
7516 					offset -
7517 					entry->vme_start),
7518 					remove_size,
7519 					PMAP_NULL,
7520 					PAGE_SIZE,
7521 					entry->vme_start,
7522 					VM_PROT_NONE,
7523 					PMAP_OPTIONS_REMOVE);
7524 			} else {
7525 				pmap_remove(map->pmap,
7526 				    (addr64_t)start,
7527 				    (addr64_t)(start + remove_size));
7528 			}
7529 		}
7530 	}
7531 
7532 	entry = entry->vme_next;
7533 
7534 	while ((entry != vm_map_to_entry(sub_map))
7535 	    && (entry->vme_start < submap_end)) {
7536 		remove_size = (entry->vme_end - entry->vme_start);
7537 		if (submap_end < entry->vme_end) {
7538 			remove_size -= entry->vme_end - submap_end;
7539 		}
7540 		if (entry->is_sub_map) {
7541 			vm_map_submap_pmap_clean(
7542 				sub_map,
7543 				(start + entry->vme_start) - offset,
7544 				((start + entry->vme_start) - offset) + remove_size,
7545 				VME_SUBMAP(entry),
7546 				VME_OFFSET(entry));
7547 		} else {
7548 			if (map->mapped_in_other_pmaps &&
7549 			    os_ref_get_count_raw(&map->map_refcnt) != 0 &&
7550 			    VME_OBJECT(entry) != NULL) {
7551 				vm_object_pmap_protect_options(
7552 					VME_OBJECT(entry),
7553 					VME_OFFSET(entry),
7554 					remove_size,
7555 					PMAP_NULL,
7556 					PAGE_SIZE,
7557 					entry->vme_start,
7558 					VM_PROT_NONE,
7559 					PMAP_OPTIONS_REMOVE);
7560 			} else {
7561 				pmap_remove(map->pmap,
7562 				    (addr64_t)((start + entry->vme_start)
7563 				    - offset),
7564 				    (addr64_t)(((start + entry->vme_start)
7565 				    - offset) + remove_size));
7566 			}
7567 		}
7568 		entry = entry->vme_next;
7569 	}
7570 	vm_map_unlock_read(sub_map);
7571 	return;
7572 }
7573 
7574 /*
7575  *     virt_memory_guard_ast:
7576  *
7577  *     Handle the AST callout for a virtual memory guard.
7578  *	   raise an EXC_GUARD exception and terminate the task
7579  *     if configured to do so.
7580  */
7581 void
virt_memory_guard_ast(thread_t thread,mach_exception_data_type_t code,mach_exception_data_type_t subcode)7582 virt_memory_guard_ast(
7583 	thread_t thread,
7584 	mach_exception_data_type_t code,
7585 	mach_exception_data_type_t subcode)
7586 {
7587 	task_t task = get_threadtask(thread);
7588 	assert(task != kernel_task);
7589 	assert(task == current_task());
7590 	kern_return_t sync_exception_result;
7591 	uint32_t behavior;
7592 
7593 	behavior = task->task_exc_guard;
7594 
7595 	/* Is delivery enabled */
7596 	if ((behavior & TASK_EXC_GUARD_VM_DELIVER) == 0) {
7597 		return;
7598 	}
7599 
7600 	/* If only once, make sure we're that once */
7601 	while (behavior & TASK_EXC_GUARD_VM_ONCE) {
7602 		uint32_t new_behavior = behavior & ~TASK_EXC_GUARD_VM_DELIVER;
7603 
7604 		if (OSCompareAndSwap(behavior, new_behavior, &task->task_exc_guard)) {
7605 			break;
7606 		}
7607 		behavior = task->task_exc_guard;
7608 		if ((behavior & TASK_EXC_GUARD_VM_DELIVER) == 0) {
7609 			return;
7610 		}
7611 	}
7612 
7613 	/* Raise exception synchronously and see if handler claimed it */
7614 	sync_exception_result = task_exception_notify(EXC_GUARD, code, subcode);
7615 
7616 	if (task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL) {
7617 		/*
7618 		 * If Synchronous EXC_GUARD delivery was successful then
7619 		 * kill the process and return, else kill the process
7620 		 * and deliver the exception via EXC_CORPSE_NOTIFY.
7621 		 */
7622 		if (sync_exception_result == KERN_SUCCESS) {
7623 			task_bsdtask_kill(current_task());
7624 		} else {
7625 			exit_with_guard_exception(current_proc(), code, subcode);
7626 		}
7627 	} else if (task->task_exc_guard & TASK_EXC_GUARD_VM_CORPSE) {
7628 		/*
7629 		 * If the synchronous EXC_GUARD delivery was not successful,
7630 		 * raise a simulated crash.
7631 		 */
7632 		if (sync_exception_result != KERN_SUCCESS) {
7633 			task_violated_guard(code, subcode, NULL);
7634 		}
7635 	}
7636 }
7637 
7638 /*
7639  *     vm_map_guard_exception:
7640  *
7641  *     Generate a GUARD_TYPE_VIRTUAL_MEMORY EXC_GUARD exception.
7642  *
7643  *     Right now, we do this when we find nothing mapped, or a
7644  *     gap in the mapping when a user address space deallocate
7645  *     was requested. We report the address of the first gap found.
7646  */
7647 static void
vm_map_guard_exception(vm_map_offset_t gap_start,unsigned reason)7648 vm_map_guard_exception(
7649 	vm_map_offset_t gap_start,
7650 	unsigned reason)
7651 {
7652 	mach_exception_code_t code = 0;
7653 	unsigned int guard_type = GUARD_TYPE_VIRT_MEMORY;
7654 	unsigned int target = 0; /* should we pass in pid associated with map? */
7655 	mach_exception_data_type_t subcode = (uint64_t)gap_start;
7656 	boolean_t fatal = FALSE;
7657 
7658 	task_t task = current_task_early();
7659 
7660 	/* Can't deliver exceptions to a NULL task (early boot) or kernel task */
7661 	if (task == NULL || task == kernel_task) {
7662 		return;
7663 	}
7664 
7665 	EXC_GUARD_ENCODE_TYPE(code, guard_type);
7666 	EXC_GUARD_ENCODE_FLAVOR(code, reason);
7667 	EXC_GUARD_ENCODE_TARGET(code, target);
7668 
7669 	if (task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL) {
7670 		fatal = TRUE;
7671 	}
7672 	thread_guard_violation(current_thread(), code, subcode, fatal);
7673 }
7674 
7675 __abortlike
7676 static void
__vm_map_delete_misaligned_panic(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)7677 __vm_map_delete_misaligned_panic(
7678 	vm_map_t                map,
7679 	vm_map_offset_t         start,
7680 	vm_map_offset_t         end)
7681 {
7682 	panic("vm_map_delete(%p,0x%llx,0x%llx): start is not aligned to 0x%x",
7683 	    map, (uint64_t)start, (uint64_t)end, VM_MAP_PAGE_SIZE(map));
7684 }
7685 
7686 __abortlike
7687 static void
__vm_map_delete_failed_panic(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,kern_return_t kr)7688 __vm_map_delete_failed_panic(
7689 	vm_map_t                map,
7690 	vm_map_offset_t         start,
7691 	vm_map_offset_t         end,
7692 	kern_return_t           kr)
7693 {
7694 	panic("vm_map_delete(%p,0x%llx,0x%llx): failed unexpected with %d",
7695 	    map, (uint64_t)start, (uint64_t)end, kr);
7696 }
7697 
7698 __abortlike
7699 static void
__vm_map_delete_gap_panic(vm_map_t map,vm_map_offset_t where,vm_map_offset_t start,vm_map_offset_t end)7700 __vm_map_delete_gap_panic(
7701 	vm_map_t                map,
7702 	vm_map_offset_t         where,
7703 	vm_map_offset_t         start,
7704 	vm_map_offset_t         end)
7705 {
7706 	panic("vm_map_delete(%p,0x%llx,0x%llx): no map entry at 0x%llx",
7707 	    map, (uint64_t)start, (uint64_t)end, (uint64_t)where);
7708 }
7709 
7710 __abortlike
7711 static void
__vm_map_delete_permanent_panic(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_map_entry_t entry)7712 __vm_map_delete_permanent_panic(
7713 	vm_map_t                map,
7714 	vm_map_offset_t         start,
7715 	vm_map_offset_t         end,
7716 	vm_map_entry_t          entry)
7717 {
7718 	panic("vm_map_delete(%p,0x%llx,0x%llx): "
7719 	    "Attempting to remove permanent VM map entry %p [0x%llx:0x%llx]",
7720 	    map, (uint64_t)start, (uint64_t)end, entry,
7721 	    (uint64_t)entry->vme_start,
7722 	    (uint64_t)entry->vme_end);
7723 }
7724 
7725 __options_decl(vm_map_delete_state_t, uint32_t, {
7726 	VMDS_NONE               = 0x0000,
7727 
7728 	VMDS_FOUND_GAP          = 0x0001,
7729 	VMDS_GAPS_OK            = 0x0002,
7730 
7731 	VMDS_KERNEL_PMAP        = 0x0004,
7732 	VMDS_NEEDS_LOOKUP       = 0x0008,
7733 	VMDS_NEEDS_WAKEUP       = 0x0010,
7734 });
7735 
7736 /*
7737  *	vm_map_delete:	[ internal use only ]
7738  *
7739  *	Deallocates the given address range from the target map.
7740  *	Removes all user wirings. Unwires one kernel wiring if
7741  *	VM_MAP_REMOVE_KUNWIRE is set.  Waits for kernel wirings to go
7742  *	away if VM_MAP_REMOVE_WAIT_FOR_KWIRE is set.  Sleeps
7743  *	interruptibly if VM_MAP_REMOVE_INTERRUPTIBLE is set.
7744  *
7745  *
7746  *	When VM_MAP_REMOVE_RETURN_ERRORS is not passed,
7747  *	then any error in removing mappings will lead to a panic
7748  *	so that clients do not have to repeat the panic code
7749  *	at each call site.  If VM_MAP_REMOVE_INTERRUPTIBLE
7750  *	is also passed, then KERN_ABORTED will not lead to a panic.
7751  *
7752  *	This routine is called with map locked and leaves map locked.
7753  */
7754 static kmem_return_t
vm_map_delete(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vmr_flags_t flags,kmem_guard_t guard,vm_map_zap_t zap_list)7755 vm_map_delete(
7756 	vm_map_t                map,
7757 	vm_map_offset_t         start,
7758 	vm_map_offset_t         end,
7759 	vmr_flags_t             flags,
7760 	kmem_guard_t            guard,
7761 	vm_map_zap_t            zap_list)
7762 {
7763 	vm_map_entry_t          entry, next;
7764 	int                     interruptible;
7765 	vm_map_offset_t         gap_start = 0;
7766 	vm_map_offset_t         clear_in_transition_end = 0;
7767 	__unused vm_map_offset_t save_start = start;
7768 	__unused vm_map_offset_t save_end = end;
7769 	vm_map_delete_state_t   state = VMDS_NONE;
7770 	kmem_return_t           ret = { };
7771 
7772 	if (vm_map_pmap(map) == kernel_pmap) {
7773 		state |= VMDS_KERNEL_PMAP;
7774 	}
7775 
7776 	if (map->terminated || os_ref_get_count_raw(&map->map_refcnt) == 0) {
7777 		state |= VMDS_GAPS_OK;
7778 	}
7779 
7780 	interruptible = (flags & VM_MAP_REMOVE_INTERRUPTIBLE) ?
7781 	    THREAD_ABORTSAFE : THREAD_UNINT;
7782 
7783 	if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) == 0 &&
7784 	    (start & VM_MAP_PAGE_MASK(map))) {
7785 		__vm_map_delete_misaligned_panic(map, start, end);
7786 	}
7787 
7788 	if ((state & VMDS_GAPS_OK) == 0) {
7789 		/*
7790 		 * If the map isn't terminated then all deletions must have
7791 		 * no gaps, and be within the [min, max) of the map.
7792 		 *
7793 		 * We got here without VM_MAP_RANGE_CHECK() being called,
7794 		 * and hence must validate bounds manually.
7795 		 *
7796 		 * It is worth noting that because vm_deallocate() will
7797 		 * round_page() the deallocation size, it's possible for "end"
7798 		 * to be 0 here due to overflow. We hence must treat it as being
7799 		 * beyond vm_map_max(map).
7800 		 *
7801 		 * Similarly, end < start means some wrap around happend,
7802 		 * which should cause an error or panic.
7803 		 */
7804 		if (end == 0 || end > vm_map_max(map)) {
7805 			state |= VMDS_FOUND_GAP;
7806 			gap_start = vm_map_max(map);
7807 			if (state & VMDS_KERNEL_PMAP) {
7808 				__vm_map_delete_gap_panic(map,
7809 				    gap_start, start, end);
7810 			}
7811 			goto out;
7812 		}
7813 
7814 		if (end < start) {
7815 			if (state & VMDS_KERNEL_PMAP) {
7816 				__vm_map_delete_gap_panic(map,
7817 				    vm_map_max(map), start, end);
7818 			}
7819 			ret.kmr_return = KERN_INVALID_ARGUMENT;
7820 			goto out;
7821 		}
7822 
7823 		if (start < vm_map_min(map)) {
7824 			state |= VMDS_FOUND_GAP;
7825 			gap_start = start;
7826 			if (state & VMDS_KERNEL_PMAP) {
7827 				__vm_map_delete_gap_panic(map,
7828 				    gap_start, start, end);
7829 			}
7830 			goto out;
7831 		}
7832 	} else {
7833 		/*
7834 		 * If the map is terminated, we must accept start/end
7835 		 * being beyond the boundaries of the map as this is
7836 		 * how some of the mappings like commpage mappings
7837 		 * can be destroyed (they're outside of those bounds).
7838 		 *
7839 		 * end < start is still something we can't cope with,
7840 		 * so just bail.
7841 		 */
7842 		if (end < start) {
7843 			goto out;
7844 		}
7845 	}
7846 
7847 
7848 	/*
7849 	 *	Find the start of the region.
7850 	 *
7851 	 *	If in a superpage, extend the range
7852 	 *	to include the start of the mapping.
7853 	 */
7854 	while (vm_map_lookup_entry_or_next(map, start, &entry)) {
7855 		if (entry->superpage_size && (start & ~SUPERPAGE_MASK)) {
7856 			start = SUPERPAGE_ROUND_DOWN(start);
7857 		} else {
7858 			SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
7859 			break;
7860 		}
7861 	}
7862 
7863 	if (entry->superpage_size) {
7864 		end = SUPERPAGE_ROUND_UP(end);
7865 	}
7866 
7867 	/*
7868 	 *	Step through all entries in this region
7869 	 */
7870 	for (vm_map_offset_t s = start; s < end;) {
7871 		/*
7872 		 * At this point, we have deleted all the memory entries
7873 		 * in [start, s) and are proceeding with the [s, end) range.
7874 		 *
7875 		 * This loop might drop the map lock, and it is possible that
7876 		 * some memory was already reallocated within [start, s)
7877 		 * and we don't want to mess with those entries.
7878 		 *
7879 		 * Some of those entries could even have been re-assembled
7880 		 * with an entry after "s" (in vm_map_simplify_entry()), so
7881 		 * we may have to vm_map_clip_start() again.
7882 		 *
7883 		 * When clear_in_transition_end is set, the we had marked
7884 		 * [start, clear_in_transition_end) as "in_transition"
7885 		 * during a previous iteration and we need to clear it.
7886 		 */
7887 
7888 		/*
7889 		 * Step 1: If needed (because we dropped locks),
7890 		 *         lookup the entry again.
7891 		 *
7892 		 *         If we're coming back from unwiring (Step 5),
7893 		 *         we also need to mark the entries as no longer
7894 		 *         in transition after that.
7895 		 */
7896 
7897 		if (state & VMDS_NEEDS_LOOKUP) {
7898 			state &= ~VMDS_NEEDS_LOOKUP;
7899 
7900 			if (vm_map_lookup_entry_or_next(map, s, &entry)) {
7901 				SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
7902 			}
7903 		}
7904 
7905 		if (clear_in_transition_end) {
7906 			for (vm_map_entry_t it = entry;
7907 			    it != vm_map_to_entry(map) &&
7908 			    it->vme_start < clear_in_transition_end;
7909 			    it = it->vme_next) {
7910 				assert(it->in_transition);
7911 				it->in_transition = FALSE;
7912 				if (it->needs_wakeup) {
7913 					it->needs_wakeup = FALSE;
7914 					state |= VMDS_NEEDS_WAKEUP;
7915 				}
7916 			}
7917 
7918 			clear_in_transition_end = 0;
7919 		}
7920 
7921 
7922 		/*
7923 		 * Step 2: Perform various policy checks
7924 		 *         before we do _anything_ to this entry.
7925 		 */
7926 
7927 		if (entry == vm_map_to_entry(map) || s < entry->vme_start) {
7928 			if (state & (VMDS_GAPS_OK | VMDS_FOUND_GAP)) {
7929 				/*
7930 				 * Either we found a gap already,
7931 				 * or we are tearing down a map,
7932 				 * keep going.
7933 				 */
7934 			} else if (state & VMDS_KERNEL_PMAP) {
7935 				__vm_map_delete_gap_panic(map, s, start, end);
7936 			} else if (vm_map_round_page(s, VM_MAP_PAGE_MASK(map)) < end) {
7937 				/*
7938 				 * The vm_map_round_page() is needed since an entry
7939 				 * can be less than VM_MAP_PAGE_MASK() sized.
7940 				 *
7941 				 * For example, devices which have h/w 4K pages,
7942 				 * but entry sizes are all now 16K.
7943 				 */
7944 				state |= VMDS_FOUND_GAP;
7945 				gap_start = s;
7946 			}
7947 
7948 			if (entry == vm_map_to_entry(map) ||
7949 			    end <= entry->vme_start) {
7950 				break;
7951 			}
7952 
7953 			s = entry->vme_start;
7954 		}
7955 
7956 		if (state & VMDS_KERNEL_PMAP) {
7957 			/*
7958 			 * In the kernel map and its submaps,
7959 			 * permanent entries never die, even
7960 			 * if VM_MAP_REMOVE_IMMUTABLE is passed.
7961 			 */
7962 			if (entry->permanent) {
7963 				__vm_map_delete_permanent_panic(map, start, end, entry);
7964 			}
7965 
7966 			if (flags & VM_MAP_REMOVE_GUESS_SIZE) {
7967 				end = entry->vme_end;
7968 				flags &= ~VM_MAP_REMOVE_GUESS_SIZE;
7969 			}
7970 
7971 			/*
7972 			 * In the kernel map and its submaps,
7973 			 * the removal of an atomic/guarded entry is strict.
7974 			 *
7975 			 * An atomic entry is processed only if it was
7976 			 * specifically targeted.
7977 			 *
7978 			 * We might have deleted non-atomic entries before
7979 			 * we reach this this point however...
7980 			 */
7981 			kmem_entry_validate_guard(map, entry,
7982 			    start, end - start, guard);
7983 		}
7984 
7985 
7986 		/*
7987 		 * Step 3: Perform any clipping needed.
7988 		 *
7989 		 *         After this, "entry" starts at "s", ends before "end"
7990 		 */
7991 
7992 		if (entry->vme_start < s) {
7993 			if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) &&
7994 			    entry->map_aligned &&
7995 			    !VM_MAP_PAGE_ALIGNED(s, VM_MAP_PAGE_MASK(map))) {
7996 				/*
7997 				 * The entry will no longer be map-aligned
7998 				 * after clipping and the caller said it's OK.
7999 				 */
8000 				entry->map_aligned = FALSE;
8001 			}
8002 			vm_map_clip_start(map, entry, s);
8003 			SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
8004 		}
8005 
8006 		if (end < entry->vme_end) {
8007 			if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) &&
8008 			    entry->map_aligned &&
8009 			    !VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map))) {
8010 				/*
8011 				 * The entry will no longer be map-aligned
8012 				 * after clipping and the caller said it's OK.
8013 				 */
8014 				entry->map_aligned = FALSE;
8015 			}
8016 			vm_map_clip_end(map, entry, end);
8017 		}
8018 
8019 		assert(s == entry->vme_start);
8020 		assert(entry->vme_end <= end);
8021 
8022 
8023 		/*
8024 		 * Step 4: If the entry is in flux, wait for this to resolve.
8025 		 */
8026 
8027 		if (entry->in_transition) {
8028 			wait_result_t wait_result;
8029 
8030 			/*
8031 			 * Another thread is wiring/unwiring this entry.
8032 			 * Let the other thread know we are waiting.
8033 			 */
8034 
8035 			entry->needs_wakeup = TRUE;
8036 
8037 			/*
8038 			 * wake up anybody waiting on entries that we have
8039 			 * already unwired/deleted.
8040 			 */
8041 			if (state & VMDS_NEEDS_WAKEUP) {
8042 				vm_map_entry_wakeup(map);
8043 				state &= ~VMDS_NEEDS_WAKEUP;
8044 			}
8045 
8046 			wait_result = vm_map_entry_wait(map, interruptible);
8047 
8048 			if (interruptible &&
8049 			    wait_result == THREAD_INTERRUPTED) {
8050 				/*
8051 				 * We do not clear the needs_wakeup flag,
8052 				 * since we cannot tell if we were the only one.
8053 				 */
8054 				ret.kmr_return = KERN_ABORTED;
8055 				return ret;
8056 			}
8057 
8058 			/*
8059 			 * The entry could have been clipped or it
8060 			 * may not exist anymore.  Look it up again.
8061 			 */
8062 			state |= VMDS_NEEDS_LOOKUP;
8063 			continue;
8064 		}
8065 
8066 
8067 		/*
8068 		 * Step 5: Handle wiring
8069 		 */
8070 
8071 		if (entry->wired_count) {
8072 			struct vm_map_entry tmp_entry;
8073 			boolean_t           user_wire;
8074 			unsigned int        last_timestamp;
8075 
8076 			user_wire = entry->user_wired_count > 0;
8077 
8078 			/*
8079 			 *      Remove a kernel wiring if requested
8080 			 */
8081 			if (flags & VM_MAP_REMOVE_KUNWIRE) {
8082 				entry->wired_count--;
8083 			}
8084 
8085 			/*
8086 			 *	Remove all user wirings for proper accounting
8087 			 */
8088 			while (entry->user_wired_count) {
8089 				subtract_wire_counts(map, entry, user_wire);
8090 			}
8091 
8092 			/*
8093 			 * All our DMA I/O operations in IOKit are currently
8094 			 * done by wiring through the map entries of the task
8095 			 * requesting the I/O.
8096 			 *
8097 			 * Because of this, we must always wait for kernel wirings
8098 			 * to go away on the entries before deleting them.
8099 			 *
8100 			 * Any caller who wants to actually remove a kernel wiring
8101 			 * should explicitly set the VM_MAP_REMOVE_KUNWIRE flag to
8102 			 * properly remove one wiring instead of blasting through
8103 			 * them all.
8104 			 */
8105 			if (entry->wired_count != 0) {
8106 				assert(map != kernel_map);
8107 				/*
8108 				 * Cannot continue.  Typical case is when
8109 				 * a user thread has physical io pending on
8110 				 * on this page.  Either wait for the
8111 				 * kernel wiring to go away or return an
8112 				 * error.
8113 				 */
8114 				wait_result_t wait_result;
8115 
8116 				entry->needs_wakeup = TRUE;
8117 				wait_result = vm_map_entry_wait(map,
8118 				    interruptible);
8119 
8120 				if (interruptible &&
8121 				    wait_result == THREAD_INTERRUPTED) {
8122 					/*
8123 					 * We do not clear the
8124 					 * needs_wakeup flag, since we
8125 					 * cannot tell if we were the
8126 					 * only one.
8127 					 */
8128 					ret.kmr_return = KERN_ABORTED;
8129 					return ret;
8130 				}
8131 
8132 
8133 				/*
8134 				 * The entry could have been clipped or
8135 				 * it may not exist anymore.  Look it
8136 				 * up again.
8137 				 */
8138 				state |= VMDS_NEEDS_LOOKUP;
8139 				continue;
8140 			}
8141 
8142 			/*
8143 			 * We can unlock the map now.
8144 			 *
8145 			 * The entry might be split once we unlock the map,
8146 			 * but we need the range as defined by this entry
8147 			 * to be stable. So we must make a local copy.
8148 			 *
8149 			 * The underlying objects do not change during clips,
8150 			 * and the in_transition state guarentees existence
8151 			 * of the entry.
8152 			 */
8153 			last_timestamp = map->timestamp;
8154 			entry->in_transition = TRUE;
8155 			tmp_entry = *entry;
8156 			vm_map_unlock(map);
8157 
8158 			if (tmp_entry.is_sub_map) {
8159 				vm_map_t sub_map;
8160 				vm_map_offset_t sub_start, sub_end;
8161 				pmap_t pmap;
8162 				vm_map_offset_t pmap_addr;
8163 
8164 
8165 				sub_map = VME_SUBMAP(&tmp_entry);
8166 				sub_start = VME_OFFSET(&tmp_entry);
8167 				sub_end = sub_start + (tmp_entry.vme_end -
8168 				    tmp_entry.vme_start);
8169 				if (tmp_entry.use_pmap) {
8170 					pmap = sub_map->pmap;
8171 					pmap_addr = tmp_entry.vme_start;
8172 				} else {
8173 					pmap = map->pmap;
8174 					pmap_addr = tmp_entry.vme_start;
8175 				}
8176 				(void) vm_map_unwire_nested(sub_map,
8177 				    sub_start, sub_end,
8178 				    user_wire,
8179 				    pmap, pmap_addr);
8180 			} else {
8181 				if (tmp_entry.vme_kernel_object) {
8182 					pmap_protect_options(
8183 						map->pmap,
8184 						tmp_entry.vme_start,
8185 						tmp_entry.vme_end,
8186 						VM_PROT_NONE,
8187 						PMAP_OPTIONS_REMOVE,
8188 						NULL);
8189 				}
8190 				vm_fault_unwire(map, &tmp_entry,
8191 				    tmp_entry.vme_kernel_object,
8192 				    map->pmap, tmp_entry.vme_start);
8193 			}
8194 
8195 			vm_map_lock(map);
8196 
8197 			/*
8198 			 * Unwiring happened, we can now go back to deleting
8199 			 * them (after we clear the in_transition bit for the range).
8200 			 */
8201 			if (last_timestamp + 1 != map->timestamp) {
8202 				state |= VMDS_NEEDS_LOOKUP;
8203 			}
8204 			clear_in_transition_end = tmp_entry.vme_end;
8205 			continue;
8206 		}
8207 
8208 		assert(entry->wired_count == 0);
8209 		assert(entry->user_wired_count == 0);
8210 
8211 
8212 		/*
8213 		 * Step 6: Entry is unwired and ready for us to delete !
8214 		 */
8215 
8216 		if (!entry->permanent) {
8217 			/*
8218 			 * Typical case: the entry really shouldn't be permanent
8219 			 */
8220 		} else if ((flags & VM_MAP_REMOVE_IMMUTABLE) || map->terminated) {
8221 #if 0
8222 			printf("FBDP %d[%s] removing permanent entry "
8223 			    "%p [0x%llx:0x%llx] prot 0x%x/0x%x\n",
8224 			    proc_selfpid(),
8225 			    (current_task()->bsd_info
8226 			    ? proc_name_address(current_task()->bsd_info)
8227 			    : "?"), entry,
8228 			    (uint64_t)entry->vme_start,
8229 			    (uint64_t)entry->vme_end,
8230 			    entry->protection,
8231 			    entry->max_protection);
8232 #endif
8233 			entry->permanent = FALSE;
8234 		} else {
8235 			/*
8236 			 * dtrace -n 'vm_map_delete_permanent {
8237 			 *     print("start=0x%llx end=0x%llx prot=0x%x/0x%x\n", arg0, arg1, arg2, arg3);
8238 			 *     stack();
8239 			 *     ustack();
8240 			 * }'
8241 			 */
8242 			DTRACE_VM5(vm_map_delete_permanent,
8243 			    vm_map_offset_t, entry->vme_start,
8244 			    vm_map_offset_t, entry->vme_end,
8245 			    vm_prot_t, entry->protection,
8246 			    vm_prot_t, entry->max_protection,
8247 			    int, VME_ALIAS(entry));
8248 		}
8249 
8250 		if (entry->is_sub_map) {
8251 			assertf(VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)) >= VM_MAP_PAGE_SHIFT(map),
8252 			    "map %p (%d) entry %p submap %p (%d)\n",
8253 			    map, VM_MAP_PAGE_SHIFT(map), entry,
8254 			    VME_SUBMAP(entry),
8255 			    VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)));
8256 			if (entry->use_pmap) {
8257 #ifndef NO_NESTED_PMAP
8258 				int pmap_flags;
8259 
8260 				if (map->terminated) {
8261 					/*
8262 					 * This is the final cleanup of the
8263 					 * address space being terminated.
8264 					 * No new mappings are expected and
8265 					 * we don't really need to unnest the
8266 					 * shared region (and lose the "global"
8267 					 * pmap mappings, if applicable).
8268 					 *
8269 					 * Tell the pmap layer that we're
8270 					 * "clean" wrt nesting.
8271 					 */
8272 					pmap_flags = PMAP_UNNEST_CLEAN;
8273 				} else {
8274 					/*
8275 					 * We're unmapping part of the nested
8276 					 * shared region, so we can't keep the
8277 					 * nested pmap.
8278 					 */
8279 					pmap_flags = 0;
8280 				}
8281 				pmap_unnest_options(
8282 					map->pmap,
8283 					(addr64_t)entry->vme_start,
8284 					entry->vme_end - entry->vme_start,
8285 					pmap_flags);
8286 #endif  /* NO_NESTED_PMAP */
8287 				if (map->mapped_in_other_pmaps &&
8288 				    os_ref_get_count_raw(&map->map_refcnt) != 0) {
8289 					/* clean up parent map/maps */
8290 					vm_map_submap_pmap_clean(
8291 						map, entry->vme_start,
8292 						entry->vme_end,
8293 						VME_SUBMAP(entry),
8294 						VME_OFFSET(entry));
8295 				}
8296 			} else {
8297 				vm_map_submap_pmap_clean(
8298 					map, entry->vme_start, entry->vme_end,
8299 					VME_SUBMAP(entry),
8300 					VME_OFFSET(entry));
8301 			}
8302 		} else if (entry->vme_kernel_object ||
8303 		    VME_OBJECT(entry) == compressor_object) {
8304 			/*
8305 			 * nothing to do
8306 			 */
8307 		} else if (map->mapped_in_other_pmaps &&
8308 		    os_ref_get_count_raw(&map->map_refcnt) != 0) {
8309 			vm_object_pmap_protect_options(
8310 				VME_OBJECT(entry), VME_OFFSET(entry),
8311 				entry->vme_end - entry->vme_start,
8312 				PMAP_NULL,
8313 				PAGE_SIZE,
8314 				entry->vme_start,
8315 				VM_PROT_NONE,
8316 				PMAP_OPTIONS_REMOVE);
8317 		} else if ((VME_OBJECT(entry) != VM_OBJECT_NULL) ||
8318 		    (state & VMDS_KERNEL_PMAP)) {
8319 			/* Remove translations associated
8320 			 * with this range unless the entry
8321 			 * does not have an object, or
8322 			 * it's the kernel map or a descendant
8323 			 * since the platform could potentially
8324 			 * create "backdoor" mappings invisible
8325 			 * to the VM. It is expected that
8326 			 * objectless, non-kernel ranges
8327 			 * do not have such VM invisible
8328 			 * translations.
8329 			 */
8330 			pmap_remove_options(map->pmap,
8331 			    (addr64_t)entry->vme_start,
8332 			    (addr64_t)entry->vme_end,
8333 			    PMAP_OPTIONS_REMOVE);
8334 		}
8335 
8336 #if DEBUG
8337 		/*
8338 		 * All pmap mappings for this map entry must have been
8339 		 * cleared by now.
8340 		 */
8341 		assert(pmap_is_empty(map->pmap,
8342 		    entry->vme_start,
8343 		    entry->vme_end));
8344 #endif /* DEBUG */
8345 
8346 		if (entry->iokit_acct) {
8347 			/* alternate accounting */
8348 			DTRACE_VM4(vm_map_iokit_unmapped_region,
8349 			    vm_map_t, map,
8350 			    vm_map_offset_t, entry->vme_start,
8351 			    vm_map_offset_t, entry->vme_end,
8352 			    int, VME_ALIAS(entry));
8353 			vm_map_iokit_unmapped_region(map,
8354 			    (entry->vme_end -
8355 			    entry->vme_start));
8356 			entry->iokit_acct = FALSE;
8357 			entry->use_pmap = FALSE;
8358 		}
8359 
8360 		s = entry->vme_end;
8361 		next = entry->vme_next;
8362 		ret.kmr_size += entry->vme_end - entry->vme_start;
8363 
8364 		if (entry->permanent) {
8365 			/*
8366 			 * A permanent entry can not be removed, so leave it
8367 			 * in place but remove all access permissions.
8368 			 */
8369 			entry->protection = VM_PROT_NONE;
8370 			entry->max_protection = VM_PROT_NONE;
8371 		} else {
8372 			vm_map_entry_zap(map, entry, zap_list);
8373 		}
8374 
8375 		entry = next;
8376 
8377 		if ((flags & VM_MAP_REMOVE_NO_YIELD) == 0 && s < end) {
8378 			unsigned int last_timestamp = map->timestamp++;
8379 
8380 			if (lck_rw_lock_yield_exclusive(&map->lock,
8381 			    LCK_RW_YIELD_ANY_WAITER)) {
8382 				if (last_timestamp != map->timestamp + 1) {
8383 					state |= VMDS_NEEDS_LOOKUP;
8384 				}
8385 			} else {
8386 				/* we didn't yield, undo our change */
8387 				map->timestamp--;
8388 			}
8389 		}
8390 	}
8391 
8392 	if (map->wait_for_space) {
8393 		thread_wakeup((event_t) map);
8394 	}
8395 
8396 	if (state & VMDS_NEEDS_WAKEUP) {
8397 		vm_map_entry_wakeup(map);
8398 	}
8399 
8400 out:
8401 	if ((flags & VM_MAP_REMOVE_RETURN_ERRORS) == 0 && ret.kmr_return) {
8402 		__vm_map_delete_failed_panic(map, start, end, ret.kmr_return);
8403 	}
8404 
8405 	if (state & VMDS_FOUND_GAP) {
8406 		DTRACE_VM3(kern_vm_deallocate_gap,
8407 		    vm_map_offset_t, gap_start,
8408 		    vm_map_offset_t, save_start,
8409 		    vm_map_offset_t, save_end);
8410 		if (flags & VM_MAP_REMOVE_GAPS_FAIL) {
8411 			ret.kmr_return = KERN_INVALID_VALUE;
8412 		} else {
8413 			vm_map_guard_exception(gap_start, kGUARD_EXC_DEALLOC_GAP);
8414 		}
8415 	}
8416 
8417 	return ret;
8418 }
8419 
8420 kmem_return_t
vm_map_remove_and_unlock(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vmr_flags_t flags,kmem_guard_t guard)8421 vm_map_remove_and_unlock(
8422 	vm_map_t        map,
8423 	vm_map_offset_t start,
8424 	vm_map_offset_t end,
8425 	vmr_flags_t     flags,
8426 	kmem_guard_t    guard)
8427 {
8428 	kmem_return_t ret;
8429 	VM_MAP_ZAP_DECLARE(zap);
8430 
8431 	ret = vm_map_delete(map, start, end, flags, guard, &zap);
8432 	vm_map_unlock(map);
8433 
8434 	vm_map_zap_dispose(&zap);
8435 
8436 	return ret;
8437 }
8438 
8439 /*
8440  *	vm_map_remove_guard:
8441  *
8442  *	Remove the given address range from the target map.
8443  *	This is the exported form of vm_map_delete.
8444  */
8445 kmem_return_t
vm_map_remove_guard(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vmr_flags_t flags,kmem_guard_t guard)8446 vm_map_remove_guard(
8447 	vm_map_t        map,
8448 	vm_map_offset_t start,
8449 	vm_map_offset_t end,
8450 	vmr_flags_t     flags,
8451 	kmem_guard_t    guard)
8452 {
8453 	vm_map_lock(map);
8454 	return vm_map_remove_and_unlock(map, start, end, flags, guard);
8455 }
8456 
8457 /*
8458  *	vm_map_terminate:
8459  *
8460  *	Clean out a task's map.
8461  */
8462 kern_return_t
vm_map_terminate(vm_map_t map)8463 vm_map_terminate(
8464 	vm_map_t        map)
8465 {
8466 	vm_map_lock(map);
8467 	map->terminated = TRUE;
8468 	vm_map_disable_hole_optimization(map);
8469 	(void)vm_map_remove_and_unlock(map, map->min_offset, map->max_offset,
8470 	    VM_MAP_REMOVE_NO_FLAGS, KMEM_GUARD_NONE);
8471 	return KERN_SUCCESS;
8472 }
8473 
8474 
8475 /*
8476  *	Routine:	vm_map_copy_allocate
8477  *
8478  *	Description:
8479  *		Allocates and initializes a map copy object.
8480  */
8481 static vm_map_copy_t
vm_map_copy_allocate(void)8482 vm_map_copy_allocate(void)
8483 {
8484 	vm_map_copy_t new_copy;
8485 
8486 	new_copy = zalloc_flags(vm_map_copy_zone, Z_WAITOK | Z_ZERO);
8487 	new_copy->c_u.hdr.rb_head_store.rbh_root = (void*)(int)SKIP_RB_TREE;
8488 	vm_map_copy_first_entry(new_copy) = vm_map_copy_to_entry(new_copy);
8489 	vm_map_copy_last_entry(new_copy) = vm_map_copy_to_entry(new_copy);
8490 	return new_copy;
8491 }
8492 
8493 /*
8494  *	Routine:	vm_map_copy_discard
8495  *
8496  *	Description:
8497  *		Dispose of a map copy object (returned by
8498  *		vm_map_copyin).
8499  */
8500 void
vm_map_copy_discard(vm_map_copy_t copy)8501 vm_map_copy_discard(
8502 	vm_map_copy_t   copy)
8503 {
8504 	if (copy == VM_MAP_COPY_NULL) {
8505 		return;
8506 	}
8507 
8508 	/*
8509 	 * Assert that the vm_map_copy is coming from the right
8510 	 * zone and hasn't been forged
8511 	 */
8512 	vm_map_copy_require(copy);
8513 
8514 	switch (copy->type) {
8515 	case VM_MAP_COPY_ENTRY_LIST:
8516 		while (vm_map_copy_first_entry(copy) !=
8517 		    vm_map_copy_to_entry(copy)) {
8518 			vm_map_entry_t  entry = vm_map_copy_first_entry(copy);
8519 
8520 			vm_map_copy_entry_unlink(copy, entry);
8521 			if (entry->is_sub_map) {
8522 				vm_map_deallocate(VME_SUBMAP(entry));
8523 			} else {
8524 				vm_object_deallocate(VME_OBJECT(entry));
8525 			}
8526 			vm_map_copy_entry_dispose(entry);
8527 		}
8528 		break;
8529 	case VM_MAP_COPY_OBJECT:
8530 		vm_object_deallocate(copy->cpy_object);
8531 		break;
8532 	case VM_MAP_COPY_KERNEL_BUFFER:
8533 
8534 		/*
8535 		 * The vm_map_copy_t and possibly the data buffer were
8536 		 * allocated by a single call to kalloc_data(), i.e. the
8537 		 * vm_map_copy_t was not allocated out of the zone.
8538 		 */
8539 		if (copy->size > msg_ool_size_small || copy->offset) {
8540 			panic("Invalid vm_map_copy_t sz:%lld, ofst:%lld",
8541 			    (long long)copy->size, (long long)copy->offset);
8542 		}
8543 		kfree_data(copy->cpy_kdata, copy->size);
8544 	}
8545 	zfree(vm_map_copy_zone, copy);
8546 }
8547 
8548 /*
8549  *	Routine:	vm_map_copy_copy
8550  *
8551  *	Description:
8552  *			Move the information in a map copy object to
8553  *			a new map copy object, leaving the old one
8554  *			empty.
8555  *
8556  *			This is used by kernel routines that need
8557  *			to look at out-of-line data (in copyin form)
8558  *			before deciding whether to return SUCCESS.
8559  *			If the routine returns FAILURE, the original
8560  *			copy object will be deallocated; therefore,
8561  *			these routines must make a copy of the copy
8562  *			object and leave the original empty so that
8563  *			deallocation will not fail.
8564  */
8565 vm_map_copy_t
vm_map_copy_copy(vm_map_copy_t copy)8566 vm_map_copy_copy(
8567 	vm_map_copy_t   copy)
8568 {
8569 	vm_map_copy_t   new_copy;
8570 
8571 	if (copy == VM_MAP_COPY_NULL) {
8572 		return VM_MAP_COPY_NULL;
8573 	}
8574 
8575 	/*
8576 	 * Assert that the vm_map_copy is coming from the right
8577 	 * zone and hasn't been forged
8578 	 */
8579 	vm_map_copy_require(copy);
8580 
8581 	/*
8582 	 * Allocate a new copy object, and copy the information
8583 	 * from the old one into it.
8584 	 */
8585 
8586 	new_copy = (vm_map_copy_t) zalloc(vm_map_copy_zone);
8587 	memcpy((void *) new_copy, (void *) copy, sizeof(struct vm_map_copy));
8588 #if __has_feature(ptrauth_calls)
8589 	if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) {
8590 		new_copy->cpy_kdata = copy->cpy_kdata;
8591 	}
8592 #endif
8593 
8594 	if (copy->type == VM_MAP_COPY_ENTRY_LIST) {
8595 		/*
8596 		 * The links in the entry chain must be
8597 		 * changed to point to the new copy object.
8598 		 */
8599 		vm_map_copy_first_entry(copy)->vme_prev
8600 		        = vm_map_copy_to_entry(new_copy);
8601 		vm_map_copy_last_entry(copy)->vme_next
8602 		        = vm_map_copy_to_entry(new_copy);
8603 	}
8604 
8605 	/*
8606 	 * Change the old copy object into one that contains
8607 	 * nothing to be deallocated.
8608 	 */
8609 	copy->type = VM_MAP_COPY_OBJECT;
8610 	copy->cpy_object = VM_OBJECT_NULL;
8611 
8612 	/*
8613 	 * Return the new object.
8614 	 */
8615 	return new_copy;
8616 }
8617 
8618 static boolean_t
vm_map_entry_is_overwritable(vm_map_t dst_map __unused,vm_map_entry_t entry)8619 vm_map_entry_is_overwritable(
8620 	vm_map_t        dst_map __unused,
8621 	vm_map_entry_t  entry)
8622 {
8623 	if (!(entry->protection & VM_PROT_WRITE)) {
8624 		/* can't overwrite if not writable */
8625 		return FALSE;
8626 	}
8627 #if !__x86_64__
8628 	if (entry->used_for_jit &&
8629 	    vm_map_cs_enforcement(dst_map) &&
8630 	    !dst_map->cs_debugged) {
8631 		/*
8632 		 * Can't overwrite a JIT region while cs_enforced
8633 		 * and not cs_debugged.
8634 		 */
8635 		return FALSE;
8636 	}
8637 #endif /* !__x86_64__ */
8638 	return TRUE;
8639 }
8640 
8641 static kern_return_t
vm_map_overwrite_submap_recurse(vm_map_t dst_map,vm_map_offset_t dst_addr,vm_map_size_t dst_size)8642 vm_map_overwrite_submap_recurse(
8643 	vm_map_t        dst_map,
8644 	vm_map_offset_t dst_addr,
8645 	vm_map_size_t   dst_size)
8646 {
8647 	vm_map_offset_t dst_end;
8648 	vm_map_entry_t  tmp_entry;
8649 	vm_map_entry_t  entry;
8650 	kern_return_t   result;
8651 	boolean_t       encountered_sub_map = FALSE;
8652 
8653 
8654 
8655 	/*
8656 	 *	Verify that the destination is all writeable
8657 	 *	initially.  We have to trunc the destination
8658 	 *	address and round the copy size or we'll end up
8659 	 *	splitting entries in strange ways.
8660 	 */
8661 
8662 	dst_end = vm_map_round_page(dst_addr + dst_size,
8663 	    VM_MAP_PAGE_MASK(dst_map));
8664 	vm_map_lock(dst_map);
8665 
8666 start_pass_1:
8667 	if (!vm_map_lookup_entry(dst_map, dst_addr, &tmp_entry)) {
8668 		vm_map_unlock(dst_map);
8669 		return KERN_INVALID_ADDRESS;
8670 	}
8671 
8672 	vm_map_clip_start(dst_map,
8673 	    tmp_entry,
8674 	    vm_map_trunc_page(dst_addr,
8675 	    VM_MAP_PAGE_MASK(dst_map)));
8676 	if (tmp_entry->is_sub_map) {
8677 		/* clipping did unnest if needed */
8678 		assert(!tmp_entry->use_pmap);
8679 	}
8680 
8681 	for (entry = tmp_entry;;) {
8682 		vm_map_entry_t  next;
8683 
8684 		next = entry->vme_next;
8685 		while (entry->is_sub_map) {
8686 			vm_map_offset_t sub_start;
8687 			vm_map_offset_t sub_end;
8688 			vm_map_offset_t local_end;
8689 
8690 			if (entry->in_transition) {
8691 				/*
8692 				 * Say that we are waiting, and wait for entry.
8693 				 */
8694 				entry->needs_wakeup = TRUE;
8695 				vm_map_entry_wait(dst_map, THREAD_UNINT);
8696 
8697 				goto start_pass_1;
8698 			}
8699 
8700 			encountered_sub_map = TRUE;
8701 			sub_start = VME_OFFSET(entry);
8702 
8703 			if (entry->vme_end < dst_end) {
8704 				sub_end = entry->vme_end;
8705 			} else {
8706 				sub_end = dst_end;
8707 			}
8708 			sub_end -= entry->vme_start;
8709 			sub_end += VME_OFFSET(entry);
8710 			local_end = entry->vme_end;
8711 			vm_map_unlock(dst_map);
8712 
8713 			result = vm_map_overwrite_submap_recurse(
8714 				VME_SUBMAP(entry),
8715 				sub_start,
8716 				sub_end - sub_start);
8717 
8718 			if (result != KERN_SUCCESS) {
8719 				return result;
8720 			}
8721 			if (dst_end <= entry->vme_end) {
8722 				return KERN_SUCCESS;
8723 			}
8724 			vm_map_lock(dst_map);
8725 			if (!vm_map_lookup_entry(dst_map, local_end,
8726 			    &tmp_entry)) {
8727 				vm_map_unlock(dst_map);
8728 				return KERN_INVALID_ADDRESS;
8729 			}
8730 			entry = tmp_entry;
8731 			next = entry->vme_next;
8732 		}
8733 
8734 		if (!(entry->protection & VM_PROT_WRITE)) {
8735 			vm_map_unlock(dst_map);
8736 			return KERN_PROTECTION_FAILURE;
8737 		}
8738 
8739 		if (!vm_map_entry_is_overwritable(dst_map, entry)) {
8740 			vm_map_unlock(dst_map);
8741 			return KERN_PROTECTION_FAILURE;
8742 		}
8743 
8744 		/*
8745 		 *	If the entry is in transition, we must wait
8746 		 *	for it to exit that state.  Anything could happen
8747 		 *	when we unlock the map, so start over.
8748 		 */
8749 		if (entry->in_transition) {
8750 			/*
8751 			 * Say that we are waiting, and wait for entry.
8752 			 */
8753 			entry->needs_wakeup = TRUE;
8754 			vm_map_entry_wait(dst_map, THREAD_UNINT);
8755 
8756 			goto start_pass_1;
8757 		}
8758 
8759 /*
8760  *		our range is contained completely within this map entry
8761  */
8762 		if (dst_end <= entry->vme_end) {
8763 			vm_map_unlock(dst_map);
8764 			return KERN_SUCCESS;
8765 		}
8766 /*
8767  *		check that range specified is contiguous region
8768  */
8769 		if ((next == vm_map_to_entry(dst_map)) ||
8770 		    (next->vme_start != entry->vme_end)) {
8771 			vm_map_unlock(dst_map);
8772 			return KERN_INVALID_ADDRESS;
8773 		}
8774 
8775 		/*
8776 		 *	Check for permanent objects in the destination.
8777 		 */
8778 		if ((VME_OBJECT(entry) != VM_OBJECT_NULL) &&
8779 		    ((!VME_OBJECT(entry)->internal) ||
8780 		    (VME_OBJECT(entry)->true_share))) {
8781 			if (encountered_sub_map) {
8782 				vm_map_unlock(dst_map);
8783 				return KERN_FAILURE;
8784 			}
8785 		}
8786 
8787 
8788 		entry = next;
8789 	}/* for */
8790 	vm_map_unlock(dst_map);
8791 	return KERN_SUCCESS;
8792 }
8793 
8794 /*
8795  *	Routine:	vm_map_copy_overwrite
8796  *
8797  *	Description:
8798  *		Copy the memory described by the map copy
8799  *		object (copy; returned by vm_map_copyin) onto
8800  *		the specified destination region (dst_map, dst_addr).
8801  *		The destination must be writeable.
8802  *
8803  *		Unlike vm_map_copyout, this routine actually
8804  *		writes over previously-mapped memory.  If the
8805  *		previous mapping was to a permanent (user-supplied)
8806  *		memory object, it is preserved.
8807  *
8808  *		The attributes (protection and inheritance) of the
8809  *		destination region are preserved.
8810  *
8811  *		If successful, consumes the copy object.
8812  *		Otherwise, the caller is responsible for it.
8813  *
8814  *	Implementation notes:
8815  *		To overwrite aligned temporary virtual memory, it is
8816  *		sufficient to remove the previous mapping and insert
8817  *		the new copy.  This replacement is done either on
8818  *		the whole region (if no permanent virtual memory
8819  *		objects are embedded in the destination region) or
8820  *		in individual map entries.
8821  *
8822  *		To overwrite permanent virtual memory , it is necessary
8823  *		to copy each page, as the external memory management
8824  *		interface currently does not provide any optimizations.
8825  *
8826  *		Unaligned memory also has to be copied.  It is possible
8827  *		to use 'vm_trickery' to copy the aligned data.  This is
8828  *		not done but not hard to implement.
8829  *
8830  *		Once a page of permanent memory has been overwritten,
8831  *		it is impossible to interrupt this function; otherwise,
8832  *		the call would be neither atomic nor location-independent.
8833  *		The kernel-state portion of a user thread must be
8834  *		interruptible.
8835  *
8836  *		It may be expensive to forward all requests that might
8837  *		overwrite permanent memory (vm_write, vm_copy) to
8838  *		uninterruptible kernel threads.  This routine may be
8839  *		called by interruptible threads; however, success is
8840  *		not guaranteed -- if the request cannot be performed
8841  *		atomically and interruptibly, an error indication is
8842  *		returned.
8843  *
8844  *		Callers of this function must call vm_map_copy_require on
8845  *		previously created vm_map_copy_t or pass a newly created
8846  *		one to ensure that it hasn't been forged.
8847  */
8848 
8849 static kern_return_t
vm_map_copy_overwrite_nested(vm_map_t dst_map,vm_map_address_t dst_addr,vm_map_copy_t copy,boolean_t interruptible,pmap_t pmap,boolean_t discard_on_success)8850 vm_map_copy_overwrite_nested(
8851 	vm_map_t                dst_map,
8852 	vm_map_address_t        dst_addr,
8853 	vm_map_copy_t           copy,
8854 	boolean_t               interruptible,
8855 	pmap_t                  pmap,
8856 	boolean_t               discard_on_success)
8857 {
8858 	vm_map_offset_t         dst_end;
8859 	vm_map_entry_t          tmp_entry;
8860 	vm_map_entry_t          entry;
8861 	kern_return_t           kr;
8862 	boolean_t               aligned = TRUE;
8863 	boolean_t               contains_permanent_objects = FALSE;
8864 	boolean_t               encountered_sub_map = FALSE;
8865 	vm_map_offset_t         base_addr;
8866 	vm_map_size_t           copy_size;
8867 	vm_map_size_t           total_size;
8868 	uint16_t                copy_page_shift;
8869 
8870 	/*
8871 	 *	Check for special kernel buffer allocated
8872 	 *	by new_ipc_kmsg_copyin.
8873 	 */
8874 
8875 	if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) {
8876 		return vm_map_copyout_kernel_buffer(
8877 			dst_map, &dst_addr,
8878 			copy, copy->size, TRUE, discard_on_success);
8879 	}
8880 
8881 	/*
8882 	 *      Only works for entry lists at the moment.  Will
8883 	 *	support page lists later.
8884 	 */
8885 
8886 	assert(copy->type == VM_MAP_COPY_ENTRY_LIST);
8887 
8888 	if (copy->size == 0) {
8889 		if (discard_on_success) {
8890 			vm_map_copy_discard(copy);
8891 		}
8892 		return KERN_SUCCESS;
8893 	}
8894 
8895 	copy_page_shift = copy->cpy_hdr.page_shift;
8896 
8897 	/*
8898 	 *	Verify that the destination is all writeable
8899 	 *	initially.  We have to trunc the destination
8900 	 *	address and round the copy size or we'll end up
8901 	 *	splitting entries in strange ways.
8902 	 */
8903 
8904 	if (!VM_MAP_PAGE_ALIGNED(copy->size,
8905 	    VM_MAP_PAGE_MASK(dst_map)) ||
8906 	    !VM_MAP_PAGE_ALIGNED(copy->offset,
8907 	    VM_MAP_PAGE_MASK(dst_map)) ||
8908 	    !VM_MAP_PAGE_ALIGNED(dst_addr,
8909 	    VM_MAP_PAGE_MASK(dst_map)) ||
8910 	    copy_page_shift != VM_MAP_PAGE_SHIFT(dst_map)) {
8911 		aligned = FALSE;
8912 		dst_end = vm_map_round_page(dst_addr + copy->size,
8913 		    VM_MAP_PAGE_MASK(dst_map));
8914 	} else {
8915 		dst_end = dst_addr + copy->size;
8916 	}
8917 
8918 	vm_map_lock(dst_map);
8919 
8920 	/* LP64todo - remove this check when vm_map_commpage64()
8921 	 * no longer has to stuff in a map_entry for the commpage
8922 	 * above the map's max_offset.
8923 	 */
8924 	if (dst_addr >= dst_map->max_offset) {
8925 		vm_map_unlock(dst_map);
8926 		return KERN_INVALID_ADDRESS;
8927 	}
8928 
8929 start_pass_1:
8930 	if (!vm_map_lookup_entry(dst_map, dst_addr, &tmp_entry)) {
8931 		vm_map_unlock(dst_map);
8932 		return KERN_INVALID_ADDRESS;
8933 	}
8934 	vm_map_clip_start(dst_map,
8935 	    tmp_entry,
8936 	    vm_map_trunc_page(dst_addr,
8937 	    VM_MAP_PAGE_MASK(dst_map)));
8938 	for (entry = tmp_entry;;) {
8939 		vm_map_entry_t  next = entry->vme_next;
8940 
8941 		while (entry->is_sub_map) {
8942 			vm_map_offset_t sub_start;
8943 			vm_map_offset_t sub_end;
8944 			vm_map_offset_t local_end;
8945 
8946 			if (entry->in_transition) {
8947 				/*
8948 				 * Say that we are waiting, and wait for entry.
8949 				 */
8950 				entry->needs_wakeup = TRUE;
8951 				vm_map_entry_wait(dst_map, THREAD_UNINT);
8952 
8953 				goto start_pass_1;
8954 			}
8955 
8956 			local_end = entry->vme_end;
8957 			if (!(entry->needs_copy)) {
8958 				/* if needs_copy we are a COW submap */
8959 				/* in such a case we just replace so */
8960 				/* there is no need for the follow-  */
8961 				/* ing check.                        */
8962 				encountered_sub_map = TRUE;
8963 				sub_start = VME_OFFSET(entry);
8964 
8965 				if (entry->vme_end < dst_end) {
8966 					sub_end = entry->vme_end;
8967 				} else {
8968 					sub_end = dst_end;
8969 				}
8970 				sub_end -= entry->vme_start;
8971 				sub_end += VME_OFFSET(entry);
8972 				vm_map_unlock(dst_map);
8973 
8974 				kr = vm_map_overwrite_submap_recurse(
8975 					VME_SUBMAP(entry),
8976 					sub_start,
8977 					sub_end - sub_start);
8978 				if (kr != KERN_SUCCESS) {
8979 					return kr;
8980 				}
8981 				vm_map_lock(dst_map);
8982 			}
8983 
8984 			if (dst_end <= entry->vme_end) {
8985 				goto start_overwrite;
8986 			}
8987 			if (!vm_map_lookup_entry(dst_map, local_end,
8988 			    &entry)) {
8989 				vm_map_unlock(dst_map);
8990 				return KERN_INVALID_ADDRESS;
8991 			}
8992 			next = entry->vme_next;
8993 		}
8994 
8995 		if (!(entry->protection & VM_PROT_WRITE)) {
8996 			vm_map_unlock(dst_map);
8997 			return KERN_PROTECTION_FAILURE;
8998 		}
8999 
9000 		if (!vm_map_entry_is_overwritable(dst_map, entry)) {
9001 			vm_map_unlock(dst_map);
9002 			return KERN_PROTECTION_FAILURE;
9003 		}
9004 
9005 		/*
9006 		 *	If the entry is in transition, we must wait
9007 		 *	for it to exit that state.  Anything could happen
9008 		 *	when we unlock the map, so start over.
9009 		 */
9010 		if (entry->in_transition) {
9011 			/*
9012 			 * Say that we are waiting, and wait for entry.
9013 			 */
9014 			entry->needs_wakeup = TRUE;
9015 			vm_map_entry_wait(dst_map, THREAD_UNINT);
9016 
9017 			goto start_pass_1;
9018 		}
9019 
9020 /*
9021  *		our range is contained completely within this map entry
9022  */
9023 		if (dst_end <= entry->vme_end) {
9024 			break;
9025 		}
9026 /*
9027  *		check that range specified is contiguous region
9028  */
9029 		if ((next == vm_map_to_entry(dst_map)) ||
9030 		    (next->vme_start != entry->vme_end)) {
9031 			vm_map_unlock(dst_map);
9032 			return KERN_INVALID_ADDRESS;
9033 		}
9034 
9035 
9036 		/*
9037 		 *	Check for permanent objects in the destination.
9038 		 */
9039 		if ((VME_OBJECT(entry) != VM_OBJECT_NULL) &&
9040 		    ((!VME_OBJECT(entry)->internal) ||
9041 		    (VME_OBJECT(entry)->true_share))) {
9042 			contains_permanent_objects = TRUE;
9043 		}
9044 
9045 		entry = next;
9046 	}/* for */
9047 
9048 start_overwrite:
9049 	/*
9050 	 *	If there are permanent objects in the destination, then
9051 	 *	the copy cannot be interrupted.
9052 	 */
9053 
9054 	if (interruptible && contains_permanent_objects) {
9055 		vm_map_unlock(dst_map);
9056 		return KERN_FAILURE;   /* XXX */
9057 	}
9058 
9059 	/*
9060 	 *
9061 	 *	Make a second pass, overwriting the data
9062 	 *	At the beginning of each loop iteration,
9063 	 *	the next entry to be overwritten is "tmp_entry"
9064 	 *	(initially, the value returned from the lookup above),
9065 	 *	and the starting address expected in that entry
9066 	 *	is "start".
9067 	 */
9068 
9069 	total_size = copy->size;
9070 	if (encountered_sub_map) {
9071 		copy_size = 0;
9072 		/* re-calculate tmp_entry since we've had the map */
9073 		/* unlocked */
9074 		if (!vm_map_lookup_entry( dst_map, dst_addr, &tmp_entry)) {
9075 			vm_map_unlock(dst_map);
9076 			return KERN_INVALID_ADDRESS;
9077 		}
9078 	} else {
9079 		copy_size = copy->size;
9080 	}
9081 
9082 	base_addr = dst_addr;
9083 	while (TRUE) {
9084 		/* deconstruct the copy object and do in parts */
9085 		/* only in sub_map, interruptable case */
9086 		vm_map_entry_t  copy_entry;
9087 		vm_map_entry_t  previous_prev = VM_MAP_ENTRY_NULL;
9088 		vm_map_entry_t  next_copy = VM_MAP_ENTRY_NULL;
9089 		int             nentries;
9090 		int             remaining_entries = 0;
9091 		vm_map_offset_t new_offset = 0;
9092 
9093 		for (entry = tmp_entry; copy_size == 0;) {
9094 			vm_map_entry_t  next;
9095 
9096 			next = entry->vme_next;
9097 
9098 			/* tmp_entry and base address are moved along */
9099 			/* each time we encounter a sub-map.  Otherwise */
9100 			/* entry can outpase tmp_entry, and the copy_size */
9101 			/* may reflect the distance between them */
9102 			/* if the current entry is found to be in transition */
9103 			/* we will start over at the beginning or the last */
9104 			/* encounter of a submap as dictated by base_addr */
9105 			/* we will zero copy_size accordingly. */
9106 			if (entry->in_transition) {
9107 				/*
9108 				 * Say that we are waiting, and wait for entry.
9109 				 */
9110 				entry->needs_wakeup = TRUE;
9111 				vm_map_entry_wait(dst_map, THREAD_UNINT);
9112 
9113 				if (!vm_map_lookup_entry(dst_map, base_addr,
9114 				    &tmp_entry)) {
9115 					vm_map_unlock(dst_map);
9116 					return KERN_INVALID_ADDRESS;
9117 				}
9118 				copy_size = 0;
9119 				entry = tmp_entry;
9120 				continue;
9121 			}
9122 			if (entry->is_sub_map) {
9123 				vm_map_offset_t sub_start;
9124 				vm_map_offset_t sub_end;
9125 				vm_map_offset_t local_end;
9126 
9127 				if (entry->needs_copy) {
9128 					/* if this is a COW submap */
9129 					/* just back the range with a */
9130 					/* anonymous entry */
9131 					if (entry->vme_end < dst_end) {
9132 						sub_end = entry->vme_end;
9133 					} else {
9134 						sub_end = dst_end;
9135 					}
9136 					if (entry->vme_start < base_addr) {
9137 						sub_start = base_addr;
9138 					} else {
9139 						sub_start = entry->vme_start;
9140 					}
9141 					vm_map_clip_end(
9142 						dst_map, entry, sub_end);
9143 					vm_map_clip_start(
9144 						dst_map, entry, sub_start);
9145 					assert(!entry->use_pmap);
9146 					assert(!entry->iokit_acct);
9147 					entry->use_pmap = TRUE;
9148 					vm_map_deallocate(VME_SUBMAP(entry));
9149 					VME_OBJECT_SET(entry, VM_OBJECT_NULL, false, 0);
9150 					VME_OFFSET_SET(entry, 0);
9151 					entry->is_shared = FALSE;
9152 					entry->needs_copy = FALSE;
9153 					entry->protection = VM_PROT_DEFAULT;
9154 					entry->max_protection = VM_PROT_ALL;
9155 					entry->wired_count = 0;
9156 					entry->user_wired_count = 0;
9157 					if (entry->inheritance
9158 					    == VM_INHERIT_SHARE) {
9159 						entry->inheritance = VM_INHERIT_COPY;
9160 					}
9161 					continue;
9162 				}
9163 				/* first take care of any non-sub_map */
9164 				/* entries to send */
9165 				if (base_addr < entry->vme_start) {
9166 					/* stuff to send */
9167 					copy_size =
9168 					    entry->vme_start - base_addr;
9169 					break;
9170 				}
9171 				sub_start = VME_OFFSET(entry);
9172 
9173 				if (entry->vme_end < dst_end) {
9174 					sub_end = entry->vme_end;
9175 				} else {
9176 					sub_end = dst_end;
9177 				}
9178 				sub_end -= entry->vme_start;
9179 				sub_end += VME_OFFSET(entry);
9180 				local_end = entry->vme_end;
9181 				vm_map_unlock(dst_map);
9182 				copy_size = sub_end - sub_start;
9183 
9184 				/* adjust the copy object */
9185 				if (total_size > copy_size) {
9186 					vm_map_size_t   local_size = 0;
9187 					vm_map_size_t   entry_size;
9188 
9189 					nentries = 1;
9190 					new_offset = copy->offset;
9191 					copy_entry = vm_map_copy_first_entry(copy);
9192 					while (copy_entry !=
9193 					    vm_map_copy_to_entry(copy)) {
9194 						entry_size = copy_entry->vme_end -
9195 						    copy_entry->vme_start;
9196 						if ((local_size < copy_size) &&
9197 						    ((local_size + entry_size)
9198 						    >= copy_size)) {
9199 							vm_map_copy_clip_end(copy,
9200 							    copy_entry,
9201 							    copy_entry->vme_start +
9202 							    (copy_size - local_size));
9203 							entry_size = copy_entry->vme_end -
9204 							    copy_entry->vme_start;
9205 							local_size += entry_size;
9206 							new_offset += entry_size;
9207 						}
9208 						if (local_size >= copy_size) {
9209 							next_copy = copy_entry->vme_next;
9210 							copy_entry->vme_next =
9211 							    vm_map_copy_to_entry(copy);
9212 							previous_prev =
9213 							    copy->cpy_hdr.links.prev;
9214 							copy->cpy_hdr.links.prev = copy_entry;
9215 							copy->size = copy_size;
9216 							remaining_entries =
9217 							    copy->cpy_hdr.nentries;
9218 							remaining_entries -= nentries;
9219 							copy->cpy_hdr.nentries = nentries;
9220 							break;
9221 						} else {
9222 							local_size += entry_size;
9223 							new_offset += entry_size;
9224 							nentries++;
9225 						}
9226 						copy_entry = copy_entry->vme_next;
9227 					}
9228 				}
9229 
9230 				if ((entry->use_pmap) && (pmap == NULL)) {
9231 					kr = vm_map_copy_overwrite_nested(
9232 						VME_SUBMAP(entry),
9233 						sub_start,
9234 						copy,
9235 						interruptible,
9236 						VME_SUBMAP(entry)->pmap,
9237 						TRUE);
9238 				} else if (pmap != NULL) {
9239 					kr = vm_map_copy_overwrite_nested(
9240 						VME_SUBMAP(entry),
9241 						sub_start,
9242 						copy,
9243 						interruptible, pmap,
9244 						TRUE);
9245 				} else {
9246 					kr = vm_map_copy_overwrite_nested(
9247 						VME_SUBMAP(entry),
9248 						sub_start,
9249 						copy,
9250 						interruptible,
9251 						dst_map->pmap,
9252 						TRUE);
9253 				}
9254 				if (kr != KERN_SUCCESS) {
9255 					if (next_copy != NULL) {
9256 						copy->cpy_hdr.nentries +=
9257 						    remaining_entries;
9258 						copy->cpy_hdr.links.prev->vme_next =
9259 						    next_copy;
9260 						copy->cpy_hdr.links.prev
9261 						        = previous_prev;
9262 						copy->size = total_size;
9263 					}
9264 					return kr;
9265 				}
9266 				if (dst_end <= local_end) {
9267 					return KERN_SUCCESS;
9268 				}
9269 				/* otherwise copy no longer exists, it was */
9270 				/* destroyed after successful copy_overwrite */
9271 				copy = vm_map_copy_allocate();
9272 				copy->type = VM_MAP_COPY_ENTRY_LIST;
9273 				copy->offset = new_offset;
9274 				copy->cpy_hdr.page_shift = copy_page_shift;
9275 
9276 				/*
9277 				 * XXX FBDP
9278 				 * this does not seem to deal with
9279 				 * the VM map store (R&B tree)
9280 				 */
9281 
9282 				total_size -= copy_size;
9283 				copy_size = 0;
9284 				/* put back remainder of copy in container */
9285 				if (next_copy != NULL) {
9286 					copy->cpy_hdr.nentries = remaining_entries;
9287 					copy->cpy_hdr.links.next = next_copy;
9288 					copy->cpy_hdr.links.prev = previous_prev;
9289 					copy->size = total_size;
9290 					next_copy->vme_prev =
9291 					    vm_map_copy_to_entry(copy);
9292 					next_copy = NULL;
9293 				}
9294 				base_addr = local_end;
9295 				vm_map_lock(dst_map);
9296 				if (!vm_map_lookup_entry(dst_map,
9297 				    local_end, &tmp_entry)) {
9298 					vm_map_unlock(dst_map);
9299 					return KERN_INVALID_ADDRESS;
9300 				}
9301 				entry = tmp_entry;
9302 				continue;
9303 			}
9304 			if (dst_end <= entry->vme_end) {
9305 				copy_size = dst_end - base_addr;
9306 				break;
9307 			}
9308 
9309 			if ((next == vm_map_to_entry(dst_map)) ||
9310 			    (next->vme_start != entry->vme_end)) {
9311 				vm_map_unlock(dst_map);
9312 				return KERN_INVALID_ADDRESS;
9313 			}
9314 
9315 			entry = next;
9316 		}/* for */
9317 
9318 		next_copy = NULL;
9319 		nentries = 1;
9320 
9321 		/* adjust the copy object */
9322 		if (total_size > copy_size) {
9323 			vm_map_size_t   local_size = 0;
9324 			vm_map_size_t   entry_size;
9325 
9326 			new_offset = copy->offset;
9327 			copy_entry = vm_map_copy_first_entry(copy);
9328 			while (copy_entry != vm_map_copy_to_entry(copy)) {
9329 				entry_size = copy_entry->vme_end -
9330 				    copy_entry->vme_start;
9331 				if ((local_size < copy_size) &&
9332 				    ((local_size + entry_size)
9333 				    >= copy_size)) {
9334 					vm_map_copy_clip_end(copy, copy_entry,
9335 					    copy_entry->vme_start +
9336 					    (copy_size - local_size));
9337 					entry_size = copy_entry->vme_end -
9338 					    copy_entry->vme_start;
9339 					local_size += entry_size;
9340 					new_offset += entry_size;
9341 				}
9342 				if (local_size >= copy_size) {
9343 					next_copy = copy_entry->vme_next;
9344 					copy_entry->vme_next =
9345 					    vm_map_copy_to_entry(copy);
9346 					previous_prev =
9347 					    copy->cpy_hdr.links.prev;
9348 					copy->cpy_hdr.links.prev = copy_entry;
9349 					copy->size = copy_size;
9350 					remaining_entries =
9351 					    copy->cpy_hdr.nentries;
9352 					remaining_entries -= nentries;
9353 					copy->cpy_hdr.nentries = nentries;
9354 					break;
9355 				} else {
9356 					local_size += entry_size;
9357 					new_offset += entry_size;
9358 					nentries++;
9359 				}
9360 				copy_entry = copy_entry->vme_next;
9361 			}
9362 		}
9363 
9364 		if (aligned) {
9365 			pmap_t  local_pmap;
9366 
9367 			if (pmap) {
9368 				local_pmap = pmap;
9369 			} else {
9370 				local_pmap = dst_map->pmap;
9371 			}
9372 
9373 			if ((kr =  vm_map_copy_overwrite_aligned(
9374 				    dst_map, tmp_entry, copy,
9375 				    base_addr, local_pmap)) != KERN_SUCCESS) {
9376 				if (next_copy != NULL) {
9377 					copy->cpy_hdr.nentries +=
9378 					    remaining_entries;
9379 					copy->cpy_hdr.links.prev->vme_next =
9380 					    next_copy;
9381 					copy->cpy_hdr.links.prev =
9382 					    previous_prev;
9383 					copy->size += copy_size;
9384 				}
9385 				return kr;
9386 			}
9387 			vm_map_unlock(dst_map);
9388 		} else {
9389 			/*
9390 			 * Performance gain:
9391 			 *
9392 			 * if the copy and dst address are misaligned but the same
9393 			 * offset within the page we can copy_not_aligned the
9394 			 * misaligned parts and copy aligned the rest.  If they are
9395 			 * aligned but len is unaligned we simply need to copy
9396 			 * the end bit unaligned.  We'll need to split the misaligned
9397 			 * bits of the region in this case !
9398 			 */
9399 			/* ALWAYS UNLOCKS THE dst_map MAP */
9400 			kr = vm_map_copy_overwrite_unaligned(
9401 				dst_map,
9402 				tmp_entry,
9403 				copy,
9404 				base_addr,
9405 				discard_on_success);
9406 			if (kr != KERN_SUCCESS) {
9407 				if (next_copy != NULL) {
9408 					copy->cpy_hdr.nentries +=
9409 					    remaining_entries;
9410 					copy->cpy_hdr.links.prev->vme_next =
9411 					    next_copy;
9412 					copy->cpy_hdr.links.prev =
9413 					    previous_prev;
9414 					copy->size += copy_size;
9415 				}
9416 				return kr;
9417 			}
9418 		}
9419 		total_size -= copy_size;
9420 		if (total_size == 0) {
9421 			break;
9422 		}
9423 		base_addr += copy_size;
9424 		copy_size = 0;
9425 		copy->offset = new_offset;
9426 		if (next_copy != NULL) {
9427 			copy->cpy_hdr.nentries = remaining_entries;
9428 			copy->cpy_hdr.links.next = next_copy;
9429 			copy->cpy_hdr.links.prev = previous_prev;
9430 			next_copy->vme_prev = vm_map_copy_to_entry(copy);
9431 			copy->size = total_size;
9432 		}
9433 		vm_map_lock(dst_map);
9434 		while (TRUE) {
9435 			if (!vm_map_lookup_entry(dst_map,
9436 			    base_addr, &tmp_entry)) {
9437 				vm_map_unlock(dst_map);
9438 				return KERN_INVALID_ADDRESS;
9439 			}
9440 			if (tmp_entry->in_transition) {
9441 				entry->needs_wakeup = TRUE;
9442 				vm_map_entry_wait(dst_map, THREAD_UNINT);
9443 			} else {
9444 				break;
9445 			}
9446 		}
9447 		vm_map_clip_start(dst_map,
9448 		    tmp_entry,
9449 		    vm_map_trunc_page(base_addr,
9450 		    VM_MAP_PAGE_MASK(dst_map)));
9451 
9452 		entry = tmp_entry;
9453 	} /* while */
9454 
9455 	/*
9456 	 *	Throw away the vm_map_copy object
9457 	 */
9458 	if (discard_on_success) {
9459 		vm_map_copy_discard(copy);
9460 	}
9461 
9462 	return KERN_SUCCESS;
9463 }/* vm_map_copy_overwrite */
9464 
9465 kern_return_t
vm_map_copy_overwrite(vm_map_t dst_map,vm_map_offset_t dst_addr,vm_map_copy_t copy,vm_map_size_t copy_size,boolean_t interruptible)9466 vm_map_copy_overwrite(
9467 	vm_map_t        dst_map,
9468 	vm_map_offset_t dst_addr,
9469 	vm_map_copy_t   copy,
9470 	vm_map_size_t   copy_size,
9471 	boolean_t       interruptible)
9472 {
9473 	vm_map_size_t   head_size, tail_size;
9474 	vm_map_copy_t   head_copy, tail_copy;
9475 	vm_map_offset_t head_addr, tail_addr;
9476 	vm_map_entry_t  entry;
9477 	kern_return_t   kr;
9478 	vm_map_offset_t effective_page_mask, effective_page_size;
9479 	uint16_t        copy_page_shift;
9480 
9481 	head_size = 0;
9482 	tail_size = 0;
9483 	head_copy = NULL;
9484 	tail_copy = NULL;
9485 	head_addr = 0;
9486 	tail_addr = 0;
9487 
9488 	/*
9489 	 *	Check for null copy object.
9490 	 */
9491 	if (copy == VM_MAP_COPY_NULL) {
9492 		return KERN_SUCCESS;
9493 	}
9494 
9495 	/*
9496 	 * Assert that the vm_map_copy is coming from the right
9497 	 * zone and hasn't been forged
9498 	 */
9499 	vm_map_copy_require(copy);
9500 
9501 	if (interruptible ||
9502 	    copy->type != VM_MAP_COPY_ENTRY_LIST) {
9503 		/*
9504 		 * We can't split the "copy" map if we're interruptible
9505 		 * or if we don't have a "copy" map...
9506 		 */
9507 blunt_copy:
9508 		return vm_map_copy_overwrite_nested(dst_map,
9509 		           dst_addr,
9510 		           copy,
9511 		           interruptible,
9512 		           (pmap_t) NULL,
9513 		           TRUE);
9514 	}
9515 
9516 	copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy);
9517 	if (copy_page_shift < PAGE_SHIFT ||
9518 	    VM_MAP_PAGE_SHIFT(dst_map) < PAGE_SHIFT) {
9519 		goto blunt_copy;
9520 	}
9521 
9522 	if (VM_MAP_PAGE_SHIFT(dst_map) < PAGE_SHIFT) {
9523 		effective_page_mask = VM_MAP_PAGE_MASK(dst_map);
9524 	} else {
9525 		effective_page_mask = MAX(VM_MAP_PAGE_MASK(dst_map), PAGE_MASK);
9526 		effective_page_mask = MAX(VM_MAP_COPY_PAGE_MASK(copy),
9527 		    effective_page_mask);
9528 	}
9529 	effective_page_size = effective_page_mask + 1;
9530 
9531 	if (copy_size < VM_MAP_COPY_OVERWRITE_OPTIMIZATION_THRESHOLD_PAGES * effective_page_size) {
9532 		/*
9533 		 * Too small to bother with optimizing...
9534 		 */
9535 		goto blunt_copy;
9536 	}
9537 
9538 	if ((dst_addr & effective_page_mask) !=
9539 	    (copy->offset & effective_page_mask)) {
9540 		/*
9541 		 * Incompatible mis-alignment of source and destination...
9542 		 */
9543 		goto blunt_copy;
9544 	}
9545 
9546 	/*
9547 	 * Proper alignment or identical mis-alignment at the beginning.
9548 	 * Let's try and do a small unaligned copy first (if needed)
9549 	 * and then an aligned copy for the rest.
9550 	 */
9551 	if (!vm_map_page_aligned(dst_addr, effective_page_mask)) {
9552 		head_addr = dst_addr;
9553 		head_size = (effective_page_size -
9554 		    (copy->offset & effective_page_mask));
9555 		head_size = MIN(head_size, copy_size);
9556 	}
9557 	if (!vm_map_page_aligned(copy->offset + copy_size,
9558 	    effective_page_mask)) {
9559 		/*
9560 		 * Mis-alignment at the end.
9561 		 * Do an aligned copy up to the last page and
9562 		 * then an unaligned copy for the remaining bytes.
9563 		 */
9564 		tail_size = ((copy->offset + copy_size) &
9565 		    effective_page_mask);
9566 		tail_size = MIN(tail_size, copy_size);
9567 		tail_addr = dst_addr + copy_size - tail_size;
9568 		assert(tail_addr >= head_addr + head_size);
9569 	}
9570 	assert(head_size + tail_size <= copy_size);
9571 
9572 	if (head_size + tail_size == copy_size) {
9573 		/*
9574 		 * It's all unaligned, no optimization possible...
9575 		 */
9576 		goto blunt_copy;
9577 	}
9578 
9579 	/*
9580 	 * Can't optimize if there are any submaps in the
9581 	 * destination due to the way we free the "copy" map
9582 	 * progressively in vm_map_copy_overwrite_nested()
9583 	 * in that case.
9584 	 */
9585 	vm_map_lock_read(dst_map);
9586 	if (!vm_map_lookup_entry(dst_map, dst_addr, &entry)) {
9587 		vm_map_unlock_read(dst_map);
9588 		goto blunt_copy;
9589 	}
9590 	for (;
9591 	    (entry != vm_map_copy_to_entry(copy) &&
9592 	    entry->vme_start < dst_addr + copy_size);
9593 	    entry = entry->vme_next) {
9594 		if (entry->is_sub_map) {
9595 			vm_map_unlock_read(dst_map);
9596 			goto blunt_copy;
9597 		}
9598 	}
9599 	vm_map_unlock_read(dst_map);
9600 
9601 	if (head_size) {
9602 		/*
9603 		 * Unaligned copy of the first "head_size" bytes, to reach
9604 		 * a page boundary.
9605 		 */
9606 
9607 		/*
9608 		 * Extract "head_copy" out of "copy".
9609 		 */
9610 		head_copy = vm_map_copy_allocate();
9611 		head_copy->type = VM_MAP_COPY_ENTRY_LIST;
9612 		head_copy->cpy_hdr.entries_pageable =
9613 		    copy->cpy_hdr.entries_pageable;
9614 		vm_map_store_init(&head_copy->cpy_hdr);
9615 		head_copy->cpy_hdr.page_shift = copy_page_shift;
9616 
9617 		entry = vm_map_copy_first_entry(copy);
9618 		if (entry->vme_end < copy->offset + head_size) {
9619 			head_size = entry->vme_end - copy->offset;
9620 		}
9621 
9622 		head_copy->offset = copy->offset;
9623 		head_copy->size = head_size;
9624 		copy->offset += head_size;
9625 		copy->size -= head_size;
9626 		copy_size -= head_size;
9627 		assert(copy_size > 0);
9628 
9629 		vm_map_copy_clip_end(copy, entry, copy->offset);
9630 		vm_map_copy_entry_unlink(copy, entry);
9631 		vm_map_copy_entry_link(head_copy,
9632 		    vm_map_copy_to_entry(head_copy),
9633 		    entry);
9634 
9635 		/*
9636 		 * Do the unaligned copy.
9637 		 */
9638 		kr = vm_map_copy_overwrite_nested(dst_map,
9639 		    head_addr,
9640 		    head_copy,
9641 		    interruptible,
9642 		    (pmap_t) NULL,
9643 		    FALSE);
9644 		if (kr != KERN_SUCCESS) {
9645 			goto done;
9646 		}
9647 	}
9648 
9649 	if (tail_size) {
9650 		/*
9651 		 * Extract "tail_copy" out of "copy".
9652 		 */
9653 		tail_copy = vm_map_copy_allocate();
9654 		tail_copy->type = VM_MAP_COPY_ENTRY_LIST;
9655 		tail_copy->cpy_hdr.entries_pageable =
9656 		    copy->cpy_hdr.entries_pageable;
9657 		vm_map_store_init(&tail_copy->cpy_hdr);
9658 		tail_copy->cpy_hdr.page_shift = copy_page_shift;
9659 
9660 		tail_copy->offset = copy->offset + copy_size - tail_size;
9661 		tail_copy->size = tail_size;
9662 
9663 		copy->size -= tail_size;
9664 		copy_size -= tail_size;
9665 		assert(copy_size > 0);
9666 
9667 		entry = vm_map_copy_last_entry(copy);
9668 		vm_map_copy_clip_start(copy, entry, tail_copy->offset);
9669 		entry = vm_map_copy_last_entry(copy);
9670 		vm_map_copy_entry_unlink(copy, entry);
9671 		vm_map_copy_entry_link(tail_copy,
9672 		    vm_map_copy_last_entry(tail_copy),
9673 		    entry);
9674 	}
9675 
9676 	/*
9677 	 * If we are here from ipc_kmsg_copyout_ool_descriptor(),
9678 	 * we want to avoid TOCTOU issues w.r.t copy->size but
9679 	 * we don't need to change vm_map_copy_overwrite_nested()
9680 	 * and all other vm_map_copy_overwrite variants.
9681 	 *
9682 	 * So we assign the original copy_size that was passed into
9683 	 * this routine back to copy.
9684 	 *
9685 	 * This use of local 'copy_size' passed into this routine is
9686 	 * to try and protect against TOCTOU attacks where the kernel
9687 	 * has been exploited. We don't expect this to be an issue
9688 	 * during normal system operation.
9689 	 */
9690 	assertf(copy->size == copy_size,
9691 	    "Mismatch of copy sizes. Expected 0x%llx, Got 0x%llx\n", (uint64_t) copy_size, (uint64_t) copy->size);
9692 	copy->size = copy_size;
9693 
9694 	/*
9695 	 * Copy most (or possibly all) of the data.
9696 	 */
9697 	kr = vm_map_copy_overwrite_nested(dst_map,
9698 	    dst_addr + head_size,
9699 	    copy,
9700 	    interruptible,
9701 	    (pmap_t) NULL,
9702 	    FALSE);
9703 	if (kr != KERN_SUCCESS) {
9704 		goto done;
9705 	}
9706 
9707 	if (tail_size) {
9708 		kr = vm_map_copy_overwrite_nested(dst_map,
9709 		    tail_addr,
9710 		    tail_copy,
9711 		    interruptible,
9712 		    (pmap_t) NULL,
9713 		    FALSE);
9714 	}
9715 
9716 done:
9717 	assert(copy->type == VM_MAP_COPY_ENTRY_LIST);
9718 	if (kr == KERN_SUCCESS) {
9719 		/*
9720 		 * Discard all the copy maps.
9721 		 */
9722 		if (head_copy) {
9723 			vm_map_copy_discard(head_copy);
9724 			head_copy = NULL;
9725 		}
9726 		vm_map_copy_discard(copy);
9727 		if (tail_copy) {
9728 			vm_map_copy_discard(tail_copy);
9729 			tail_copy = NULL;
9730 		}
9731 	} else {
9732 		/*
9733 		 * Re-assemble the original copy map.
9734 		 */
9735 		if (head_copy) {
9736 			entry = vm_map_copy_first_entry(head_copy);
9737 			vm_map_copy_entry_unlink(head_copy, entry);
9738 			vm_map_copy_entry_link(copy,
9739 			    vm_map_copy_to_entry(copy),
9740 			    entry);
9741 			copy->offset -= head_size;
9742 			copy->size += head_size;
9743 			vm_map_copy_discard(head_copy);
9744 			head_copy = NULL;
9745 		}
9746 		if (tail_copy) {
9747 			entry = vm_map_copy_last_entry(tail_copy);
9748 			vm_map_copy_entry_unlink(tail_copy, entry);
9749 			vm_map_copy_entry_link(copy,
9750 			    vm_map_copy_last_entry(copy),
9751 			    entry);
9752 			copy->size += tail_size;
9753 			vm_map_copy_discard(tail_copy);
9754 			tail_copy = NULL;
9755 		}
9756 	}
9757 	return kr;
9758 }
9759 
9760 
9761 /*
9762  *	Routine: vm_map_copy_overwrite_unaligned	[internal use only]
9763  *
9764  *	Decription:
9765  *	Physically copy unaligned data
9766  *
9767  *	Implementation:
9768  *	Unaligned parts of pages have to be physically copied.  We use
9769  *	a modified form of vm_fault_copy (which understands none-aligned
9770  *	page offsets and sizes) to do the copy.  We attempt to copy as
9771  *	much memory in one go as possibly, however vm_fault_copy copies
9772  *	within 1 memory object so we have to find the smaller of "amount left"
9773  *	"source object data size" and "target object data size".  With
9774  *	unaligned data we don't need to split regions, therefore the source
9775  *	(copy) object should be one map entry, the target range may be split
9776  *	over multiple map entries however.  In any event we are pessimistic
9777  *	about these assumptions.
9778  *
9779  *	Callers of this function must call vm_map_copy_require on
9780  *	previously created vm_map_copy_t or pass a newly created
9781  *	one to ensure that it hasn't been forged.
9782  *
9783  *	Assumptions:
9784  *	dst_map is locked on entry and is return locked on success,
9785  *	unlocked on error.
9786  */
9787 
9788 static kern_return_t
vm_map_copy_overwrite_unaligned(vm_map_t dst_map,vm_map_entry_t entry,vm_map_copy_t copy,vm_map_offset_t start,boolean_t discard_on_success)9789 vm_map_copy_overwrite_unaligned(
9790 	vm_map_t        dst_map,
9791 	vm_map_entry_t  entry,
9792 	vm_map_copy_t   copy,
9793 	vm_map_offset_t start,
9794 	boolean_t       discard_on_success)
9795 {
9796 	vm_map_entry_t          copy_entry;
9797 	vm_map_entry_t          copy_entry_next;
9798 	vm_map_version_t        version;
9799 	vm_object_t             dst_object;
9800 	vm_object_offset_t      dst_offset;
9801 	vm_object_offset_t      src_offset;
9802 	vm_object_offset_t      entry_offset;
9803 	vm_map_offset_t         entry_end;
9804 	vm_map_size_t           src_size,
9805 	    dst_size,
9806 	    copy_size,
9807 	    amount_left;
9808 	kern_return_t           kr = KERN_SUCCESS;
9809 
9810 
9811 	copy_entry = vm_map_copy_first_entry(copy);
9812 
9813 	vm_map_lock_write_to_read(dst_map);
9814 
9815 	src_offset = copy->offset - trunc_page_mask_64(copy->offset, VM_MAP_COPY_PAGE_MASK(copy));
9816 	amount_left = copy->size;
9817 /*
9818  *	unaligned so we never clipped this entry, we need the offset into
9819  *	the vm_object not just the data.
9820  */
9821 	while (amount_left > 0) {
9822 		if (entry == vm_map_to_entry(dst_map)) {
9823 			vm_map_unlock_read(dst_map);
9824 			return KERN_INVALID_ADDRESS;
9825 		}
9826 
9827 		/* "start" must be within the current map entry */
9828 		assert((start >= entry->vme_start) && (start < entry->vme_end));
9829 
9830 		dst_offset = start - entry->vme_start;
9831 
9832 		dst_size = entry->vme_end - start;
9833 
9834 		src_size = copy_entry->vme_end -
9835 		    (copy_entry->vme_start + src_offset);
9836 
9837 		if (dst_size < src_size) {
9838 /*
9839  *			we can only copy dst_size bytes before
9840  *			we have to get the next destination entry
9841  */
9842 			copy_size = dst_size;
9843 		} else {
9844 /*
9845  *			we can only copy src_size bytes before
9846  *			we have to get the next source copy entry
9847  */
9848 			copy_size = src_size;
9849 		}
9850 
9851 		if (copy_size > amount_left) {
9852 			copy_size = amount_left;
9853 		}
9854 /*
9855  *		Entry needs copy, create a shadow shadow object for
9856  *		Copy on write region.
9857  */
9858 		if (entry->needs_copy &&
9859 		    ((entry->protection & VM_PROT_WRITE) != 0)) {
9860 			if (vm_map_lock_read_to_write(dst_map)) {
9861 				vm_map_lock_read(dst_map);
9862 				goto RetryLookup;
9863 			}
9864 			VME_OBJECT_SHADOW(entry,
9865 			    (vm_map_size_t)(entry->vme_end
9866 			    - entry->vme_start));
9867 			entry->needs_copy = FALSE;
9868 			vm_map_lock_write_to_read(dst_map);
9869 		}
9870 		dst_object = VME_OBJECT(entry);
9871 /*
9872  *		unlike with the virtual (aligned) copy we're going
9873  *		to fault on it therefore we need a target object.
9874  */
9875 		if (dst_object == VM_OBJECT_NULL) {
9876 			if (vm_map_lock_read_to_write(dst_map)) {
9877 				vm_map_lock_read(dst_map);
9878 				goto RetryLookup;
9879 			}
9880 			dst_object = vm_object_allocate((vm_map_size_t)
9881 			    entry->vme_end - entry->vme_start);
9882 			VME_OBJECT_SET(entry, dst_object, false, 0);
9883 			VME_OFFSET_SET(entry, 0);
9884 			assert(entry->use_pmap);
9885 			vm_map_lock_write_to_read(dst_map);
9886 		}
9887 /*
9888  *		Take an object reference and unlock map. The "entry" may
9889  *		disappear or change when the map is unlocked.
9890  */
9891 		vm_object_reference(dst_object);
9892 		version.main_timestamp = dst_map->timestamp;
9893 		entry_offset = VME_OFFSET(entry);
9894 		entry_end = entry->vme_end;
9895 		vm_map_unlock_read(dst_map);
9896 /*
9897  *		Copy as much as possible in one pass
9898  */
9899 		kr = vm_fault_copy(
9900 			VME_OBJECT(copy_entry),
9901 			VME_OFFSET(copy_entry) + src_offset,
9902 			&copy_size,
9903 			dst_object,
9904 			entry_offset + dst_offset,
9905 			dst_map,
9906 			&version,
9907 			THREAD_UNINT );
9908 
9909 		start += copy_size;
9910 		src_offset += copy_size;
9911 		amount_left -= copy_size;
9912 /*
9913  *		Release the object reference
9914  */
9915 		vm_object_deallocate(dst_object);
9916 /*
9917  *		If a hard error occurred, return it now
9918  */
9919 		if (kr != KERN_SUCCESS) {
9920 			return kr;
9921 		}
9922 
9923 		if ((copy_entry->vme_start + src_offset) == copy_entry->vme_end
9924 		    || amount_left == 0) {
9925 /*
9926  *			all done with this copy entry, dispose.
9927  */
9928 			copy_entry_next = copy_entry->vme_next;
9929 
9930 			if (discard_on_success) {
9931 				vm_map_copy_entry_unlink(copy, copy_entry);
9932 				assert(!copy_entry->is_sub_map);
9933 				vm_object_deallocate(VME_OBJECT(copy_entry));
9934 				vm_map_copy_entry_dispose(copy_entry);
9935 			}
9936 
9937 			if (copy_entry_next == vm_map_copy_to_entry(copy) &&
9938 			    amount_left) {
9939 /*
9940  *				not finished copying but run out of source
9941  */
9942 				return KERN_INVALID_ADDRESS;
9943 			}
9944 
9945 			copy_entry = copy_entry_next;
9946 
9947 			src_offset = 0;
9948 		}
9949 
9950 		if (amount_left == 0) {
9951 			return KERN_SUCCESS;
9952 		}
9953 
9954 		vm_map_lock_read(dst_map);
9955 		if (version.main_timestamp == dst_map->timestamp) {
9956 			if (start == entry_end) {
9957 /*
9958  *				destination region is split.  Use the version
9959  *				information to avoid a lookup in the normal
9960  *				case.
9961  */
9962 				entry = entry->vme_next;
9963 /*
9964  *				should be contiguous. Fail if we encounter
9965  *				a hole in the destination.
9966  */
9967 				if (start != entry->vme_start) {
9968 					vm_map_unlock_read(dst_map);
9969 					return KERN_INVALID_ADDRESS;
9970 				}
9971 			}
9972 		} else {
9973 /*
9974  *			Map version check failed.
9975  *			we must lookup the entry because somebody
9976  *			might have changed the map behind our backs.
9977  */
9978 RetryLookup:
9979 			if (!vm_map_lookup_entry(dst_map, start, &entry)) {
9980 				vm_map_unlock_read(dst_map);
9981 				return KERN_INVALID_ADDRESS;
9982 			}
9983 		}
9984 	}/* while */
9985 
9986 	return KERN_SUCCESS;
9987 }/* vm_map_copy_overwrite_unaligned */
9988 
9989 /*
9990  *	Routine: vm_map_copy_overwrite_aligned	[internal use only]
9991  *
9992  *	Description:
9993  *	Does all the vm_trickery possible for whole pages.
9994  *
9995  *	Implementation:
9996  *
9997  *	If there are no permanent objects in the destination,
9998  *	and the source and destination map entry zones match,
9999  *	and the destination map entry is not shared,
10000  *	then the map entries can be deleted and replaced
10001  *	with those from the copy.  The following code is the
10002  *	basic idea of what to do, but there are lots of annoying
10003  *	little details about getting protection and inheritance
10004  *	right.  Should add protection, inheritance, and sharing checks
10005  *	to the above pass and make sure that no wiring is involved.
10006  *
10007  *	Callers of this function must call vm_map_copy_require on
10008  *	previously created vm_map_copy_t or pass a newly created
10009  *	one to ensure that it hasn't been forged.
10010  */
10011 
10012 int vm_map_copy_overwrite_aligned_src_not_internal = 0;
10013 int vm_map_copy_overwrite_aligned_src_not_symmetric = 0;
10014 int vm_map_copy_overwrite_aligned_src_large = 0;
10015 
10016 static kern_return_t
vm_map_copy_overwrite_aligned(vm_map_t dst_map,vm_map_entry_t tmp_entry,vm_map_copy_t copy,vm_map_offset_t start,__unused pmap_t pmap)10017 vm_map_copy_overwrite_aligned(
10018 	vm_map_t        dst_map,
10019 	vm_map_entry_t  tmp_entry,
10020 	vm_map_copy_t   copy,
10021 	vm_map_offset_t start,
10022 	__unused pmap_t pmap)
10023 {
10024 	vm_object_t     object;
10025 	vm_map_entry_t  copy_entry;
10026 	vm_map_size_t   copy_size;
10027 	vm_map_size_t   size;
10028 	vm_map_entry_t  entry;
10029 
10030 	while ((copy_entry = vm_map_copy_first_entry(copy))
10031 	    != vm_map_copy_to_entry(copy)) {
10032 		copy_size = (copy_entry->vme_end - copy_entry->vme_start);
10033 
10034 		entry = tmp_entry;
10035 		if (entry->is_sub_map) {
10036 			/* unnested when clipped earlier */
10037 			assert(!entry->use_pmap);
10038 		}
10039 		if (entry == vm_map_to_entry(dst_map)) {
10040 			vm_map_unlock(dst_map);
10041 			return KERN_INVALID_ADDRESS;
10042 		}
10043 		size = (entry->vme_end - entry->vme_start);
10044 		/*
10045 		 *	Make sure that no holes popped up in the
10046 		 *	address map, and that the protection is
10047 		 *	still valid, in case the map was unlocked
10048 		 *	earlier.
10049 		 */
10050 
10051 		if ((entry->vme_start != start) || ((entry->is_sub_map)
10052 		    && !entry->needs_copy)) {
10053 			vm_map_unlock(dst_map);
10054 			return KERN_INVALID_ADDRESS;
10055 		}
10056 		assert(entry != vm_map_to_entry(dst_map));
10057 
10058 		/*
10059 		 *	Check protection again
10060 		 */
10061 
10062 		if (!(entry->protection & VM_PROT_WRITE)) {
10063 			vm_map_unlock(dst_map);
10064 			return KERN_PROTECTION_FAILURE;
10065 		}
10066 
10067 		if (!vm_map_entry_is_overwritable(dst_map, entry)) {
10068 			vm_map_unlock(dst_map);
10069 			return KERN_PROTECTION_FAILURE;
10070 		}
10071 
10072 		/*
10073 		 *	Adjust to source size first
10074 		 */
10075 
10076 		if (copy_size < size) {
10077 			if (entry->map_aligned &&
10078 			    !VM_MAP_PAGE_ALIGNED(entry->vme_start + copy_size,
10079 			    VM_MAP_PAGE_MASK(dst_map))) {
10080 				/* no longer map-aligned */
10081 				entry->map_aligned = FALSE;
10082 			}
10083 			vm_map_clip_end(dst_map, entry, entry->vme_start + copy_size);
10084 			size = copy_size;
10085 		}
10086 
10087 		/*
10088 		 *	Adjust to destination size
10089 		 */
10090 
10091 		if (size < copy_size) {
10092 			vm_map_copy_clip_end(copy, copy_entry,
10093 			    copy_entry->vme_start + size);
10094 			copy_size = size;
10095 		}
10096 
10097 		assert((entry->vme_end - entry->vme_start) == size);
10098 		assert((tmp_entry->vme_end - tmp_entry->vme_start) == size);
10099 		assert((copy_entry->vme_end - copy_entry->vme_start) == size);
10100 
10101 		/*
10102 		 *	If the destination contains temporary unshared memory,
10103 		 *	we can perform the copy by throwing it away and
10104 		 *	installing the source data.
10105 		 */
10106 
10107 		object = VME_OBJECT(entry);
10108 		if ((!entry->is_shared &&
10109 		    ((object == VM_OBJECT_NULL) ||
10110 		    (object->internal && !object->true_share))) ||
10111 		    entry->needs_copy) {
10112 			vm_object_t     old_object = VME_OBJECT(entry);
10113 			vm_object_offset_t      old_offset = VME_OFFSET(entry);
10114 			vm_object_offset_t      offset;
10115 
10116 			/*
10117 			 * Ensure that the source and destination aren't
10118 			 * identical
10119 			 */
10120 			if (old_object == VME_OBJECT(copy_entry) &&
10121 			    old_offset == VME_OFFSET(copy_entry)) {
10122 				vm_map_copy_entry_unlink(copy, copy_entry);
10123 				vm_map_copy_entry_dispose(copy_entry);
10124 
10125 				if (old_object != VM_OBJECT_NULL) {
10126 					vm_object_deallocate(old_object);
10127 				}
10128 
10129 				start = tmp_entry->vme_end;
10130 				tmp_entry = tmp_entry->vme_next;
10131 				continue;
10132 			}
10133 
10134 #if XNU_TARGET_OS_OSX
10135 #define __TRADEOFF1_OBJ_SIZE (64 * 1024 * 1024) /* 64 MB */
10136 #define __TRADEOFF1_COPY_SIZE (128 * 1024)      /* 128 KB */
10137 			if (VME_OBJECT(copy_entry) != VM_OBJECT_NULL &&
10138 			    VME_OBJECT(copy_entry)->vo_size >= __TRADEOFF1_OBJ_SIZE &&
10139 			    copy_size <= __TRADEOFF1_COPY_SIZE) {
10140 				/*
10141 				 * Virtual vs. Physical copy tradeoff #1.
10142 				 *
10143 				 * Copying only a few pages out of a large
10144 				 * object:  do a physical copy instead of
10145 				 * a virtual copy, to avoid possibly keeping
10146 				 * the entire large object alive because of
10147 				 * those few copy-on-write pages.
10148 				 */
10149 				vm_map_copy_overwrite_aligned_src_large++;
10150 				goto slow_copy;
10151 			}
10152 #endif /* XNU_TARGET_OS_OSX */
10153 
10154 			if ((dst_map->pmap != kernel_pmap) &&
10155 			    (VME_ALIAS(entry) >= VM_MEMORY_MALLOC) &&
10156 			    (VME_ALIAS(entry) <= VM_MEMORY_MALLOC_MEDIUM)) {
10157 				vm_object_t new_object, new_shadow;
10158 
10159 				/*
10160 				 * We're about to map something over a mapping
10161 				 * established by malloc()...
10162 				 */
10163 				new_object = VME_OBJECT(copy_entry);
10164 				if (new_object != VM_OBJECT_NULL) {
10165 					vm_object_lock_shared(new_object);
10166 				}
10167 				while (new_object != VM_OBJECT_NULL &&
10168 #if XNU_TARGET_OS_OSX
10169 				    !new_object->true_share &&
10170 				    new_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC &&
10171 #endif /* XNU_TARGET_OS_OSX */
10172 				    new_object->internal) {
10173 					new_shadow = new_object->shadow;
10174 					if (new_shadow == VM_OBJECT_NULL) {
10175 						break;
10176 					}
10177 					vm_object_lock_shared(new_shadow);
10178 					vm_object_unlock(new_object);
10179 					new_object = new_shadow;
10180 				}
10181 				if (new_object != VM_OBJECT_NULL) {
10182 					if (!new_object->internal) {
10183 						/*
10184 						 * The new mapping is backed
10185 						 * by an external object.  We
10186 						 * don't want malloc'ed memory
10187 						 * to be replaced with such a
10188 						 * non-anonymous mapping, so
10189 						 * let's go off the optimized
10190 						 * path...
10191 						 */
10192 						vm_map_copy_overwrite_aligned_src_not_internal++;
10193 						vm_object_unlock(new_object);
10194 						goto slow_copy;
10195 					}
10196 #if XNU_TARGET_OS_OSX
10197 					if (new_object->true_share ||
10198 					    new_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
10199 						/*
10200 						 * Same if there's a "true_share"
10201 						 * object in the shadow chain, or
10202 						 * an object with a non-default
10203 						 * (SYMMETRIC) copy strategy.
10204 						 */
10205 						vm_map_copy_overwrite_aligned_src_not_symmetric++;
10206 						vm_object_unlock(new_object);
10207 						goto slow_copy;
10208 					}
10209 #endif /* XNU_TARGET_OS_OSX */
10210 					vm_object_unlock(new_object);
10211 				}
10212 				/*
10213 				 * The new mapping is still backed by
10214 				 * anonymous (internal) memory, so it's
10215 				 * OK to substitute it for the original
10216 				 * malloc() mapping.
10217 				 */
10218 			}
10219 
10220 			if (old_object != VM_OBJECT_NULL) {
10221 				if (entry->is_sub_map) {
10222 					if (entry->use_pmap) {
10223 #ifndef NO_NESTED_PMAP
10224 						pmap_unnest(dst_map->pmap,
10225 						    (addr64_t)entry->vme_start,
10226 						    entry->vme_end - entry->vme_start);
10227 #endif  /* NO_NESTED_PMAP */
10228 						if (dst_map->mapped_in_other_pmaps) {
10229 							/* clean up parent */
10230 							/* map/maps */
10231 							vm_map_submap_pmap_clean(
10232 								dst_map, entry->vme_start,
10233 								entry->vme_end,
10234 								VME_SUBMAP(entry),
10235 								VME_OFFSET(entry));
10236 						}
10237 					} else {
10238 						vm_map_submap_pmap_clean(
10239 							dst_map, entry->vme_start,
10240 							entry->vme_end,
10241 							VME_SUBMAP(entry),
10242 							VME_OFFSET(entry));
10243 					}
10244 					vm_map_deallocate(VME_SUBMAP(entry));
10245 				} else {
10246 					if (dst_map->mapped_in_other_pmaps) {
10247 						vm_object_pmap_protect_options(
10248 							VME_OBJECT(entry),
10249 							VME_OFFSET(entry),
10250 							entry->vme_end
10251 							- entry->vme_start,
10252 							PMAP_NULL,
10253 							PAGE_SIZE,
10254 							entry->vme_start,
10255 							VM_PROT_NONE,
10256 							PMAP_OPTIONS_REMOVE);
10257 					} else {
10258 						pmap_remove_options(
10259 							dst_map->pmap,
10260 							(addr64_t)(entry->vme_start),
10261 							(addr64_t)(entry->vme_end),
10262 							PMAP_OPTIONS_REMOVE);
10263 					}
10264 					vm_object_deallocate(old_object);
10265 				}
10266 			}
10267 
10268 			if (entry->iokit_acct) {
10269 				/* keep using iokit accounting */
10270 				entry->use_pmap = FALSE;
10271 			} else {
10272 				/* use pmap accounting */
10273 				entry->use_pmap = TRUE;
10274 			}
10275 			VME_OBJECT_SET(entry, VME_OBJECT(copy_entry), false, 0);
10276 			object = VME_OBJECT(entry);
10277 			entry->needs_copy = copy_entry->needs_copy;
10278 			entry->wired_count = 0;
10279 			entry->user_wired_count = 0;
10280 			offset = VME_OFFSET(copy_entry);
10281 			VME_OFFSET_SET(entry, offset);
10282 
10283 			vm_map_copy_entry_unlink(copy, copy_entry);
10284 			vm_map_copy_entry_dispose(copy_entry);
10285 
10286 			/*
10287 			 * we could try to push pages into the pmap at this point, BUT
10288 			 * this optimization only saved on average 2 us per page if ALL
10289 			 * the pages in the source were currently mapped
10290 			 * and ALL the pages in the dest were touched, if there were fewer
10291 			 * than 2/3 of the pages touched, this optimization actually cost more cycles
10292 			 * it also puts a lot of pressure on the pmap layer w/r to mapping structures
10293 			 */
10294 
10295 			/*
10296 			 *	Set up for the next iteration.  The map
10297 			 *	has not been unlocked, so the next
10298 			 *	address should be at the end of this
10299 			 *	entry, and the next map entry should be
10300 			 *	the one following it.
10301 			 */
10302 
10303 			start = tmp_entry->vme_end;
10304 			tmp_entry = tmp_entry->vme_next;
10305 		} else {
10306 			vm_map_version_t        version;
10307 			vm_object_t             dst_object;
10308 			vm_object_offset_t      dst_offset;
10309 			kern_return_t           r;
10310 
10311 slow_copy:
10312 			if (entry->needs_copy) {
10313 				VME_OBJECT_SHADOW(entry,
10314 				    (entry->vme_end -
10315 				    entry->vme_start));
10316 				entry->needs_copy = FALSE;
10317 			}
10318 
10319 			dst_object = VME_OBJECT(entry);
10320 			dst_offset = VME_OFFSET(entry);
10321 
10322 			/*
10323 			 *	Take an object reference, and record
10324 			 *	the map version information so that the
10325 			 *	map can be safely unlocked.
10326 			 */
10327 
10328 			if (dst_object == VM_OBJECT_NULL) {
10329 				/*
10330 				 * We would usually have just taken the
10331 				 * optimized path above if the destination
10332 				 * object has not been allocated yet.  But we
10333 				 * now disable that optimization if the copy
10334 				 * entry's object is not backed by anonymous
10335 				 * memory to avoid replacing malloc'ed
10336 				 * (i.e. re-usable) anonymous memory with a
10337 				 * not-so-anonymous mapping.
10338 				 * So we have to handle this case here and
10339 				 * allocate a new VM object for this map entry.
10340 				 */
10341 				dst_object = vm_object_allocate(
10342 					entry->vme_end - entry->vme_start);
10343 				dst_offset = 0;
10344 				VME_OBJECT_SET(entry, dst_object, false, 0);
10345 				VME_OFFSET_SET(entry, dst_offset);
10346 				assert(entry->use_pmap);
10347 			}
10348 
10349 			vm_object_reference(dst_object);
10350 
10351 			/* account for unlock bumping up timestamp */
10352 			version.main_timestamp = dst_map->timestamp + 1;
10353 
10354 			vm_map_unlock(dst_map);
10355 
10356 			/*
10357 			 *	Copy as much as possible in one pass
10358 			 */
10359 
10360 			copy_size = size;
10361 			r = vm_fault_copy(
10362 				VME_OBJECT(copy_entry),
10363 				VME_OFFSET(copy_entry),
10364 				&copy_size,
10365 				dst_object,
10366 				dst_offset,
10367 				dst_map,
10368 				&version,
10369 				THREAD_UNINT );
10370 
10371 			/*
10372 			 *	Release the object reference
10373 			 */
10374 
10375 			vm_object_deallocate(dst_object);
10376 
10377 			/*
10378 			 *	If a hard error occurred, return it now
10379 			 */
10380 
10381 			if (r != KERN_SUCCESS) {
10382 				return r;
10383 			}
10384 
10385 			if (copy_size != 0) {
10386 				/*
10387 				 *	Dispose of the copied region
10388 				 */
10389 
10390 				vm_map_copy_clip_end(copy, copy_entry,
10391 				    copy_entry->vme_start + copy_size);
10392 				vm_map_copy_entry_unlink(copy, copy_entry);
10393 				vm_object_deallocate(VME_OBJECT(copy_entry));
10394 				vm_map_copy_entry_dispose(copy_entry);
10395 			}
10396 
10397 			/*
10398 			 *	Pick up in the destination map where we left off.
10399 			 *
10400 			 *	Use the version information to avoid a lookup
10401 			 *	in the normal case.
10402 			 */
10403 
10404 			start += copy_size;
10405 			vm_map_lock(dst_map);
10406 			if (version.main_timestamp == dst_map->timestamp &&
10407 			    copy_size != 0) {
10408 				/* We can safely use saved tmp_entry value */
10409 
10410 				if (tmp_entry->map_aligned &&
10411 				    !VM_MAP_PAGE_ALIGNED(
10412 					    start,
10413 					    VM_MAP_PAGE_MASK(dst_map))) {
10414 					/* no longer map-aligned */
10415 					tmp_entry->map_aligned = FALSE;
10416 				}
10417 				vm_map_clip_end(dst_map, tmp_entry, start);
10418 				tmp_entry = tmp_entry->vme_next;
10419 			} else {
10420 				/* Must do lookup of tmp_entry */
10421 
10422 				if (!vm_map_lookup_entry(dst_map, start, &tmp_entry)) {
10423 					vm_map_unlock(dst_map);
10424 					return KERN_INVALID_ADDRESS;
10425 				}
10426 				if (tmp_entry->map_aligned &&
10427 				    !VM_MAP_PAGE_ALIGNED(
10428 					    start,
10429 					    VM_MAP_PAGE_MASK(dst_map))) {
10430 					/* no longer map-aligned */
10431 					tmp_entry->map_aligned = FALSE;
10432 				}
10433 				vm_map_clip_start(dst_map, tmp_entry, start);
10434 			}
10435 		}
10436 	}/* while */
10437 
10438 	return KERN_SUCCESS;
10439 }/* vm_map_copy_overwrite_aligned */
10440 
10441 /*
10442  *	Routine: vm_map_copyin_kernel_buffer [internal use only]
10443  *
10444  *	Description:
10445  *		Copy in data to a kernel buffer from space in the
10446  *		source map. The original space may be optionally
10447  *		deallocated.
10448  *
10449  *		If successful, returns a new copy object.
10450  */
10451 static kern_return_t
vm_map_copyin_kernel_buffer(vm_map_t src_map,vm_map_offset_t src_addr,vm_map_size_t len,boolean_t src_destroy,vm_map_copy_t * copy_result)10452 vm_map_copyin_kernel_buffer(
10453 	vm_map_t        src_map,
10454 	vm_map_offset_t src_addr,
10455 	vm_map_size_t   len,
10456 	boolean_t       src_destroy,
10457 	vm_map_copy_t   *copy_result)
10458 {
10459 	kern_return_t kr;
10460 	vm_map_copy_t copy;
10461 
10462 	if (len > msg_ool_size_small) {
10463 		return KERN_INVALID_ARGUMENT;
10464 	}
10465 
10466 	copy = zalloc_flags(vm_map_copy_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
10467 	copy->cpy_kdata = kalloc_data(len, Z_WAITOK);
10468 	if (copy->cpy_kdata == NULL) {
10469 		zfree(vm_map_copy_zone, copy);
10470 		return KERN_RESOURCE_SHORTAGE;
10471 	}
10472 
10473 	copy->type = VM_MAP_COPY_KERNEL_BUFFER;
10474 	copy->size = len;
10475 	copy->offset = 0;
10476 
10477 	kr = copyinmap(src_map, src_addr, copy->cpy_kdata, (vm_size_t)len);
10478 	if (kr != KERN_SUCCESS) {
10479 		kfree_data(copy->cpy_kdata, len);
10480 		zfree(vm_map_copy_zone, copy);
10481 		return kr;
10482 	}
10483 
10484 	if (src_destroy) {
10485 		vmr_flags_t flags = VM_MAP_REMOVE_INTERRUPTIBLE;
10486 
10487 		if (src_map == kernel_map) {
10488 			flags |= VM_MAP_REMOVE_KUNWIRE;
10489 		}
10490 
10491 		(void)vm_map_remove_guard(src_map,
10492 		    vm_map_trunc_page(src_addr, VM_MAP_PAGE_MASK(src_map)),
10493 		    vm_map_round_page(src_addr + len, VM_MAP_PAGE_MASK(src_map)),
10494 		    flags, KMEM_GUARD_NONE);
10495 	}
10496 
10497 	*copy_result = copy;
10498 	return KERN_SUCCESS;
10499 }
10500 
10501 /*
10502  *	Routine: vm_map_copyout_kernel_buffer	[internal use only]
10503  *
10504  *	Description:
10505  *		Copy out data from a kernel buffer into space in the
10506  *		destination map. The space may be otpionally dynamically
10507  *		allocated.
10508  *
10509  *		If successful, consumes the copy object.
10510  *		Otherwise, the caller is responsible for it.
10511  *
10512  *		Callers of this function must call vm_map_copy_require on
10513  *		previously created vm_map_copy_t or pass a newly created
10514  *		one to ensure that it hasn't been forged.
10515  */
10516 static int vm_map_copyout_kernel_buffer_failures = 0;
10517 static kern_return_t
vm_map_copyout_kernel_buffer(vm_map_t map,vm_map_address_t * addr,vm_map_copy_t copy,vm_map_size_t copy_size,boolean_t overwrite,boolean_t consume_on_success)10518 vm_map_copyout_kernel_buffer(
10519 	vm_map_t                map,
10520 	vm_map_address_t        *addr,  /* IN/OUT */
10521 	vm_map_copy_t           copy,
10522 	vm_map_size_t           copy_size,
10523 	boolean_t               overwrite,
10524 	boolean_t               consume_on_success)
10525 {
10526 	kern_return_t kr = KERN_SUCCESS;
10527 	thread_t thread = current_thread();
10528 
10529 	assert(copy->size == copy_size);
10530 
10531 	/*
10532 	 * check for corrupted vm_map_copy structure
10533 	 */
10534 	if (copy_size > msg_ool_size_small || copy->offset) {
10535 		panic("Invalid vm_map_copy_t sz:%lld, ofst:%lld",
10536 		    (long long)copy->size, (long long)copy->offset);
10537 	}
10538 
10539 	if (!overwrite) {
10540 		/*
10541 		 * Allocate space in the target map for the data
10542 		 */
10543 		*addr = 0;
10544 		kr = vm_map_enter(map,
10545 		    addr,
10546 		    vm_map_round_page(copy_size,
10547 		    VM_MAP_PAGE_MASK(map)),
10548 		    (vm_map_offset_t) 0,
10549 		    VM_FLAGS_ANYWHERE,
10550 		    VM_MAP_KERNEL_FLAGS_NONE,
10551 		    VM_KERN_MEMORY_NONE,
10552 		    VM_OBJECT_NULL,
10553 		    (vm_object_offset_t) 0,
10554 		    FALSE,
10555 		    VM_PROT_DEFAULT,
10556 		    VM_PROT_ALL,
10557 		    VM_INHERIT_DEFAULT);
10558 		if (kr != KERN_SUCCESS) {
10559 			return kr;
10560 		}
10561 #if KASAN
10562 		if (map->pmap == kernel_pmap) {
10563 			kasan_notify_address(*addr, copy->size);
10564 		}
10565 #endif
10566 	}
10567 
10568 	/*
10569 	 * Copyout the data from the kernel buffer to the target map.
10570 	 */
10571 	if (thread->map == map) {
10572 		/*
10573 		 * If the target map is the current map, just do
10574 		 * the copy.
10575 		 */
10576 		assert((vm_size_t)copy_size == copy_size);
10577 		if (copyout(copy->cpy_kdata, *addr, (vm_size_t)copy_size)) {
10578 			kr = KERN_INVALID_ADDRESS;
10579 		}
10580 	} else {
10581 		vm_map_t oldmap;
10582 
10583 		/*
10584 		 * If the target map is another map, assume the
10585 		 * target's address space identity for the duration
10586 		 * of the copy.
10587 		 */
10588 		vm_map_reference(map);
10589 		oldmap = vm_map_switch(map);
10590 
10591 		assert((vm_size_t)copy_size == copy_size);
10592 		if (copyout(copy->cpy_kdata, *addr, (vm_size_t)copy_size)) {
10593 			vm_map_copyout_kernel_buffer_failures++;
10594 			kr = KERN_INVALID_ADDRESS;
10595 		}
10596 
10597 		(void) vm_map_switch(oldmap);
10598 		vm_map_deallocate(map);
10599 	}
10600 
10601 	if (kr != KERN_SUCCESS) {
10602 		/* the copy failed, clean up */
10603 		if (!overwrite) {
10604 			/*
10605 			 * Deallocate the space we allocated in the target map.
10606 			 */
10607 			vm_map_remove(map,
10608 			    vm_map_trunc_page(*addr,
10609 			    VM_MAP_PAGE_MASK(map)),
10610 			    vm_map_round_page((*addr +
10611 			    vm_map_round_page(copy_size,
10612 			    VM_MAP_PAGE_MASK(map))),
10613 			    VM_MAP_PAGE_MASK(map)));
10614 			*addr = 0;
10615 		}
10616 	} else {
10617 		/* copy was successful, dicard the copy structure */
10618 		if (consume_on_success) {
10619 			kfree_data(copy->cpy_kdata, copy_size);
10620 			zfree(vm_map_copy_zone, copy);
10621 		}
10622 	}
10623 
10624 	return kr;
10625 }
10626 
10627 /*
10628  *	Routine:	vm_map_copy_insert      [internal use only]
10629  *
10630  *	Description:
10631  *		Link a copy chain ("copy") into a map at the
10632  *		specified location (after "where").
10633  *
10634  *		Callers of this function must call vm_map_copy_require on
10635  *		previously created vm_map_copy_t or pass a newly created
10636  *		one to ensure that it hasn't been forged.
10637  *	Side effects:
10638  *		The copy chain is destroyed.
10639  */
10640 static void
vm_map_copy_insert(vm_map_t map,vm_map_entry_t after_where,vm_map_copy_t copy)10641 vm_map_copy_insert(
10642 	vm_map_t        map,
10643 	vm_map_entry_t  after_where,
10644 	vm_map_copy_t   copy)
10645 {
10646 	vm_map_entry_t  entry;
10647 
10648 	while (vm_map_copy_first_entry(copy) != vm_map_copy_to_entry(copy)) {
10649 		entry = vm_map_copy_first_entry(copy);
10650 		vm_map_copy_entry_unlink(copy, entry);
10651 		vm_map_store_entry_link(map, after_where, entry,
10652 		    VM_MAP_KERNEL_FLAGS_NONE);
10653 		after_where = entry;
10654 	}
10655 	zfree(vm_map_copy_zone, copy);
10656 }
10657 
10658 /*
10659  * Callers of this function must call vm_map_copy_require on
10660  * previously created vm_map_copy_t or pass a newly created
10661  * one to ensure that it hasn't been forged.
10662  */
10663 void
vm_map_copy_remap(vm_map_t map,vm_map_entry_t where,vm_map_copy_t copy,vm_map_offset_t adjustment,vm_prot_t cur_prot,vm_prot_t max_prot,vm_inherit_t inheritance)10664 vm_map_copy_remap(
10665 	vm_map_t        map,
10666 	vm_map_entry_t  where,
10667 	vm_map_copy_t   copy,
10668 	vm_map_offset_t adjustment,
10669 	vm_prot_t       cur_prot,
10670 	vm_prot_t       max_prot,
10671 	vm_inherit_t    inheritance)
10672 {
10673 	vm_map_entry_t  copy_entry, new_entry;
10674 
10675 	for (copy_entry = vm_map_copy_first_entry(copy);
10676 	    copy_entry != vm_map_copy_to_entry(copy);
10677 	    copy_entry = copy_entry->vme_next) {
10678 		/* get a new VM map entry for the map */
10679 		new_entry = vm_map_entry_create(map);
10680 		/* copy the "copy entry" to the new entry */
10681 		vm_map_entry_copy(map, new_entry, copy_entry);
10682 		/* adjust "start" and "end" */
10683 		new_entry->vme_start += adjustment;
10684 		new_entry->vme_end += adjustment;
10685 		/* clear some attributes */
10686 		new_entry->inheritance = inheritance;
10687 		new_entry->protection = cur_prot;
10688 		new_entry->max_protection = max_prot;
10689 		new_entry->behavior = VM_BEHAVIOR_DEFAULT;
10690 		/* take an extra reference on the entry's "object" */
10691 		if (new_entry->is_sub_map) {
10692 			assert(!new_entry->use_pmap); /* not nested */
10693 			vm_map_reference(VME_SUBMAP(new_entry));
10694 		} else {
10695 			vm_object_reference(VME_OBJECT(new_entry));
10696 		}
10697 		/* insert the new entry in the map */
10698 		vm_map_store_entry_link(map, where, new_entry,
10699 		    VM_MAP_KERNEL_FLAGS_NONE);
10700 		/* continue inserting the "copy entries" after the new entry */
10701 		where = new_entry;
10702 	}
10703 }
10704 
10705 
10706 /*
10707  * Returns true if *size matches (or is in the range of) copy->size.
10708  * Upon returning true, the *size field is updated with the actual size of the
10709  * copy object (may be different for VM_MAP_COPY_ENTRY_LIST types)
10710  */
10711 boolean_t
vm_map_copy_validate_size(vm_map_t dst_map,vm_map_copy_t copy,vm_map_size_t * size)10712 vm_map_copy_validate_size(
10713 	vm_map_t                dst_map,
10714 	vm_map_copy_t           copy,
10715 	vm_map_size_t           *size)
10716 {
10717 	if (copy == VM_MAP_COPY_NULL) {
10718 		return FALSE;
10719 	}
10720 
10721 	/*
10722 	 * Assert that the vm_map_copy is coming from the right
10723 	 * zone and hasn't been forged
10724 	 */
10725 	vm_map_copy_require(copy);
10726 
10727 	vm_map_size_t copy_sz = copy->size;
10728 	vm_map_size_t sz = *size;
10729 	switch (copy->type) {
10730 	case VM_MAP_COPY_OBJECT:
10731 	case VM_MAP_COPY_KERNEL_BUFFER:
10732 		if (sz == copy_sz) {
10733 			return TRUE;
10734 		}
10735 		break;
10736 	case VM_MAP_COPY_ENTRY_LIST:
10737 		/*
10738 		 * potential page-size rounding prevents us from exactly
10739 		 * validating this flavor of vm_map_copy, but we can at least
10740 		 * assert that it's within a range.
10741 		 */
10742 		if (copy_sz >= sz &&
10743 		    copy_sz <= vm_map_round_page(sz, VM_MAP_PAGE_MASK(dst_map))) {
10744 			*size = copy_sz;
10745 			return TRUE;
10746 		}
10747 		break;
10748 	default:
10749 		break;
10750 	}
10751 	return FALSE;
10752 }
10753 
10754 /*
10755  *	Routine:	vm_map_copyout_size
10756  *
10757  *	Description:
10758  *		Copy out a copy chain ("copy") into newly-allocated
10759  *		space in the destination map. Uses a prevalidated
10760  *		size for the copy object (vm_map_copy_validate_size).
10761  *
10762  *		If successful, consumes the copy object.
10763  *		Otherwise, the caller is responsible for it.
10764  */
10765 kern_return_t
vm_map_copyout_size(vm_map_t dst_map,vm_map_address_t * dst_addr,vm_map_copy_t copy,vm_map_size_t copy_size)10766 vm_map_copyout_size(
10767 	vm_map_t                dst_map,
10768 	vm_map_address_t        *dst_addr,      /* OUT */
10769 	vm_map_copy_t           copy,
10770 	vm_map_size_t           copy_size)
10771 {
10772 	return vm_map_copyout_internal(dst_map, dst_addr, copy, copy_size,
10773 	           TRUE,                     /* consume_on_success */
10774 	           VM_PROT_DEFAULT,
10775 	           VM_PROT_ALL,
10776 	           VM_INHERIT_DEFAULT);
10777 }
10778 
10779 /*
10780  *	Routine:	vm_map_copyout
10781  *
10782  *	Description:
10783  *		Copy out a copy chain ("copy") into newly-allocated
10784  *		space in the destination map.
10785  *
10786  *		If successful, consumes the copy object.
10787  *		Otherwise, the caller is responsible for it.
10788  */
10789 kern_return_t
vm_map_copyout(vm_map_t dst_map,vm_map_address_t * dst_addr,vm_map_copy_t copy)10790 vm_map_copyout(
10791 	vm_map_t                dst_map,
10792 	vm_map_address_t        *dst_addr,      /* OUT */
10793 	vm_map_copy_t           copy)
10794 {
10795 	return vm_map_copyout_internal(dst_map, dst_addr, copy, copy ? copy->size : 0,
10796 	           TRUE,                     /* consume_on_success */
10797 	           VM_PROT_DEFAULT,
10798 	           VM_PROT_ALL,
10799 	           VM_INHERIT_DEFAULT);
10800 }
10801 
10802 kern_return_t
vm_map_copyout_internal(vm_map_t dst_map,vm_map_address_t * dst_addr,vm_map_copy_t copy,vm_map_size_t copy_size,boolean_t consume_on_success,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)10803 vm_map_copyout_internal(
10804 	vm_map_t                dst_map,
10805 	vm_map_address_t        *dst_addr,      /* OUT */
10806 	vm_map_copy_t           copy,
10807 	vm_map_size_t           copy_size,
10808 	boolean_t               consume_on_success,
10809 	vm_prot_t               cur_protection,
10810 	vm_prot_t               max_protection,
10811 	vm_inherit_t            inheritance)
10812 {
10813 	vm_map_size_t           size;
10814 	vm_map_size_t           adjustment;
10815 	vm_map_offset_t         start;
10816 	vm_object_offset_t      vm_copy_start;
10817 	vm_map_entry_t          last;
10818 	vm_map_entry_t          entry;
10819 	vm_map_copy_t           original_copy;
10820 	kern_return_t           kr;
10821 	vm_map_kernel_flags_t   vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
10822 
10823 	/*
10824 	 *	Check for null copy object.
10825 	 */
10826 
10827 	if (copy == VM_MAP_COPY_NULL) {
10828 		*dst_addr = 0;
10829 		return KERN_SUCCESS;
10830 	}
10831 
10832 	/*
10833 	 * Assert that the vm_map_copy is coming from the right
10834 	 * zone and hasn't been forged
10835 	 */
10836 	vm_map_copy_require(copy);
10837 
10838 	if (copy->size != copy_size) {
10839 		*dst_addr = 0;
10840 		return KERN_FAILURE;
10841 	}
10842 
10843 	/*
10844 	 *	Check for special copy object, created
10845 	 *	by vm_map_copyin_object.
10846 	 */
10847 
10848 	if (copy->type == VM_MAP_COPY_OBJECT) {
10849 		vm_object_t             object = copy->cpy_object;
10850 		vm_object_offset_t      offset;
10851 
10852 		offset = vm_object_trunc_page(copy->offset);
10853 		size = vm_map_round_page((copy_size +
10854 		    (vm_map_size_t)(copy->offset -
10855 		    offset)),
10856 		    VM_MAP_PAGE_MASK(dst_map));
10857 		*dst_addr = 0;
10858 		kr = vm_map_enter(dst_map, dst_addr, size,
10859 		    (vm_map_offset_t) 0, VM_FLAGS_ANYWHERE,
10860 		    VM_MAP_KERNEL_FLAGS_NONE,
10861 		    VM_KERN_MEMORY_NONE,
10862 		    object, offset, FALSE,
10863 		    VM_PROT_DEFAULT, VM_PROT_ALL,
10864 		    VM_INHERIT_DEFAULT);
10865 		if (kr != KERN_SUCCESS) {
10866 			return kr;
10867 		}
10868 		/* Account for non-pagealigned copy object */
10869 		*dst_addr += (vm_map_offset_t)(copy->offset - offset);
10870 		if (consume_on_success) {
10871 			zfree(vm_map_copy_zone, copy);
10872 		}
10873 		return KERN_SUCCESS;
10874 	}
10875 
10876 	/*
10877 	 *	Check for special kernel buffer allocated
10878 	 *	by new_ipc_kmsg_copyin.
10879 	 */
10880 
10881 	if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) {
10882 		return vm_map_copyout_kernel_buffer(dst_map, dst_addr,
10883 		           copy, copy_size, FALSE,
10884 		           consume_on_success);
10885 	}
10886 
10887 	original_copy = copy;
10888 	if (copy->cpy_hdr.page_shift != VM_MAP_PAGE_SHIFT(dst_map)) {
10889 		vm_map_copy_t target_copy;
10890 		vm_map_offset_t overmap_start, overmap_end, trimmed_start;
10891 
10892 		target_copy = VM_MAP_COPY_NULL;
10893 		DEBUG4K_ADJUST("adjusting...\n");
10894 		kr = vm_map_copy_adjust_to_target(
10895 			copy,
10896 			0, /* offset */
10897 			copy->size, /* size */
10898 			dst_map,
10899 			TRUE, /* copy */
10900 			&target_copy,
10901 			&overmap_start,
10902 			&overmap_end,
10903 			&trimmed_start);
10904 		if (kr != KERN_SUCCESS) {
10905 			DEBUG4K_COPY("adjust failed 0x%x\n", kr);
10906 			return kr;
10907 		}
10908 		DEBUG4K_COPY("copy %p (%d 0x%llx 0x%llx) dst_map %p (%d) target_copy %p (%d 0x%llx 0x%llx) overmap_start 0x%llx overmap_end 0x%llx trimmed_start 0x%llx\n", copy, copy->cpy_hdr.page_shift, copy->offset, (uint64_t)copy->size, dst_map, VM_MAP_PAGE_SHIFT(dst_map), target_copy, target_copy->cpy_hdr.page_shift, target_copy->offset, (uint64_t)target_copy->size, (uint64_t)overmap_start, (uint64_t)overmap_end, (uint64_t)trimmed_start);
10909 		if (target_copy != copy) {
10910 			copy = target_copy;
10911 		}
10912 		copy_size = copy->size;
10913 	}
10914 
10915 	/*
10916 	 *	Find space for the data
10917 	 */
10918 
10919 	vm_copy_start = vm_map_trunc_page((vm_map_size_t)copy->offset,
10920 	    VM_MAP_COPY_PAGE_MASK(copy));
10921 	size = vm_map_round_page((vm_map_size_t)copy->offset + copy_size,
10922 	    VM_MAP_COPY_PAGE_MASK(copy))
10923 	    - vm_copy_start;
10924 
10925 
10926 	if (dst_map == kernel_map) {
10927 		vmk_flags.vmkf_range_id = KMEM_RANGE_ID_DATA;
10928 	}
10929 
10930 	vm_map_lock(dst_map);
10931 	kr = vm_map_locate_space(dst_map, size, 0, vmk_flags,
10932 	    &start, &last);
10933 	if (kr != KERN_SUCCESS) {
10934 		vm_map_unlock(dst_map);
10935 		return kr;
10936 	}
10937 
10938 	adjustment = start - vm_copy_start;
10939 	if (!consume_on_success) {
10940 		/*
10941 		 * We're not allowed to consume "copy", so we'll have to
10942 		 * copy its map entries into the destination map below.
10943 		 * No need to re-allocate map entries from the correct
10944 		 * (pageable or not) zone, since we'll get new map entries
10945 		 * during the transfer.
10946 		 * We'll also adjust the map entries's "start" and "end"
10947 		 * during the transfer, to keep "copy"'s entries consistent
10948 		 * with its "offset".
10949 		 */
10950 		goto after_adjustments;
10951 	}
10952 
10953 	/*
10954 	 *	Since we're going to just drop the map
10955 	 *	entries from the copy into the destination
10956 	 *	map, they must come from the same pool.
10957 	 */
10958 
10959 	if (copy->cpy_hdr.entries_pageable != dst_map->hdr.entries_pageable) {
10960 		/*
10961 		 * Mismatches occur when dealing with the default
10962 		 * pager.
10963 		 */
10964 		vm_map_entry_t  next, new;
10965 
10966 		/*
10967 		 * Find the zone that the copies were allocated from
10968 		 */
10969 
10970 		entry = vm_map_copy_first_entry(copy);
10971 
10972 		/*
10973 		 * Reinitialize the copy so that vm_map_copy_entry_link
10974 		 * will work.
10975 		 */
10976 		vm_map_store_copy_reset(copy, entry);
10977 		copy->cpy_hdr.entries_pageable = dst_map->hdr.entries_pageable;
10978 
10979 		/*
10980 		 * Copy each entry.
10981 		 */
10982 		while (entry != vm_map_copy_to_entry(copy)) {
10983 			new = vm_map_copy_entry_create(copy);
10984 			vm_map_entry_copy_full(new, entry);
10985 			new->vme_no_copy_on_read = FALSE;
10986 			assert(!new->iokit_acct);
10987 			if (new->is_sub_map) {
10988 				/* clr address space specifics */
10989 				new->use_pmap = FALSE;
10990 			}
10991 			vm_map_copy_entry_link(copy,
10992 			    vm_map_copy_last_entry(copy),
10993 			    new);
10994 			next = entry->vme_next;
10995 			vm_map_entry_dispose(entry);
10996 			entry = next;
10997 		}
10998 	}
10999 
11000 	/*
11001 	 *	Adjust the addresses in the copy chain, and
11002 	 *	reset the region attributes.
11003 	 */
11004 
11005 	for (entry = vm_map_copy_first_entry(copy);
11006 	    entry != vm_map_copy_to_entry(copy);
11007 	    entry = entry->vme_next) {
11008 		if (VM_MAP_PAGE_SHIFT(dst_map) == PAGE_SHIFT) {
11009 			/*
11010 			 * We're injecting this copy entry into a map that
11011 			 * has the standard page alignment, so clear
11012 			 * "map_aligned" (which might have been inherited
11013 			 * from the original map entry).
11014 			 */
11015 			entry->map_aligned = FALSE;
11016 		}
11017 
11018 		entry->vme_start += adjustment;
11019 		entry->vme_end += adjustment;
11020 
11021 		if (entry->map_aligned) {
11022 			assert(VM_MAP_PAGE_ALIGNED(entry->vme_start,
11023 			    VM_MAP_PAGE_MASK(dst_map)));
11024 			assert(VM_MAP_PAGE_ALIGNED(entry->vme_end,
11025 			    VM_MAP_PAGE_MASK(dst_map)));
11026 		}
11027 
11028 		entry->inheritance = VM_INHERIT_DEFAULT;
11029 		entry->protection = VM_PROT_DEFAULT;
11030 		entry->max_protection = VM_PROT_ALL;
11031 		entry->behavior = VM_BEHAVIOR_DEFAULT;
11032 
11033 		/*
11034 		 * If the entry is now wired,
11035 		 * map the pages into the destination map.
11036 		 */
11037 		if (entry->wired_count != 0) {
11038 			vm_map_offset_t va;
11039 			vm_object_offset_t       offset;
11040 			vm_object_t object;
11041 			vm_prot_t prot;
11042 			int     type_of_fault;
11043 
11044 			/* TODO4K would need to use actual page size */
11045 			assert(VM_MAP_PAGE_SHIFT(dst_map) == PAGE_SHIFT);
11046 
11047 			object = VME_OBJECT(entry);
11048 			offset = VME_OFFSET(entry);
11049 			va = entry->vme_start;
11050 
11051 			pmap_pageable(dst_map->pmap,
11052 			    entry->vme_start,
11053 			    entry->vme_end,
11054 			    TRUE);
11055 
11056 			while (va < entry->vme_end) {
11057 				vm_page_t       m;
11058 				struct vm_object_fault_info fault_info = {};
11059 
11060 				/*
11061 				 * Look up the page in the object.
11062 				 * Assert that the page will be found in the
11063 				 * top object:
11064 				 * either
11065 				 *	the object was newly created by
11066 				 *	vm_object_copy_slowly, and has
11067 				 *	copies of all of the pages from
11068 				 *	the source object
11069 				 * or
11070 				 *	the object was moved from the old
11071 				 *	map entry; because the old map
11072 				 *	entry was wired, all of the pages
11073 				 *	were in the top-level object.
11074 				 *	(XXX not true if we wire pages for
11075 				 *	 reading)
11076 				 */
11077 				vm_object_lock(object);
11078 
11079 				m = vm_page_lookup(object, offset);
11080 				if (m == VM_PAGE_NULL || !VM_PAGE_WIRED(m) ||
11081 				    m->vmp_absent) {
11082 					panic("vm_map_copyout: wiring %p", m);
11083 				}
11084 
11085 				prot = entry->protection;
11086 
11087 				if (override_nx(dst_map, VME_ALIAS(entry)) &&
11088 				    prot) {
11089 					prot |= VM_PROT_EXECUTE;
11090 				}
11091 
11092 				type_of_fault = DBG_CACHE_HIT_FAULT;
11093 
11094 				fault_info.user_tag = VME_ALIAS(entry);
11095 				fault_info.pmap_options = 0;
11096 				if (entry->iokit_acct ||
11097 				    (!entry->is_sub_map && !entry->use_pmap)) {
11098 					fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
11099 				}
11100 
11101 				vm_fault_enter(m,
11102 				    dst_map->pmap,
11103 				    va,
11104 				    PAGE_SIZE, 0,
11105 				    prot,
11106 				    prot,
11107 				    VM_PAGE_WIRED(m),
11108 				    FALSE,            /* change_wiring */
11109 				    VM_KERN_MEMORY_NONE,            /* tag - not wiring */
11110 				    &fault_info,
11111 				    NULL,             /* need_retry */
11112 				    &type_of_fault);
11113 
11114 				vm_object_unlock(object);
11115 
11116 				offset += PAGE_SIZE_64;
11117 				va += PAGE_SIZE;
11118 			}
11119 		}
11120 	}
11121 
11122 after_adjustments:
11123 
11124 	/*
11125 	 *	Correct the page alignment for the result
11126 	 */
11127 
11128 	*dst_addr = start + (copy->offset - vm_copy_start);
11129 
11130 #if KASAN
11131 	kasan_notify_address(*dst_addr, size);
11132 #endif
11133 
11134 	/*
11135 	 *	Update the hints and the map size
11136 	 */
11137 
11138 	if (consume_on_success) {
11139 		SAVE_HINT_MAP_WRITE(dst_map, vm_map_copy_last_entry(copy));
11140 	} else {
11141 		SAVE_HINT_MAP_WRITE(dst_map, last);
11142 	}
11143 
11144 	dst_map->size += size;
11145 
11146 	/*
11147 	 *	Link in the copy
11148 	 */
11149 
11150 	if (consume_on_success) {
11151 		vm_map_copy_insert(dst_map, last, copy);
11152 		if (copy != original_copy) {
11153 			vm_map_copy_discard(original_copy);
11154 			original_copy = VM_MAP_COPY_NULL;
11155 		}
11156 	} else {
11157 		vm_map_copy_remap(dst_map, last, copy, adjustment,
11158 		    cur_protection, max_protection,
11159 		    inheritance);
11160 		if (copy != original_copy && original_copy != VM_MAP_COPY_NULL) {
11161 			vm_map_copy_discard(copy);
11162 			copy = original_copy;
11163 		}
11164 	}
11165 
11166 
11167 	vm_map_unlock(dst_map);
11168 
11169 	/*
11170 	 * XXX	If wiring_required, call vm_map_pageable
11171 	 */
11172 
11173 	return KERN_SUCCESS;
11174 }
11175 
11176 /*
11177  *	Routine:	vm_map_copyin
11178  *
11179  *	Description:
11180  *		see vm_map_copyin_common.  Exported via Unsupported.exports.
11181  *
11182  */
11183 
11184 #undef vm_map_copyin
11185 
11186 kern_return_t
vm_map_copyin(vm_map_t src_map,vm_map_address_t src_addr,vm_map_size_t len,boolean_t src_destroy,vm_map_copy_t * copy_result)11187 vm_map_copyin(
11188 	vm_map_t                        src_map,
11189 	vm_map_address_t        src_addr,
11190 	vm_map_size_t           len,
11191 	boolean_t                       src_destroy,
11192 	vm_map_copy_t           *copy_result)   /* OUT */
11193 {
11194 	return vm_map_copyin_common(src_map, src_addr, len, src_destroy,
11195 	           FALSE, copy_result, FALSE);
11196 }
11197 
11198 /*
11199  *	Routine:	vm_map_copyin_common
11200  *
11201  *	Description:
11202  *		Copy the specified region (src_addr, len) from the
11203  *		source address space (src_map), possibly removing
11204  *		the region from the source address space (src_destroy).
11205  *
11206  *	Returns:
11207  *		A vm_map_copy_t object (copy_result), suitable for
11208  *		insertion into another address space (using vm_map_copyout),
11209  *		copying over another address space region (using
11210  *		vm_map_copy_overwrite).  If the copy is unused, it
11211  *		should be destroyed (using vm_map_copy_discard).
11212  *
11213  *	In/out conditions:
11214  *		The source map should not be locked on entry.
11215  */
11216 
11217 typedef struct submap_map {
11218 	vm_map_t        parent_map;
11219 	vm_map_offset_t base_start;
11220 	vm_map_offset_t base_end;
11221 	vm_map_size_t   base_len;
11222 	struct submap_map *next;
11223 } submap_map_t;
11224 
11225 kern_return_t
vm_map_copyin_common(vm_map_t src_map,vm_map_address_t src_addr,vm_map_size_t len,boolean_t src_destroy,__unused boolean_t src_volatile,vm_map_copy_t * copy_result,boolean_t use_maxprot)11226 vm_map_copyin_common(
11227 	vm_map_t        src_map,
11228 	vm_map_address_t src_addr,
11229 	vm_map_size_t   len,
11230 	boolean_t       src_destroy,
11231 	__unused boolean_t      src_volatile,
11232 	vm_map_copy_t   *copy_result,   /* OUT */
11233 	boolean_t       use_maxprot)
11234 {
11235 	int flags;
11236 
11237 	flags = 0;
11238 	if (src_destroy) {
11239 		flags |= VM_MAP_COPYIN_SRC_DESTROY;
11240 	}
11241 	if (use_maxprot) {
11242 		flags |= VM_MAP_COPYIN_USE_MAXPROT;
11243 	}
11244 	return vm_map_copyin_internal(src_map,
11245 	           src_addr,
11246 	           len,
11247 	           flags,
11248 	           copy_result);
11249 }
11250 kern_return_t
vm_map_copyin_internal(vm_map_t src_map,vm_map_address_t src_addr,vm_map_size_t len,int flags,vm_map_copy_t * copy_result)11251 vm_map_copyin_internal(
11252 	vm_map_t        src_map,
11253 	vm_map_address_t src_addr,
11254 	vm_map_size_t   len,
11255 	int             flags,
11256 	vm_map_copy_t   *copy_result)   /* OUT */
11257 {
11258 	vm_map_entry_t  tmp_entry;      /* Result of last map lookup --
11259 	                                 * in multi-level lookup, this
11260 	                                 * entry contains the actual
11261 	                                 * vm_object/offset.
11262 	                                 */
11263 	vm_map_entry_t  new_entry = VM_MAP_ENTRY_NULL;  /* Map entry for copy */
11264 
11265 	vm_map_offset_t src_start;      /* Start of current entry --
11266 	                                 * where copy is taking place now
11267 	                                 */
11268 	vm_map_offset_t src_end;        /* End of entire region to be
11269 	                                 * copied */
11270 	vm_map_offset_t src_base;
11271 	vm_map_t        base_map = src_map;
11272 	boolean_t       map_share = FALSE;
11273 	submap_map_t    *parent_maps = NULL;
11274 
11275 	vm_map_copy_t   copy;           /* Resulting copy */
11276 	vm_map_address_t copy_addr;
11277 	vm_map_size_t   copy_size;
11278 	boolean_t       src_destroy;
11279 	boolean_t       use_maxprot;
11280 	boolean_t       preserve_purgeable;
11281 	boolean_t       entry_was_shared;
11282 	vm_map_entry_t  saved_src_entry;
11283 
11284 	if (flags & ~VM_MAP_COPYIN_ALL_FLAGS) {
11285 		return KERN_INVALID_ARGUMENT;
11286 	}
11287 
11288 #if CONFIG_KERNEL_TBI
11289 	if (src_map->pmap == kernel_pmap) {
11290 		src_addr = VM_KERNEL_TBI_FILL(src_addr);
11291 	}
11292 #endif /* CONFIG_KERNEL_TBI && KASAN_TBI */
11293 
11294 	src_destroy = (flags & VM_MAP_COPYIN_SRC_DESTROY) ? TRUE : FALSE;
11295 	use_maxprot = (flags & VM_MAP_COPYIN_USE_MAXPROT) ? TRUE : FALSE;
11296 	preserve_purgeable =
11297 	    (flags & VM_MAP_COPYIN_PRESERVE_PURGEABLE) ? TRUE : FALSE;
11298 
11299 	/*
11300 	 *	Check for copies of zero bytes.
11301 	 */
11302 
11303 	if (len == 0) {
11304 		*copy_result = VM_MAP_COPY_NULL;
11305 		return KERN_SUCCESS;
11306 	}
11307 
11308 	/*
11309 	 *	Check that the end address doesn't overflow
11310 	 */
11311 	src_end = src_addr + len;
11312 	if (src_end < src_addr) {
11313 		return KERN_INVALID_ADDRESS;
11314 	}
11315 
11316 	/*
11317 	 *	Compute (page aligned) start and end of region
11318 	 */
11319 	src_start = vm_map_trunc_page(src_addr,
11320 	    VM_MAP_PAGE_MASK(src_map));
11321 	src_end = vm_map_round_page(src_end,
11322 	    VM_MAP_PAGE_MASK(src_map));
11323 
11324 	/*
11325 	 * If the copy is sufficiently small, use a kernel buffer instead
11326 	 * of making a virtual copy.  The theory being that the cost of
11327 	 * setting up VM (and taking C-O-W faults) dominates the copy costs
11328 	 * for small regions.
11329 	 */
11330 	if ((len <= msg_ool_size_small) &&
11331 	    !use_maxprot &&
11332 	    !preserve_purgeable &&
11333 	    !(flags & VM_MAP_COPYIN_ENTRY_LIST) &&
11334 	    /*
11335 	     * Since the "msg_ool_size_small" threshold was increased and
11336 	     * vm_map_copyin_kernel_buffer() doesn't handle accesses beyond the
11337 	     * address space limits, we revert to doing a virtual copy if the
11338 	     * copied range goes beyond those limits.  Otherwise, mach_vm_read()
11339 	     * of the commpage would now fail when it used to work.
11340 	     */
11341 	    (src_start >= vm_map_min(src_map) &&
11342 	    src_start < vm_map_max(src_map) &&
11343 	    src_end >= vm_map_min(src_map) &&
11344 	    src_end < vm_map_max(src_map))) {
11345 		return vm_map_copyin_kernel_buffer(src_map, src_addr, len,
11346 		           src_destroy, copy_result);
11347 	}
11348 
11349 	/*
11350 	 *	Allocate a header element for the list.
11351 	 *
11352 	 *	Use the start and end in the header to
11353 	 *	remember the endpoints prior to rounding.
11354 	 */
11355 
11356 	copy = vm_map_copy_allocate();
11357 	copy->type = VM_MAP_COPY_ENTRY_LIST;
11358 	copy->cpy_hdr.entries_pageable = TRUE;
11359 	copy->cpy_hdr.page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(src_map);
11360 
11361 	vm_map_store_init( &(copy->cpy_hdr));
11362 
11363 	copy->offset = src_addr;
11364 	copy->size = len;
11365 
11366 	new_entry = vm_map_copy_entry_create(copy);
11367 
11368 #define RETURN(x)                                               \
11369 	MACRO_BEGIN                                             \
11370 	vm_map_unlock(src_map);                                 \
11371 	if(src_map != base_map)                                 \
11372 	        vm_map_deallocate(src_map);                     \
11373 	if (new_entry != VM_MAP_ENTRY_NULL)                     \
11374 	        vm_map_copy_entry_dispose(new_entry);           \
11375 	vm_map_copy_discard(copy);                              \
11376 	{                                                       \
11377 	        submap_map_t	*_ptr;                          \
11378                                                                 \
11379 	        for(_ptr = parent_maps; _ptr != NULL; _ptr = parent_maps) { \
11380 	                parent_maps=parent_maps->next;          \
11381 	                if (_ptr->parent_map != base_map)       \
11382 	                        vm_map_deallocate(_ptr->parent_map);    \
11383 	                kfree_type(submap_map_t, _ptr);         \
11384 	        }                                               \
11385 	}                                                       \
11386 	MACRO_RETURN(x);                                        \
11387 	MACRO_END
11388 
11389 	/*
11390 	 *	Find the beginning of the region.
11391 	 */
11392 
11393 	vm_map_lock(src_map);
11394 
11395 	/*
11396 	 * Lookup the original "src_addr" rather than the truncated
11397 	 * "src_start", in case "src_start" falls in a non-map-aligned
11398 	 * map entry *before* the map entry that contains "src_addr"...
11399 	 */
11400 	if (!vm_map_lookup_entry(src_map, src_addr, &tmp_entry)) {
11401 		RETURN(KERN_INVALID_ADDRESS);
11402 	}
11403 	if (!tmp_entry->is_sub_map) {
11404 		/*
11405 		 * ... but clip to the map-rounded "src_start" rather than
11406 		 * "src_addr" to preserve map-alignment.  We'll adjust the
11407 		 * first copy entry at the end, if needed.
11408 		 */
11409 		vm_map_clip_start(src_map, tmp_entry, src_start);
11410 	}
11411 	if (src_start < tmp_entry->vme_start) {
11412 		/*
11413 		 * Move "src_start" up to the start of the
11414 		 * first map entry to copy.
11415 		 */
11416 		src_start = tmp_entry->vme_start;
11417 	}
11418 	/* set for later submap fix-up */
11419 	copy_addr = src_start;
11420 
11421 	/*
11422 	 *	Go through entries until we get to the end.
11423 	 */
11424 
11425 	while (TRUE) {
11426 		vm_map_entry_t  src_entry = tmp_entry;  /* Top-level entry */
11427 		vm_map_size_t   src_size;               /* Size of source
11428 		                                         * map entry (in both
11429 		                                         * maps)
11430 		                                         */
11431 
11432 		vm_object_t             src_object;     /* Object to copy */
11433 		vm_object_offset_t      src_offset;
11434 
11435 		vm_object_t             new_copy_object;/* vm_object_copy_* result */
11436 
11437 		boolean_t       src_needs_copy;         /* Should source map
11438 		                                         * be made read-only
11439 		                                         * for copy-on-write?
11440 		                                         */
11441 
11442 		boolean_t       new_entry_needs_copy;   /* Will new entry be COW? */
11443 
11444 		boolean_t       was_wired;              /* Was source wired? */
11445 		boolean_t       saved_used_for_jit;     /* Saved used_for_jit. */
11446 		vm_map_version_t version;               /* Version before locks
11447 		                                         * dropped to make copy
11448 		                                         */
11449 		kern_return_t   result;                 /* Return value from
11450 		                                         * copy_strategically.
11451 		                                         */
11452 		while (tmp_entry->is_sub_map) {
11453 			vm_map_size_t submap_len;
11454 			submap_map_t *ptr;
11455 
11456 			ptr = kalloc_type(submap_map_t, Z_WAITOK);
11457 			ptr->next = parent_maps;
11458 			parent_maps = ptr;
11459 			ptr->parent_map = src_map;
11460 			ptr->base_start = src_start;
11461 			ptr->base_end = src_end;
11462 			submap_len = tmp_entry->vme_end - src_start;
11463 			if (submap_len > (src_end - src_start)) {
11464 				submap_len = src_end - src_start;
11465 			}
11466 			ptr->base_len = submap_len;
11467 
11468 			src_start -= tmp_entry->vme_start;
11469 			src_start += VME_OFFSET(tmp_entry);
11470 			src_end = src_start + submap_len;
11471 			src_map = VME_SUBMAP(tmp_entry);
11472 			vm_map_lock(src_map);
11473 			/* keep an outstanding reference for all maps in */
11474 			/* the parents tree except the base map */
11475 			vm_map_reference(src_map);
11476 			vm_map_unlock(ptr->parent_map);
11477 			if (!vm_map_lookup_entry(
11478 				    src_map, src_start, &tmp_entry)) {
11479 				RETURN(KERN_INVALID_ADDRESS);
11480 			}
11481 			map_share = TRUE;
11482 			if (!tmp_entry->is_sub_map) {
11483 				vm_map_clip_start(src_map, tmp_entry, src_start);
11484 			}
11485 			src_entry = tmp_entry;
11486 		}
11487 		/* we are now in the lowest level submap... */
11488 
11489 		if ((VME_OBJECT(tmp_entry) != VM_OBJECT_NULL) &&
11490 		    (VME_OBJECT(tmp_entry)->phys_contiguous)) {
11491 			/* This is not, supported for now.In future */
11492 			/* we will need to detect the phys_contig   */
11493 			/* condition and then upgrade copy_slowly   */
11494 			/* to do physical copy from the device mem  */
11495 			/* based object. We can piggy-back off of   */
11496 			/* the was wired boolean to set-up the      */
11497 			/* proper handling */
11498 			RETURN(KERN_PROTECTION_FAILURE);
11499 		}
11500 		/*
11501 		 *	Create a new address map entry to hold the result.
11502 		 *	Fill in the fields from the appropriate source entries.
11503 		 *	We must unlock the source map to do this if we need
11504 		 *	to allocate a map entry.
11505 		 */
11506 		if (new_entry == VM_MAP_ENTRY_NULL) {
11507 			version.main_timestamp = src_map->timestamp;
11508 			vm_map_unlock(src_map);
11509 
11510 			new_entry = vm_map_copy_entry_create(copy);
11511 
11512 			vm_map_lock(src_map);
11513 			if ((version.main_timestamp + 1) != src_map->timestamp) {
11514 				if (!vm_map_lookup_entry(src_map, src_start,
11515 				    &tmp_entry)) {
11516 					RETURN(KERN_INVALID_ADDRESS);
11517 				}
11518 				if (!tmp_entry->is_sub_map) {
11519 					vm_map_clip_start(src_map, tmp_entry, src_start);
11520 				}
11521 				continue; /* restart w/ new tmp_entry */
11522 			}
11523 		}
11524 
11525 		/*
11526 		 *	Verify that the region can be read.
11527 		 */
11528 		if (((src_entry->protection & VM_PROT_READ) == VM_PROT_NONE &&
11529 		    !use_maxprot) ||
11530 		    (src_entry->max_protection & VM_PROT_READ) == 0) {
11531 			RETURN(KERN_PROTECTION_FAILURE);
11532 		}
11533 
11534 		/*
11535 		 *	Clip against the endpoints of the entire region.
11536 		 */
11537 
11538 		vm_map_clip_end(src_map, src_entry, src_end);
11539 
11540 		src_size = src_entry->vme_end - src_start;
11541 		src_object = VME_OBJECT(src_entry);
11542 		src_offset = VME_OFFSET(src_entry);
11543 		was_wired = (src_entry->wired_count != 0);
11544 
11545 		vm_map_entry_copy(src_map, new_entry, src_entry);
11546 		if (new_entry->is_sub_map) {
11547 			/* clr address space specifics */
11548 			new_entry->use_pmap = FALSE;
11549 		} else {
11550 			/*
11551 			 * We're dealing with a copy-on-write operation,
11552 			 * so the resulting mapping should not inherit the
11553 			 * original mapping's accounting settings.
11554 			 * "iokit_acct" should have been cleared in
11555 			 * vm_map_entry_copy().
11556 			 * "use_pmap" should be reset to its default (TRUE)
11557 			 * so that the new mapping gets accounted for in
11558 			 * the task's memory footprint.
11559 			 */
11560 			assert(!new_entry->iokit_acct);
11561 			new_entry->use_pmap = TRUE;
11562 		}
11563 
11564 		/*
11565 		 *	Attempt non-blocking copy-on-write optimizations.
11566 		 */
11567 
11568 		/*
11569 		 * If we are destroying the source, and the object
11570 		 * is internal, we could move the object reference
11571 		 * from the source to the copy.  The copy is
11572 		 * copy-on-write only if the source is.
11573 		 * We make another reference to the object, because
11574 		 * destroying the source entry will deallocate it.
11575 		 *
11576 		 * This memory transfer has to be atomic, (to prevent
11577 		 * the VM object from being shared or copied while
11578 		 * it's being moved here), so we could only do this
11579 		 * if we won't have to unlock the VM map until the
11580 		 * original mapping has been fully removed.
11581 		 */
11582 
11583 RestartCopy:
11584 		if ((src_object == VM_OBJECT_NULL ||
11585 		    (!was_wired && !map_share && !tmp_entry->is_shared
11586 		    && !(debug4k_no_cow_copyin && VM_MAP_PAGE_SHIFT(src_map) < PAGE_SHIFT))) &&
11587 		    vm_object_copy_quickly(
11588 			    VME_OBJECT(new_entry),
11589 			    src_offset,
11590 			    src_size,
11591 			    &src_needs_copy,
11592 			    &new_entry_needs_copy)) {
11593 			new_entry->needs_copy = new_entry_needs_copy;
11594 
11595 			/*
11596 			 *	Handle copy-on-write obligations
11597 			 */
11598 
11599 			if (src_needs_copy && !tmp_entry->needs_copy) {
11600 				vm_prot_t prot;
11601 
11602 				prot = src_entry->protection & ~VM_PROT_WRITE;
11603 
11604 				if (override_nx(src_map, VME_ALIAS(src_entry))
11605 				    && prot) {
11606 					prot |= VM_PROT_EXECUTE;
11607 				}
11608 
11609 				vm_object_pmap_protect(
11610 					src_object,
11611 					src_offset,
11612 					src_size,
11613 					(src_entry->is_shared ?
11614 					PMAP_NULL
11615 					: src_map->pmap),
11616 					VM_MAP_PAGE_SIZE(src_map),
11617 					src_entry->vme_start,
11618 					prot);
11619 
11620 				assert(tmp_entry->wired_count == 0);
11621 				tmp_entry->needs_copy = TRUE;
11622 			}
11623 
11624 			/*
11625 			 *	The map has never been unlocked, so it's safe
11626 			 *	to move to the next entry rather than doing
11627 			 *	another lookup.
11628 			 */
11629 
11630 			goto CopySuccessful;
11631 		}
11632 
11633 		entry_was_shared = tmp_entry->is_shared;
11634 
11635 		/*
11636 		 *	Take an object reference, so that we may
11637 		 *	release the map lock(s).
11638 		 */
11639 
11640 		assert(src_object != VM_OBJECT_NULL);
11641 		vm_object_reference(src_object);
11642 
11643 		/*
11644 		 *	Record the timestamp for later verification.
11645 		 *	Unlock the map.
11646 		 */
11647 
11648 		version.main_timestamp = src_map->timestamp;
11649 		vm_map_unlock(src_map); /* Increments timestamp once! */
11650 		saved_src_entry = src_entry;
11651 		tmp_entry = VM_MAP_ENTRY_NULL;
11652 		src_entry = VM_MAP_ENTRY_NULL;
11653 
11654 		/*
11655 		 *	Perform the copy
11656 		 */
11657 
11658 		if (was_wired ||
11659 		    (debug4k_no_cow_copyin &&
11660 		    VM_MAP_PAGE_SHIFT(src_map) < PAGE_SHIFT)) {
11661 CopySlowly:
11662 			vm_object_lock(src_object);
11663 			result = vm_object_copy_slowly(
11664 				src_object,
11665 				src_offset,
11666 				src_size,
11667 				THREAD_UNINT,
11668 				&new_copy_object);
11669 			/* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
11670 			saved_used_for_jit = new_entry->used_for_jit;
11671 			VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
11672 			new_entry->used_for_jit = saved_used_for_jit;
11673 			VME_OFFSET_SET(new_entry,
11674 			    src_offset - vm_object_trunc_page(src_offset));
11675 			new_entry->needs_copy = FALSE;
11676 		} else if (src_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC &&
11677 		    (entry_was_shared || map_share)) {
11678 			vm_object_t new_object;
11679 
11680 			vm_object_lock_shared(src_object);
11681 			new_object = vm_object_copy_delayed(
11682 				src_object,
11683 				src_offset,
11684 				src_size,
11685 				TRUE);
11686 			if (new_object == VM_OBJECT_NULL) {
11687 				goto CopySlowly;
11688 			}
11689 
11690 			VME_OBJECT_SET(new_entry, new_object, false, 0);
11691 			assert(new_entry->wired_count == 0);
11692 			new_entry->needs_copy = TRUE;
11693 			assert(!new_entry->iokit_acct);
11694 			assert(new_object->purgable == VM_PURGABLE_DENY);
11695 			assertf(new_entry->use_pmap, "src_map %p new_entry %p\n", src_map, new_entry);
11696 			result = KERN_SUCCESS;
11697 		} else {
11698 			vm_object_offset_t new_offset;
11699 			new_offset = VME_OFFSET(new_entry);
11700 			result = vm_object_copy_strategically(src_object,
11701 			    src_offset,
11702 			    src_size,
11703 			    &new_copy_object,
11704 			    &new_offset,
11705 			    &new_entry_needs_copy);
11706 			/* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
11707 			saved_used_for_jit = new_entry->used_for_jit;
11708 			VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
11709 			new_entry->used_for_jit = saved_used_for_jit;
11710 			if (new_offset != VME_OFFSET(new_entry)) {
11711 				VME_OFFSET_SET(new_entry, new_offset);
11712 			}
11713 
11714 			new_entry->needs_copy = new_entry_needs_copy;
11715 		}
11716 
11717 		if (result == KERN_SUCCESS &&
11718 		    ((preserve_purgeable &&
11719 		    src_object->purgable != VM_PURGABLE_DENY) ||
11720 		    new_entry->used_for_jit)) {
11721 			/*
11722 			 * Purgeable objects should be COPY_NONE, true share;
11723 			 * this should be propogated to the copy.
11724 			 *
11725 			 * Also force mappings the pmap specially protects to
11726 			 * be COPY_NONE; trying to COW these mappings would
11727 			 * change the effective protections, which could have
11728 			 * side effects if the pmap layer relies on the
11729 			 * specified protections.
11730 			 */
11731 
11732 			vm_object_t     new_object;
11733 
11734 			new_object = VME_OBJECT(new_entry);
11735 			assert(new_object != src_object);
11736 			vm_object_lock(new_object);
11737 			assert(new_object->ref_count == 1);
11738 			assert(new_object->shadow == VM_OBJECT_NULL);
11739 			assert(new_object->copy == VM_OBJECT_NULL);
11740 			assert(new_object->vo_owner == NULL);
11741 
11742 			new_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
11743 
11744 			if (preserve_purgeable &&
11745 			    src_object->purgable != VM_PURGABLE_DENY) {
11746 				new_object->true_share = TRUE;
11747 
11748 				/* start as non-volatile with no owner... */
11749 				new_object->purgable = VM_PURGABLE_NONVOLATILE;
11750 				vm_purgeable_nonvolatile_enqueue(new_object, NULL);
11751 				/* ... and move to src_object's purgeable state */
11752 				if (src_object->purgable != VM_PURGABLE_NONVOLATILE) {
11753 					int state;
11754 					state = src_object->purgable;
11755 					vm_object_purgable_control(
11756 						new_object,
11757 						VM_PURGABLE_SET_STATE_FROM_KERNEL,
11758 						&state);
11759 				}
11760 				/* no pmap accounting for purgeable objects */
11761 				new_entry->use_pmap = FALSE;
11762 			}
11763 
11764 			vm_object_unlock(new_object);
11765 			new_object = VM_OBJECT_NULL;
11766 		}
11767 
11768 		if (result != KERN_SUCCESS &&
11769 		    result != KERN_MEMORY_RESTART_COPY) {
11770 			vm_map_lock(src_map);
11771 			RETURN(result);
11772 		}
11773 
11774 		/*
11775 		 *	Throw away the extra reference
11776 		 */
11777 
11778 		vm_object_deallocate(src_object);
11779 
11780 		/*
11781 		 *	Verify that the map has not substantially
11782 		 *	changed while the copy was being made.
11783 		 */
11784 
11785 		vm_map_lock(src_map);
11786 
11787 		if ((version.main_timestamp + 1) == src_map->timestamp) {
11788 			/* src_map hasn't changed: src_entry is still valid */
11789 			src_entry = saved_src_entry;
11790 			goto VerificationSuccessful;
11791 		}
11792 
11793 		/*
11794 		 *	Simple version comparison failed.
11795 		 *
11796 		 *	Retry the lookup and verify that the
11797 		 *	same object/offset are still present.
11798 		 *
11799 		 *	[Note: a memory manager that colludes with
11800 		 *	the calling task can detect that we have
11801 		 *	cheated.  While the map was unlocked, the
11802 		 *	mapping could have been changed and restored.]
11803 		 */
11804 
11805 		if (!vm_map_lookup_entry(src_map, src_start, &tmp_entry)) {
11806 			if (result != KERN_MEMORY_RESTART_COPY) {
11807 				vm_object_deallocate(VME_OBJECT(new_entry));
11808 				VME_OBJECT_SET(new_entry, VM_OBJECT_NULL, false, 0);
11809 				/* reset accounting state */
11810 				new_entry->iokit_acct = FALSE;
11811 				new_entry->use_pmap = TRUE;
11812 			}
11813 			RETURN(KERN_INVALID_ADDRESS);
11814 		}
11815 
11816 		src_entry = tmp_entry;
11817 		vm_map_clip_start(src_map, src_entry, src_start);
11818 
11819 		if ((((src_entry->protection & VM_PROT_READ) == VM_PROT_NONE) &&
11820 		    !use_maxprot) ||
11821 		    ((src_entry->max_protection & VM_PROT_READ) == 0)) {
11822 			goto VerificationFailed;
11823 		}
11824 
11825 		if (src_entry->vme_end < new_entry->vme_end) {
11826 			/*
11827 			 * This entry might have been shortened
11828 			 * (vm_map_clip_end) or been replaced with
11829 			 * an entry that ends closer to "src_start"
11830 			 * than before.
11831 			 * Adjust "new_entry" accordingly; copying
11832 			 * less memory would be correct but we also
11833 			 * redo the copy (see below) if the new entry
11834 			 * no longer points at the same object/offset.
11835 			 */
11836 			assert(VM_MAP_PAGE_ALIGNED(src_entry->vme_end,
11837 			    VM_MAP_COPY_PAGE_MASK(copy)));
11838 			new_entry->vme_end = src_entry->vme_end;
11839 			src_size = new_entry->vme_end - src_start;
11840 		} else if (src_entry->vme_end > new_entry->vme_end) {
11841 			/*
11842 			 * This entry might have been extended
11843 			 * (vm_map_entry_simplify() or coalesce)
11844 			 * or been replaced with an entry that ends farther
11845 			 * from "src_start" than before.
11846 			 *
11847 			 * We've called vm_object_copy_*() only on
11848 			 * the previous <start:end> range, so we can't
11849 			 * just extend new_entry.  We have to re-do
11850 			 * the copy based on the new entry as if it was
11851 			 * pointing at a different object/offset (see
11852 			 * "Verification failed" below).
11853 			 */
11854 		}
11855 
11856 		if ((VME_OBJECT(src_entry) != src_object) ||
11857 		    (VME_OFFSET(src_entry) != src_offset) ||
11858 		    (src_entry->vme_end > new_entry->vme_end)) {
11859 			/*
11860 			 *	Verification failed.
11861 			 *
11862 			 *	Start over with this top-level entry.
11863 			 */
11864 
11865 VerificationFailed:     ;
11866 
11867 			vm_object_deallocate(VME_OBJECT(new_entry));
11868 			tmp_entry = src_entry;
11869 			continue;
11870 		}
11871 
11872 		/*
11873 		 *	Verification succeeded.
11874 		 */
11875 
11876 VerificationSuccessful:;
11877 
11878 		if (result == KERN_MEMORY_RESTART_COPY) {
11879 			goto RestartCopy;
11880 		}
11881 
11882 		/*
11883 		 *	Copy succeeded.
11884 		 */
11885 
11886 CopySuccessful: ;
11887 
11888 		/*
11889 		 *	Link in the new copy entry.
11890 		 */
11891 
11892 		vm_map_copy_entry_link(copy, vm_map_copy_last_entry(copy),
11893 		    new_entry);
11894 
11895 		/*
11896 		 *	Determine whether the entire region
11897 		 *	has been copied.
11898 		 */
11899 		src_base = src_start;
11900 		src_start = new_entry->vme_end;
11901 		new_entry = VM_MAP_ENTRY_NULL;
11902 		while ((src_start >= src_end) && (src_end != 0)) {
11903 			submap_map_t    *ptr;
11904 
11905 			if (src_map == base_map) {
11906 				/* back to the top */
11907 				break;
11908 			}
11909 
11910 			ptr = parent_maps;
11911 			assert(ptr != NULL);
11912 			parent_maps = parent_maps->next;
11913 
11914 			/* fix up the damage we did in that submap */
11915 			vm_map_simplify_range(src_map,
11916 			    src_base,
11917 			    src_end);
11918 
11919 			vm_map_unlock(src_map);
11920 			vm_map_deallocate(src_map);
11921 			vm_map_lock(ptr->parent_map);
11922 			src_map = ptr->parent_map;
11923 			src_base = ptr->base_start;
11924 			src_start = ptr->base_start + ptr->base_len;
11925 			src_end = ptr->base_end;
11926 			if (!vm_map_lookup_entry(src_map,
11927 			    src_start,
11928 			    &tmp_entry) &&
11929 			    (src_end > src_start)) {
11930 				RETURN(KERN_INVALID_ADDRESS);
11931 			}
11932 			kfree_type(submap_map_t, ptr);
11933 			if (parent_maps == NULL) {
11934 				map_share = FALSE;
11935 			}
11936 			src_entry = tmp_entry->vme_prev;
11937 		}
11938 
11939 		if ((VM_MAP_PAGE_SHIFT(src_map) != PAGE_SHIFT) &&
11940 		    (src_start >= src_addr + len) &&
11941 		    (src_addr + len != 0)) {
11942 			/*
11943 			 * Stop copying now, even though we haven't reached
11944 			 * "src_end".  We'll adjust the end of the last copy
11945 			 * entry at the end, if needed.
11946 			 *
11947 			 * If src_map's aligment is different from the
11948 			 * system's page-alignment, there could be
11949 			 * extra non-map-aligned map entries between
11950 			 * the original (non-rounded) "src_addr + len"
11951 			 * and the rounded "src_end".
11952 			 * We do not want to copy those map entries since
11953 			 * they're not part of the copied range.
11954 			 */
11955 			break;
11956 		}
11957 
11958 		if ((src_start >= src_end) && (src_end != 0)) {
11959 			break;
11960 		}
11961 
11962 		/*
11963 		 *	Verify that there are no gaps in the region
11964 		 */
11965 
11966 		tmp_entry = src_entry->vme_next;
11967 		if ((tmp_entry->vme_start != src_start) ||
11968 		    (tmp_entry == vm_map_to_entry(src_map))) {
11969 			RETURN(KERN_INVALID_ADDRESS);
11970 		}
11971 	}
11972 
11973 	/*
11974 	 * If the source should be destroyed, do it now, since the
11975 	 * copy was successful.
11976 	 */
11977 	if (src_destroy) {
11978 		(void)vm_map_remove_and_unlock(src_map,
11979 		    vm_map_trunc_page(src_addr, VM_MAP_PAGE_MASK(src_map)),
11980 		    src_end,
11981 		    ((src_map == kernel_map) ?
11982 		    VM_MAP_REMOVE_KUNWIRE :
11983 		    VM_MAP_REMOVE_NO_FLAGS),
11984 		    KMEM_GUARD_NONE);
11985 	} else {
11986 		/* fix up the damage we did in the base map */
11987 		vm_map_simplify_range(
11988 			src_map,
11989 			vm_map_trunc_page(src_addr,
11990 			VM_MAP_PAGE_MASK(src_map)),
11991 			vm_map_round_page(src_end,
11992 			VM_MAP_PAGE_MASK(src_map)));
11993 		vm_map_unlock(src_map);
11994 	}
11995 
11996 	tmp_entry = VM_MAP_ENTRY_NULL;
11997 
11998 	if (VM_MAP_PAGE_SHIFT(src_map) > PAGE_SHIFT &&
11999 	    VM_MAP_PAGE_SHIFT(src_map) != VM_MAP_COPY_PAGE_SHIFT(copy)) {
12000 		vm_map_offset_t original_start, original_offset, original_end;
12001 
12002 		assert(VM_MAP_COPY_PAGE_MASK(copy) == PAGE_MASK);
12003 
12004 		/* adjust alignment of first copy_entry's "vme_start" */
12005 		tmp_entry = vm_map_copy_first_entry(copy);
12006 		if (tmp_entry != vm_map_copy_to_entry(copy)) {
12007 			vm_map_offset_t adjustment;
12008 
12009 			original_start = tmp_entry->vme_start;
12010 			original_offset = VME_OFFSET(tmp_entry);
12011 
12012 			/* map-align the start of the first copy entry... */
12013 			adjustment = (tmp_entry->vme_start -
12014 			    vm_map_trunc_page(
12015 				    tmp_entry->vme_start,
12016 				    VM_MAP_PAGE_MASK(src_map)));
12017 			tmp_entry->vme_start -= adjustment;
12018 			VME_OFFSET_SET(tmp_entry,
12019 			    VME_OFFSET(tmp_entry) - adjustment);
12020 			copy_addr -= adjustment;
12021 			assert(tmp_entry->vme_start < tmp_entry->vme_end);
12022 			/* ... adjust for mis-aligned start of copy range */
12023 			adjustment =
12024 			    (vm_map_trunc_page(copy->offset,
12025 			    PAGE_MASK) -
12026 			    vm_map_trunc_page(copy->offset,
12027 			    VM_MAP_PAGE_MASK(src_map)));
12028 			if (adjustment) {
12029 				assert(page_aligned(adjustment));
12030 				assert(adjustment < VM_MAP_PAGE_SIZE(src_map));
12031 				tmp_entry->vme_start += adjustment;
12032 				VME_OFFSET_SET(tmp_entry,
12033 				    (VME_OFFSET(tmp_entry) +
12034 				    adjustment));
12035 				copy_addr += adjustment;
12036 				assert(tmp_entry->vme_start < tmp_entry->vme_end);
12037 			}
12038 
12039 			/*
12040 			 * Assert that the adjustments haven't exposed
12041 			 * more than was originally copied...
12042 			 */
12043 			assert(tmp_entry->vme_start >= original_start);
12044 			assert(VME_OFFSET(tmp_entry) >= original_offset);
12045 			/*
12046 			 * ... and that it did not adjust outside of a
12047 			 * a single 16K page.
12048 			 */
12049 			assert(vm_map_trunc_page(tmp_entry->vme_start,
12050 			    VM_MAP_PAGE_MASK(src_map)) ==
12051 			    vm_map_trunc_page(original_start,
12052 			    VM_MAP_PAGE_MASK(src_map)));
12053 		}
12054 
12055 		/* adjust alignment of last copy_entry's "vme_end" */
12056 		tmp_entry = vm_map_copy_last_entry(copy);
12057 		if (tmp_entry != vm_map_copy_to_entry(copy)) {
12058 			vm_map_offset_t adjustment;
12059 
12060 			original_end = tmp_entry->vme_end;
12061 
12062 			/* map-align the end of the last copy entry... */
12063 			tmp_entry->vme_end =
12064 			    vm_map_round_page(tmp_entry->vme_end,
12065 			    VM_MAP_PAGE_MASK(src_map));
12066 			/* ... adjust for mis-aligned end of copy range */
12067 			adjustment =
12068 			    (vm_map_round_page((copy->offset +
12069 			    copy->size),
12070 			    VM_MAP_PAGE_MASK(src_map)) -
12071 			    vm_map_round_page((copy->offset +
12072 			    copy->size),
12073 			    PAGE_MASK));
12074 			if (adjustment) {
12075 				assert(page_aligned(adjustment));
12076 				assert(adjustment < VM_MAP_PAGE_SIZE(src_map));
12077 				tmp_entry->vme_end -= adjustment;
12078 				assert(tmp_entry->vme_start < tmp_entry->vme_end);
12079 			}
12080 
12081 			/*
12082 			 * Assert that the adjustments haven't exposed
12083 			 * more than was originally copied...
12084 			 */
12085 			assert(tmp_entry->vme_end <= original_end);
12086 			/*
12087 			 * ... and that it did not adjust outside of a
12088 			 * a single 16K page.
12089 			 */
12090 			assert(vm_map_round_page(tmp_entry->vme_end,
12091 			    VM_MAP_PAGE_MASK(src_map)) ==
12092 			    vm_map_round_page(original_end,
12093 			    VM_MAP_PAGE_MASK(src_map)));
12094 		}
12095 	}
12096 
12097 	/* Fix-up start and end points in copy.  This is necessary */
12098 	/* when the various entries in the copy object were picked */
12099 	/* up from different sub-maps */
12100 
12101 	tmp_entry = vm_map_copy_first_entry(copy);
12102 	copy_size = 0; /* compute actual size */
12103 	while (tmp_entry != vm_map_copy_to_entry(copy)) {
12104 		assert(VM_MAP_PAGE_ALIGNED(
12105 			    copy_addr + (tmp_entry->vme_end -
12106 			    tmp_entry->vme_start),
12107 			    MIN(VM_MAP_COPY_PAGE_MASK(copy), PAGE_MASK)));
12108 		assert(VM_MAP_PAGE_ALIGNED(
12109 			    copy_addr,
12110 			    MIN(VM_MAP_COPY_PAGE_MASK(copy), PAGE_MASK)));
12111 
12112 		/*
12113 		 * The copy_entries will be injected directly into the
12114 		 * destination map and might not be "map aligned" there...
12115 		 */
12116 		tmp_entry->map_aligned = FALSE;
12117 
12118 		tmp_entry->vme_end = copy_addr +
12119 		    (tmp_entry->vme_end - tmp_entry->vme_start);
12120 		tmp_entry->vme_start = copy_addr;
12121 		assert(tmp_entry->vme_start < tmp_entry->vme_end);
12122 		copy_addr += tmp_entry->vme_end - tmp_entry->vme_start;
12123 		copy_size += tmp_entry->vme_end - tmp_entry->vme_start;
12124 		tmp_entry = (struct vm_map_entry *)tmp_entry->vme_next;
12125 	}
12126 
12127 	if (VM_MAP_PAGE_SHIFT(src_map) != PAGE_SHIFT &&
12128 	    copy_size < copy->size) {
12129 		/*
12130 		 * The actual size of the VM map copy is smaller than what
12131 		 * was requested by the caller.  This must be because some
12132 		 * PAGE_SIZE-sized pages are missing at the end of the last
12133 		 * VM_MAP_PAGE_SIZE(src_map)-sized chunk of the range.
12134 		 * The caller might not have been aware of those missing
12135 		 * pages and might not want to be aware of it, which is
12136 		 * fine as long as they don't try to access (and crash on)
12137 		 * those missing pages.
12138 		 * Let's adjust the size of the "copy", to avoid failing
12139 		 * in vm_map_copyout() or vm_map_copy_overwrite().
12140 		 */
12141 		assert(vm_map_round_page(copy_size,
12142 		    VM_MAP_PAGE_MASK(src_map)) ==
12143 		    vm_map_round_page(copy->size,
12144 		    VM_MAP_PAGE_MASK(src_map)));
12145 		copy->size = copy_size;
12146 	}
12147 
12148 	*copy_result = copy;
12149 	return KERN_SUCCESS;
12150 
12151 #undef  RETURN
12152 }
12153 
12154 kern_return_t
vm_map_copy_extract(vm_map_t src_map,vm_map_address_t src_addr,vm_map_size_t len,boolean_t do_copy,vm_map_copy_t * copy_result,vm_prot_t * cur_prot,vm_prot_t * max_prot,vm_inherit_t inheritance,vm_map_kernel_flags_t vmk_flags)12155 vm_map_copy_extract(
12156 	vm_map_t                src_map,
12157 	vm_map_address_t        src_addr,
12158 	vm_map_size_t           len,
12159 	boolean_t               do_copy,
12160 	vm_map_copy_t           *copy_result,   /* OUT */
12161 	vm_prot_t               *cur_prot,      /* IN/OUT */
12162 	vm_prot_t               *max_prot,      /* IN/OUT */
12163 	vm_inherit_t            inheritance,
12164 	vm_map_kernel_flags_t   vmk_flags)
12165 {
12166 	vm_map_copy_t   copy;
12167 	kern_return_t   kr;
12168 	vm_prot_t required_cur_prot, required_max_prot;
12169 
12170 	/*
12171 	 *	Check for copies of zero bytes.
12172 	 */
12173 
12174 	if (len == 0) {
12175 		*copy_result = VM_MAP_COPY_NULL;
12176 		return KERN_SUCCESS;
12177 	}
12178 
12179 	/*
12180 	 *	Check that the end address doesn't overflow
12181 	 */
12182 	if (src_addr + len < src_addr) {
12183 		return KERN_INVALID_ADDRESS;
12184 	}
12185 
12186 	if (VM_MAP_PAGE_SIZE(src_map) < PAGE_SIZE) {
12187 		DEBUG4K_SHARE("src_map %p src_addr 0x%llx src_end 0x%llx\n", src_map, (uint64_t)src_addr, (uint64_t)(src_addr + len));
12188 	}
12189 
12190 	required_cur_prot = *cur_prot;
12191 	required_max_prot = *max_prot;
12192 
12193 	/*
12194 	 *	Allocate a header element for the list.
12195 	 *
12196 	 *	Use the start and end in the header to
12197 	 *	remember the endpoints prior to rounding.
12198 	 */
12199 
12200 	copy = vm_map_copy_allocate();
12201 	copy->type = VM_MAP_COPY_ENTRY_LIST;
12202 	copy->cpy_hdr.entries_pageable = vmk_flags.vmkf_copy_pageable;
12203 
12204 	vm_map_store_init(&copy->cpy_hdr);
12205 
12206 	copy->offset = 0;
12207 	copy->size = len;
12208 
12209 	kr = vm_map_remap_extract(src_map,
12210 	    src_addr,
12211 	    len,
12212 	    do_copy,             /* copy */
12213 	    &copy->cpy_hdr,
12214 	    cur_prot,            /* IN/OUT */
12215 	    max_prot,            /* IN/OUT */
12216 	    inheritance,
12217 	    vmk_flags);
12218 	if (kr != KERN_SUCCESS) {
12219 		vm_map_copy_discard(copy);
12220 		return kr;
12221 	}
12222 	if (required_cur_prot != VM_PROT_NONE) {
12223 		assert((*cur_prot & required_cur_prot) == required_cur_prot);
12224 		assert((*max_prot & required_max_prot) == required_max_prot);
12225 	}
12226 
12227 	*copy_result = copy;
12228 	return KERN_SUCCESS;
12229 }
12230 
12231 /*
12232  *	vm_map_copyin_object:
12233  *
12234  *	Create a copy object from an object.
12235  *	Our caller donates an object reference.
12236  */
12237 
12238 kern_return_t
vm_map_copyin_object(vm_object_t object,vm_object_offset_t offset,vm_object_size_t size,vm_map_copy_t * copy_result)12239 vm_map_copyin_object(
12240 	vm_object_t             object,
12241 	vm_object_offset_t      offset, /* offset of region in object */
12242 	vm_object_size_t        size,   /* size of region in object */
12243 	vm_map_copy_t   *copy_result)   /* OUT */
12244 {
12245 	vm_map_copy_t   copy;           /* Resulting copy */
12246 
12247 	/*
12248 	 *	We drop the object into a special copy object
12249 	 *	that contains the object directly.
12250 	 */
12251 
12252 	copy = vm_map_copy_allocate();
12253 	copy->type = VM_MAP_COPY_OBJECT;
12254 	copy->cpy_object = object;
12255 	copy->offset = offset;
12256 	copy->size = size;
12257 
12258 	*copy_result = copy;
12259 	return KERN_SUCCESS;
12260 }
12261 
12262 static void
vm_map_fork_share(vm_map_t old_map,vm_map_entry_t old_entry,vm_map_t new_map)12263 vm_map_fork_share(
12264 	vm_map_t        old_map,
12265 	vm_map_entry_t  old_entry,
12266 	vm_map_t        new_map)
12267 {
12268 	vm_object_t     object;
12269 	vm_map_entry_t  new_entry;
12270 
12271 	/*
12272 	 *	New sharing code.  New map entry
12273 	 *	references original object.  Internal
12274 	 *	objects use asynchronous copy algorithm for
12275 	 *	future copies.  First make sure we have
12276 	 *	the right object.  If we need a shadow,
12277 	 *	or someone else already has one, then
12278 	 *	make a new shadow and share it.
12279 	 */
12280 
12281 	if (!old_entry->is_sub_map) {
12282 		object = VME_OBJECT(old_entry);
12283 	}
12284 
12285 	if (old_entry->is_sub_map) {
12286 		assert(old_entry->wired_count == 0);
12287 #ifndef NO_NESTED_PMAP
12288 		if (old_entry->use_pmap) {
12289 			kern_return_t   result;
12290 
12291 			result = pmap_nest(new_map->pmap,
12292 			    (VME_SUBMAP(old_entry))->pmap,
12293 			    (addr64_t)old_entry->vme_start,
12294 			    (uint64_t)(old_entry->vme_end - old_entry->vme_start));
12295 			if (result) {
12296 				panic("vm_map_fork_share: pmap_nest failed!");
12297 			}
12298 		}
12299 #endif  /* NO_NESTED_PMAP */
12300 	} else if (object == VM_OBJECT_NULL) {
12301 		object = vm_object_allocate((vm_map_size_t)(old_entry->vme_end -
12302 		    old_entry->vme_start));
12303 		VME_OFFSET_SET(old_entry, 0);
12304 		VME_OBJECT_SET(old_entry, object, false, 0);
12305 		old_entry->use_pmap = TRUE;
12306 //		assert(!old_entry->needs_copy);
12307 	} else if (object->copy_strategy !=
12308 	    MEMORY_OBJECT_COPY_SYMMETRIC) {
12309 		/*
12310 		 *	We are already using an asymmetric
12311 		 *	copy, and therefore we already have
12312 		 *	the right object.
12313 		 */
12314 
12315 		assert(!old_entry->needs_copy);
12316 	} else if (old_entry->needs_copy ||       /* case 1 */
12317 	    object->shadowed ||                 /* case 2 */
12318 	    (!object->true_share &&             /* case 3 */
12319 	    !old_entry->is_shared &&
12320 	    (object->vo_size >
12321 	    (vm_map_size_t)(old_entry->vme_end -
12322 	    old_entry->vme_start)))) {
12323 		/*
12324 		 *	We need to create a shadow.
12325 		 *	There are three cases here.
12326 		 *	In the first case, we need to
12327 		 *	complete a deferred symmetrical
12328 		 *	copy that we participated in.
12329 		 *	In the second and third cases,
12330 		 *	we need to create the shadow so
12331 		 *	that changes that we make to the
12332 		 *	object do not interfere with
12333 		 *	any symmetrical copies which
12334 		 *	have occured (case 2) or which
12335 		 *	might occur (case 3).
12336 		 *
12337 		 *	The first case is when we had
12338 		 *	deferred shadow object creation
12339 		 *	via the entry->needs_copy mechanism.
12340 		 *	This mechanism only works when
12341 		 *	only one entry points to the source
12342 		 *	object, and we are about to create
12343 		 *	a second entry pointing to the
12344 		 *	same object. The problem is that
12345 		 *	there is no way of mapping from
12346 		 *	an object to the entries pointing
12347 		 *	to it. (Deferred shadow creation
12348 		 *	works with one entry because occurs
12349 		 *	at fault time, and we walk from the
12350 		 *	entry to the object when handling
12351 		 *	the fault.)
12352 		 *
12353 		 *	The second case is when the object
12354 		 *	to be shared has already been copied
12355 		 *	with a symmetric copy, but we point
12356 		 *	directly to the object without
12357 		 *	needs_copy set in our entry. (This
12358 		 *	can happen because different ranges
12359 		 *	of an object can be pointed to by
12360 		 *	different entries. In particular,
12361 		 *	a single entry pointing to an object
12362 		 *	can be split by a call to vm_inherit,
12363 		 *	which, combined with task_create, can
12364 		 *	result in the different entries
12365 		 *	having different needs_copy values.)
12366 		 *	The shadowed flag in the object allows
12367 		 *	us to detect this case. The problem
12368 		 *	with this case is that if this object
12369 		 *	has or will have shadows, then we
12370 		 *	must not perform an asymmetric copy
12371 		 *	of this object, since such a copy
12372 		 *	allows the object to be changed, which
12373 		 *	will break the previous symmetrical
12374 		 *	copies (which rely upon the object
12375 		 *	not changing). In a sense, the shadowed
12376 		 *	flag says "don't change this object".
12377 		 *	We fix this by creating a shadow
12378 		 *	object for this object, and sharing
12379 		 *	that. This works because we are free
12380 		 *	to change the shadow object (and thus
12381 		 *	to use an asymmetric copy strategy);
12382 		 *	this is also semantically correct,
12383 		 *	since this object is temporary, and
12384 		 *	therefore a copy of the object is
12385 		 *	as good as the object itself. (This
12386 		 *	is not true for permanent objects,
12387 		 *	since the pager needs to see changes,
12388 		 *	which won't happen if the changes
12389 		 *	are made to a copy.)
12390 		 *
12391 		 *	The third case is when the object
12392 		 *	to be shared has parts sticking
12393 		 *	outside of the entry we're working
12394 		 *	with, and thus may in the future
12395 		 *	be subject to a symmetrical copy.
12396 		 *	(This is a preemptive version of
12397 		 *	case 2.)
12398 		 */
12399 		VME_OBJECT_SHADOW(old_entry,
12400 		    (vm_map_size_t) (old_entry->vme_end -
12401 		    old_entry->vme_start));
12402 
12403 		/*
12404 		 *	If we're making a shadow for other than
12405 		 *	copy on write reasons, then we have
12406 		 *	to remove write permission.
12407 		 */
12408 
12409 		if (!old_entry->needs_copy &&
12410 		    (old_entry->protection & VM_PROT_WRITE)) {
12411 			vm_prot_t prot;
12412 
12413 			assert(!pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, old_entry->protection));
12414 
12415 			prot = old_entry->protection & ~VM_PROT_WRITE;
12416 
12417 			assert(!pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, prot));
12418 
12419 			if (override_nx(old_map, VME_ALIAS(old_entry)) && prot) {
12420 				prot |= VM_PROT_EXECUTE;
12421 			}
12422 
12423 
12424 			if (old_map->mapped_in_other_pmaps) {
12425 				vm_object_pmap_protect(
12426 					VME_OBJECT(old_entry),
12427 					VME_OFFSET(old_entry),
12428 					(old_entry->vme_end -
12429 					old_entry->vme_start),
12430 					PMAP_NULL,
12431 					PAGE_SIZE,
12432 					old_entry->vme_start,
12433 					prot);
12434 			} else {
12435 				pmap_protect(old_map->pmap,
12436 				    old_entry->vme_start,
12437 				    old_entry->vme_end,
12438 				    prot);
12439 			}
12440 		}
12441 
12442 		old_entry->needs_copy = FALSE;
12443 		object = VME_OBJECT(old_entry);
12444 	}
12445 
12446 
12447 	/*
12448 	 *	If object was using a symmetric copy strategy,
12449 	 *	change its copy strategy to the default
12450 	 *	asymmetric copy strategy, which is copy_delay
12451 	 *	in the non-norma case and copy_call in the
12452 	 *	norma case. Bump the reference count for the
12453 	 *	new entry.
12454 	 */
12455 
12456 	if (old_entry->is_sub_map) {
12457 		vm_map_reference(VME_SUBMAP(old_entry));
12458 	} else {
12459 		vm_object_lock(object);
12460 		vm_object_reference_locked(object);
12461 		if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
12462 			object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
12463 		}
12464 		vm_object_unlock(object);
12465 	}
12466 
12467 	/*
12468 	 *	Clone the entry, using object ref from above.
12469 	 *	Mark both entries as shared.
12470 	 */
12471 
12472 	new_entry = vm_map_entry_create(new_map); /* Never the kernel map or descendants */
12473 	vm_map_entry_copy(old_map, new_entry, old_entry);
12474 	old_entry->is_shared = TRUE;
12475 	new_entry->is_shared = TRUE;
12476 
12477 	/*
12478 	 * We're dealing with a shared mapping, so the resulting mapping
12479 	 * should inherit some of the original mapping's accounting settings.
12480 	 * "iokit_acct" should have been cleared in vm_map_entry_copy().
12481 	 * "use_pmap" should stay the same as before (if it hasn't been reset
12482 	 * to TRUE when we cleared "iokit_acct").
12483 	 */
12484 	assert(!new_entry->iokit_acct);
12485 
12486 	/*
12487 	 *	If old entry's inheritence is VM_INHERIT_NONE,
12488 	 *	the new entry is for corpse fork, remove the
12489 	 *	write permission from the new entry.
12490 	 */
12491 	if (old_entry->inheritance == VM_INHERIT_NONE) {
12492 		new_entry->protection &= ~VM_PROT_WRITE;
12493 		new_entry->max_protection &= ~VM_PROT_WRITE;
12494 	}
12495 
12496 	/*
12497 	 *	Insert the entry into the new map -- we
12498 	 *	know we're inserting at the end of the new
12499 	 *	map.
12500 	 */
12501 
12502 	vm_map_store_entry_link(new_map, vm_map_last_entry(new_map), new_entry,
12503 	    VM_MAP_KERNEL_FLAGS_NONE);
12504 
12505 	/*
12506 	 *	Update the physical map
12507 	 */
12508 
12509 	if (old_entry->is_sub_map) {
12510 		/* Bill Angell pmap support goes here */
12511 	} else {
12512 		pmap_copy(new_map->pmap, old_map->pmap, new_entry->vme_start,
12513 		    old_entry->vme_end - old_entry->vme_start,
12514 		    old_entry->vme_start);
12515 	}
12516 }
12517 
12518 static boolean_t
vm_map_fork_copy(vm_map_t old_map,vm_map_entry_t * old_entry_p,vm_map_t new_map,int vm_map_copyin_flags)12519 vm_map_fork_copy(
12520 	vm_map_t        old_map,
12521 	vm_map_entry_t  *old_entry_p,
12522 	vm_map_t        new_map,
12523 	int             vm_map_copyin_flags)
12524 {
12525 	vm_map_entry_t old_entry = *old_entry_p;
12526 	vm_map_size_t entry_size = old_entry->vme_end - old_entry->vme_start;
12527 	vm_map_offset_t start = old_entry->vme_start;
12528 	vm_map_copy_t copy;
12529 	vm_map_entry_t last = vm_map_last_entry(new_map);
12530 
12531 	vm_map_unlock(old_map);
12532 	/*
12533 	 *	Use maxprot version of copyin because we
12534 	 *	care about whether this memory can ever
12535 	 *	be accessed, not just whether it's accessible
12536 	 *	right now.
12537 	 */
12538 	vm_map_copyin_flags |= VM_MAP_COPYIN_USE_MAXPROT;
12539 	if (vm_map_copyin_internal(old_map, start, entry_size,
12540 	    vm_map_copyin_flags, &copy)
12541 	    != KERN_SUCCESS) {
12542 		/*
12543 		 *	The map might have changed while it
12544 		 *	was unlocked, check it again.  Skip
12545 		 *	any blank space or permanently
12546 		 *	unreadable region.
12547 		 */
12548 		vm_map_lock(old_map);
12549 		if (!vm_map_lookup_entry(old_map, start, &last) ||
12550 		    (last->max_protection & VM_PROT_READ) == VM_PROT_NONE) {
12551 			last = last->vme_next;
12552 		}
12553 		*old_entry_p = last;
12554 
12555 		/*
12556 		 * XXX	For some error returns, want to
12557 		 * XXX	skip to the next element.  Note
12558 		 *	that INVALID_ADDRESS and
12559 		 *	PROTECTION_FAILURE are handled above.
12560 		 */
12561 
12562 		return FALSE;
12563 	}
12564 
12565 	/*
12566 	 * Assert that the vm_map_copy is coming from the right
12567 	 * zone and hasn't been forged
12568 	 */
12569 	vm_map_copy_require(copy);
12570 
12571 	/*
12572 	 *	Insert the copy into the new map
12573 	 */
12574 	vm_map_copy_insert(new_map, last, copy);
12575 
12576 	/*
12577 	 *	Pick up the traversal at the end of
12578 	 *	the copied region.
12579 	 */
12580 
12581 	vm_map_lock(old_map);
12582 	start += entry_size;
12583 	if (!vm_map_lookup_entry(old_map, start, &last)) {
12584 		last = last->vme_next;
12585 	} else {
12586 		if (last->vme_start == start) {
12587 			/*
12588 			 * No need to clip here and we don't
12589 			 * want to cause any unnecessary
12590 			 * unnesting...
12591 			 */
12592 		} else {
12593 			vm_map_clip_start(old_map, last, start);
12594 		}
12595 	}
12596 	*old_entry_p = last;
12597 
12598 	return TRUE;
12599 }
12600 
12601 /*
12602  *	vm_map_fork:
12603  *
12604  *	Create and return a new map based on the old
12605  *	map, according to the inheritance values on the
12606  *	regions in that map and the options.
12607  *
12608  *	The source map must not be locked.
12609  */
12610 vm_map_t
vm_map_fork(ledger_t ledger,vm_map_t old_map,int options)12611 vm_map_fork(
12612 	ledger_t        ledger,
12613 	vm_map_t        old_map,
12614 	int             options)
12615 {
12616 	pmap_t          new_pmap;
12617 	vm_map_t        new_map;
12618 	vm_map_entry_t  old_entry;
12619 	vm_map_size_t   new_size = 0, entry_size;
12620 	vm_map_entry_t  new_entry;
12621 	boolean_t       src_needs_copy;
12622 	boolean_t       new_entry_needs_copy;
12623 	boolean_t       pmap_is64bit;
12624 	int             vm_map_copyin_flags;
12625 	vm_inherit_t    old_entry_inheritance;
12626 	int             map_create_options;
12627 	kern_return_t   footprint_collect_kr;
12628 
12629 	if (options & ~(VM_MAP_FORK_SHARE_IF_INHERIT_NONE |
12630 	    VM_MAP_FORK_PRESERVE_PURGEABLE |
12631 	    VM_MAP_FORK_CORPSE_FOOTPRINT)) {
12632 		/* unsupported option */
12633 		return VM_MAP_NULL;
12634 	}
12635 
12636 	pmap_is64bit =
12637 #if defined(__i386__) || defined(__x86_64__)
12638 	    old_map->pmap->pm_task_map != TASK_MAP_32BIT;
12639 #elif defined(__arm64__)
12640 	    old_map->pmap->is_64bit;
12641 #elif defined(__arm__)
12642 	    FALSE;
12643 #else
12644 #error Unknown architecture.
12645 #endif
12646 
12647 	unsigned int pmap_flags = 0;
12648 	pmap_flags |= pmap_is64bit ? PMAP_CREATE_64BIT : 0;
12649 #if defined(HAS_APPLE_PAC)
12650 	pmap_flags |= old_map->pmap->disable_jop ? PMAP_CREATE_DISABLE_JOP : 0;
12651 #endif
12652 #if PMAP_CREATE_FORCE_4K_PAGES
12653 	if (VM_MAP_PAGE_SIZE(old_map) == FOURK_PAGE_SIZE &&
12654 	    PAGE_SIZE != FOURK_PAGE_SIZE) {
12655 		pmap_flags |= PMAP_CREATE_FORCE_4K_PAGES;
12656 	}
12657 #endif /* PMAP_CREATE_FORCE_4K_PAGES */
12658 	new_pmap = pmap_create_options(ledger, (vm_map_size_t) 0, pmap_flags);
12659 	if (new_pmap == NULL) {
12660 		return VM_MAP_NULL;
12661 	}
12662 
12663 	vm_map_reference(old_map);
12664 	vm_map_lock(old_map);
12665 
12666 	map_create_options = 0;
12667 	if (old_map->hdr.entries_pageable) {
12668 		map_create_options |= VM_MAP_CREATE_PAGEABLE;
12669 	}
12670 	if (options & VM_MAP_FORK_CORPSE_FOOTPRINT) {
12671 		map_create_options |= VM_MAP_CREATE_CORPSE_FOOTPRINT;
12672 		footprint_collect_kr = KERN_SUCCESS;
12673 	}
12674 	new_map = vm_map_create_options(new_pmap,
12675 	    old_map->min_offset,
12676 	    old_map->max_offset,
12677 	    map_create_options);
12678 	/* inherit cs_enforcement */
12679 	vm_map_cs_enforcement_set(new_map, old_map->cs_enforcement);
12680 	vm_map_lock(new_map);
12681 	vm_commit_pagezero_status(new_map);
12682 	/* inherit the parent map's page size */
12683 	vm_map_set_page_shift(new_map, VM_MAP_PAGE_SHIFT(old_map));
12684 
12685 	/* ensure PMAP_CS structures are prepared for the fork */
12686 	pmap_cs_fork_prepare(old_map->pmap, new_pmap);
12687 
12688 	for (old_entry = vm_map_first_entry(old_map); old_entry != vm_map_to_entry(old_map);) {
12689 		/*
12690 		 * Abort any corpse collection if the system is shutting down.
12691 		 */
12692 		if ((options & VM_MAP_FORK_CORPSE_FOOTPRINT) &&
12693 		    get_system_inshutdown()) {
12694 			vm_map_corpse_footprint_collect_done(new_map);
12695 			vm_map_unlock(new_map);
12696 			vm_map_unlock(old_map);
12697 			vm_map_deallocate(new_map);
12698 			vm_map_deallocate(old_map);
12699 			printf("Aborting corpse map due to system shutdown\n");
12700 			return VM_MAP_NULL;
12701 		}
12702 
12703 		entry_size = old_entry->vme_end - old_entry->vme_start;
12704 
12705 		old_entry_inheritance = old_entry->inheritance;
12706 		/*
12707 		 * If caller used the VM_MAP_FORK_SHARE_IF_INHERIT_NONE option
12708 		 * share VM_INHERIT_NONE entries that are not backed by a
12709 		 * device pager.
12710 		 */
12711 		if (old_entry_inheritance == VM_INHERIT_NONE &&
12712 		    (options & VM_MAP_FORK_SHARE_IF_INHERIT_NONE) &&
12713 		    (old_entry->protection & VM_PROT_READ) &&
12714 		    !(!old_entry->is_sub_map &&
12715 		    VME_OBJECT(old_entry) != NULL &&
12716 		    VME_OBJECT(old_entry)->pager != NULL &&
12717 		    is_device_pager_ops(
12718 			    VME_OBJECT(old_entry)->pager->mo_pager_ops))) {
12719 			old_entry_inheritance = VM_INHERIT_SHARE;
12720 		}
12721 
12722 		if (old_entry_inheritance != VM_INHERIT_NONE &&
12723 		    (options & VM_MAP_FORK_CORPSE_FOOTPRINT) &&
12724 		    footprint_collect_kr == KERN_SUCCESS) {
12725 			/*
12726 			 * The corpse won't have old_map->pmap to query
12727 			 * footprint information, so collect that data now
12728 			 * and store it in new_map->vmmap_corpse_footprint
12729 			 * for later autopsy.
12730 			 */
12731 			footprint_collect_kr =
12732 			    vm_map_corpse_footprint_collect(old_map,
12733 			    old_entry,
12734 			    new_map);
12735 		}
12736 
12737 		switch (old_entry_inheritance) {
12738 		case VM_INHERIT_NONE:
12739 			break;
12740 
12741 		case VM_INHERIT_SHARE:
12742 			vm_map_fork_share(old_map, old_entry, new_map);
12743 			new_size += entry_size;
12744 			break;
12745 
12746 		case VM_INHERIT_COPY:
12747 
12748 			/*
12749 			 *	Inline the copy_quickly case;
12750 			 *	upon failure, fall back on call
12751 			 *	to vm_map_fork_copy.
12752 			 */
12753 
12754 			if (old_entry->is_sub_map) {
12755 				break;
12756 			}
12757 			if ((old_entry->wired_count != 0) ||
12758 			    ((VME_OBJECT(old_entry) != NULL) &&
12759 			    (VME_OBJECT(old_entry)->true_share))) {
12760 				goto slow_vm_map_fork_copy;
12761 			}
12762 
12763 			new_entry = vm_map_entry_create(new_map); /* never the kernel map or descendants */
12764 			vm_map_entry_copy(old_map, new_entry, old_entry);
12765 			if (old_entry->permanent) {
12766 				/* inherit "permanent" on fork() */
12767 				new_entry->permanent = TRUE;
12768 			}
12769 
12770 			if (new_entry->used_for_jit == TRUE && new_map->jit_entry_exists == FALSE) {
12771 				new_map->jit_entry_exists = TRUE;
12772 			}
12773 
12774 			if (new_entry->is_sub_map) {
12775 				/* clear address space specifics */
12776 				new_entry->use_pmap = FALSE;
12777 			} else {
12778 				/*
12779 				 * We're dealing with a copy-on-write operation,
12780 				 * so the resulting mapping should not inherit
12781 				 * the original mapping's accounting settings.
12782 				 * "iokit_acct" should have been cleared in
12783 				 * vm_map_entry_copy().
12784 				 * "use_pmap" should be reset to its default
12785 				 * (TRUE) so that the new mapping gets
12786 				 * accounted for in the task's memory footprint.
12787 				 */
12788 				assert(!new_entry->iokit_acct);
12789 				new_entry->use_pmap = TRUE;
12790 			}
12791 
12792 			if (!vm_object_copy_quickly(
12793 				    VME_OBJECT(new_entry),
12794 				    VME_OFFSET(old_entry),
12795 				    (old_entry->vme_end -
12796 				    old_entry->vme_start),
12797 				    &src_needs_copy,
12798 				    &new_entry_needs_copy)) {
12799 				vm_map_entry_dispose(new_entry);
12800 				goto slow_vm_map_fork_copy;
12801 			}
12802 
12803 			/*
12804 			 *	Handle copy-on-write obligations
12805 			 */
12806 
12807 			if (src_needs_copy && !old_entry->needs_copy) {
12808 				vm_prot_t prot;
12809 
12810 				assert(!pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, old_entry->protection));
12811 
12812 				prot = old_entry->protection & ~VM_PROT_WRITE;
12813 
12814 				if (override_nx(old_map, VME_ALIAS(old_entry))
12815 				    && prot) {
12816 					prot |= VM_PROT_EXECUTE;
12817 				}
12818 
12819 				assert(!pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, prot));
12820 
12821 				vm_object_pmap_protect(
12822 					VME_OBJECT(old_entry),
12823 					VME_OFFSET(old_entry),
12824 					(old_entry->vme_end -
12825 					old_entry->vme_start),
12826 					((old_entry->is_shared
12827 					|| old_map->mapped_in_other_pmaps)
12828 					? PMAP_NULL :
12829 					old_map->pmap),
12830 					VM_MAP_PAGE_SIZE(old_map),
12831 					old_entry->vme_start,
12832 					prot);
12833 
12834 				assert(old_entry->wired_count == 0);
12835 				old_entry->needs_copy = TRUE;
12836 			}
12837 			new_entry->needs_copy = new_entry_needs_copy;
12838 
12839 			/*
12840 			 *	Insert the entry at the end
12841 			 *	of the map.
12842 			 */
12843 
12844 			vm_map_store_entry_link(new_map,
12845 			    vm_map_last_entry(new_map),
12846 			    new_entry,
12847 			    VM_MAP_KERNEL_FLAGS_NONE);
12848 			new_size += entry_size;
12849 			break;
12850 
12851 slow_vm_map_fork_copy:
12852 			vm_map_copyin_flags = 0;
12853 			if (options & VM_MAP_FORK_PRESERVE_PURGEABLE) {
12854 				vm_map_copyin_flags |=
12855 				    VM_MAP_COPYIN_PRESERVE_PURGEABLE;
12856 			}
12857 			if (vm_map_fork_copy(old_map,
12858 			    &old_entry,
12859 			    new_map,
12860 			    vm_map_copyin_flags)) {
12861 				new_size += entry_size;
12862 			}
12863 			continue;
12864 		}
12865 		old_entry = old_entry->vme_next;
12866 	}
12867 
12868 #if defined(__arm64__)
12869 	pmap_insert_sharedpage(new_map->pmap);
12870 #endif /* __arm64__ */
12871 
12872 	new_map->size = new_size;
12873 
12874 	if (options & VM_MAP_FORK_CORPSE_FOOTPRINT) {
12875 		vm_map_corpse_footprint_collect_done(new_map);
12876 	}
12877 
12878 	/* Propagate JIT entitlement for the pmap layer. */
12879 	if (pmap_get_jit_entitled(old_map->pmap)) {
12880 		/* Tell the pmap that it supports JIT. */
12881 		pmap_set_jit_entitled(new_map->pmap);
12882 	}
12883 
12884 	vm_map_unlock(new_map);
12885 	vm_map_unlock(old_map);
12886 	vm_map_deallocate(old_map);
12887 
12888 	return new_map;
12889 }
12890 
12891 /*
12892  * vm_map_exec:
12893  *
12894  *      Setup the "new_map" with the proper execution environment according
12895  *	to the type of executable (platform, 64bit, chroot environment).
12896  *	Map the comm page and shared region, etc...
12897  */
12898 kern_return_t
vm_map_exec(vm_map_t new_map,task_t task,boolean_t is64bit,void * fsroot,cpu_type_t cpu,cpu_subtype_t cpu_subtype,boolean_t reslide,boolean_t is_driverkit)12899 vm_map_exec(
12900 	vm_map_t        new_map,
12901 	task_t          task,
12902 	boolean_t       is64bit,
12903 	void            *fsroot,
12904 	cpu_type_t      cpu,
12905 	cpu_subtype_t   cpu_subtype,
12906 	boolean_t       reslide,
12907 	boolean_t       is_driverkit)
12908 {
12909 	SHARED_REGION_TRACE_DEBUG(
12910 		("shared_region: task %p: vm_map_exec(%p,%p,%p,0x%x,0x%x): ->\n",
12911 		(void *)VM_KERNEL_ADDRPERM(current_task()),
12912 		(void *)VM_KERNEL_ADDRPERM(new_map),
12913 		(void *)VM_KERNEL_ADDRPERM(task),
12914 		(void *)VM_KERNEL_ADDRPERM(fsroot),
12915 		cpu,
12916 		cpu_subtype));
12917 	(void) vm_commpage_enter(new_map, task, is64bit);
12918 
12919 	(void) vm_shared_region_enter(new_map, task, is64bit, fsroot, cpu, cpu_subtype, reslide, is_driverkit);
12920 
12921 	SHARED_REGION_TRACE_DEBUG(
12922 		("shared_region: task %p: vm_map_exec(%p,%p,%p,0x%x,0x%x): <-\n",
12923 		(void *)VM_KERNEL_ADDRPERM(current_task()),
12924 		(void *)VM_KERNEL_ADDRPERM(new_map),
12925 		(void *)VM_KERNEL_ADDRPERM(task),
12926 		(void *)VM_KERNEL_ADDRPERM(fsroot),
12927 		cpu,
12928 		cpu_subtype));
12929 
12930 	/*
12931 	 * Some devices have region(s) of memory that shouldn't get allocated by
12932 	 * user processes. The following code creates dummy vm_map_entry_t's for each
12933 	 * of the regions that needs to be reserved to prevent any allocations in
12934 	 * those regions.
12935 	 */
12936 	kern_return_t kr = KERN_FAILURE;
12937 	vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
12938 	vmk_flags.vmkf_permanent = TRUE;
12939 	vmk_flags.vmkf_beyond_max = TRUE;
12940 
12941 	struct vm_reserved_region *regions = NULL;
12942 	size_t num_regions = ml_get_vm_reserved_regions(is64bit, &regions);
12943 	assert((num_regions == 0) || (num_regions > 0 && regions != NULL));
12944 
12945 	for (size_t i = 0; i < num_regions; ++i) {
12946 		kr = vm_map_enter(
12947 			new_map,
12948 			&regions[i].vmrr_addr,
12949 			regions[i].vmrr_size,
12950 			(vm_map_offset_t)0,
12951 			VM_FLAGS_FIXED,
12952 			vmk_flags,
12953 			VM_KERN_MEMORY_NONE,
12954 			VM_OBJECT_NULL,
12955 			(vm_object_offset_t)0,
12956 			FALSE,
12957 			VM_PROT_NONE,
12958 			VM_PROT_NONE,
12959 			VM_INHERIT_COPY);
12960 
12961 		if (kr != KERN_SUCCESS) {
12962 			panic("Failed to reserve %s region in user map %p %d", regions[i].vmrr_name, new_map, kr);
12963 		}
12964 	}
12965 
12966 	new_map->reserved_regions = (num_regions ? TRUE : FALSE);
12967 
12968 	return KERN_SUCCESS;
12969 }
12970 
12971 uint64_t vm_map_lookup_locked_copy_slowly_count = 0;
12972 uint64_t vm_map_lookup_locked_copy_slowly_size = 0;
12973 uint64_t vm_map_lookup_locked_copy_slowly_max = 0;
12974 uint64_t vm_map_lookup_locked_copy_slowly_restart = 0;
12975 uint64_t vm_map_lookup_locked_copy_slowly_error = 0;
12976 uint64_t vm_map_lookup_locked_copy_strategically_count = 0;
12977 uint64_t vm_map_lookup_locked_copy_strategically_size = 0;
12978 uint64_t vm_map_lookup_locked_copy_strategically_max = 0;
12979 uint64_t vm_map_lookup_locked_copy_strategically_restart = 0;
12980 uint64_t vm_map_lookup_locked_copy_strategically_error = 0;
12981 uint64_t vm_map_lookup_locked_copy_shadow_count = 0;
12982 uint64_t vm_map_lookup_locked_copy_shadow_size = 0;
12983 uint64_t vm_map_lookup_locked_copy_shadow_max = 0;
12984 /*
12985  *	vm_map_lookup_locked:
12986  *
12987  *	Finds the VM object, offset, and
12988  *	protection for a given virtual address in the
12989  *	specified map, assuming a page fault of the
12990  *	type specified.
12991  *
12992  *	Returns the (object, offset, protection) for
12993  *	this address, whether it is wired down, and whether
12994  *	this map has the only reference to the data in question.
12995  *	In order to later verify this lookup, a "version"
12996  *	is returned.
12997  *	If contended != NULL, *contended will be set to
12998  *	true iff the thread had to spin or block to acquire
12999  *	an exclusive lock.
13000  *
13001  *	The map MUST be locked by the caller and WILL be
13002  *	locked on exit.  In order to guarantee the
13003  *	existence of the returned object, it is returned
13004  *	locked.
13005  *
13006  *	If a lookup is requested with "write protection"
13007  *	specified, the map may be changed to perform virtual
13008  *	copying operations, although the data referenced will
13009  *	remain the same.
13010  */
13011 kern_return_t
vm_map_lookup_locked(vm_map_t * var_map,vm_map_offset_t vaddr,vm_prot_t fault_type,int object_lock_type,vm_map_version_t * out_version,vm_object_t * object,vm_object_offset_t * offset,vm_prot_t * out_prot,boolean_t * wired,vm_object_fault_info_t fault_info,vm_map_t * real_map,bool * contended)13012 vm_map_lookup_locked(
13013 	vm_map_t                *var_map,       /* IN/OUT */
13014 	vm_map_offset_t         vaddr,
13015 	vm_prot_t               fault_type,
13016 	int                     object_lock_type,
13017 	vm_map_version_t        *out_version,   /* OUT */
13018 	vm_object_t             *object,        /* OUT */
13019 	vm_object_offset_t      *offset,        /* OUT */
13020 	vm_prot_t               *out_prot,      /* OUT */
13021 	boolean_t               *wired,         /* OUT */
13022 	vm_object_fault_info_t  fault_info,     /* OUT */
13023 	vm_map_t                *real_map,      /* OUT */
13024 	bool                    *contended)     /* OUT */
13025 {
13026 	vm_map_entry_t                  entry;
13027 	vm_map_t                        map = *var_map;
13028 	vm_map_t                        old_map = *var_map;
13029 	vm_map_t                        cow_sub_map_parent = VM_MAP_NULL;
13030 	vm_map_offset_t                 cow_parent_vaddr = 0;
13031 	vm_map_offset_t                 old_start = 0;
13032 	vm_map_offset_t                 old_end = 0;
13033 	vm_prot_t                       prot;
13034 	boolean_t                       mask_protections;
13035 	boolean_t                       force_copy;
13036 	boolean_t                       no_force_copy_if_executable;
13037 	boolean_t                       submap_needed_copy;
13038 	vm_prot_t                       original_fault_type;
13039 	vm_map_size_t                   fault_page_mask;
13040 
13041 	/*
13042 	 * VM_PROT_MASK means that the caller wants us to use "fault_type"
13043 	 * as a mask against the mapping's actual protections, not as an
13044 	 * absolute value.
13045 	 */
13046 	mask_protections = (fault_type & VM_PROT_IS_MASK) ? TRUE : FALSE;
13047 	force_copy = (fault_type & VM_PROT_COPY) ? TRUE : FALSE;
13048 	no_force_copy_if_executable = (fault_type & VM_PROT_COPY_FAIL_IF_EXECUTABLE) ? TRUE : FALSE;
13049 	fault_type &= VM_PROT_ALL;
13050 	original_fault_type = fault_type;
13051 	if (contended) {
13052 		*contended = false;
13053 	}
13054 
13055 	*real_map = map;
13056 
13057 	fault_page_mask = MIN(VM_MAP_PAGE_MASK(map), PAGE_MASK);
13058 	vaddr = VM_MAP_TRUNC_PAGE(vaddr, fault_page_mask);
13059 
13060 RetryLookup:
13061 	fault_type = original_fault_type;
13062 
13063 	/*
13064 	 *	If the map has an interesting hint, try it before calling
13065 	 *	full blown lookup routine.
13066 	 */
13067 	entry = map->hint;
13068 
13069 	if ((entry == vm_map_to_entry(map)) ||
13070 	    (vaddr < entry->vme_start) || (vaddr >= entry->vme_end)) {
13071 		vm_map_entry_t  tmp_entry;
13072 
13073 		/*
13074 		 *	Entry was either not a valid hint, or the vaddr
13075 		 *	was not contained in the entry, so do a full lookup.
13076 		 */
13077 		if (!vm_map_lookup_entry(map, vaddr, &tmp_entry)) {
13078 			if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
13079 				vm_map_unlock(cow_sub_map_parent);
13080 			}
13081 			if ((*real_map != map)
13082 			    && (*real_map != cow_sub_map_parent)) {
13083 				vm_map_unlock(*real_map);
13084 			}
13085 			return KERN_INVALID_ADDRESS;
13086 		}
13087 
13088 		entry = tmp_entry;
13089 	}
13090 	if (map == old_map) {
13091 		old_start = entry->vme_start;
13092 		old_end = entry->vme_end;
13093 	}
13094 
13095 	/*
13096 	 *	Handle submaps.  Drop lock on upper map, submap is
13097 	 *	returned locked.
13098 	 */
13099 
13100 	submap_needed_copy = FALSE;
13101 submap_recurse:
13102 	if (entry->is_sub_map) {
13103 		vm_map_offset_t         local_vaddr;
13104 		vm_map_offset_t         end_delta;
13105 		vm_map_offset_t         start_delta;
13106 		vm_map_entry_t          submap_entry, saved_submap_entry;
13107 		vm_object_offset_t      submap_entry_offset;
13108 		vm_object_size_t        submap_entry_size;
13109 		vm_prot_t               subentry_protection;
13110 		vm_prot_t               subentry_max_protection;
13111 		boolean_t               subentry_no_copy_on_read;
13112 		boolean_t               mapped_needs_copy = FALSE;
13113 		vm_map_version_t        version;
13114 
13115 		assertf(VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)) >= VM_MAP_PAGE_SHIFT(map),
13116 		    "map %p (%d) entry %p submap %p (%d)\n",
13117 		    map, VM_MAP_PAGE_SHIFT(map), entry,
13118 		    VME_SUBMAP(entry), VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)));
13119 
13120 		local_vaddr = vaddr;
13121 
13122 		if ((entry->use_pmap &&
13123 		    !((fault_type & VM_PROT_WRITE) ||
13124 		    force_copy))) {
13125 			/* if real_map equals map we unlock below */
13126 			if ((*real_map != map) &&
13127 			    (*real_map != cow_sub_map_parent)) {
13128 				vm_map_unlock(*real_map);
13129 			}
13130 			*real_map = VME_SUBMAP(entry);
13131 		}
13132 
13133 		if (entry->needs_copy &&
13134 		    ((fault_type & VM_PROT_WRITE) ||
13135 		    force_copy)) {
13136 			if (!mapped_needs_copy) {
13137 				if (vm_map_lock_read_to_write(map)) {
13138 					vm_map_lock_read(map);
13139 					*real_map = map;
13140 					goto RetryLookup;
13141 				}
13142 				vm_map_lock_read(VME_SUBMAP(entry));
13143 				*var_map = VME_SUBMAP(entry);
13144 				cow_sub_map_parent = map;
13145 				/* reset base to map before cow object */
13146 				/* this is the map which will accept   */
13147 				/* the new cow object */
13148 				old_start = entry->vme_start;
13149 				old_end = entry->vme_end;
13150 				cow_parent_vaddr = vaddr;
13151 				mapped_needs_copy = TRUE;
13152 			} else {
13153 				vm_map_lock_read(VME_SUBMAP(entry));
13154 				*var_map = VME_SUBMAP(entry);
13155 				if ((cow_sub_map_parent != map) &&
13156 				    (*real_map != map)) {
13157 					vm_map_unlock(map);
13158 				}
13159 			}
13160 		} else {
13161 			if (entry->needs_copy) {
13162 				submap_needed_copy = TRUE;
13163 			}
13164 			vm_map_lock_read(VME_SUBMAP(entry));
13165 			*var_map = VME_SUBMAP(entry);
13166 			/* leave map locked if it is a target */
13167 			/* cow sub_map above otherwise, just  */
13168 			/* follow the maps down to the object */
13169 			/* here we unlock knowing we are not  */
13170 			/* revisiting the map.  */
13171 			if ((*real_map != map) && (map != cow_sub_map_parent)) {
13172 				vm_map_unlock_read(map);
13173 			}
13174 		}
13175 
13176 		map = *var_map;
13177 
13178 		/* calculate the offset in the submap for vaddr */
13179 		local_vaddr = (local_vaddr - entry->vme_start) + VME_OFFSET(entry);
13180 		assertf(VM_MAP_PAGE_ALIGNED(local_vaddr, fault_page_mask),
13181 		    "local_vaddr 0x%llx entry->vme_start 0x%llx fault_page_mask 0x%llx\n",
13182 		    (uint64_t)local_vaddr, (uint64_t)entry->vme_start, (uint64_t)fault_page_mask);
13183 
13184 RetrySubMap:
13185 		if (!vm_map_lookup_entry(map, local_vaddr, &submap_entry)) {
13186 			if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
13187 				vm_map_unlock(cow_sub_map_parent);
13188 			}
13189 			if ((*real_map != map)
13190 			    && (*real_map != cow_sub_map_parent)) {
13191 				vm_map_unlock(*real_map);
13192 			}
13193 			*real_map = map;
13194 			return KERN_INVALID_ADDRESS;
13195 		}
13196 
13197 		/* find the attenuated shadow of the underlying object */
13198 		/* on our target map */
13199 
13200 		/* in english the submap object may extend beyond the     */
13201 		/* region mapped by the entry or, may only fill a portion */
13202 		/* of it.  For our purposes, we only care if the object   */
13203 		/* doesn't fill.  In this case the area which will        */
13204 		/* ultimately be clipped in the top map will only need    */
13205 		/* to be as big as the portion of the underlying entry    */
13206 		/* which is mapped */
13207 		start_delta = submap_entry->vme_start > VME_OFFSET(entry) ?
13208 		    submap_entry->vme_start - VME_OFFSET(entry) : 0;
13209 
13210 		end_delta =
13211 		    (VME_OFFSET(entry) + start_delta + (old_end - old_start)) <=
13212 		    submap_entry->vme_end ?
13213 		    0 : (VME_OFFSET(entry) +
13214 		    (old_end - old_start))
13215 		    - submap_entry->vme_end;
13216 
13217 		old_start += start_delta;
13218 		old_end -= end_delta;
13219 
13220 		if (submap_entry->is_sub_map) {
13221 			entry = submap_entry;
13222 			vaddr = local_vaddr;
13223 			goto submap_recurse;
13224 		}
13225 
13226 		if (((fault_type & VM_PROT_WRITE) ||
13227 		    force_copy)
13228 		    && cow_sub_map_parent) {
13229 			vm_object_t     sub_object, copy_object;
13230 			vm_object_offset_t copy_offset;
13231 			vm_map_offset_t local_start;
13232 			vm_map_offset_t local_end;
13233 			boolean_t       object_copied = FALSE;
13234 			vm_object_offset_t object_copied_offset = 0;
13235 			boolean_t       object_copied_needs_copy = FALSE;
13236 			kern_return_t   kr = KERN_SUCCESS;
13237 
13238 			if (vm_map_lock_read_to_write(map)) {
13239 				vm_map_lock_read(map);
13240 				old_start -= start_delta;
13241 				old_end += end_delta;
13242 				goto RetrySubMap;
13243 			}
13244 
13245 
13246 			sub_object = VME_OBJECT(submap_entry);
13247 			if (sub_object == VM_OBJECT_NULL) {
13248 				sub_object =
13249 				    vm_object_allocate(
13250 					(vm_map_size_t)
13251 					(submap_entry->vme_end -
13252 					submap_entry->vme_start));
13253 				VME_OBJECT_SET(submap_entry, sub_object, false, 0);
13254 				VME_OFFSET_SET(submap_entry, 0);
13255 				assert(!submap_entry->is_sub_map);
13256 				assert(submap_entry->use_pmap);
13257 			}
13258 			local_start =  local_vaddr -
13259 			    (cow_parent_vaddr - old_start);
13260 			local_end = local_vaddr +
13261 			    (old_end - cow_parent_vaddr);
13262 			vm_map_clip_start(map, submap_entry, local_start);
13263 			vm_map_clip_end(map, submap_entry, local_end);
13264 			if (submap_entry->is_sub_map) {
13265 				/* unnesting was done when clipping */
13266 				assert(!submap_entry->use_pmap);
13267 			}
13268 
13269 			/* This is the COW case, lets connect */
13270 			/* an entry in our space to the underlying */
13271 			/* object in the submap, bypassing the  */
13272 			/* submap. */
13273 			submap_entry_offset = VME_OFFSET(submap_entry);
13274 			submap_entry_size = submap_entry->vme_end - submap_entry->vme_start;
13275 
13276 			if ((submap_entry->wired_count != 0 ||
13277 			    sub_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) &&
13278 			    (submap_entry->protection & VM_PROT_EXECUTE) &&
13279 			    no_force_copy_if_executable) {
13280 //				printf("FBDP map %p entry %p start 0x%llx end 0x%llx wired %d strat %d\n", map, submap_entry, (uint64_t)local_start, (uint64_t)local_end, submap_entry->wired_count, sub_object->copy_strategy);
13281 				if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
13282 					vm_map_unlock(cow_sub_map_parent);
13283 				}
13284 				if ((*real_map != map)
13285 				    && (*real_map != cow_sub_map_parent)) {
13286 					vm_map_unlock(*real_map);
13287 				}
13288 				*real_map = map;
13289 				kernel_triage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_NO_COW_ON_EXECUTABLE), 0 /* arg */);
13290 				vm_map_lock_write_to_read(map);
13291 				kr = KERN_PROTECTION_FAILURE;
13292 				DTRACE_VM4(submap_no_copy_executable,
13293 				    vm_map_t, map,
13294 				    vm_object_offset_t, submap_entry_offset,
13295 				    vm_object_size_t, submap_entry_size,
13296 				    int, kr);
13297 				return kr;
13298 			}
13299 
13300 			if (submap_entry->wired_count != 0) {
13301 				vm_object_reference(sub_object);
13302 
13303 				assertf(VM_MAP_PAGE_ALIGNED(VME_OFFSET(submap_entry), VM_MAP_PAGE_MASK(map)),
13304 				    "submap_entry %p offset 0x%llx\n",
13305 				    submap_entry, VME_OFFSET(submap_entry));
13306 
13307 				DTRACE_VM6(submap_copy_slowly,
13308 				    vm_map_t, cow_sub_map_parent,
13309 				    vm_map_offset_t, vaddr,
13310 				    vm_map_t, map,
13311 				    vm_object_size_t, submap_entry_size,
13312 				    int, submap_entry->wired_count,
13313 				    int, sub_object->copy_strategy);
13314 
13315 				saved_submap_entry = submap_entry;
13316 				version.main_timestamp = map->timestamp;
13317 				vm_map_unlock(map); /* Increments timestamp by 1 */
13318 				submap_entry = VM_MAP_ENTRY_NULL;
13319 
13320 				vm_object_lock(sub_object);
13321 				kr = vm_object_copy_slowly(sub_object,
13322 				    submap_entry_offset,
13323 				    submap_entry_size,
13324 				    FALSE,
13325 				    &copy_object);
13326 				object_copied = TRUE;
13327 				object_copied_offset = 0;
13328 				/* 4k: account for extra offset in physical page */
13329 				object_copied_offset += submap_entry_offset - vm_object_trunc_page(submap_entry_offset);
13330 				object_copied_needs_copy = FALSE;
13331 				vm_object_deallocate(sub_object);
13332 
13333 				vm_map_lock(map);
13334 
13335 				if (kr != KERN_SUCCESS &&
13336 				    kr != KERN_MEMORY_RESTART_COPY) {
13337 					if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
13338 						vm_map_unlock(cow_sub_map_parent);
13339 					}
13340 					if ((*real_map != map)
13341 					    && (*real_map != cow_sub_map_parent)) {
13342 						vm_map_unlock(*real_map);
13343 					}
13344 					*real_map = map;
13345 					vm_object_deallocate(copy_object);
13346 					copy_object = VM_OBJECT_NULL;
13347 					kernel_triage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_COPY_SLOWLY_FAILED), 0 /* arg */);
13348 					vm_map_lock_write_to_read(map);
13349 					DTRACE_VM4(submap_copy_error_slowly,
13350 					    vm_object_t, sub_object,
13351 					    vm_object_offset_t, submap_entry_offset,
13352 					    vm_object_size_t, submap_entry_size,
13353 					    int, kr);
13354 					vm_map_lookup_locked_copy_slowly_error++;
13355 					return kr;
13356 				}
13357 
13358 				if ((kr == KERN_SUCCESS) &&
13359 				    (version.main_timestamp + 1) == map->timestamp) {
13360 					submap_entry = saved_submap_entry;
13361 				} else {
13362 					saved_submap_entry = NULL;
13363 					old_start -= start_delta;
13364 					old_end += end_delta;
13365 					vm_object_deallocate(copy_object);
13366 					copy_object = VM_OBJECT_NULL;
13367 					vm_map_lock_write_to_read(map);
13368 					vm_map_lookup_locked_copy_slowly_restart++;
13369 					goto RetrySubMap;
13370 				}
13371 				vm_map_lookup_locked_copy_slowly_count++;
13372 				vm_map_lookup_locked_copy_slowly_size += submap_entry_size;
13373 				if (submap_entry_size > vm_map_lookup_locked_copy_slowly_max) {
13374 					vm_map_lookup_locked_copy_slowly_max = submap_entry_size;
13375 				}
13376 			} else if (sub_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
13377 				submap_entry_offset = VME_OFFSET(submap_entry);
13378 				copy_object = VM_OBJECT_NULL;
13379 				object_copied_offset = submap_entry_offset;
13380 				object_copied_needs_copy = FALSE;
13381 				DTRACE_VM6(submap_copy_strategically,
13382 				    vm_map_t, cow_sub_map_parent,
13383 				    vm_map_offset_t, vaddr,
13384 				    vm_map_t, map,
13385 				    vm_object_size_t, submap_entry_size,
13386 				    int, submap_entry->wired_count,
13387 				    int, sub_object->copy_strategy);
13388 				kr = vm_object_copy_strategically(
13389 					sub_object,
13390 					submap_entry_offset,
13391 					submap_entry->vme_end - submap_entry->vme_start,
13392 					&copy_object,
13393 					&object_copied_offset,
13394 					&object_copied_needs_copy);
13395 				if (kr == KERN_MEMORY_RESTART_COPY) {
13396 					old_start -= start_delta;
13397 					old_end += end_delta;
13398 					vm_object_deallocate(copy_object);
13399 					copy_object = VM_OBJECT_NULL;
13400 					vm_map_lock_write_to_read(map);
13401 					vm_map_lookup_locked_copy_strategically_restart++;
13402 					goto RetrySubMap;
13403 				}
13404 				if (kr != KERN_SUCCESS) {
13405 					if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
13406 						vm_map_unlock(cow_sub_map_parent);
13407 					}
13408 					if ((*real_map != map)
13409 					    && (*real_map != cow_sub_map_parent)) {
13410 						vm_map_unlock(*real_map);
13411 					}
13412 					*real_map = map;
13413 					vm_object_deallocate(copy_object);
13414 					copy_object = VM_OBJECT_NULL;
13415 					kernel_triage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_COPY_STRAT_FAILED), 0 /* arg */);
13416 					vm_map_lock_write_to_read(map);
13417 					DTRACE_VM4(submap_copy_error_strategically,
13418 					    vm_object_t, sub_object,
13419 					    vm_object_offset_t, submap_entry_offset,
13420 					    vm_object_size_t, submap_entry_size,
13421 					    int, kr);
13422 					vm_map_lookup_locked_copy_strategically_error++;
13423 					return kr;
13424 				}
13425 				assert(copy_object != VM_OBJECT_NULL);
13426 				assert(copy_object != sub_object);
13427 				object_copied = TRUE;
13428 				vm_map_lookup_locked_copy_strategically_count++;
13429 				vm_map_lookup_locked_copy_strategically_size += submap_entry_size;
13430 				if (submap_entry_size > vm_map_lookup_locked_copy_strategically_max) {
13431 					vm_map_lookup_locked_copy_strategically_max = submap_entry_size;
13432 				}
13433 			} else {
13434 				/* set up shadow object */
13435 				object_copied = FALSE;
13436 				copy_object = sub_object;
13437 				vm_object_lock(sub_object);
13438 				vm_object_reference_locked(sub_object);
13439 				sub_object->shadowed = TRUE;
13440 				vm_object_unlock(sub_object);
13441 
13442 				assert(submap_entry->wired_count == 0);
13443 				submap_entry->needs_copy = TRUE;
13444 
13445 				prot = submap_entry->protection;
13446 				assert(!pmap_has_prot_policy(map->pmap, submap_entry->translated_allow_execute, prot));
13447 				prot = prot & ~VM_PROT_WRITE;
13448 				assert(!pmap_has_prot_policy(map->pmap, submap_entry->translated_allow_execute, prot));
13449 
13450 				if (override_nx(old_map,
13451 				    VME_ALIAS(submap_entry))
13452 				    && prot) {
13453 					prot |= VM_PROT_EXECUTE;
13454 				}
13455 
13456 				vm_object_pmap_protect(
13457 					sub_object,
13458 					VME_OFFSET(submap_entry),
13459 					submap_entry->vme_end -
13460 					submap_entry->vme_start,
13461 					(submap_entry->is_shared
13462 					|| map->mapped_in_other_pmaps) ?
13463 					PMAP_NULL : map->pmap,
13464 					VM_MAP_PAGE_SIZE(map),
13465 					submap_entry->vme_start,
13466 					prot);
13467 				vm_map_lookup_locked_copy_shadow_count++;
13468 				vm_map_lookup_locked_copy_shadow_size += submap_entry_size;
13469 				if (submap_entry_size > vm_map_lookup_locked_copy_shadow_max) {
13470 					vm_map_lookup_locked_copy_shadow_max = submap_entry_size;
13471 				}
13472 			}
13473 
13474 			/*
13475 			 * Adjust the fault offset to the submap entry.
13476 			 */
13477 			copy_offset = (local_vaddr -
13478 			    submap_entry->vme_start +
13479 			    VME_OFFSET(submap_entry));
13480 
13481 			/* This works diffently than the   */
13482 			/* normal submap case. We go back  */
13483 			/* to the parent of the cow map and*/
13484 			/* clip out the target portion of  */
13485 			/* the sub_map, substituting the   */
13486 			/* new copy object,                */
13487 
13488 			subentry_protection = submap_entry->protection;
13489 			subentry_max_protection = submap_entry->max_protection;
13490 			subentry_no_copy_on_read = submap_entry->vme_no_copy_on_read;
13491 			vm_map_unlock(map);
13492 			submap_entry = NULL; /* not valid after map unlock */
13493 
13494 			local_start = old_start;
13495 			local_end = old_end;
13496 			map = cow_sub_map_parent;
13497 			*var_map = cow_sub_map_parent;
13498 			vaddr = cow_parent_vaddr;
13499 			cow_sub_map_parent = NULL;
13500 
13501 			if (!vm_map_lookup_entry(map,
13502 			    vaddr, &entry)) {
13503 				if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
13504 					vm_map_unlock(cow_sub_map_parent);
13505 				}
13506 				if ((*real_map != map)
13507 				    && (*real_map != cow_sub_map_parent)) {
13508 					vm_map_unlock(*real_map);
13509 				}
13510 				*real_map = map;
13511 				vm_object_deallocate(
13512 					copy_object);
13513 				copy_object = VM_OBJECT_NULL;
13514 				vm_map_lock_write_to_read(map);
13515 				DTRACE_VM4(submap_lookup_post_unlock,
13516 				    uint64_t, (uint64_t)entry->vme_start,
13517 				    uint64_t, (uint64_t)entry->vme_end,
13518 				    vm_map_offset_t, vaddr,
13519 				    int, object_copied);
13520 				return KERN_INVALID_ADDRESS;
13521 			}
13522 
13523 			/* clip out the portion of space */
13524 			/* mapped by the sub map which   */
13525 			/* corresponds to the underlying */
13526 			/* object */
13527 
13528 			/*
13529 			 * Clip (and unnest) the smallest nested chunk
13530 			 * possible around the faulting address...
13531 			 */
13532 			local_start = vaddr & ~(pmap_shared_region_size_min(map->pmap) - 1);
13533 			local_end = local_start + pmap_shared_region_size_min(map->pmap);
13534 			/*
13535 			 * ... but don't go beyond the "old_start" to "old_end"
13536 			 * range, to avoid spanning over another VM region
13537 			 * with a possibly different VM object and/or offset.
13538 			 */
13539 			if (local_start < old_start) {
13540 				local_start = old_start;
13541 			}
13542 			if (local_end > old_end) {
13543 				local_end = old_end;
13544 			}
13545 			/*
13546 			 * Adjust copy_offset to the start of the range.
13547 			 */
13548 			copy_offset -= (vaddr - local_start);
13549 
13550 			vm_map_clip_start(map, entry, local_start);
13551 			vm_map_clip_end(map, entry, local_end);
13552 			if (entry->is_sub_map) {
13553 				/* unnesting was done when clipping */
13554 				assert(!entry->use_pmap);
13555 			}
13556 
13557 			/* substitute copy object for */
13558 			/* shared map entry           */
13559 			vm_map_deallocate(VME_SUBMAP(entry));
13560 			assert(!entry->iokit_acct);
13561 			entry->use_pmap = TRUE;
13562 			VME_OBJECT_SET(entry, copy_object, false, 0);
13563 
13564 			/* propagate the submap entry's protections */
13565 			if (entry->protection != VM_PROT_READ) {
13566 				/*
13567 				 * Someone has already altered the top entry's
13568 				 * protections via vm_protect(VM_PROT_COPY).
13569 				 * Respect these new values and ignore the
13570 				 * submap entry's protections.
13571 				 */
13572 			} else {
13573 				/*
13574 				 * Regular copy-on-write: propagate the submap
13575 				 * entry's protections to the top map entry.
13576 				 */
13577 				entry->protection |= subentry_protection;
13578 			}
13579 			entry->max_protection |= subentry_max_protection;
13580 			/* propagate no_copy_on_read */
13581 			entry->vme_no_copy_on_read = subentry_no_copy_on_read;
13582 
13583 			if ((entry->protection & VM_PROT_WRITE) &&
13584 			    (entry->protection & VM_PROT_EXECUTE) &&
13585 #if XNU_TARGET_OS_OSX
13586 			    map->pmap != kernel_pmap &&
13587 			    (vm_map_cs_enforcement(map)
13588 #if __arm64__
13589 			    || !VM_MAP_IS_EXOTIC(map)
13590 #endif /* __arm64__ */
13591 			    ) &&
13592 #endif /* XNU_TARGET_OS_OSX */
13593 			    !(entry->used_for_jit) &&
13594 			    VM_MAP_POLICY_WX_STRIP_X(map)) {
13595 				DTRACE_VM3(cs_wx,
13596 				    uint64_t, (uint64_t)entry->vme_start,
13597 				    uint64_t, (uint64_t)entry->vme_end,
13598 				    vm_prot_t, entry->protection);
13599 				printf("CODE SIGNING: %d[%s] %s can't have both write and exec at the same time\n",
13600 				    proc_selfpid(),
13601 				    (current_task()->bsd_info
13602 				    ? proc_name_address(current_task()->bsd_info)
13603 				    : "?"),
13604 				    __FUNCTION__);
13605 				entry->protection &= ~VM_PROT_EXECUTE;
13606 			}
13607 
13608 			if (object_copied) {
13609 				VME_OFFSET_SET(entry, local_start - old_start + object_copied_offset);
13610 				entry->needs_copy = object_copied_needs_copy;
13611 				entry->is_shared = FALSE;
13612 			} else {
13613 				assert(VME_OBJECT(entry) != VM_OBJECT_NULL);
13614 				assert(VME_OBJECT(entry)->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC);
13615 				assert(entry->wired_count == 0);
13616 				VME_OFFSET_SET(entry, copy_offset);
13617 				entry->needs_copy = TRUE;
13618 				if (map != old_map) {
13619 					entry->is_shared = TRUE;
13620 				}
13621 			}
13622 			if (entry->inheritance == VM_INHERIT_SHARE) {
13623 				entry->inheritance = VM_INHERIT_COPY;
13624 			}
13625 
13626 			vm_map_lock_write_to_read(map);
13627 		} else {
13628 			if ((cow_sub_map_parent)
13629 			    && (cow_sub_map_parent != *real_map)
13630 			    && (cow_sub_map_parent != map)) {
13631 				vm_map_unlock(cow_sub_map_parent);
13632 			}
13633 			entry = submap_entry;
13634 			vaddr = local_vaddr;
13635 		}
13636 	}
13637 
13638 	/*
13639 	 *	Check whether this task is allowed to have
13640 	 *	this page.
13641 	 */
13642 
13643 	prot = entry->protection;
13644 
13645 	if (override_nx(old_map, VME_ALIAS(entry)) && prot) {
13646 		/*
13647 		 * HACK -- if not a stack, then allow execution
13648 		 */
13649 		prot |= VM_PROT_EXECUTE;
13650 	}
13651 
13652 	if (mask_protections) {
13653 		fault_type &= prot;
13654 		if (fault_type == VM_PROT_NONE) {
13655 			goto protection_failure;
13656 		}
13657 	}
13658 	if (((fault_type & prot) != fault_type)
13659 #if __arm64__
13660 	    /* prefetch abort in execute-only page */
13661 	    && !(prot == VM_PROT_EXECUTE && fault_type == (VM_PROT_READ | VM_PROT_EXECUTE))
13662 #elif defined(__x86_64__)
13663 	    /* Consider the UEXEC bit when handling an EXECUTE fault */
13664 	    && !((fault_type & VM_PROT_EXECUTE) && !(prot & VM_PROT_EXECUTE) && (prot & VM_PROT_UEXEC))
13665 #endif
13666 	    ) {
13667 protection_failure:
13668 		if (*real_map != map) {
13669 			vm_map_unlock(*real_map);
13670 		}
13671 		*real_map = map;
13672 
13673 		if ((fault_type & VM_PROT_EXECUTE) && prot) {
13674 			log_stack_execution_failure((addr64_t)vaddr, prot);
13675 		}
13676 
13677 		DTRACE_VM2(prot_fault, int, 1, (uint64_t *), NULL);
13678 		DTRACE_VM3(prot_fault_detailed, vm_prot_t, fault_type, vm_prot_t, prot, void *, vaddr);
13679 		/*
13680 		 * Noisy (esp. internally) and can be inferred from CrashReports. So OFF for now.
13681 		 *
13682 		 * kernel_triage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_PROTECTION_FAILURE), 0);
13683 		 */
13684 		return KERN_PROTECTION_FAILURE;
13685 	}
13686 
13687 	/*
13688 	 *	If this page is not pageable, we have to get
13689 	 *	it for all possible accesses.
13690 	 */
13691 
13692 	*wired = (entry->wired_count != 0);
13693 	if (*wired) {
13694 		fault_type = prot;
13695 	}
13696 
13697 	/*
13698 	 *	If the entry was copy-on-write, we either ...
13699 	 */
13700 
13701 	if (entry->needs_copy) {
13702 		/*
13703 		 *	If we want to write the page, we may as well
13704 		 *	handle that now since we've got the map locked.
13705 		 *
13706 		 *	If we don't need to write the page, we just
13707 		 *	demote the permissions allowed.
13708 		 */
13709 
13710 		if ((fault_type & VM_PROT_WRITE) || *wired || force_copy) {
13711 			/*
13712 			 *	Make a new object, and place it in the
13713 			 *	object chain.  Note that no new references
13714 			 *	have appeared -- one just moved from the
13715 			 *	map to the new object.
13716 			 */
13717 
13718 			if (vm_map_lock_read_to_write(map)) {
13719 				vm_map_lock_read(map);
13720 				goto RetryLookup;
13721 			}
13722 
13723 			if (VME_OBJECT(entry)->shadowed == FALSE) {
13724 				vm_object_lock(VME_OBJECT(entry));
13725 				VME_OBJECT(entry)->shadowed = TRUE;
13726 				vm_object_unlock(VME_OBJECT(entry));
13727 			}
13728 			VME_OBJECT_SHADOW(entry,
13729 			    (vm_map_size_t) (entry->vme_end -
13730 			    entry->vme_start));
13731 			entry->needs_copy = FALSE;
13732 
13733 			vm_map_lock_write_to_read(map);
13734 		}
13735 		if ((fault_type & VM_PROT_WRITE) == 0 && *wired == 0) {
13736 			/*
13737 			 *	We're attempting to read a copy-on-write
13738 			 *	page -- don't allow writes.
13739 			 */
13740 
13741 			prot &= (~VM_PROT_WRITE);
13742 		}
13743 	}
13744 
13745 	if (submap_needed_copy && (prot & VM_PROT_WRITE)) {
13746 		/*
13747 		 * We went through a "needs_copy" submap without triggering
13748 		 * a copy, so granting write access to the page would bypass
13749 		 * that submap's "needs_copy".
13750 		 */
13751 		assert(!(fault_type & VM_PROT_WRITE));
13752 		assert(!*wired);
13753 		assert(!force_copy);
13754 		// printf("FBDP %d[%s] submap_needed_copy for %p 0x%llx\n", proc_selfpid(), proc_name_address(current_task()->bsd_info), map, vaddr);
13755 		prot &= ~VM_PROT_WRITE;
13756 	}
13757 
13758 	/*
13759 	 *	Create an object if necessary.
13760 	 */
13761 	if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
13762 		if (vm_map_lock_read_to_write(map)) {
13763 			vm_map_lock_read(map);
13764 			goto RetryLookup;
13765 		}
13766 
13767 		VME_OBJECT_SET(entry,
13768 		    vm_object_allocate(
13769 			    (vm_map_size_t)(entry->vme_end -
13770 			    entry->vme_start)), false, 0);
13771 		VME_OFFSET_SET(entry, 0);
13772 		assert(entry->use_pmap);
13773 		vm_map_lock_write_to_read(map);
13774 	}
13775 
13776 	/*
13777 	 *	Return the object/offset from this entry.  If the entry
13778 	 *	was copy-on-write or empty, it has been fixed up.  Also
13779 	 *	return the protection.
13780 	 */
13781 
13782 	*offset = (vaddr - entry->vme_start) + VME_OFFSET(entry);
13783 	*object = VME_OBJECT(entry);
13784 	*out_prot = prot;
13785 	KDBG_FILTERED(MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_MAP_LOOKUP_OBJECT), VM_KERNEL_UNSLIDE_OR_PERM(*object), (unsigned long) VME_ALIAS(entry), 0, 0);
13786 
13787 	if (fault_info) {
13788 		fault_info->interruptible = THREAD_UNINT; /* for now... */
13789 		/* ... the caller will change "interruptible" if needed */
13790 		fault_info->cluster_size = 0;
13791 		fault_info->user_tag = VME_ALIAS(entry);
13792 		fault_info->pmap_options = 0;
13793 		if (entry->iokit_acct ||
13794 		    (!entry->is_sub_map && !entry->use_pmap)) {
13795 			fault_info->pmap_options |= PMAP_OPTIONS_ALT_ACCT;
13796 		}
13797 		fault_info->behavior = entry->behavior;
13798 		fault_info->lo_offset = VME_OFFSET(entry);
13799 		fault_info->hi_offset =
13800 		    (entry->vme_end - entry->vme_start) + VME_OFFSET(entry);
13801 		fault_info->no_cache  = entry->no_cache;
13802 		fault_info->stealth = FALSE;
13803 		fault_info->io_sync = FALSE;
13804 		if (entry->used_for_jit ||
13805 		    entry->vme_resilient_codesign) {
13806 			fault_info->cs_bypass = TRUE;
13807 		} else {
13808 			fault_info->cs_bypass = FALSE;
13809 		}
13810 		fault_info->pmap_cs_associated = FALSE;
13811 #if CONFIG_PMAP_CS
13812 		if (entry->pmap_cs_associated) {
13813 			/*
13814 			 * The pmap layer will validate this page
13815 			 * before allowing it to be executed from.
13816 			 */
13817 			fault_info->pmap_cs_associated = TRUE;
13818 		}
13819 #endif /* CONFIG_PMAP_CS */
13820 		fault_info->mark_zf_absent = FALSE;
13821 		fault_info->batch_pmap_op = FALSE;
13822 		fault_info->resilient_media = entry->vme_resilient_media;
13823 		fault_info->no_copy_on_read = entry->vme_no_copy_on_read;
13824 		if (entry->translated_allow_execute) {
13825 			fault_info->pmap_options |= PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE;
13826 		}
13827 	}
13828 
13829 	/*
13830 	 *	Lock the object to prevent it from disappearing
13831 	 */
13832 	if (object_lock_type == OBJECT_LOCK_EXCLUSIVE) {
13833 		if (contended == NULL) {
13834 			vm_object_lock(*object);
13835 		} else {
13836 			*contended = vm_object_lock_check_contended(*object);
13837 		}
13838 	} else {
13839 		vm_object_lock_shared(*object);
13840 	}
13841 
13842 	/*
13843 	 *	Save the version number
13844 	 */
13845 
13846 	out_version->main_timestamp = map->timestamp;
13847 
13848 	return KERN_SUCCESS;
13849 }
13850 
13851 
13852 /*
13853  *	vm_map_verify:
13854  *
13855  *	Verifies that the map in question has not changed
13856  *	since the given version. The map has to be locked
13857  *	("shared" mode is fine) before calling this function
13858  *	and it will be returned locked too.
13859  */
13860 boolean_t
vm_map_verify(vm_map_t map,vm_map_version_t * version)13861 vm_map_verify(
13862 	vm_map_t                map,
13863 	vm_map_version_t        *version)       /* REF */
13864 {
13865 	boolean_t       result;
13866 
13867 	vm_map_lock_assert_held(map);
13868 	result = (map->timestamp == version->main_timestamp);
13869 
13870 	return result;
13871 }
13872 
13873 /*
13874  *	TEMPORARYTEMPORARYTEMPORARYTEMPORARYTEMPORARYTEMPORARY
13875  *	Goes away after regular vm_region_recurse function migrates to
13876  *	64 bits
13877  *	vm_region_recurse: A form of vm_region which follows the
13878  *	submaps in a target map
13879  *
13880  */
13881 
13882 kern_return_t
vm_map_region_recurse_64(vm_map_t map,vm_map_offset_t * address,vm_map_size_t * size,natural_t * nesting_depth,vm_region_submap_info_64_t submap_info,mach_msg_type_number_t * count)13883 vm_map_region_recurse_64(
13884 	vm_map_t                 map,
13885 	vm_map_offset_t *address,               /* IN/OUT */
13886 	vm_map_size_t           *size,                  /* OUT */
13887 	natural_t               *nesting_depth, /* IN/OUT */
13888 	vm_region_submap_info_64_t      submap_info,    /* IN/OUT */
13889 	mach_msg_type_number_t  *count) /* IN/OUT */
13890 {
13891 	mach_msg_type_number_t  original_count;
13892 	vm_region_extended_info_data_t  extended;
13893 	vm_map_entry_t                  tmp_entry;
13894 	vm_map_offset_t                 user_address;
13895 	unsigned int                    user_max_depth;
13896 
13897 	/*
13898 	 * "curr_entry" is the VM map entry preceding or including the
13899 	 * address we're looking for.
13900 	 * "curr_map" is the map or sub-map containing "curr_entry".
13901 	 * "curr_address" is the equivalent of the top map's "user_address"
13902 	 * in the current map.
13903 	 * "curr_offset" is the cumulated offset of "curr_map" in the
13904 	 * target task's address space.
13905 	 * "curr_depth" is the depth of "curr_map" in the chain of
13906 	 * sub-maps.
13907 	 *
13908 	 * "curr_max_below" and "curr_max_above" limit the range (around
13909 	 * "curr_address") we should take into account in the current (sub)map.
13910 	 * They limit the range to what's visible through the map entries
13911 	 * we've traversed from the top map to the current map.
13912 	 *
13913 	 */
13914 	vm_map_entry_t                  curr_entry;
13915 	vm_map_address_t                curr_address;
13916 	vm_map_offset_t                 curr_offset;
13917 	vm_map_t                        curr_map;
13918 	unsigned int                    curr_depth;
13919 	vm_map_offset_t                 curr_max_below, curr_max_above;
13920 	vm_map_offset_t                 curr_skip;
13921 
13922 	/*
13923 	 * "next_" is the same as "curr_" but for the VM region immediately
13924 	 * after the address we're looking for.  We need to keep track of this
13925 	 * too because we want to return info about that region if the
13926 	 * address we're looking for is not mapped.
13927 	 */
13928 	vm_map_entry_t                  next_entry;
13929 	vm_map_offset_t                 next_offset;
13930 	vm_map_offset_t                 next_address;
13931 	vm_map_t                        next_map;
13932 	unsigned int                    next_depth;
13933 	vm_map_offset_t                 next_max_below, next_max_above;
13934 	vm_map_offset_t                 next_skip;
13935 
13936 	boolean_t                       look_for_pages;
13937 	vm_region_submap_short_info_64_t short_info;
13938 	boolean_t                       do_region_footprint;
13939 	int                             effective_page_size, effective_page_shift;
13940 	boolean_t                       submap_needed_copy;
13941 
13942 	if (map == VM_MAP_NULL) {
13943 		/* no address space to work on */
13944 		return KERN_INVALID_ARGUMENT;
13945 	}
13946 
13947 	effective_page_shift = vm_self_region_page_shift(map);
13948 	effective_page_size = (1 << effective_page_shift);
13949 
13950 	if (*count < VM_REGION_SUBMAP_SHORT_INFO_COUNT_64) {
13951 		/*
13952 		 * "info" structure is not big enough and
13953 		 * would overflow
13954 		 */
13955 		return KERN_INVALID_ARGUMENT;
13956 	}
13957 
13958 	do_region_footprint = task_self_region_footprint();
13959 	original_count = *count;
13960 
13961 	if (original_count < VM_REGION_SUBMAP_INFO_V0_COUNT_64) {
13962 		*count = VM_REGION_SUBMAP_SHORT_INFO_COUNT_64;
13963 		look_for_pages = FALSE;
13964 		short_info = (vm_region_submap_short_info_64_t) submap_info;
13965 		submap_info = NULL;
13966 	} else {
13967 		look_for_pages = TRUE;
13968 		*count = VM_REGION_SUBMAP_INFO_V0_COUNT_64;
13969 		short_info = NULL;
13970 
13971 		if (original_count >= VM_REGION_SUBMAP_INFO_V1_COUNT_64) {
13972 			*count = VM_REGION_SUBMAP_INFO_V1_COUNT_64;
13973 		}
13974 		if (original_count >= VM_REGION_SUBMAP_INFO_V2_COUNT_64) {
13975 			*count = VM_REGION_SUBMAP_INFO_V2_COUNT_64;
13976 		}
13977 	}
13978 
13979 	user_address = *address;
13980 	user_max_depth = *nesting_depth;
13981 	submap_needed_copy = FALSE;
13982 
13983 	if (not_in_kdp) {
13984 		vm_map_lock_read(map);
13985 	}
13986 
13987 recurse_again:
13988 	curr_entry = NULL;
13989 	curr_map = map;
13990 	curr_address = user_address;
13991 	curr_offset = 0;
13992 	curr_skip = 0;
13993 	curr_depth = 0;
13994 	curr_max_above = ((vm_map_offset_t) -1) - curr_address;
13995 	curr_max_below = curr_address;
13996 
13997 	next_entry = NULL;
13998 	next_map = NULL;
13999 	next_address = 0;
14000 	next_offset = 0;
14001 	next_skip = 0;
14002 	next_depth = 0;
14003 	next_max_above = (vm_map_offset_t) -1;
14004 	next_max_below = (vm_map_offset_t) -1;
14005 
14006 	for (;;) {
14007 		if (vm_map_lookup_entry(curr_map,
14008 		    curr_address,
14009 		    &tmp_entry)) {
14010 			/* tmp_entry contains the address we're looking for */
14011 			curr_entry = tmp_entry;
14012 		} else {
14013 			vm_map_offset_t skip;
14014 			/*
14015 			 * The address is not mapped.  "tmp_entry" is the
14016 			 * map entry preceding the address.  We want the next
14017 			 * one, if it exists.
14018 			 */
14019 			curr_entry = tmp_entry->vme_next;
14020 
14021 			if (curr_entry == vm_map_to_entry(curr_map) ||
14022 			    (curr_entry->vme_start >=
14023 			    curr_address + curr_max_above)) {
14024 				/* no next entry at this level: stop looking */
14025 				if (not_in_kdp) {
14026 					vm_map_unlock_read(curr_map);
14027 				}
14028 				curr_entry = NULL;
14029 				curr_map = NULL;
14030 				curr_skip = 0;
14031 				curr_offset = 0;
14032 				curr_depth = 0;
14033 				curr_max_above = 0;
14034 				curr_max_below = 0;
14035 				break;
14036 			}
14037 
14038 			/* adjust current address and offset */
14039 			skip = curr_entry->vme_start - curr_address;
14040 			curr_address = curr_entry->vme_start;
14041 			curr_skip += skip;
14042 			curr_offset += skip;
14043 			curr_max_above -= skip;
14044 			curr_max_below = 0;
14045 		}
14046 
14047 		/*
14048 		 * Is the next entry at this level closer to the address (or
14049 		 * deeper in the submap chain) than the one we had
14050 		 * so far ?
14051 		 */
14052 		tmp_entry = curr_entry->vme_next;
14053 		if (tmp_entry == vm_map_to_entry(curr_map)) {
14054 			/* no next entry at this level */
14055 		} else if (tmp_entry->vme_start >=
14056 		    curr_address + curr_max_above) {
14057 			/*
14058 			 * tmp_entry is beyond the scope of what we mapped of
14059 			 * this submap in the upper level: ignore it.
14060 			 */
14061 		} else if ((next_entry == NULL) ||
14062 		    (tmp_entry->vme_start + curr_offset <=
14063 		    next_entry->vme_start + next_offset)) {
14064 			/*
14065 			 * We didn't have a "next_entry" or this one is
14066 			 * closer to the address we're looking for:
14067 			 * use this "tmp_entry" as the new "next_entry".
14068 			 */
14069 			if (next_entry != NULL) {
14070 				/* unlock the last "next_map" */
14071 				if (next_map != curr_map && not_in_kdp) {
14072 					vm_map_unlock_read(next_map);
14073 				}
14074 			}
14075 			next_entry = tmp_entry;
14076 			next_map = curr_map;
14077 			next_depth = curr_depth;
14078 			next_address = next_entry->vme_start;
14079 			next_skip = curr_skip;
14080 			next_skip += (next_address - curr_address);
14081 			next_offset = curr_offset;
14082 			next_offset += (next_address - curr_address);
14083 			next_max_above = MIN(next_max_above, curr_max_above);
14084 			next_max_above = MIN(next_max_above,
14085 			    next_entry->vme_end - next_address);
14086 			next_max_below = MIN(next_max_below, curr_max_below);
14087 			next_max_below = MIN(next_max_below,
14088 			    next_address - next_entry->vme_start);
14089 		}
14090 
14091 		/*
14092 		 * "curr_max_{above,below}" allow us to keep track of the
14093 		 * portion of the submap that is actually mapped at this level:
14094 		 * the rest of that submap is irrelevant to us, since it's not
14095 		 * mapped here.
14096 		 * The relevant portion of the map starts at
14097 		 * "VME_OFFSET(curr_entry)" up to the size of "curr_entry".
14098 		 */
14099 		curr_max_above = MIN(curr_max_above,
14100 		    curr_entry->vme_end - curr_address);
14101 		curr_max_below = MIN(curr_max_below,
14102 		    curr_address - curr_entry->vme_start);
14103 
14104 		if (!curr_entry->is_sub_map ||
14105 		    curr_depth >= user_max_depth) {
14106 			/*
14107 			 * We hit a leaf map or we reached the maximum depth
14108 			 * we could, so stop looking.  Keep the current map
14109 			 * locked.
14110 			 */
14111 			break;
14112 		}
14113 
14114 		/*
14115 		 * Get down to the next submap level.
14116 		 */
14117 
14118 		if (curr_entry->needs_copy) {
14119 			/* everything below this is effectively copy-on-write */
14120 			submap_needed_copy = TRUE;
14121 		}
14122 
14123 		/*
14124 		 * Lock the next level and unlock the current level,
14125 		 * unless we need to keep it locked to access the "next_entry"
14126 		 * later.
14127 		 */
14128 		if (not_in_kdp) {
14129 			vm_map_lock_read(VME_SUBMAP(curr_entry));
14130 		}
14131 		if (curr_map == next_map) {
14132 			/* keep "next_map" locked in case we need it */
14133 		} else {
14134 			/* release this map */
14135 			if (not_in_kdp) {
14136 				vm_map_unlock_read(curr_map);
14137 			}
14138 		}
14139 
14140 		/*
14141 		 * Adjust the offset.  "curr_entry" maps the submap
14142 		 * at relative address "curr_entry->vme_start" in the
14143 		 * curr_map but skips the first "VME_OFFSET(curr_entry)"
14144 		 * bytes of the submap.
14145 		 * "curr_offset" always represents the offset of a virtual
14146 		 * address in the curr_map relative to the absolute address
14147 		 * space (i.e. the top-level VM map).
14148 		 */
14149 		curr_offset +=
14150 		    (VME_OFFSET(curr_entry) - curr_entry->vme_start);
14151 		curr_address = user_address + curr_offset;
14152 		/* switch to the submap */
14153 		curr_map = VME_SUBMAP(curr_entry);
14154 		curr_depth++;
14155 		curr_entry = NULL;
14156 	}
14157 
14158 // LP64todo: all the current tools are 32bit, obviously never worked for 64b
14159 // so probably should be a real 32b ID vs. ptr.
14160 // Current users just check for equality
14161 
14162 	if (curr_entry == NULL) {
14163 		/* no VM region contains the address... */
14164 
14165 		if (do_region_footprint && /* we want footprint numbers */
14166 		    next_entry == NULL && /* & there are no more regions */
14167 		    /* & we haven't already provided our fake region: */
14168 		    user_address <= vm_map_last_entry(map)->vme_end) {
14169 			ledger_amount_t ledger_resident, ledger_compressed;
14170 
14171 			/*
14172 			 * Add a fake memory region to account for
14173 			 * purgeable and/or ledger-tagged memory that
14174 			 * counts towards this task's memory footprint,
14175 			 * i.e. the resident/compressed pages of non-volatile
14176 			 * objects owned by that task.
14177 			 */
14178 			task_ledgers_footprint(map->pmap->ledger,
14179 			    &ledger_resident,
14180 			    &ledger_compressed);
14181 			if (ledger_resident + ledger_compressed == 0) {
14182 				/* no purgeable memory usage to report */
14183 				return KERN_INVALID_ADDRESS;
14184 			}
14185 			/* fake region to show nonvolatile footprint */
14186 			if (look_for_pages) {
14187 				submap_info->protection = VM_PROT_DEFAULT;
14188 				submap_info->max_protection = VM_PROT_DEFAULT;
14189 				submap_info->inheritance = VM_INHERIT_DEFAULT;
14190 				submap_info->offset = 0;
14191 				submap_info->user_tag = -1;
14192 				submap_info->pages_resident = (unsigned int) (ledger_resident / effective_page_size);
14193 				submap_info->pages_shared_now_private = 0;
14194 				submap_info->pages_swapped_out = (unsigned int) (ledger_compressed / effective_page_size);
14195 				submap_info->pages_dirtied = submap_info->pages_resident;
14196 				submap_info->ref_count = 1;
14197 				submap_info->shadow_depth = 0;
14198 				submap_info->external_pager = 0;
14199 				submap_info->share_mode = SM_PRIVATE;
14200 				if (submap_needed_copy) {
14201 					submap_info->share_mode = SM_COW;
14202 				}
14203 				submap_info->is_submap = 0;
14204 				submap_info->behavior = VM_BEHAVIOR_DEFAULT;
14205 				submap_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
14206 				submap_info->user_wired_count = 0;
14207 				submap_info->pages_reusable = 0;
14208 			} else {
14209 				short_info->user_tag = -1;
14210 				short_info->offset = 0;
14211 				short_info->protection = VM_PROT_DEFAULT;
14212 				short_info->inheritance = VM_INHERIT_DEFAULT;
14213 				short_info->max_protection = VM_PROT_DEFAULT;
14214 				short_info->behavior = VM_BEHAVIOR_DEFAULT;
14215 				short_info->user_wired_count = 0;
14216 				short_info->is_submap = 0;
14217 				short_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
14218 				short_info->external_pager = 0;
14219 				short_info->shadow_depth = 0;
14220 				short_info->share_mode = SM_PRIVATE;
14221 				if (submap_needed_copy) {
14222 					short_info->share_mode = SM_COW;
14223 				}
14224 				short_info->ref_count = 1;
14225 			}
14226 			*nesting_depth = 0;
14227 			*size = (vm_map_size_t) (ledger_resident + ledger_compressed);
14228 //			*address = user_address;
14229 			*address = vm_map_last_entry(map)->vme_end;
14230 			return KERN_SUCCESS;
14231 		}
14232 
14233 		if (next_entry == NULL) {
14234 			/* ... and no VM region follows it either */
14235 			return KERN_INVALID_ADDRESS;
14236 		}
14237 		/* ... gather info about the next VM region */
14238 		curr_entry = next_entry;
14239 		curr_map = next_map;    /* still locked ... */
14240 		curr_address = next_address;
14241 		curr_skip = next_skip;
14242 		curr_offset = next_offset;
14243 		curr_depth = next_depth;
14244 		curr_max_above = next_max_above;
14245 		curr_max_below = next_max_below;
14246 	} else {
14247 		/* we won't need "next_entry" after all */
14248 		if (next_entry != NULL) {
14249 			/* release "next_map" */
14250 			if (next_map != curr_map && not_in_kdp) {
14251 				vm_map_unlock_read(next_map);
14252 			}
14253 		}
14254 	}
14255 	next_entry = NULL;
14256 	next_map = NULL;
14257 	next_offset = 0;
14258 	next_skip = 0;
14259 	next_depth = 0;
14260 	next_max_below = -1;
14261 	next_max_above = -1;
14262 
14263 	if (curr_entry->is_sub_map &&
14264 	    curr_depth < user_max_depth) {
14265 		/*
14266 		 * We're not as deep as we could be:  we must have
14267 		 * gone back up after not finding anything mapped
14268 		 * below the original top-level map entry's.
14269 		 * Let's move "curr_address" forward and recurse again.
14270 		 */
14271 		user_address = curr_address;
14272 		goto recurse_again;
14273 	}
14274 
14275 	*nesting_depth = curr_depth;
14276 	*size = curr_max_above + curr_max_below;
14277 	*address = user_address + curr_skip - curr_max_below;
14278 
14279 	if (look_for_pages) {
14280 		submap_info->user_tag = VME_ALIAS(curr_entry);
14281 		submap_info->offset = VME_OFFSET(curr_entry);
14282 		submap_info->protection = curr_entry->protection;
14283 		submap_info->inheritance = curr_entry->inheritance;
14284 		submap_info->max_protection = curr_entry->max_protection;
14285 		submap_info->behavior = curr_entry->behavior;
14286 		submap_info->user_wired_count = curr_entry->user_wired_count;
14287 		submap_info->is_submap = curr_entry->is_sub_map;
14288 		if (curr_entry->is_sub_map) {
14289 			submap_info->object_id = VM_OBJECT_ID(VME_SUBMAP(curr_entry));
14290 		} else {
14291 			submap_info->object_id = VM_OBJECT_ID(VME_OBJECT(curr_entry));
14292 		}
14293 	} else {
14294 		short_info->user_tag = VME_ALIAS(curr_entry);
14295 		short_info->offset = VME_OFFSET(curr_entry);
14296 		short_info->protection = curr_entry->protection;
14297 		short_info->inheritance = curr_entry->inheritance;
14298 		short_info->max_protection = curr_entry->max_protection;
14299 		short_info->behavior = curr_entry->behavior;
14300 		short_info->user_wired_count = curr_entry->user_wired_count;
14301 		short_info->is_submap = curr_entry->is_sub_map;
14302 		if (curr_entry->is_sub_map) {
14303 			short_info->object_id = VM_OBJECT_ID(VME_SUBMAP(curr_entry));
14304 		} else {
14305 			short_info->object_id = VM_OBJECT_ID(VME_OBJECT(curr_entry));
14306 		}
14307 	}
14308 
14309 	extended.pages_resident = 0;
14310 	extended.pages_swapped_out = 0;
14311 	extended.pages_shared_now_private = 0;
14312 	extended.pages_dirtied = 0;
14313 	extended.pages_reusable = 0;
14314 	extended.external_pager = 0;
14315 	extended.shadow_depth = 0;
14316 	extended.share_mode = SM_EMPTY;
14317 	extended.ref_count = 0;
14318 
14319 	if (not_in_kdp) {
14320 		if (!curr_entry->is_sub_map) {
14321 			vm_map_offset_t range_start, range_end;
14322 			range_start = MAX((curr_address - curr_max_below),
14323 			    curr_entry->vme_start);
14324 			range_end = MIN((curr_address + curr_max_above),
14325 			    curr_entry->vme_end);
14326 			vm_map_region_walk(curr_map,
14327 			    range_start,
14328 			    curr_entry,
14329 			    (VME_OFFSET(curr_entry) +
14330 			    (range_start -
14331 			    curr_entry->vme_start)),
14332 			    range_end - range_start,
14333 			    &extended,
14334 			    look_for_pages, VM_REGION_EXTENDED_INFO_COUNT);
14335 			if (extended.external_pager &&
14336 			    extended.ref_count == 2 &&
14337 			    extended.share_mode == SM_SHARED) {
14338 				extended.share_mode = SM_PRIVATE;
14339 			}
14340 			if (submap_needed_copy) {
14341 				extended.share_mode = SM_COW;
14342 			}
14343 		} else {
14344 			if (curr_entry->use_pmap) {
14345 				extended.share_mode = SM_TRUESHARED;
14346 			} else {
14347 				extended.share_mode = SM_PRIVATE;
14348 			}
14349 			extended.ref_count = os_ref_get_count_raw(&VME_SUBMAP(curr_entry)->map_refcnt);
14350 		}
14351 	}
14352 
14353 	if (look_for_pages) {
14354 		submap_info->pages_resident = extended.pages_resident;
14355 		submap_info->pages_swapped_out = extended.pages_swapped_out;
14356 		submap_info->pages_shared_now_private =
14357 		    extended.pages_shared_now_private;
14358 		submap_info->pages_dirtied = extended.pages_dirtied;
14359 		submap_info->external_pager = extended.external_pager;
14360 		submap_info->shadow_depth = extended.shadow_depth;
14361 		submap_info->share_mode = extended.share_mode;
14362 		submap_info->ref_count = extended.ref_count;
14363 
14364 		if (original_count >= VM_REGION_SUBMAP_INFO_V1_COUNT_64) {
14365 			submap_info->pages_reusable = extended.pages_reusable;
14366 		}
14367 		if (original_count >= VM_REGION_SUBMAP_INFO_V2_COUNT_64) {
14368 			if (curr_entry->is_sub_map) {
14369 				submap_info->object_id_full = (vm_object_id_t)VM_KERNEL_ADDRPERM(VME_SUBMAP(curr_entry));
14370 			} else if (VME_OBJECT(curr_entry)) {
14371 				submap_info->object_id_full = (vm_object_id_t)VM_KERNEL_ADDRPERM(VME_OBJECT(curr_entry));
14372 			} else {
14373 				submap_info->object_id_full = 0ull;
14374 			}
14375 		}
14376 	} else {
14377 		short_info->external_pager = extended.external_pager;
14378 		short_info->shadow_depth = extended.shadow_depth;
14379 		short_info->share_mode = extended.share_mode;
14380 		short_info->ref_count = extended.ref_count;
14381 	}
14382 
14383 	if (not_in_kdp) {
14384 		vm_map_unlock_read(curr_map);
14385 	}
14386 
14387 	return KERN_SUCCESS;
14388 }
14389 
14390 /*
14391  *	vm_region:
14392  *
14393  *	User call to obtain information about a region in
14394  *	a task's address map. Currently, only one flavor is
14395  *	supported.
14396  *
14397  *	XXX The reserved and behavior fields cannot be filled
14398  *	    in until the vm merge from the IK is completed, and
14399  *	    vm_reserve is implemented.
14400  */
14401 
14402 kern_return_t
vm_map_region(vm_map_t map,vm_map_offset_t * address,vm_map_size_t * size,vm_region_flavor_t flavor,vm_region_info_t info,mach_msg_type_number_t * count,mach_port_t * object_name)14403 vm_map_region(
14404 	vm_map_t                 map,
14405 	vm_map_offset_t *address,               /* IN/OUT */
14406 	vm_map_size_t           *size,                  /* OUT */
14407 	vm_region_flavor_t       flavor,                /* IN */
14408 	vm_region_info_t         info,                  /* OUT */
14409 	mach_msg_type_number_t  *count, /* IN/OUT */
14410 	mach_port_t             *object_name)           /* OUT */
14411 {
14412 	vm_map_entry_t          tmp_entry;
14413 	vm_map_entry_t          entry;
14414 	vm_map_offset_t         start;
14415 
14416 	if (map == VM_MAP_NULL) {
14417 		return KERN_INVALID_ARGUMENT;
14418 	}
14419 
14420 	switch (flavor) {
14421 	case VM_REGION_BASIC_INFO:
14422 		/* legacy for old 32-bit objects info */
14423 	{
14424 		vm_region_basic_info_t  basic;
14425 
14426 		if (*count < VM_REGION_BASIC_INFO_COUNT) {
14427 			return KERN_INVALID_ARGUMENT;
14428 		}
14429 
14430 		basic = (vm_region_basic_info_t) info;
14431 		*count = VM_REGION_BASIC_INFO_COUNT;
14432 
14433 		vm_map_lock_read(map);
14434 
14435 		start = *address;
14436 		if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
14437 			if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
14438 				vm_map_unlock_read(map);
14439 				return KERN_INVALID_ADDRESS;
14440 			}
14441 		} else {
14442 			entry = tmp_entry;
14443 		}
14444 
14445 		start = entry->vme_start;
14446 
14447 		basic->offset = (uint32_t)VME_OFFSET(entry);
14448 		basic->protection = entry->protection;
14449 		basic->inheritance = entry->inheritance;
14450 		basic->max_protection = entry->max_protection;
14451 		basic->behavior = entry->behavior;
14452 		basic->user_wired_count = entry->user_wired_count;
14453 		basic->reserved = entry->is_sub_map;
14454 		*address = start;
14455 		*size = (entry->vme_end - start);
14456 
14457 		if (object_name) {
14458 			*object_name = IP_NULL;
14459 		}
14460 		if (entry->is_sub_map) {
14461 			basic->shared = FALSE;
14462 		} else {
14463 			basic->shared = entry->is_shared;
14464 		}
14465 
14466 		vm_map_unlock_read(map);
14467 		return KERN_SUCCESS;
14468 	}
14469 
14470 	case VM_REGION_BASIC_INFO_64:
14471 	{
14472 		vm_region_basic_info_64_t       basic;
14473 
14474 		if (*count < VM_REGION_BASIC_INFO_COUNT_64) {
14475 			return KERN_INVALID_ARGUMENT;
14476 		}
14477 
14478 		basic = (vm_region_basic_info_64_t) info;
14479 		*count = VM_REGION_BASIC_INFO_COUNT_64;
14480 
14481 		vm_map_lock_read(map);
14482 
14483 		start = *address;
14484 		if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
14485 			if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
14486 				vm_map_unlock_read(map);
14487 				return KERN_INVALID_ADDRESS;
14488 			}
14489 		} else {
14490 			entry = tmp_entry;
14491 		}
14492 
14493 		start = entry->vme_start;
14494 
14495 		basic->offset = VME_OFFSET(entry);
14496 		basic->protection = entry->protection;
14497 		basic->inheritance = entry->inheritance;
14498 		basic->max_protection = entry->max_protection;
14499 		basic->behavior = entry->behavior;
14500 		basic->user_wired_count = entry->user_wired_count;
14501 		basic->reserved = entry->is_sub_map;
14502 		*address = start;
14503 		*size = (entry->vme_end - start);
14504 
14505 		if (object_name) {
14506 			*object_name = IP_NULL;
14507 		}
14508 		if (entry->is_sub_map) {
14509 			basic->shared = FALSE;
14510 		} else {
14511 			basic->shared = entry->is_shared;
14512 		}
14513 
14514 		vm_map_unlock_read(map);
14515 		return KERN_SUCCESS;
14516 	}
14517 	case VM_REGION_EXTENDED_INFO:
14518 		if (*count < VM_REGION_EXTENDED_INFO_COUNT) {
14519 			return KERN_INVALID_ARGUMENT;
14520 		}
14521 		OS_FALLTHROUGH;
14522 	case VM_REGION_EXTENDED_INFO__legacy:
14523 		if (*count < VM_REGION_EXTENDED_INFO_COUNT__legacy) {
14524 			return KERN_INVALID_ARGUMENT;
14525 		}
14526 
14527 		{
14528 			vm_region_extended_info_t       extended;
14529 			mach_msg_type_number_t original_count;
14530 			int effective_page_size, effective_page_shift;
14531 
14532 			extended = (vm_region_extended_info_t) info;
14533 
14534 			effective_page_shift = vm_self_region_page_shift(map);
14535 			effective_page_size = (1 << effective_page_shift);
14536 
14537 			vm_map_lock_read(map);
14538 
14539 			start = *address;
14540 			if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
14541 				if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
14542 					vm_map_unlock_read(map);
14543 					return KERN_INVALID_ADDRESS;
14544 				}
14545 			} else {
14546 				entry = tmp_entry;
14547 			}
14548 			start = entry->vme_start;
14549 
14550 			extended->protection = entry->protection;
14551 			extended->user_tag = VME_ALIAS(entry);
14552 			extended->pages_resident = 0;
14553 			extended->pages_swapped_out = 0;
14554 			extended->pages_shared_now_private = 0;
14555 			extended->pages_dirtied = 0;
14556 			extended->external_pager = 0;
14557 			extended->shadow_depth = 0;
14558 
14559 			original_count = *count;
14560 			if (flavor == VM_REGION_EXTENDED_INFO__legacy) {
14561 				*count = VM_REGION_EXTENDED_INFO_COUNT__legacy;
14562 			} else {
14563 				extended->pages_reusable = 0;
14564 				*count = VM_REGION_EXTENDED_INFO_COUNT;
14565 			}
14566 
14567 			vm_map_region_walk(map, start, entry, VME_OFFSET(entry), entry->vme_end - start, extended, TRUE, *count);
14568 
14569 			if (extended->external_pager && extended->ref_count == 2 && extended->share_mode == SM_SHARED) {
14570 				extended->share_mode = SM_PRIVATE;
14571 			}
14572 
14573 			if (object_name) {
14574 				*object_name = IP_NULL;
14575 			}
14576 			*address = start;
14577 			*size = (entry->vme_end - start);
14578 
14579 			vm_map_unlock_read(map);
14580 			return KERN_SUCCESS;
14581 		}
14582 	case VM_REGION_TOP_INFO:
14583 	{
14584 		vm_region_top_info_t    top;
14585 
14586 		if (*count < VM_REGION_TOP_INFO_COUNT) {
14587 			return KERN_INVALID_ARGUMENT;
14588 		}
14589 
14590 		top = (vm_region_top_info_t) info;
14591 		*count = VM_REGION_TOP_INFO_COUNT;
14592 
14593 		vm_map_lock_read(map);
14594 
14595 		start = *address;
14596 		if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
14597 			if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
14598 				vm_map_unlock_read(map);
14599 				return KERN_INVALID_ADDRESS;
14600 			}
14601 		} else {
14602 			entry = tmp_entry;
14603 		}
14604 		start = entry->vme_start;
14605 
14606 		top->private_pages_resident = 0;
14607 		top->shared_pages_resident = 0;
14608 
14609 		vm_map_region_top_walk(entry, top);
14610 
14611 		if (object_name) {
14612 			*object_name = IP_NULL;
14613 		}
14614 		*address = start;
14615 		*size = (entry->vme_end - start);
14616 
14617 		vm_map_unlock_read(map);
14618 		return KERN_SUCCESS;
14619 	}
14620 	default:
14621 		return KERN_INVALID_ARGUMENT;
14622 	}
14623 }
14624 
14625 #define OBJ_RESIDENT_COUNT(obj, entry_size)                             \
14626 	MIN((entry_size),                                               \
14627 	    ((obj)->all_reusable ?                                      \
14628 	     (obj)->wired_page_count :                                  \
14629 	     (obj)->resident_page_count - (obj)->reusable_page_count))
14630 
14631 void
vm_map_region_top_walk(vm_map_entry_t entry,vm_region_top_info_t top)14632 vm_map_region_top_walk(
14633 	vm_map_entry_t             entry,
14634 	vm_region_top_info_t       top)
14635 {
14636 	if (entry->is_sub_map || VME_OBJECT(entry) == 0) {
14637 		top->share_mode = SM_EMPTY;
14638 		top->ref_count = 0;
14639 		top->obj_id = 0;
14640 		return;
14641 	}
14642 
14643 	{
14644 		struct  vm_object *obj, *tmp_obj;
14645 		int             ref_count;
14646 		uint32_t        entry_size;
14647 
14648 		entry_size = (uint32_t) ((entry->vme_end - entry->vme_start) / PAGE_SIZE_64);
14649 
14650 		obj = VME_OBJECT(entry);
14651 
14652 		vm_object_lock(obj);
14653 
14654 		if ((ref_count = obj->ref_count) > 1 && obj->paging_in_progress) {
14655 			ref_count--;
14656 		}
14657 
14658 		assert(obj->reusable_page_count <= obj->resident_page_count);
14659 		if (obj->shadow) {
14660 			if (ref_count == 1) {
14661 				top->private_pages_resident =
14662 				    OBJ_RESIDENT_COUNT(obj, entry_size);
14663 			} else {
14664 				top->shared_pages_resident =
14665 				    OBJ_RESIDENT_COUNT(obj, entry_size);
14666 			}
14667 			top->ref_count  = ref_count;
14668 			top->share_mode = SM_COW;
14669 
14670 			while ((tmp_obj = obj->shadow)) {
14671 				vm_object_lock(tmp_obj);
14672 				vm_object_unlock(obj);
14673 				obj = tmp_obj;
14674 
14675 				if ((ref_count = obj->ref_count) > 1 && obj->paging_in_progress) {
14676 					ref_count--;
14677 				}
14678 
14679 				assert(obj->reusable_page_count <= obj->resident_page_count);
14680 				top->shared_pages_resident +=
14681 				    OBJ_RESIDENT_COUNT(obj, entry_size);
14682 				top->ref_count += ref_count - 1;
14683 			}
14684 		} else {
14685 			if (entry->superpage_size) {
14686 				top->share_mode = SM_LARGE_PAGE;
14687 				top->shared_pages_resident = 0;
14688 				top->private_pages_resident = entry_size;
14689 			} else if (entry->needs_copy) {
14690 				top->share_mode = SM_COW;
14691 				top->shared_pages_resident =
14692 				    OBJ_RESIDENT_COUNT(obj, entry_size);
14693 			} else {
14694 				if (ref_count == 1 ||
14695 				    (ref_count == 2 && obj->named)) {
14696 					top->share_mode = SM_PRIVATE;
14697 					top->private_pages_resident =
14698 					    OBJ_RESIDENT_COUNT(obj,
14699 					    entry_size);
14700 				} else {
14701 					top->share_mode = SM_SHARED;
14702 					top->shared_pages_resident =
14703 					    OBJ_RESIDENT_COUNT(obj,
14704 					    entry_size);
14705 				}
14706 			}
14707 			top->ref_count = ref_count;
14708 		}
14709 		/* XXX K64: obj_id will be truncated */
14710 		top->obj_id = (unsigned int) (uintptr_t)VM_KERNEL_ADDRPERM(obj);
14711 
14712 		vm_object_unlock(obj);
14713 	}
14714 }
14715 
14716 void
vm_map_region_walk(vm_map_t map,vm_map_offset_t va,vm_map_entry_t entry,vm_object_offset_t offset,vm_object_size_t range,vm_region_extended_info_t extended,boolean_t look_for_pages,mach_msg_type_number_t count)14717 vm_map_region_walk(
14718 	vm_map_t                        map,
14719 	vm_map_offset_t                 va,
14720 	vm_map_entry_t                  entry,
14721 	vm_object_offset_t              offset,
14722 	vm_object_size_t                range,
14723 	vm_region_extended_info_t       extended,
14724 	boolean_t                       look_for_pages,
14725 	mach_msg_type_number_t count)
14726 {
14727 	struct vm_object *obj, *tmp_obj;
14728 	vm_map_offset_t       last_offset;
14729 	int               i;
14730 	int               ref_count;
14731 	struct vm_object        *shadow_object;
14732 	unsigned short          shadow_depth;
14733 	boolean_t         do_region_footprint;
14734 	int                     effective_page_size, effective_page_shift;
14735 	vm_map_offset_t         effective_page_mask;
14736 
14737 	do_region_footprint = task_self_region_footprint();
14738 
14739 	if ((entry->is_sub_map) ||
14740 	    (VME_OBJECT(entry) == 0) ||
14741 	    (VME_OBJECT(entry)->phys_contiguous &&
14742 	    !entry->superpage_size)) {
14743 		extended->share_mode = SM_EMPTY;
14744 		extended->ref_count = 0;
14745 		return;
14746 	}
14747 
14748 	if (entry->superpage_size) {
14749 		extended->shadow_depth = 0;
14750 		extended->share_mode = SM_LARGE_PAGE;
14751 		extended->ref_count = 1;
14752 		extended->external_pager = 0;
14753 
14754 		/* TODO4K: Superpage in 4k mode? */
14755 		extended->pages_resident = (unsigned int)(range >> PAGE_SHIFT);
14756 		extended->shadow_depth = 0;
14757 		return;
14758 	}
14759 
14760 	effective_page_shift = vm_self_region_page_shift(map);
14761 	effective_page_size = (1 << effective_page_shift);
14762 	effective_page_mask = effective_page_size - 1;
14763 
14764 	offset = vm_map_trunc_page(offset, effective_page_mask);
14765 
14766 	obj = VME_OBJECT(entry);
14767 
14768 	vm_object_lock(obj);
14769 
14770 	if ((ref_count = obj->ref_count) > 1 && obj->paging_in_progress) {
14771 		ref_count--;
14772 	}
14773 
14774 	if (look_for_pages) {
14775 		for (last_offset = offset + range;
14776 		    offset < last_offset;
14777 		    offset += effective_page_size, va += effective_page_size) {
14778 			if (do_region_footprint) {
14779 				int disp;
14780 
14781 				disp = 0;
14782 				if (map->has_corpse_footprint) {
14783 					/*
14784 					 * Query the page info data we saved
14785 					 * while forking the corpse.
14786 					 */
14787 					vm_map_corpse_footprint_query_page_info(
14788 						map,
14789 						va,
14790 						&disp);
14791 				} else {
14792 					/*
14793 					 * Query the pmap.
14794 					 */
14795 					vm_map_footprint_query_page_info(
14796 						map,
14797 						entry,
14798 						va,
14799 						&disp);
14800 				}
14801 				if (disp & VM_PAGE_QUERY_PAGE_PRESENT) {
14802 					extended->pages_resident++;
14803 				}
14804 				if (disp & VM_PAGE_QUERY_PAGE_REUSABLE) {
14805 					extended->pages_reusable++;
14806 				}
14807 				if (disp & VM_PAGE_QUERY_PAGE_DIRTY) {
14808 					extended->pages_dirtied++;
14809 				}
14810 				if (disp & PMAP_QUERY_PAGE_COMPRESSED) {
14811 					extended->pages_swapped_out++;
14812 				}
14813 				continue;
14814 			}
14815 
14816 			vm_map_region_look_for_page(map, va, obj,
14817 			    vm_object_trunc_page(offset), ref_count,
14818 			    0, extended, count);
14819 		}
14820 
14821 		if (do_region_footprint) {
14822 			goto collect_object_info;
14823 		}
14824 	} else {
14825 collect_object_info:
14826 		shadow_object = obj->shadow;
14827 		shadow_depth = 0;
14828 
14829 		if (!(obj->internal)) {
14830 			extended->external_pager = 1;
14831 		}
14832 
14833 		if (shadow_object != VM_OBJECT_NULL) {
14834 			vm_object_lock(shadow_object);
14835 			for (;
14836 			    shadow_object != VM_OBJECT_NULL;
14837 			    shadow_depth++) {
14838 				vm_object_t     next_shadow;
14839 
14840 				if (!(shadow_object->internal)) {
14841 					extended->external_pager = 1;
14842 				}
14843 
14844 				next_shadow = shadow_object->shadow;
14845 				if (next_shadow) {
14846 					vm_object_lock(next_shadow);
14847 				}
14848 				vm_object_unlock(shadow_object);
14849 				shadow_object = next_shadow;
14850 			}
14851 		}
14852 		extended->shadow_depth = shadow_depth;
14853 	}
14854 
14855 	if (extended->shadow_depth || entry->needs_copy) {
14856 		extended->share_mode = SM_COW;
14857 	} else {
14858 		if (ref_count == 1) {
14859 			extended->share_mode = SM_PRIVATE;
14860 		} else {
14861 			if (obj->true_share) {
14862 				extended->share_mode = SM_TRUESHARED;
14863 			} else {
14864 				extended->share_mode = SM_SHARED;
14865 			}
14866 		}
14867 	}
14868 	extended->ref_count = ref_count - extended->shadow_depth;
14869 
14870 	for (i = 0; i < extended->shadow_depth; i++) {
14871 		if ((tmp_obj = obj->shadow) == 0) {
14872 			break;
14873 		}
14874 		vm_object_lock(tmp_obj);
14875 		vm_object_unlock(obj);
14876 
14877 		if ((ref_count = tmp_obj->ref_count) > 1 && tmp_obj->paging_in_progress) {
14878 			ref_count--;
14879 		}
14880 
14881 		extended->ref_count += ref_count;
14882 		obj = tmp_obj;
14883 	}
14884 	vm_object_unlock(obj);
14885 
14886 	if (extended->share_mode == SM_SHARED) {
14887 		vm_map_entry_t       cur;
14888 		vm_map_entry_t       last;
14889 		int      my_refs;
14890 
14891 		obj = VME_OBJECT(entry);
14892 		last = vm_map_to_entry(map);
14893 		my_refs = 0;
14894 
14895 		if ((ref_count = obj->ref_count) > 1 && obj->paging_in_progress) {
14896 			ref_count--;
14897 		}
14898 		for (cur = vm_map_first_entry(map); cur != last; cur = cur->vme_next) {
14899 			my_refs += vm_map_region_count_obj_refs(cur, obj);
14900 		}
14901 
14902 		if (my_refs == ref_count) {
14903 			extended->share_mode = SM_PRIVATE_ALIASED;
14904 		} else if (my_refs > 1) {
14905 			extended->share_mode = SM_SHARED_ALIASED;
14906 		}
14907 	}
14908 }
14909 
14910 
14911 /* object is locked on entry and locked on return */
14912 
14913 
14914 static void
vm_map_region_look_for_page(__unused vm_map_t map,__unused vm_map_offset_t va,vm_object_t object,vm_object_offset_t offset,int max_refcnt,unsigned short depth,vm_region_extended_info_t extended,mach_msg_type_number_t count)14915 vm_map_region_look_for_page(
14916 	__unused vm_map_t               map,
14917 	__unused vm_map_offset_t        va,
14918 	vm_object_t                     object,
14919 	vm_object_offset_t              offset,
14920 	int                             max_refcnt,
14921 	unsigned short                  depth,
14922 	vm_region_extended_info_t       extended,
14923 	mach_msg_type_number_t count)
14924 {
14925 	vm_page_t       p;
14926 	vm_object_t     shadow;
14927 	int             ref_count;
14928 	vm_object_t     caller_object;
14929 
14930 	shadow = object->shadow;
14931 	caller_object = object;
14932 
14933 
14934 	while (TRUE) {
14935 		if (!(object->internal)) {
14936 			extended->external_pager = 1;
14937 		}
14938 
14939 		if ((p = vm_page_lookup(object, offset)) != VM_PAGE_NULL) {
14940 			if (shadow && (max_refcnt == 1)) {
14941 				extended->pages_shared_now_private++;
14942 			}
14943 
14944 			if (!p->vmp_fictitious &&
14945 			    (p->vmp_dirty || pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(p)))) {
14946 				extended->pages_dirtied++;
14947 			} else if (count >= VM_REGION_EXTENDED_INFO_COUNT) {
14948 				if (p->vmp_reusable || object->all_reusable) {
14949 					extended->pages_reusable++;
14950 				}
14951 			}
14952 
14953 			extended->pages_resident++;
14954 
14955 			if (object != caller_object) {
14956 				vm_object_unlock(object);
14957 			}
14958 
14959 			return;
14960 		}
14961 		if (object->internal &&
14962 		    object->alive &&
14963 		    !object->terminating &&
14964 		    object->pager_ready) {
14965 			if (VM_COMPRESSOR_PAGER_STATE_GET(object, offset)
14966 			    == VM_EXTERNAL_STATE_EXISTS) {
14967 				/* the pager has that page */
14968 				extended->pages_swapped_out++;
14969 				if (object != caller_object) {
14970 					vm_object_unlock(object);
14971 				}
14972 				return;
14973 			}
14974 		}
14975 
14976 		if (shadow) {
14977 			vm_object_lock(shadow);
14978 
14979 			if ((ref_count = shadow->ref_count) > 1 && shadow->paging_in_progress) {
14980 				ref_count--;
14981 			}
14982 
14983 			if (++depth > extended->shadow_depth) {
14984 				extended->shadow_depth = depth;
14985 			}
14986 
14987 			if (ref_count > max_refcnt) {
14988 				max_refcnt = ref_count;
14989 			}
14990 
14991 			if (object != caller_object) {
14992 				vm_object_unlock(object);
14993 			}
14994 
14995 			offset = offset + object->vo_shadow_offset;
14996 			object = shadow;
14997 			shadow = object->shadow;
14998 			continue;
14999 		}
15000 		if (object != caller_object) {
15001 			vm_object_unlock(object);
15002 		}
15003 		break;
15004 	}
15005 }
15006 
15007 static int
vm_map_region_count_obj_refs(vm_map_entry_t entry,vm_object_t object)15008 vm_map_region_count_obj_refs(
15009 	vm_map_entry_t    entry,
15010 	vm_object_t       object)
15011 {
15012 	int ref_count;
15013 	vm_object_t chk_obj;
15014 	vm_object_t tmp_obj;
15015 
15016 	if (entry->is_sub_map || VME_OBJECT(entry) == VM_OBJECT_NULL) {
15017 		return 0;
15018 	}
15019 
15020 	ref_count = 0;
15021 	chk_obj = VME_OBJECT(entry);
15022 	vm_object_lock(chk_obj);
15023 
15024 	while (chk_obj) {
15025 		if (chk_obj == object) {
15026 			ref_count++;
15027 		}
15028 		tmp_obj = chk_obj->shadow;
15029 		if (tmp_obj) {
15030 			vm_object_lock(tmp_obj);
15031 		}
15032 		vm_object_unlock(chk_obj);
15033 
15034 		chk_obj = tmp_obj;
15035 	}
15036 
15037 	return ref_count;
15038 }
15039 
15040 
15041 /*
15042  *	Routine:	vm_map_simplify
15043  *
15044  *	Description:
15045  *		Attempt to simplify the map representation in
15046  *		the vicinity of the given starting address.
15047  *	Note:
15048  *		This routine is intended primarily to keep the
15049  *		kernel maps more compact -- they generally don't
15050  *		benefit from the "expand a map entry" technology
15051  *		at allocation time because the adjacent entry
15052  *		is often wired down.
15053  */
15054 void
vm_map_simplify_entry(vm_map_t map,vm_map_entry_t this_entry)15055 vm_map_simplify_entry(
15056 	vm_map_t        map,
15057 	vm_map_entry_t  this_entry)
15058 {
15059 	vm_map_entry_t  prev_entry;
15060 
15061 	prev_entry = this_entry->vme_prev;
15062 
15063 	if ((this_entry != vm_map_to_entry(map)) &&
15064 	    (prev_entry != vm_map_to_entry(map)) &&
15065 
15066 	    (prev_entry->vme_end == this_entry->vme_start) &&
15067 
15068 	    (prev_entry->is_sub_map == this_entry->is_sub_map) &&
15069 	    (prev_entry->vme_object_value == this_entry->vme_object_value) &&
15070 	    (prev_entry->vme_kernel_object == this_entry->vme_kernel_object) &&
15071 	    ((VME_OFFSET(prev_entry) + (prev_entry->vme_end -
15072 	    prev_entry->vme_start))
15073 	    == VME_OFFSET(this_entry)) &&
15074 
15075 	    (prev_entry->behavior == this_entry->behavior) &&
15076 	    (prev_entry->needs_copy == this_entry->needs_copy) &&
15077 	    (prev_entry->protection == this_entry->protection) &&
15078 	    (prev_entry->max_protection == this_entry->max_protection) &&
15079 	    (prev_entry->inheritance == this_entry->inheritance) &&
15080 	    (prev_entry->use_pmap == this_entry->use_pmap) &&
15081 	    (VME_ALIAS(prev_entry) == VME_ALIAS(this_entry)) &&
15082 	    (prev_entry->no_cache == this_entry->no_cache) &&
15083 	    (prev_entry->permanent == this_entry->permanent) &&
15084 	    (prev_entry->map_aligned == this_entry->map_aligned) &&
15085 	    (prev_entry->zero_wired_pages == this_entry->zero_wired_pages) &&
15086 	    (prev_entry->used_for_jit == this_entry->used_for_jit) &&
15087 	    (prev_entry->pmap_cs_associated == this_entry->pmap_cs_associated) &&
15088 	    (prev_entry->iokit_acct == this_entry->iokit_acct) &&
15089 	    (prev_entry->vme_resilient_codesign ==
15090 	    this_entry->vme_resilient_codesign) &&
15091 	    (prev_entry->vme_resilient_media ==
15092 	    this_entry->vme_resilient_media) &&
15093 	    (prev_entry->vme_no_copy_on_read == this_entry->vme_no_copy_on_read) &&
15094 
15095 	    (prev_entry->wired_count == this_entry->wired_count) &&
15096 	    (prev_entry->user_wired_count == this_entry->user_wired_count) &&
15097 
15098 	    ((prev_entry->vme_atomic == FALSE) && (this_entry->vme_atomic == FALSE)) &&
15099 	    (prev_entry->in_transition == FALSE) &&
15100 	    (this_entry->in_transition == FALSE) &&
15101 	    (prev_entry->needs_wakeup == FALSE) &&
15102 	    (this_entry->needs_wakeup == FALSE) &&
15103 	    (prev_entry->is_shared == this_entry->is_shared) &&
15104 	    (prev_entry->superpage_size == FALSE) &&
15105 	    (this_entry->superpage_size == FALSE)
15106 	    ) {
15107 		vm_map_store_entry_unlink(map, prev_entry);
15108 		assert(prev_entry->vme_start < this_entry->vme_end);
15109 		if (prev_entry->map_aligned) {
15110 			assert(VM_MAP_PAGE_ALIGNED(prev_entry->vme_start,
15111 			    VM_MAP_PAGE_MASK(map)));
15112 		}
15113 		this_entry->vme_start = prev_entry->vme_start;
15114 		VME_OFFSET_SET(this_entry, VME_OFFSET(prev_entry));
15115 
15116 		if (map->holelistenabled) {
15117 			vm_map_store_update_first_free(map, this_entry, TRUE);
15118 		}
15119 
15120 		if (prev_entry->is_sub_map) {
15121 			vm_map_deallocate(VME_SUBMAP(prev_entry));
15122 		} else {
15123 			vm_object_deallocate(VME_OBJECT(prev_entry));
15124 		}
15125 		vm_map_entry_dispose(prev_entry);
15126 		SAVE_HINT_MAP_WRITE(map, this_entry);
15127 	}
15128 }
15129 
15130 void
vm_map_simplify(vm_map_t map,vm_map_offset_t start)15131 vm_map_simplify(
15132 	vm_map_t        map,
15133 	vm_map_offset_t start)
15134 {
15135 	vm_map_entry_t  this_entry;
15136 
15137 	vm_map_lock(map);
15138 	if (vm_map_lookup_entry(map, start, &this_entry)) {
15139 		vm_map_simplify_entry(map, this_entry);
15140 		vm_map_simplify_entry(map, this_entry->vme_next);
15141 	}
15142 	vm_map_unlock(map);
15143 }
15144 
15145 static void
vm_map_simplify_range(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)15146 vm_map_simplify_range(
15147 	vm_map_t        map,
15148 	vm_map_offset_t start,
15149 	vm_map_offset_t end)
15150 {
15151 	vm_map_entry_t  entry;
15152 
15153 	/*
15154 	 * The map should be locked (for "write") by the caller.
15155 	 */
15156 
15157 	if (start >= end) {
15158 		/* invalid address range */
15159 		return;
15160 	}
15161 
15162 	start = vm_map_trunc_page(start,
15163 	    VM_MAP_PAGE_MASK(map));
15164 	end = vm_map_round_page(end,
15165 	    VM_MAP_PAGE_MASK(map));
15166 
15167 	if (!vm_map_lookup_entry(map, start, &entry)) {
15168 		/* "start" is not mapped and "entry" ends before "start" */
15169 		if (entry == vm_map_to_entry(map)) {
15170 			/* start with first entry in the map */
15171 			entry = vm_map_first_entry(map);
15172 		} else {
15173 			/* start with next entry */
15174 			entry = entry->vme_next;
15175 		}
15176 	}
15177 
15178 	while (entry != vm_map_to_entry(map) &&
15179 	    entry->vme_start <= end) {
15180 		/* try and coalesce "entry" with its previous entry */
15181 		vm_map_simplify_entry(map, entry);
15182 		entry = entry->vme_next;
15183 	}
15184 }
15185 
15186 
15187 /*
15188  *	Routine:	vm_map_machine_attribute
15189  *	Purpose:
15190  *		Provide machine-specific attributes to mappings,
15191  *		such as cachability etc. for machines that provide
15192  *		them.  NUMA architectures and machines with big/strange
15193  *		caches will use this.
15194  *	Note:
15195  *		Responsibilities for locking and checking are handled here,
15196  *		everything else in the pmap module. If any non-volatile
15197  *		information must be kept, the pmap module should handle
15198  *		it itself. [This assumes that attributes do not
15199  *		need to be inherited, which seems ok to me]
15200  */
15201 kern_return_t
vm_map_machine_attribute(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_machine_attribute_t attribute,vm_machine_attribute_val_t * value)15202 vm_map_machine_attribute(
15203 	vm_map_t                        map,
15204 	vm_map_offset_t         start,
15205 	vm_map_offset_t         end,
15206 	vm_machine_attribute_t  attribute,
15207 	vm_machine_attribute_val_t* value)              /* IN/OUT */
15208 {
15209 	kern_return_t   ret;
15210 	vm_map_size_t sync_size;
15211 	vm_map_entry_t entry;
15212 
15213 	if (start < vm_map_min(map) || end > vm_map_max(map)) {
15214 		return KERN_INVALID_ADDRESS;
15215 	}
15216 
15217 	/* Figure how much memory we need to flush (in page increments) */
15218 	sync_size = end - start;
15219 
15220 	vm_map_lock(map);
15221 
15222 	if (attribute != MATTR_CACHE) {
15223 		/* If we don't have to find physical addresses, we */
15224 		/* don't have to do an explicit traversal here.    */
15225 		ret = pmap_attribute(map->pmap, start, end - start,
15226 		    attribute, value);
15227 		vm_map_unlock(map);
15228 		return ret;
15229 	}
15230 
15231 	ret = KERN_SUCCESS;                                                                             /* Assume it all worked */
15232 
15233 	while (sync_size) {
15234 		if (vm_map_lookup_entry(map, start, &entry)) {
15235 			vm_map_size_t   sub_size;
15236 			if ((entry->vme_end - start) > sync_size) {
15237 				sub_size = sync_size;
15238 				sync_size = 0;
15239 			} else {
15240 				sub_size = entry->vme_end - start;
15241 				sync_size -= sub_size;
15242 			}
15243 			if (entry->is_sub_map) {
15244 				vm_map_offset_t sub_start;
15245 				vm_map_offset_t sub_end;
15246 
15247 				sub_start = (start - entry->vme_start)
15248 				    + VME_OFFSET(entry);
15249 				sub_end = sub_start + sub_size;
15250 				vm_map_machine_attribute(
15251 					VME_SUBMAP(entry),
15252 					sub_start,
15253 					sub_end,
15254 					attribute, value);
15255 			} else if (VME_OBJECT(entry)) {
15256 				vm_page_t               m;
15257 				vm_object_t             object;
15258 				vm_object_t             base_object;
15259 				vm_object_t             last_object;
15260 				vm_object_offset_t      offset;
15261 				vm_object_offset_t      base_offset;
15262 				vm_map_size_t           range;
15263 				range = sub_size;
15264 				offset = (start - entry->vme_start)
15265 				    + VME_OFFSET(entry);
15266 				offset = vm_object_trunc_page(offset);
15267 				base_offset = offset;
15268 				object = VME_OBJECT(entry);
15269 				base_object = object;
15270 				last_object = NULL;
15271 
15272 				vm_object_lock(object);
15273 
15274 				while (range) {
15275 					m = vm_page_lookup(
15276 						object, offset);
15277 
15278 					if (m && !m->vmp_fictitious) {
15279 						ret =
15280 						    pmap_attribute_cache_sync(
15281 							VM_PAGE_GET_PHYS_PAGE(m),
15282 							PAGE_SIZE,
15283 							attribute, value);
15284 					} else if (object->shadow) {
15285 						offset = offset + object->vo_shadow_offset;
15286 						last_object = object;
15287 						object = object->shadow;
15288 						vm_object_lock(last_object->shadow);
15289 						vm_object_unlock(last_object);
15290 						continue;
15291 					}
15292 					if (range < PAGE_SIZE) {
15293 						range = 0;
15294 					} else {
15295 						range -= PAGE_SIZE;
15296 					}
15297 
15298 					if (base_object != object) {
15299 						vm_object_unlock(object);
15300 						vm_object_lock(base_object);
15301 						object = base_object;
15302 					}
15303 					/* Bump to the next page */
15304 					base_offset += PAGE_SIZE;
15305 					offset = base_offset;
15306 				}
15307 				vm_object_unlock(object);
15308 			}
15309 			start += sub_size;
15310 		} else {
15311 			vm_map_unlock(map);
15312 			return KERN_FAILURE;
15313 		}
15314 	}
15315 
15316 	vm_map_unlock(map);
15317 
15318 	return ret;
15319 }
15320 
15321 /*
15322  *	vm_map_behavior_set:
15323  *
15324  *	Sets the paging reference behavior of the specified address
15325  *	range in the target map.  Paging reference behavior affects
15326  *	how pagein operations resulting from faults on the map will be
15327  *	clustered.
15328  */
15329 kern_return_t
vm_map_behavior_set(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_behavior_t new_behavior)15330 vm_map_behavior_set(
15331 	vm_map_t        map,
15332 	vm_map_offset_t start,
15333 	vm_map_offset_t end,
15334 	vm_behavior_t   new_behavior)
15335 {
15336 	vm_map_entry_t  entry;
15337 	vm_map_entry_t  temp_entry;
15338 
15339 	if (start > end ||
15340 	    start < vm_map_min(map) ||
15341 	    end > vm_map_max(map)) {
15342 		return KERN_NO_SPACE;
15343 	}
15344 
15345 	switch (new_behavior) {
15346 	/*
15347 	 * This first block of behaviors all set a persistent state on the specified
15348 	 * memory range.  All we have to do here is to record the desired behavior
15349 	 * in the vm_map_entry_t's.
15350 	 */
15351 
15352 	case VM_BEHAVIOR_DEFAULT:
15353 	case VM_BEHAVIOR_RANDOM:
15354 	case VM_BEHAVIOR_SEQUENTIAL:
15355 	case VM_BEHAVIOR_RSEQNTL:
15356 	case VM_BEHAVIOR_ZERO_WIRED_PAGES:
15357 		vm_map_lock(map);
15358 
15359 		/*
15360 		 *	The entire address range must be valid for the map.
15361 		 *      Note that vm_map_range_check() does a
15362 		 *	vm_map_lookup_entry() internally and returns the
15363 		 *	entry containing the start of the address range if
15364 		 *	the entire range is valid.
15365 		 */
15366 		if (vm_map_range_check(map, start, end, &temp_entry)) {
15367 			entry = temp_entry;
15368 			vm_map_clip_start(map, entry, start);
15369 		} else {
15370 			vm_map_unlock(map);
15371 			return KERN_INVALID_ADDRESS;
15372 		}
15373 
15374 		while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
15375 			vm_map_clip_end(map, entry, end);
15376 			if (entry->is_sub_map) {
15377 				assert(!entry->use_pmap);
15378 			}
15379 
15380 			if (new_behavior == VM_BEHAVIOR_ZERO_WIRED_PAGES) {
15381 				entry->zero_wired_pages = TRUE;
15382 			} else {
15383 				entry->behavior = new_behavior;
15384 			}
15385 			entry = entry->vme_next;
15386 		}
15387 
15388 		vm_map_unlock(map);
15389 		break;
15390 
15391 	/*
15392 	 * The rest of these are different from the above in that they cause
15393 	 * an immediate action to take place as opposed to setting a behavior that
15394 	 * affects future actions.
15395 	 */
15396 
15397 	case VM_BEHAVIOR_WILLNEED:
15398 		return vm_map_willneed(map, start, end);
15399 
15400 	case VM_BEHAVIOR_DONTNEED:
15401 		return vm_map_msync(map, start, end - start, VM_SYNC_DEACTIVATE | VM_SYNC_CONTIGUOUS);
15402 
15403 	case VM_BEHAVIOR_FREE:
15404 		return vm_map_msync(map, start, end - start, VM_SYNC_KILLPAGES | VM_SYNC_CONTIGUOUS);
15405 
15406 	case VM_BEHAVIOR_REUSABLE:
15407 		return vm_map_reusable_pages(map, start, end);
15408 
15409 	case VM_BEHAVIOR_REUSE:
15410 		return vm_map_reuse_pages(map, start, end);
15411 
15412 	case VM_BEHAVIOR_CAN_REUSE:
15413 		return vm_map_can_reuse(map, start, end);
15414 
15415 #if MACH_ASSERT
15416 	case VM_BEHAVIOR_PAGEOUT:
15417 		return vm_map_pageout(map, start, end);
15418 #endif /* MACH_ASSERT */
15419 
15420 	default:
15421 		return KERN_INVALID_ARGUMENT;
15422 	}
15423 
15424 	return KERN_SUCCESS;
15425 }
15426 
15427 
15428 /*
15429  * Internals for madvise(MADV_WILLNEED) system call.
15430  *
15431  * The implementation is to do:-
15432  * a) read-ahead if the mapping corresponds to a mapped regular file
15433  * b) or, fault in the pages (zero-fill, decompress etc) if it's an anonymous mapping
15434  */
15435 
15436 
15437 static kern_return_t
vm_map_willneed(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)15438 vm_map_willneed(
15439 	vm_map_t        map,
15440 	vm_map_offset_t start,
15441 	vm_map_offset_t end
15442 	)
15443 {
15444 	vm_map_entry_t                  entry;
15445 	vm_object_t                     object;
15446 	memory_object_t                 pager;
15447 	struct vm_object_fault_info     fault_info = {};
15448 	kern_return_t                   kr;
15449 	vm_object_size_t                len;
15450 	vm_object_offset_t              offset;
15451 
15452 	fault_info.interruptible = THREAD_UNINT;        /* ignored value */
15453 	fault_info.behavior      = VM_BEHAVIOR_SEQUENTIAL;
15454 	fault_info.stealth       = TRUE;
15455 
15456 	/*
15457 	 * The MADV_WILLNEED operation doesn't require any changes to the
15458 	 * vm_map_entry_t's, so the read lock is sufficient.
15459 	 */
15460 
15461 	vm_map_lock_read(map);
15462 
15463 	/*
15464 	 * The madvise semantics require that the address range be fully
15465 	 * allocated with no holes.  Otherwise, we're required to return
15466 	 * an error.
15467 	 */
15468 
15469 	if (!vm_map_range_check(map, start, end, &entry)) {
15470 		vm_map_unlock_read(map);
15471 		return KERN_INVALID_ADDRESS;
15472 	}
15473 
15474 	/*
15475 	 * Examine each vm_map_entry_t in the range.
15476 	 */
15477 	for (; entry != vm_map_to_entry(map) && start < end;) {
15478 		/*
15479 		 * The first time through, the start address could be anywhere
15480 		 * within the vm_map_entry we found.  So adjust the offset to
15481 		 * correspond.  After that, the offset will always be zero to
15482 		 * correspond to the beginning of the current vm_map_entry.
15483 		 */
15484 		offset = (start - entry->vme_start) + VME_OFFSET(entry);
15485 
15486 		/*
15487 		 * Set the length so we don't go beyond the end of the
15488 		 * map_entry or beyond the end of the range we were given.
15489 		 * This range could span also multiple map entries all of which
15490 		 * map different files, so make sure we only do the right amount
15491 		 * of I/O for each object.  Note that it's possible for there
15492 		 * to be multiple map entries all referring to the same object
15493 		 * but with different page permissions, but it's not worth
15494 		 * trying to optimize that case.
15495 		 */
15496 		len = MIN(entry->vme_end - start, end - start);
15497 
15498 		if ((vm_size_t) len != len) {
15499 			/* 32-bit overflow */
15500 			len = (vm_size_t) (0 - PAGE_SIZE);
15501 		}
15502 		fault_info.cluster_size = (vm_size_t) len;
15503 		fault_info.lo_offset    = offset;
15504 		fault_info.hi_offset    = offset + len;
15505 		fault_info.user_tag     = VME_ALIAS(entry);
15506 		fault_info.pmap_options = 0;
15507 		if (entry->iokit_acct ||
15508 		    (!entry->is_sub_map && !entry->use_pmap)) {
15509 			fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
15510 		}
15511 
15512 		/*
15513 		 * If the entry is a submap OR there's no read permission
15514 		 * to this mapping, then just skip it.
15515 		 */
15516 		if ((entry->is_sub_map) || (entry->protection & VM_PROT_READ) == 0) {
15517 			entry = entry->vme_next;
15518 			start = entry->vme_start;
15519 			continue;
15520 		}
15521 
15522 		object = VME_OBJECT(entry);
15523 
15524 		if (object == NULL ||
15525 		    (object && object->internal)) {
15526 			/*
15527 			 * Memory range backed by anonymous memory.
15528 			 */
15529 			vm_size_t region_size = 0, effective_page_size = 0;
15530 			vm_map_offset_t addr = 0, effective_page_mask = 0;
15531 
15532 			region_size = len;
15533 			addr = start;
15534 
15535 			effective_page_mask = MIN(vm_map_page_mask(current_map()), PAGE_MASK);
15536 			effective_page_size = effective_page_mask + 1;
15537 
15538 			vm_map_unlock_read(map);
15539 
15540 			while (region_size) {
15541 				vm_pre_fault(
15542 					vm_map_trunc_page(addr, effective_page_mask),
15543 					VM_PROT_READ | VM_PROT_WRITE);
15544 
15545 				region_size -= effective_page_size;
15546 				addr += effective_page_size;
15547 			}
15548 		} else {
15549 			/*
15550 			 * Find the file object backing this map entry.  If there is
15551 			 * none, then we simply ignore the "will need" advice for this
15552 			 * entry and go on to the next one.
15553 			 */
15554 			if ((object = find_vnode_object(entry)) == VM_OBJECT_NULL) {
15555 				entry = entry->vme_next;
15556 				start = entry->vme_start;
15557 				continue;
15558 			}
15559 
15560 			vm_object_paging_begin(object);
15561 			pager = object->pager;
15562 			vm_object_unlock(object);
15563 
15564 			/*
15565 			 * The data_request() could take a long time, so let's
15566 			 * release the map lock to avoid blocking other threads.
15567 			 */
15568 			vm_map_unlock_read(map);
15569 
15570 			/*
15571 			 * Get the data from the object asynchronously.
15572 			 *
15573 			 * Note that memory_object_data_request() places limits on the
15574 			 * amount of I/O it will do.  Regardless of the len we
15575 			 * specified, it won't do more than MAX_UPL_TRANSFER_BYTES and it
15576 			 * silently truncates the len to that size.  This isn't
15577 			 * necessarily bad since madvise shouldn't really be used to
15578 			 * page in unlimited amounts of data.  Other Unix variants
15579 			 * limit the willneed case as well.  If this turns out to be an
15580 			 * issue for developers, then we can always adjust the policy
15581 			 * here and still be backwards compatible since this is all
15582 			 * just "advice".
15583 			 */
15584 			kr = memory_object_data_request(
15585 				pager,
15586 				vm_object_trunc_page(offset) + object->paging_offset,
15587 				0,      /* ignored */
15588 				VM_PROT_READ,
15589 				(memory_object_fault_info_t)&fault_info);
15590 
15591 			vm_object_lock(object);
15592 			vm_object_paging_end(object);
15593 			vm_object_unlock(object);
15594 
15595 			/*
15596 			 * If we couldn't do the I/O for some reason, just give up on
15597 			 * the madvise.  We still return success to the user since
15598 			 * madvise isn't supposed to fail when the advice can't be
15599 			 * taken.
15600 			 */
15601 
15602 			if (kr != KERN_SUCCESS) {
15603 				return KERN_SUCCESS;
15604 			}
15605 		}
15606 
15607 		start += len;
15608 		if (start >= end) {
15609 			/* done */
15610 			return KERN_SUCCESS;
15611 		}
15612 
15613 		/* look up next entry */
15614 		vm_map_lock_read(map);
15615 		if (!vm_map_lookup_entry(map, start, &entry)) {
15616 			/*
15617 			 * There's a new hole in the address range.
15618 			 */
15619 			vm_map_unlock_read(map);
15620 			return KERN_INVALID_ADDRESS;
15621 		}
15622 	}
15623 
15624 	vm_map_unlock_read(map);
15625 	return KERN_SUCCESS;
15626 }
15627 
15628 static boolean_t
vm_map_entry_is_reusable(vm_map_entry_t entry)15629 vm_map_entry_is_reusable(
15630 	vm_map_entry_t entry)
15631 {
15632 	/* Only user map entries */
15633 
15634 	vm_object_t object;
15635 
15636 	if (entry->is_sub_map) {
15637 		return FALSE;
15638 	}
15639 
15640 	switch (VME_ALIAS(entry)) {
15641 	case VM_MEMORY_MALLOC:
15642 	case VM_MEMORY_MALLOC_SMALL:
15643 	case VM_MEMORY_MALLOC_LARGE:
15644 	case VM_MEMORY_REALLOC:
15645 	case VM_MEMORY_MALLOC_TINY:
15646 	case VM_MEMORY_MALLOC_LARGE_REUSABLE:
15647 	case VM_MEMORY_MALLOC_LARGE_REUSED:
15648 		/*
15649 		 * This is a malloc() memory region: check if it's still
15650 		 * in its original state and can be re-used for more
15651 		 * malloc() allocations.
15652 		 */
15653 		break;
15654 	default:
15655 		/*
15656 		 * Not a malloc() memory region: let the caller decide if
15657 		 * it's re-usable.
15658 		 */
15659 		return TRUE;
15660 	}
15661 
15662 	if (/*entry->is_shared ||*/
15663 		entry->is_sub_map ||
15664 		entry->in_transition ||
15665 		entry->protection != VM_PROT_DEFAULT ||
15666 		entry->max_protection != VM_PROT_ALL ||
15667 		entry->inheritance != VM_INHERIT_DEFAULT ||
15668 		entry->no_cache ||
15669 		entry->permanent ||
15670 		entry->superpage_size != FALSE ||
15671 		entry->zero_wired_pages ||
15672 		entry->wired_count != 0 ||
15673 		entry->user_wired_count != 0) {
15674 		return FALSE;
15675 	}
15676 
15677 	object = VME_OBJECT(entry);
15678 	if (object == VM_OBJECT_NULL) {
15679 		return TRUE;
15680 	}
15681 	if (
15682 #if 0
15683 		/*
15684 		 * Let's proceed even if the VM object is potentially
15685 		 * shared.
15686 		 * We check for this later when processing the actual
15687 		 * VM pages, so the contents will be safe if shared.
15688 		 *
15689 		 * But we can still mark this memory region as "reusable" to
15690 		 * acknowledge that the caller did let us know that the memory
15691 		 * could be re-used and should not be penalized for holding
15692 		 * on to it.  This allows its "resident size" to not include
15693 		 * the reusable range.
15694 		 */
15695 		object->ref_count == 1 &&
15696 #endif
15697 		object->wired_page_count == 0 &&
15698 		object->copy == VM_OBJECT_NULL &&
15699 		object->shadow == VM_OBJECT_NULL &&
15700 		object->internal &&
15701 		object->purgable == VM_PURGABLE_DENY &&
15702 		object->wimg_bits == VM_WIMG_USE_DEFAULT &&
15703 		!object->code_signed) {
15704 		return TRUE;
15705 	}
15706 	return FALSE;
15707 }
15708 
15709 static kern_return_t
vm_map_reuse_pages(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)15710 vm_map_reuse_pages(
15711 	vm_map_t        map,
15712 	vm_map_offset_t start,
15713 	vm_map_offset_t end)
15714 {
15715 	vm_map_entry_t                  entry;
15716 	vm_object_t                     object;
15717 	vm_object_offset_t              start_offset, end_offset;
15718 
15719 	/*
15720 	 * The MADV_REUSE operation doesn't require any changes to the
15721 	 * vm_map_entry_t's, so the read lock is sufficient.
15722 	 */
15723 
15724 	if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
15725 		/*
15726 		 * XXX TODO4K
15727 		 * need to figure out what reusable means for a
15728 		 * portion of a native page.
15729 		 */
15730 		return KERN_SUCCESS;
15731 	}
15732 
15733 	vm_map_lock_read(map);
15734 	assert(map->pmap != kernel_pmap);       /* protect alias access */
15735 
15736 	/*
15737 	 * The madvise semantics require that the address range be fully
15738 	 * allocated with no holes.  Otherwise, we're required to return
15739 	 * an error.
15740 	 */
15741 
15742 	if (!vm_map_range_check(map, start, end, &entry)) {
15743 		vm_map_unlock_read(map);
15744 		vm_page_stats_reusable.reuse_pages_failure++;
15745 		return KERN_INVALID_ADDRESS;
15746 	}
15747 
15748 	/*
15749 	 * Examine each vm_map_entry_t in the range.
15750 	 */
15751 	for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
15752 	    entry = entry->vme_next) {
15753 		/*
15754 		 * Sanity check on the VM map entry.
15755 		 */
15756 		if (!vm_map_entry_is_reusable(entry)) {
15757 			vm_map_unlock_read(map);
15758 			vm_page_stats_reusable.reuse_pages_failure++;
15759 			return KERN_INVALID_ADDRESS;
15760 		}
15761 
15762 		/*
15763 		 * The first time through, the start address could be anywhere
15764 		 * within the vm_map_entry we found.  So adjust the offset to
15765 		 * correspond.
15766 		 */
15767 		if (entry->vme_start < start) {
15768 			start_offset = start - entry->vme_start;
15769 		} else {
15770 			start_offset = 0;
15771 		}
15772 		end_offset = MIN(end, entry->vme_end) - entry->vme_start;
15773 		start_offset += VME_OFFSET(entry);
15774 		end_offset += VME_OFFSET(entry);
15775 
15776 		object = VME_OBJECT(entry);
15777 		if (object != VM_OBJECT_NULL) {
15778 			vm_object_lock(object);
15779 			vm_object_reuse_pages(object, start_offset, end_offset,
15780 			    TRUE);
15781 			vm_object_unlock(object);
15782 		}
15783 
15784 		if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE_REUSABLE) {
15785 			/*
15786 			 * XXX
15787 			 * We do not hold the VM map exclusively here.
15788 			 * The "alias" field is not that critical, so it's
15789 			 * safe to update it here, as long as it is the only
15790 			 * one that can be modified while holding the VM map
15791 			 * "shared".
15792 			 */
15793 			VME_ALIAS_SET(entry, VM_MEMORY_MALLOC_LARGE_REUSED);
15794 		}
15795 	}
15796 
15797 	vm_map_unlock_read(map);
15798 	vm_page_stats_reusable.reuse_pages_success++;
15799 	return KERN_SUCCESS;
15800 }
15801 
15802 
15803 static kern_return_t
vm_map_reusable_pages(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)15804 vm_map_reusable_pages(
15805 	vm_map_t        map,
15806 	vm_map_offset_t start,
15807 	vm_map_offset_t end)
15808 {
15809 	vm_map_entry_t                  entry;
15810 	vm_object_t                     object;
15811 	vm_object_offset_t              start_offset, end_offset;
15812 	vm_map_offset_t                 pmap_offset;
15813 
15814 	if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
15815 		/*
15816 		 * XXX TODO4K
15817 		 * need to figure out what reusable means for a portion
15818 		 * of a native page.
15819 		 */
15820 		return KERN_SUCCESS;
15821 	}
15822 
15823 	/*
15824 	 * The MADV_REUSABLE operation doesn't require any changes to the
15825 	 * vm_map_entry_t's, so the read lock is sufficient.
15826 	 */
15827 
15828 	vm_map_lock_read(map);
15829 	assert(map->pmap != kernel_pmap);       /* protect alias access */
15830 
15831 	/*
15832 	 * The madvise semantics require that the address range be fully
15833 	 * allocated with no holes.  Otherwise, we're required to return
15834 	 * an error.
15835 	 */
15836 
15837 	if (!vm_map_range_check(map, start, end, &entry)) {
15838 		vm_map_unlock_read(map);
15839 		vm_page_stats_reusable.reusable_pages_failure++;
15840 		return KERN_INVALID_ADDRESS;
15841 	}
15842 
15843 	/*
15844 	 * Examine each vm_map_entry_t in the range.
15845 	 */
15846 	for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
15847 	    entry = entry->vme_next) {
15848 		int kill_pages = 0;
15849 
15850 		/*
15851 		 * Sanity check on the VM map entry.
15852 		 */
15853 		if (!vm_map_entry_is_reusable(entry)) {
15854 			vm_map_unlock_read(map);
15855 			vm_page_stats_reusable.reusable_pages_failure++;
15856 			return KERN_INVALID_ADDRESS;
15857 		}
15858 
15859 		if (!(entry->protection & VM_PROT_WRITE) && !entry->used_for_jit) {
15860 			/* not writable: can't discard contents */
15861 			vm_map_unlock_read(map);
15862 			vm_page_stats_reusable.reusable_nonwritable++;
15863 			vm_page_stats_reusable.reusable_pages_failure++;
15864 			return KERN_PROTECTION_FAILURE;
15865 		}
15866 
15867 		/*
15868 		 * The first time through, the start address could be anywhere
15869 		 * within the vm_map_entry we found.  So adjust the offset to
15870 		 * correspond.
15871 		 */
15872 		if (entry->vme_start < start) {
15873 			start_offset = start - entry->vme_start;
15874 			pmap_offset = start;
15875 		} else {
15876 			start_offset = 0;
15877 			pmap_offset = entry->vme_start;
15878 		}
15879 		end_offset = MIN(end, entry->vme_end) - entry->vme_start;
15880 		start_offset += VME_OFFSET(entry);
15881 		end_offset += VME_OFFSET(entry);
15882 
15883 		object = VME_OBJECT(entry);
15884 		if (object == VM_OBJECT_NULL) {
15885 			continue;
15886 		}
15887 
15888 
15889 		vm_object_lock(object);
15890 		if (((object->ref_count == 1) ||
15891 		    (object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC &&
15892 		    object->copy == VM_OBJECT_NULL)) &&
15893 		    object->shadow == VM_OBJECT_NULL &&
15894 		    /*
15895 		     * "iokit_acct" entries are billed for their virtual size
15896 		     * (rather than for their resident pages only), so they
15897 		     * wouldn't benefit from making pages reusable, and it
15898 		     * would be hard to keep track of pages that are both
15899 		     * "iokit_acct" and "reusable" in the pmap stats and
15900 		     * ledgers.
15901 		     */
15902 		    !(entry->iokit_acct ||
15903 		    (!entry->is_sub_map && !entry->use_pmap))) {
15904 			if (object->ref_count != 1) {
15905 				vm_page_stats_reusable.reusable_shared++;
15906 			}
15907 			kill_pages = 1;
15908 		} else {
15909 			kill_pages = -1;
15910 		}
15911 		if (kill_pages != -1) {
15912 			vm_object_deactivate_pages(object,
15913 			    start_offset,
15914 			    end_offset - start_offset,
15915 			    kill_pages,
15916 			    TRUE /*reusable_pages*/,
15917 			    map->pmap,
15918 			    pmap_offset);
15919 		} else {
15920 			vm_page_stats_reusable.reusable_pages_shared++;
15921 			DTRACE_VM4(vm_map_reusable_pages_shared,
15922 			    unsigned int, VME_ALIAS(entry),
15923 			    vm_map_t, map,
15924 			    vm_map_entry_t, entry,
15925 			    vm_object_t, object);
15926 		}
15927 		vm_object_unlock(object);
15928 
15929 		if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE ||
15930 		    VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE_REUSED) {
15931 			/*
15932 			 * XXX
15933 			 * We do not hold the VM map exclusively here.
15934 			 * The "alias" field is not that critical, so it's
15935 			 * safe to update it here, as long as it is the only
15936 			 * one that can be modified while holding the VM map
15937 			 * "shared".
15938 			 */
15939 			VME_ALIAS_SET(entry, VM_MEMORY_MALLOC_LARGE_REUSABLE);
15940 		}
15941 	}
15942 
15943 	vm_map_unlock_read(map);
15944 	vm_page_stats_reusable.reusable_pages_success++;
15945 	return KERN_SUCCESS;
15946 }
15947 
15948 
15949 static kern_return_t
vm_map_can_reuse(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)15950 vm_map_can_reuse(
15951 	vm_map_t        map,
15952 	vm_map_offset_t start,
15953 	vm_map_offset_t end)
15954 {
15955 	vm_map_entry_t                  entry;
15956 
15957 	/*
15958 	 * The MADV_REUSABLE operation doesn't require any changes to the
15959 	 * vm_map_entry_t's, so the read lock is sufficient.
15960 	 */
15961 
15962 	vm_map_lock_read(map);
15963 	assert(map->pmap != kernel_pmap);       /* protect alias access */
15964 
15965 	/*
15966 	 * The madvise semantics require that the address range be fully
15967 	 * allocated with no holes.  Otherwise, we're required to return
15968 	 * an error.
15969 	 */
15970 
15971 	if (!vm_map_range_check(map, start, end, &entry)) {
15972 		vm_map_unlock_read(map);
15973 		vm_page_stats_reusable.can_reuse_failure++;
15974 		return KERN_INVALID_ADDRESS;
15975 	}
15976 
15977 	/*
15978 	 * Examine each vm_map_entry_t in the range.
15979 	 */
15980 	for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
15981 	    entry = entry->vme_next) {
15982 		/*
15983 		 * Sanity check on the VM map entry.
15984 		 */
15985 		if (!vm_map_entry_is_reusable(entry)) {
15986 			vm_map_unlock_read(map);
15987 			vm_page_stats_reusable.can_reuse_failure++;
15988 			return KERN_INVALID_ADDRESS;
15989 		}
15990 	}
15991 
15992 	vm_map_unlock_read(map);
15993 	vm_page_stats_reusable.can_reuse_success++;
15994 	return KERN_SUCCESS;
15995 }
15996 
15997 
15998 #if MACH_ASSERT
15999 static kern_return_t
vm_map_pageout(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16000 vm_map_pageout(
16001 	vm_map_t        map,
16002 	vm_map_offset_t start,
16003 	vm_map_offset_t end)
16004 {
16005 	vm_map_entry_t                  entry;
16006 
16007 	/*
16008 	 * The MADV_PAGEOUT operation doesn't require any changes to the
16009 	 * vm_map_entry_t's, so the read lock is sufficient.
16010 	 */
16011 
16012 	vm_map_lock_read(map);
16013 
16014 	/*
16015 	 * The madvise semantics require that the address range be fully
16016 	 * allocated with no holes.  Otherwise, we're required to return
16017 	 * an error.
16018 	 */
16019 
16020 	if (!vm_map_range_check(map, start, end, &entry)) {
16021 		vm_map_unlock_read(map);
16022 		return KERN_INVALID_ADDRESS;
16023 	}
16024 
16025 	/*
16026 	 * Examine each vm_map_entry_t in the range.
16027 	 */
16028 	for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
16029 	    entry = entry->vme_next) {
16030 		vm_object_t     object;
16031 
16032 		/*
16033 		 * Sanity check on the VM map entry.
16034 		 */
16035 		if (entry->is_sub_map) {
16036 			vm_map_t submap;
16037 			vm_map_offset_t submap_start;
16038 			vm_map_offset_t submap_end;
16039 			vm_map_entry_t submap_entry;
16040 
16041 			submap = VME_SUBMAP(entry);
16042 			submap_start = VME_OFFSET(entry);
16043 			submap_end = submap_start + (entry->vme_end -
16044 			    entry->vme_start);
16045 
16046 			vm_map_lock_read(submap);
16047 
16048 			if (!vm_map_range_check(submap,
16049 			    submap_start,
16050 			    submap_end,
16051 			    &submap_entry)) {
16052 				vm_map_unlock_read(submap);
16053 				vm_map_unlock_read(map);
16054 				return KERN_INVALID_ADDRESS;
16055 			}
16056 
16057 			if (submap_entry->is_sub_map) {
16058 				vm_map_unlock_read(submap);
16059 				continue;
16060 			}
16061 
16062 			object = VME_OBJECT(submap_entry);
16063 			if (object == VM_OBJECT_NULL || !object->internal) {
16064 				vm_map_unlock_read(submap);
16065 				continue;
16066 			}
16067 
16068 			vm_object_pageout(object);
16069 
16070 			vm_map_unlock_read(submap);
16071 			submap = VM_MAP_NULL;
16072 			submap_entry = VM_MAP_ENTRY_NULL;
16073 			continue;
16074 		}
16075 
16076 		object = VME_OBJECT(entry);
16077 		if (object == VM_OBJECT_NULL || !object->internal) {
16078 			continue;
16079 		}
16080 
16081 		vm_object_pageout(object);
16082 	}
16083 
16084 	vm_map_unlock_read(map);
16085 	return KERN_SUCCESS;
16086 }
16087 #endif /* MACH_ASSERT */
16088 
16089 
16090 /*
16091  *	Routine:	vm_map_entry_insert
16092  *
16093  *	Description:	This routine inserts a new vm_entry in a locked map.
16094  */
16095 static vm_map_entry_t
vm_map_entry_insert(vm_map_t map,vm_map_entry_t insp_entry,vm_map_offset_t start,vm_map_offset_t end,vm_object_t object,vm_object_offset_t offset,vm_map_kernel_flags_t vmk_flags,boolean_t needs_copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance,boolean_t no_cache,boolean_t permanent,unsigned int superpage_size,boolean_t clear_map_aligned,int alias)16096 vm_map_entry_insert(
16097 	vm_map_t                map,
16098 	vm_map_entry_t          insp_entry,
16099 	vm_map_offset_t         start,
16100 	vm_map_offset_t         end,
16101 	vm_object_t             object,
16102 	vm_object_offset_t      offset,
16103 	vm_map_kernel_flags_t   vmk_flags,
16104 	boolean_t               needs_copy,
16105 	vm_prot_t               cur_protection,
16106 	vm_prot_t               max_protection,
16107 	vm_inherit_t            inheritance,
16108 	boolean_t               no_cache,
16109 	boolean_t               permanent,
16110 	unsigned int            superpage_size,
16111 	boolean_t               clear_map_aligned,
16112 	int                     alias)
16113 {
16114 	vm_map_entry_t  new_entry;
16115 	boolean_t map_aligned = FALSE;
16116 
16117 	assert(insp_entry != (vm_map_entry_t)0);
16118 	vm_map_lock_assert_exclusive(map);
16119 
16120 #if DEVELOPMENT || DEBUG
16121 	vm_object_offset_t      end_offset = 0;
16122 	assertf(!os_add_overflow(end - start, offset, &end_offset), "size 0x%llx, offset 0x%llx caused overflow", (uint64_t)(end - start), offset);
16123 #endif /* DEVELOPMENT || DEBUG */
16124 
16125 	if (VM_MAP_PAGE_SHIFT(map) != PAGE_SHIFT) {
16126 		map_aligned = TRUE;
16127 	}
16128 	if (clear_map_aligned &&
16129 	    (!VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)) ||
16130 	    !VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)))) {
16131 		map_aligned = FALSE;
16132 	}
16133 	if (map_aligned) {
16134 		assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
16135 		assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
16136 	} else {
16137 		assert(page_aligned(start));
16138 		assert(page_aligned(end));
16139 	}
16140 	assert(start < end);
16141 
16142 	new_entry = vm_map_entry_create(map);
16143 
16144 	new_entry->vme_start = start;
16145 	new_entry->vme_end = end;
16146 
16147 	if (vmk_flags.vmkf_submap) {
16148 		new_entry->vme_atomic = vmk_flags.vmkf_submap_atomic;
16149 		VME_SUBMAP_SET(new_entry, (vm_map_t)object);
16150 	} else {
16151 		VME_OBJECT_SET(new_entry, object, false, 0);
16152 	}
16153 	VME_OFFSET_SET(new_entry, offset);
16154 	VME_ALIAS_SET(new_entry, alias);
16155 
16156 	new_entry->map_aligned = map_aligned;
16157 	new_entry->needs_copy = needs_copy;
16158 	new_entry->inheritance = inheritance;
16159 	new_entry->protection = cur_protection;
16160 	new_entry->max_protection = max_protection;
16161 	/*
16162 	 * submap: "use_pmap" means "nested".
16163 	 * default: false.
16164 	 *
16165 	 * object: "use_pmap" means "use pmap accounting" for footprint.
16166 	 * default: true.
16167 	 */
16168 	new_entry->use_pmap = !vmk_flags.vmkf_submap;
16169 	new_entry->no_cache = no_cache;
16170 	new_entry->permanent = permanent;
16171 	new_entry->translated_allow_execute = vmk_flags.vmkf_translated_allow_execute;
16172 	new_entry->vme_no_copy_on_read = vmk_flags.vmkf_no_copy_on_read;
16173 	new_entry->superpage_size = (superpage_size != 0);
16174 
16175 	if (vmk_flags.vmkf_map_jit) {
16176 		if (!(map->jit_entry_exists) ||
16177 		    VM_MAP_POLICY_ALLOW_MULTIPLE_JIT(map)) {
16178 			new_entry->used_for_jit = TRUE;
16179 			map->jit_entry_exists = TRUE;
16180 		}
16181 	}
16182 
16183 	/*
16184 	 *	Insert the new entry into the list.
16185 	 */
16186 
16187 	vm_map_store_entry_link(map, insp_entry, new_entry, vmk_flags);
16188 	map->size += end - start;
16189 
16190 	/*
16191 	 *	Update the free space hint and the lookup hint.
16192 	 */
16193 
16194 	SAVE_HINT_MAP_WRITE(map, new_entry);
16195 	return new_entry;
16196 }
16197 
16198 /*
16199  *	Routine:	vm_map_remap_extract
16200  *
16201  *	Description:	This routine returns a vm_entry list from a map.
16202  */
16203 static kern_return_t
vm_map_remap_extract(vm_map_t map,vm_map_offset_t addr,vm_map_size_t size,boolean_t copy,struct vm_map_header * map_header,vm_prot_t * cur_protection,vm_prot_t * max_protection,vm_inherit_t inheritance,vm_map_kernel_flags_t vmk_flags)16204 vm_map_remap_extract(
16205 	vm_map_t                map,
16206 	vm_map_offset_t         addr,
16207 	vm_map_size_t           size,
16208 	boolean_t               copy,
16209 	struct vm_map_header    *map_header,
16210 	vm_prot_t               *cur_protection,   /* IN/OUT */
16211 	vm_prot_t               *max_protection,   /* IN/OUT */
16212 	/* What, no behavior? */
16213 	vm_inherit_t            inheritance,
16214 	vm_map_kernel_flags_t   vmk_flags)
16215 {
16216 	kern_return_t           result;
16217 	vm_map_size_t           mapped_size;
16218 	vm_map_size_t           tmp_size;
16219 	vm_map_entry_t          src_entry;     /* result of last map lookup */
16220 	vm_map_entry_t          new_entry;
16221 	vm_object_offset_t      offset;
16222 	vm_map_offset_t         map_address;
16223 	vm_map_offset_t         src_start;     /* start of entry to map */
16224 	vm_map_offset_t         src_end;       /* end of region to be mapped */
16225 	vm_object_t             object;
16226 	vm_map_version_t        version;
16227 	boolean_t               src_needs_copy;
16228 	boolean_t               new_entry_needs_copy;
16229 	vm_map_entry_t          saved_src_entry;
16230 	boolean_t               src_entry_was_wired;
16231 	vm_prot_t               max_prot_for_prot_copy;
16232 	vm_map_offset_t         effective_page_mask;
16233 	boolean_t               pageable, same_map;
16234 	boolean_t               vm_remap_legacy;
16235 	vm_prot_t               required_cur_prot, required_max_prot;
16236 	vm_object_t             new_copy_object;     /* vm_object_copy_* result */
16237 	boolean_t               saved_used_for_jit;     /* Saved used_for_jit. */
16238 
16239 	pageable = vmk_flags.vmkf_copy_pageable;
16240 	same_map = vmk_flags.vmkf_copy_same_map;
16241 
16242 	effective_page_mask = MIN(PAGE_MASK, VM_MAP_PAGE_MASK(map));
16243 
16244 	assert(map != VM_MAP_NULL);
16245 	assert(size != 0);
16246 	assert(size == vm_map_round_page(size, effective_page_mask));
16247 	assert(inheritance == VM_INHERIT_NONE ||
16248 	    inheritance == VM_INHERIT_COPY ||
16249 	    inheritance == VM_INHERIT_SHARE);
16250 	assert(!(*cur_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)));
16251 	assert(!(*max_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)));
16252 	assert((*cur_protection & *max_protection) == *cur_protection);
16253 
16254 	/*
16255 	 *	Compute start and end of region.
16256 	 */
16257 	src_start = vm_map_trunc_page(addr, effective_page_mask);
16258 	src_end = vm_map_round_page(src_start + size, effective_page_mask);
16259 
16260 	/*
16261 	 *	Initialize map_header.
16262 	 */
16263 	map_header->links.next = CAST_TO_VM_MAP_ENTRY(&map_header->links);
16264 	map_header->links.prev = CAST_TO_VM_MAP_ENTRY(&map_header->links);
16265 	map_header->nentries = 0;
16266 	map_header->entries_pageable = pageable;
16267 //	map_header->page_shift = MIN(VM_MAP_PAGE_SHIFT(map), PAGE_SHIFT);
16268 	map_header->page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(map);
16269 	map_header->rb_head_store.rbh_root = (void *)(int)SKIP_RB_TREE;
16270 
16271 	vm_map_store_init( map_header );
16272 
16273 	if (copy && vmk_flags.vmkf_remap_prot_copy) {
16274 		/*
16275 		 * Special case for vm_map_protect(VM_PROT_COPY):
16276 		 * we want to set the new mappings' max protection to the
16277 		 * specified *max_protection...
16278 		 */
16279 		max_prot_for_prot_copy = *max_protection & (VM_PROT_ALL | VM_PROT_ALLEXEC);
16280 		/* ... but we want to use the vm_remap() legacy mode */
16281 		*max_protection = VM_PROT_NONE;
16282 		*cur_protection = VM_PROT_NONE;
16283 	} else {
16284 		max_prot_for_prot_copy = VM_PROT_NONE;
16285 	}
16286 
16287 	if (*cur_protection == VM_PROT_NONE &&
16288 	    *max_protection == VM_PROT_NONE) {
16289 		/*
16290 		 * vm_remap() legacy mode:
16291 		 * Extract all memory regions in the specified range and
16292 		 * collect the strictest set of protections allowed on the
16293 		 * entire range, so the caller knows what they can do with
16294 		 * the remapped range.
16295 		 * We start with VM_PROT_ALL and we'll remove the protections
16296 		 * missing from each memory region.
16297 		 */
16298 		vm_remap_legacy = TRUE;
16299 		*cur_protection = VM_PROT_ALL;
16300 		*max_protection = VM_PROT_ALL;
16301 		required_cur_prot = VM_PROT_NONE;
16302 		required_max_prot = VM_PROT_NONE;
16303 	} else {
16304 		/*
16305 		 * vm_remap_new() mode:
16306 		 * Extract all memory regions in the specified range and
16307 		 * ensure that they have at least the protections specified
16308 		 * by the caller via *cur_protection and *max_protection.
16309 		 * The resulting mapping should have these protections.
16310 		 */
16311 		vm_remap_legacy = FALSE;
16312 		if (copy) {
16313 			required_cur_prot = VM_PROT_NONE;
16314 			required_max_prot = VM_PROT_READ;
16315 		} else {
16316 			required_cur_prot = *cur_protection;
16317 			required_max_prot = *max_protection;
16318 		}
16319 	}
16320 
16321 	map_address = 0;
16322 	mapped_size = 0;
16323 	result = KERN_SUCCESS;
16324 
16325 	/*
16326 	 *	The specified source virtual space might correspond to
16327 	 *	multiple map entries, need to loop on them.
16328 	 */
16329 	vm_map_lock(map);
16330 	if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
16331 		/*
16332 		 * This address space uses sub-pages so the range might
16333 		 * not be re-mappable in an address space with larger
16334 		 * pages. Re-assemble any broken-up VM map entries to
16335 		 * improve our chances of making it work.
16336 		 */
16337 		vm_map_simplify_range(map, src_start, src_end);
16338 	}
16339 	while (mapped_size != size) {
16340 		vm_map_size_t   entry_size;
16341 
16342 		/*
16343 		 *	Find the beginning of the region.
16344 		 */
16345 		if (!vm_map_lookup_entry(map, src_start, &src_entry)) {
16346 			result = KERN_INVALID_ADDRESS;
16347 			break;
16348 		}
16349 
16350 		if (src_start < src_entry->vme_start ||
16351 		    (mapped_size && src_start != src_entry->vme_start)) {
16352 			result = KERN_INVALID_ADDRESS;
16353 			break;
16354 		}
16355 
16356 		tmp_size = size - mapped_size;
16357 		if (src_end > src_entry->vme_end) {
16358 			tmp_size -= (src_end - src_entry->vme_end);
16359 		}
16360 
16361 		entry_size = (vm_map_size_t)(src_entry->vme_end -
16362 		    src_entry->vme_start);
16363 
16364 		if (src_entry->is_sub_map &&
16365 		    vmk_flags.vmkf_copy_single_object) {
16366 			vm_map_t submap;
16367 			vm_map_offset_t submap_start;
16368 			vm_map_size_t submap_size;
16369 			boolean_t submap_needs_copy;
16370 
16371 			/*
16372 			 * No check for "required protection" on "src_entry"
16373 			 * because the protections that matter are the ones
16374 			 * on the submap's VM map entry, which will be checked
16375 			 * during the call to vm_map_remap_extract() below.
16376 			 */
16377 			submap_size = src_entry->vme_end - src_start;
16378 			if (submap_size > size) {
16379 				submap_size = size;
16380 			}
16381 			submap_start = VME_OFFSET(src_entry) + src_start - src_entry->vme_start;
16382 			submap = VME_SUBMAP(src_entry);
16383 			if (copy) {
16384 				/*
16385 				 * The caller wants a copy-on-write re-mapping,
16386 				 * so let's extract from the submap accordingly.
16387 				 */
16388 				submap_needs_copy = TRUE;
16389 			} else if (src_entry->needs_copy) {
16390 				/*
16391 				 * The caller wants a shared re-mapping but the
16392 				 * submap is mapped with "needs_copy", so its
16393 				 * contents can't be shared as is. Extract the
16394 				 * contents of the submap as "copy-on-write".
16395 				 * The re-mapping won't be shared with the
16396 				 * original mapping but this is equivalent to
16397 				 * what happened with the original "remap from
16398 				 * submap" code.
16399 				 * The shared region is mapped "needs_copy", for
16400 				 * example.
16401 				 */
16402 				submap_needs_copy = TRUE;
16403 			} else {
16404 				/*
16405 				 * The caller wants a shared re-mapping and
16406 				 * this mapping can be shared (no "needs_copy"),
16407 				 * so let's extract from the submap accordingly.
16408 				 * Kernel submaps are mapped without
16409 				 * "needs_copy", for example.
16410 				 */
16411 				submap_needs_copy = FALSE;
16412 			}
16413 			vm_map_reference(submap);
16414 			vm_map_unlock(map);
16415 			src_entry = NULL;
16416 			if (vm_remap_legacy) {
16417 				*cur_protection = VM_PROT_NONE;
16418 				*max_protection = VM_PROT_NONE;
16419 			}
16420 
16421 			DTRACE_VM7(remap_submap_recurse,
16422 			    vm_map_t, map,
16423 			    vm_map_offset_t, addr,
16424 			    vm_map_size_t, size,
16425 			    boolean_t, copy,
16426 			    vm_map_offset_t, submap_start,
16427 			    vm_map_size_t, submap_size,
16428 			    boolean_t, submap_needs_copy);
16429 
16430 			result = vm_map_remap_extract(submap,
16431 			    submap_start,
16432 			    submap_size,
16433 			    submap_needs_copy,
16434 			    map_header,
16435 			    cur_protection,
16436 			    max_protection,
16437 			    inheritance,
16438 			    vmk_flags);
16439 			vm_map_deallocate(submap);
16440 			return result;
16441 		}
16442 
16443 		if (src_entry->is_sub_map) {
16444 			/* protections for submap mapping are irrelevant here */
16445 		} else if (((src_entry->protection & required_cur_prot) !=
16446 		    required_cur_prot) ||
16447 		    ((src_entry->max_protection & required_max_prot) !=
16448 		    required_max_prot)) {
16449 			if (vmk_flags.vmkf_copy_single_object &&
16450 			    mapped_size != 0) {
16451 				/*
16452 				 * Single object extraction.
16453 				 * We can't extract more with the required
16454 				 * protection but we've extracted some, so
16455 				 * stop there and declare success.
16456 				 * The caller should check the size of
16457 				 * the copy entry we've extracted.
16458 				 */
16459 				result = KERN_SUCCESS;
16460 			} else {
16461 				/*
16462 				 * VM range extraction.
16463 				 * Required proctection is not available
16464 				 * for this part of the range: fail.
16465 				 */
16466 				result = KERN_PROTECTION_FAILURE;
16467 			}
16468 			break;
16469 		}
16470 
16471 		if (src_entry->is_sub_map) {
16472 			vm_map_t submap;
16473 			vm_map_offset_t submap_start;
16474 			vm_map_size_t submap_size;
16475 			vm_map_copy_t submap_copy;
16476 			vm_prot_t submap_curprot, submap_maxprot;
16477 			boolean_t submap_needs_copy;
16478 
16479 			/*
16480 			 * No check for "required protection" on "src_entry"
16481 			 * because the protections that matter are the ones
16482 			 * on the submap's VM map entry, which will be checked
16483 			 * during the call to vm_map_copy_extract() below.
16484 			 */
16485 			object = VM_OBJECT_NULL;
16486 			submap_copy = VM_MAP_COPY_NULL;
16487 
16488 			/* find equivalent range in the submap */
16489 			submap = VME_SUBMAP(src_entry);
16490 			submap_start = VME_OFFSET(src_entry) + src_start - src_entry->vme_start;
16491 			submap_size = tmp_size;
16492 			if (copy) {
16493 				/*
16494 				 * The caller wants a copy-on-write re-mapping,
16495 				 * so let's extract from the submap accordingly.
16496 				 */
16497 				submap_needs_copy = TRUE;
16498 			} else if (src_entry->needs_copy) {
16499 				/*
16500 				 * The caller wants a shared re-mapping but the
16501 				 * submap is mapped with "needs_copy", so its
16502 				 * contents can't be shared as is. Extract the
16503 				 * contents of the submap as "copy-on-write".
16504 				 * The re-mapping won't be shared with the
16505 				 * original mapping but this is equivalent to
16506 				 * what happened with the original "remap from
16507 				 * submap" code.
16508 				 * The shared region is mapped "needs_copy", for
16509 				 * example.
16510 				 */
16511 				submap_needs_copy = TRUE;
16512 			} else {
16513 				/*
16514 				 * The caller wants a shared re-mapping and
16515 				 * this mapping can be shared (no "needs_copy"),
16516 				 * so let's extract from the submap accordingly.
16517 				 * Kernel submaps are mapped without
16518 				 * "needs_copy", for example.
16519 				 */
16520 				submap_needs_copy = FALSE;
16521 			}
16522 			/* extra ref to keep submap alive */
16523 			vm_map_reference(submap);
16524 
16525 			DTRACE_VM7(remap_submap_recurse,
16526 			    vm_map_t, map,
16527 			    vm_map_offset_t, addr,
16528 			    vm_map_size_t, size,
16529 			    boolean_t, copy,
16530 			    vm_map_offset_t, submap_start,
16531 			    vm_map_size_t, submap_size,
16532 			    boolean_t, submap_needs_copy);
16533 
16534 			/*
16535 			 * The map can be safely unlocked since we
16536 			 * already hold a reference on the submap.
16537 			 *
16538 			 * No timestamp since we don't care if the map
16539 			 * gets modified while we're down in the submap.
16540 			 * We'll resume the extraction at src_start + tmp_size
16541 			 * anyway.
16542 			 */
16543 			vm_map_unlock(map);
16544 			src_entry = NULL; /* not valid once map is unlocked */
16545 
16546 			if (vm_remap_legacy) {
16547 				submap_curprot = VM_PROT_NONE;
16548 				submap_maxprot = VM_PROT_NONE;
16549 				if (max_prot_for_prot_copy) {
16550 					submap_maxprot = max_prot_for_prot_copy;
16551 				}
16552 			} else {
16553 				assert(!max_prot_for_prot_copy);
16554 				submap_curprot = *cur_protection;
16555 				submap_maxprot = *max_protection;
16556 			}
16557 			result = vm_map_copy_extract(submap,
16558 			    submap_start,
16559 			    submap_size,
16560 			    submap_needs_copy,
16561 			    &submap_copy,
16562 			    &submap_curprot,
16563 			    &submap_maxprot,
16564 			    inheritance,
16565 			    vmk_flags);
16566 
16567 			/* release extra ref on submap */
16568 			vm_map_deallocate(submap);
16569 			submap = VM_MAP_NULL;
16570 
16571 			if (result != KERN_SUCCESS) {
16572 				vm_map_lock(map);
16573 				break;
16574 			}
16575 
16576 			/* transfer submap_copy entries to map_header */
16577 			while (vm_map_copy_first_entry(submap_copy) !=
16578 			    vm_map_copy_to_entry(submap_copy)) {
16579 				vm_map_entry_t copy_entry;
16580 				vm_map_size_t copy_entry_size;
16581 
16582 				copy_entry = vm_map_copy_first_entry(submap_copy);
16583 
16584 				/*
16585 				 * Prevent kernel_object from being exposed to
16586 				 * user space.
16587 				 */
16588 				if (__improbable(copy_entry->vme_kernel_object)) {
16589 					printf("%d[%s]: rejecting attempt to extract from kernel_object\n",
16590 					    proc_selfpid(),
16591 					    (current_task()->bsd_info
16592 					    ? proc_name_address(current_task()->bsd_info)
16593 					    : "?"));
16594 					DTRACE_VM(extract_kernel_only);
16595 					result = KERN_INVALID_RIGHT;
16596 					vm_map_copy_discard(submap_copy);
16597 					submap_copy = VM_MAP_COPY_NULL;
16598 					vm_map_lock(map);
16599 					break;
16600 				}
16601 
16602 				vm_map_copy_entry_unlink(submap_copy, copy_entry);
16603 				copy_entry_size = copy_entry->vme_end - copy_entry->vme_start;
16604 				copy_entry->vme_start = map_address;
16605 				copy_entry->vme_end = map_address + copy_entry_size;
16606 				map_address += copy_entry_size;
16607 				mapped_size += copy_entry_size;
16608 				src_start += copy_entry_size;
16609 				assert(src_start <= src_end);
16610 				_vm_map_store_entry_link(map_header,
16611 				    map_header->links.prev,
16612 				    copy_entry);
16613 			}
16614 			/* done with submap_copy */
16615 			vm_map_copy_discard(submap_copy);
16616 
16617 			if (vm_remap_legacy) {
16618 				*cur_protection &= submap_curprot;
16619 				*max_protection &= submap_maxprot;
16620 			}
16621 
16622 			/* re-acquire the map lock and continue to next entry */
16623 			vm_map_lock(map);
16624 			continue;
16625 		} else {
16626 			object = VME_OBJECT(src_entry);
16627 
16628 			/*
16629 			 * Prevent kernel_object from being exposed to
16630 			 * user space.
16631 			 */
16632 			if (__improbable(object == kernel_object)) {
16633 				printf("%d[%s]: rejecting attempt to extract from kernel_object\n",
16634 				    proc_selfpid(),
16635 				    (current_task()->bsd_info
16636 				    ? proc_name_address(current_task()->bsd_info)
16637 				    : "?"));
16638 				DTRACE_VM(extract_kernel_only);
16639 				result = KERN_INVALID_RIGHT;
16640 				break;
16641 			}
16642 
16643 			if (src_entry->iokit_acct) {
16644 				/*
16645 				 * This entry uses "IOKit accounting".
16646 				 */
16647 			} else if (object != VM_OBJECT_NULL &&
16648 			    (object->purgable != VM_PURGABLE_DENY ||
16649 			    object->vo_ledger_tag != VM_LEDGER_TAG_NONE)) {
16650 				/*
16651 				 * Purgeable objects have their own accounting:
16652 				 * no pmap accounting for them.
16653 				 */
16654 				assertf(!src_entry->use_pmap,
16655 				    "map=%p src_entry=%p [0x%llx:0x%llx] 0x%x/0x%x %d",
16656 				    map,
16657 				    src_entry,
16658 				    (uint64_t)src_entry->vme_start,
16659 				    (uint64_t)src_entry->vme_end,
16660 				    src_entry->protection,
16661 				    src_entry->max_protection,
16662 				    VME_ALIAS(src_entry));
16663 			} else {
16664 				/*
16665 				 * Not IOKit or purgeable:
16666 				 * must be accounted by pmap stats.
16667 				 */
16668 				assertf(src_entry->use_pmap,
16669 				    "map=%p src_entry=%p [0x%llx:0x%llx] 0x%x/0x%x %d",
16670 				    map,
16671 				    src_entry,
16672 				    (uint64_t)src_entry->vme_start,
16673 				    (uint64_t)src_entry->vme_end,
16674 				    src_entry->protection,
16675 				    src_entry->max_protection,
16676 				    VME_ALIAS(src_entry));
16677 			}
16678 
16679 			if (object == VM_OBJECT_NULL) {
16680 				assert(!src_entry->needs_copy);
16681 				object = vm_object_allocate(entry_size);
16682 				VME_OFFSET_SET(src_entry, 0);
16683 				VME_OBJECT_SET(src_entry, object, false, 0);
16684 				assert(src_entry->use_pmap);
16685 				assert(!map->mapped_in_other_pmaps);
16686 			} else if (src_entry->wired_count ||
16687 			    object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
16688 				/*
16689 				 * A wired memory region should not have
16690 				 * any pending copy-on-write and needs to
16691 				 * keep pointing at the VM object that
16692 				 * contains the wired pages.
16693 				 * If we're sharing this memory (copy=false),
16694 				 * we'll share this VM object.
16695 				 * If we're copying this memory (copy=true),
16696 				 * we'll call vm_object_copy_slowly() below
16697 				 * and use the new VM object for the remapping.
16698 				 *
16699 				 * Or, we are already using an asymmetric
16700 				 * copy, and therefore we already have
16701 				 * the right object.
16702 				 */
16703 				assert(!src_entry->needs_copy);
16704 			} else if (src_entry->needs_copy || object->shadowed ||
16705 			    (object->internal && !object->true_share &&
16706 			    !src_entry->is_shared &&
16707 			    object->vo_size > entry_size)) {
16708 				VME_OBJECT_SHADOW(src_entry, entry_size);
16709 				assert(src_entry->use_pmap);
16710 
16711 				if (!src_entry->needs_copy &&
16712 				    (src_entry->protection & VM_PROT_WRITE)) {
16713 					vm_prot_t prot;
16714 
16715 					assert(!pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, src_entry->protection));
16716 
16717 					prot = src_entry->protection & ~VM_PROT_WRITE;
16718 
16719 					if (override_nx(map,
16720 					    VME_ALIAS(src_entry))
16721 					    && prot) {
16722 						prot |= VM_PROT_EXECUTE;
16723 					}
16724 
16725 					assert(!pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, prot));
16726 
16727 					if (map->mapped_in_other_pmaps) {
16728 						vm_object_pmap_protect(
16729 							VME_OBJECT(src_entry),
16730 							VME_OFFSET(src_entry),
16731 							entry_size,
16732 							PMAP_NULL,
16733 							PAGE_SIZE,
16734 							src_entry->vme_start,
16735 							prot);
16736 #if MACH_ASSERT
16737 					} else if (__improbable(map->pmap == PMAP_NULL)) {
16738 						extern boolean_t vm_tests_in_progress;
16739 						assert(vm_tests_in_progress);
16740 						/*
16741 						 * Some VM tests (in vm_tests.c)
16742 						 * sometimes want to use a VM
16743 						 * map without a pmap.
16744 						 * Otherwise, this should never
16745 						 * happen.
16746 						 */
16747 #endif /* MACH_ASSERT */
16748 					} else {
16749 						pmap_protect(vm_map_pmap(map),
16750 						    src_entry->vme_start,
16751 						    src_entry->vme_end,
16752 						    prot);
16753 					}
16754 				}
16755 
16756 				object = VME_OBJECT(src_entry);
16757 				src_entry->needs_copy = FALSE;
16758 			}
16759 
16760 
16761 			vm_object_lock(object);
16762 			vm_object_reference_locked(object); /* object ref. for new entry */
16763 			assert(!src_entry->needs_copy);
16764 			if (object->copy_strategy ==
16765 			    MEMORY_OBJECT_COPY_SYMMETRIC) {
16766 				/*
16767 				 * If we want to share this object (copy==0),
16768 				 * it needs to be COPY_DELAY.
16769 				 * If we want to copy this object (copy==1),
16770 				 * we can't just set "needs_copy" on our side
16771 				 * and expect the other side to do the same
16772 				 * (symmetrically), so we can't let the object
16773 				 * stay COPY_SYMMETRIC.
16774 				 * So we always switch from COPY_SYMMETRIC to
16775 				 * COPY_DELAY.
16776 				 */
16777 				object->copy_strategy =
16778 				    MEMORY_OBJECT_COPY_DELAY;
16779 				object->true_share = TRUE;
16780 			}
16781 			vm_object_unlock(object);
16782 		}
16783 
16784 		offset = (VME_OFFSET(src_entry) +
16785 		    (src_start - src_entry->vme_start));
16786 
16787 		new_entry = _vm_map_entry_create(map_header);
16788 		vm_map_entry_copy(map, new_entry, src_entry);
16789 		if (new_entry->is_sub_map) {
16790 			/* clr address space specifics */
16791 			new_entry->use_pmap = FALSE;
16792 		} else if (copy) {
16793 			/*
16794 			 * We're dealing with a copy-on-write operation,
16795 			 * so the resulting mapping should not inherit the
16796 			 * original mapping's accounting settings.
16797 			 * "use_pmap" should be reset to its default (TRUE)
16798 			 * so that the new mapping gets accounted for in
16799 			 * the task's memory footprint.
16800 			 */
16801 			new_entry->use_pmap = TRUE;
16802 		}
16803 		/* "iokit_acct" was cleared in vm_map_entry_copy() */
16804 		assert(!new_entry->iokit_acct);
16805 
16806 		new_entry->map_aligned = FALSE;
16807 
16808 		new_entry->vme_start = map_address;
16809 		new_entry->vme_end = map_address + tmp_size;
16810 		assert(new_entry->vme_start < new_entry->vme_end);
16811 		if (copy && vmk_flags.vmkf_remap_prot_copy) {
16812 			/*
16813 			 * Remapping for vm_map_protect(VM_PROT_COPY)
16814 			 * to convert a read-only mapping into a
16815 			 * copy-on-write version of itself but
16816 			 * with write access:
16817 			 * keep the original inheritance and add
16818 			 * VM_PROT_WRITE to the max protection.
16819 			 */
16820 			new_entry->inheritance = src_entry->inheritance;
16821 			new_entry->protection &= max_prot_for_prot_copy;
16822 			new_entry->max_protection |= VM_PROT_WRITE;
16823 		} else {
16824 			new_entry->inheritance = inheritance;
16825 			if (!vm_remap_legacy) {
16826 				new_entry->protection = *cur_protection;
16827 				new_entry->max_protection = *max_protection;
16828 			}
16829 		}
16830 		VME_OFFSET_SET(new_entry, offset);
16831 
16832 		/*
16833 		 * The new region has to be copied now if required.
16834 		 */
16835 RestartCopy:
16836 		if (!copy) {
16837 			if (src_entry->used_for_jit == TRUE) {
16838 				if (same_map) {
16839 				} else if (!VM_MAP_POLICY_ALLOW_JIT_SHARING(map)) {
16840 					/*
16841 					 * Cannot allow an entry describing a JIT
16842 					 * region to be shared across address spaces.
16843 					 */
16844 					result = KERN_INVALID_ARGUMENT;
16845 					vm_object_deallocate(object);
16846 					vm_map_entry_dispose(new_entry);
16847 					new_entry = VM_MAP_ENTRY_NULL;
16848 					break;
16849 				}
16850 			}
16851 
16852 			src_entry->is_shared = TRUE;
16853 			new_entry->is_shared = TRUE;
16854 			if (!(new_entry->is_sub_map)) {
16855 				new_entry->needs_copy = FALSE;
16856 			}
16857 		} else if (src_entry->is_sub_map) {
16858 			/* make this a COW sub_map if not already */
16859 			assert(new_entry->wired_count == 0);
16860 			new_entry->needs_copy = TRUE;
16861 			object = VM_OBJECT_NULL;
16862 		} else if (src_entry->wired_count == 0 &&
16863 		    !(debug4k_no_cow_copyin && VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) &&
16864 		    vm_object_copy_quickly(VME_OBJECT(new_entry),
16865 		    VME_OFFSET(new_entry),
16866 		    (new_entry->vme_end -
16867 		    new_entry->vme_start),
16868 		    &src_needs_copy,
16869 		    &new_entry_needs_copy)) {
16870 			new_entry->needs_copy = new_entry_needs_copy;
16871 			new_entry->is_shared = FALSE;
16872 			assertf(new_entry->use_pmap, "map %p new_entry %p\n", map, new_entry);
16873 
16874 			/*
16875 			 * Handle copy_on_write semantics.
16876 			 */
16877 			if (src_needs_copy && !src_entry->needs_copy) {
16878 				vm_prot_t prot;
16879 
16880 				assert(!pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, src_entry->protection));
16881 
16882 				prot = src_entry->protection & ~VM_PROT_WRITE;
16883 
16884 				if (override_nx(map,
16885 				    VME_ALIAS(src_entry))
16886 				    && prot) {
16887 					prot |= VM_PROT_EXECUTE;
16888 				}
16889 
16890 				assert(!pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, prot));
16891 
16892 				vm_object_pmap_protect(object,
16893 				    offset,
16894 				    entry_size,
16895 				    ((src_entry->is_shared
16896 				    || map->mapped_in_other_pmaps) ?
16897 				    PMAP_NULL : map->pmap),
16898 				    VM_MAP_PAGE_SIZE(map),
16899 				    src_entry->vme_start,
16900 				    prot);
16901 
16902 				assert(src_entry->wired_count == 0);
16903 				src_entry->needs_copy = TRUE;
16904 			}
16905 			/*
16906 			 * Throw away the old object reference of the new entry.
16907 			 */
16908 			vm_object_deallocate(object);
16909 		} else {
16910 			new_entry->is_shared = FALSE;
16911 			assertf(new_entry->use_pmap, "map %p new_entry %p\n", map, new_entry);
16912 
16913 			src_entry_was_wired = (src_entry->wired_count > 0);
16914 			saved_src_entry = src_entry;
16915 			src_entry = VM_MAP_ENTRY_NULL;
16916 
16917 			/*
16918 			 * The map can be safely unlocked since we
16919 			 * already hold a reference on the object.
16920 			 *
16921 			 * Record the timestamp of the map for later
16922 			 * verification, and unlock the map.
16923 			 */
16924 			version.main_timestamp = map->timestamp;
16925 			vm_map_unlock(map);     /* Increments timestamp once! */
16926 
16927 			/*
16928 			 * Perform the copy.
16929 			 */
16930 			if (src_entry_was_wired > 0 ||
16931 			    (debug4k_no_cow_copyin &&
16932 			    VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT)) {
16933 				vm_object_lock(object);
16934 				result = vm_object_copy_slowly(
16935 					object,
16936 					offset,
16937 					(new_entry->vme_end -
16938 					new_entry->vme_start),
16939 					THREAD_UNINT,
16940 					&new_copy_object);
16941 				/* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
16942 				saved_used_for_jit = new_entry->used_for_jit;
16943 				VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
16944 				new_entry->used_for_jit = saved_used_for_jit;
16945 				VME_OFFSET_SET(new_entry, offset - vm_object_trunc_page(offset));
16946 				new_entry->needs_copy = FALSE;
16947 			} else {
16948 				vm_object_offset_t new_offset;
16949 
16950 				new_offset = VME_OFFSET(new_entry);
16951 				result = vm_object_copy_strategically(
16952 					object,
16953 					offset,
16954 					(new_entry->vme_end -
16955 					new_entry->vme_start),
16956 					&new_copy_object,
16957 					&new_offset,
16958 					&new_entry_needs_copy);
16959 				/* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
16960 				saved_used_for_jit = new_entry->used_for_jit;
16961 				VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
16962 				new_entry->used_for_jit = saved_used_for_jit;
16963 				if (new_offset != VME_OFFSET(new_entry)) {
16964 					VME_OFFSET_SET(new_entry, new_offset);
16965 				}
16966 
16967 				new_entry->needs_copy = new_entry_needs_copy;
16968 			}
16969 
16970 			/*
16971 			 * Throw away the old object reference of the new entry.
16972 			 */
16973 			vm_object_deallocate(object);
16974 
16975 			if (result != KERN_SUCCESS &&
16976 			    result != KERN_MEMORY_RESTART_COPY) {
16977 				vm_map_entry_dispose(new_entry);
16978 				vm_map_lock(map);
16979 				break;
16980 			}
16981 
16982 			/*
16983 			 * Verify that the map has not substantially
16984 			 * changed while the copy was being made.
16985 			 */
16986 
16987 			vm_map_lock(map);
16988 			if (version.main_timestamp + 1 != map->timestamp) {
16989 				/*
16990 				 * Simple version comparison failed.
16991 				 *
16992 				 * Retry the lookup and verify that the
16993 				 * same object/offset are still present.
16994 				 */
16995 				saved_src_entry = VM_MAP_ENTRY_NULL;
16996 				vm_object_deallocate(VME_OBJECT(new_entry));
16997 				vm_map_entry_dispose(new_entry);
16998 				if (result == KERN_MEMORY_RESTART_COPY) {
16999 					result = KERN_SUCCESS;
17000 				}
17001 				continue;
17002 			}
17003 			/* map hasn't changed: src_entry is still valid */
17004 			src_entry = saved_src_entry;
17005 			saved_src_entry = VM_MAP_ENTRY_NULL;
17006 
17007 			if (result == KERN_MEMORY_RESTART_COPY) {
17008 				vm_object_reference(object);
17009 				goto RestartCopy;
17010 			}
17011 		}
17012 
17013 		_vm_map_store_entry_link(map_header,
17014 		    map_header->links.prev, new_entry);
17015 
17016 		/* protections for submap mapping are irrelevant here */
17017 		if (vm_remap_legacy && !src_entry->is_sub_map) {
17018 			*cur_protection &= src_entry->protection;
17019 			*max_protection &= src_entry->max_protection;
17020 		}
17021 
17022 		map_address += tmp_size;
17023 		mapped_size += tmp_size;
17024 		src_start += tmp_size;
17025 
17026 		if (vmk_flags.vmkf_copy_single_object) {
17027 			if (mapped_size != size) {
17028 				DEBUG4K_SHARE("map %p addr 0x%llx size 0x%llx clipped copy at mapped_size 0x%llx\n",
17029 				    map, (uint64_t)addr, (uint64_t)size, (uint64_t)mapped_size);
17030 				if (src_entry->vme_next != vm_map_to_entry(map) &&
17031 				    src_entry->vme_next->vme_object_value ==
17032 				    src_entry->vme_object_value) {
17033 					/* XXX TODO4K */
17034 					DEBUG4K_ERROR("could have extended copy to next entry...\n");
17035 				}
17036 			}
17037 			break;
17038 		}
17039 	} /* end while */
17040 
17041 	vm_map_unlock(map);
17042 	if (result != KERN_SUCCESS) {
17043 		/*
17044 		 * Free all allocated elements.
17045 		 */
17046 		for (src_entry = map_header->links.next;
17047 		    src_entry != CAST_TO_VM_MAP_ENTRY(&map_header->links);
17048 		    src_entry = new_entry) {
17049 			new_entry = src_entry->vme_next;
17050 			_vm_map_store_entry_unlink(map_header, src_entry);
17051 			if (src_entry->is_sub_map) {
17052 				vm_map_deallocate(VME_SUBMAP(src_entry));
17053 			} else {
17054 				vm_object_deallocate(VME_OBJECT(src_entry));
17055 			}
17056 			vm_map_entry_dispose(src_entry);
17057 		}
17058 	}
17059 	return result;
17060 }
17061 
17062 bool
vm_map_is_exotic(vm_map_t map)17063 vm_map_is_exotic(
17064 	vm_map_t map)
17065 {
17066 	return VM_MAP_IS_EXOTIC(map);
17067 }
17068 
17069 bool
vm_map_is_alien(vm_map_t map)17070 vm_map_is_alien(
17071 	vm_map_t map)
17072 {
17073 	return VM_MAP_IS_ALIEN(map);
17074 }
17075 
17076 #if XNU_TARGET_OS_OSX
17077 void
vm_map_mark_alien(vm_map_t map)17078 vm_map_mark_alien(
17079 	vm_map_t map)
17080 {
17081 	vm_map_lock(map);
17082 	map->is_alien = true;
17083 	vm_map_unlock(map);
17084 }
17085 
17086 void
vm_map_single_jit(vm_map_t map)17087 vm_map_single_jit(
17088 	vm_map_t map)
17089 {
17090 	vm_map_lock(map);
17091 	map->single_jit = true;
17092 	vm_map_unlock(map);
17093 }
17094 #endif /* XNU_TARGET_OS_OSX */
17095 
17096 /*
17097  * Callers of this function must call vm_map_copy_require on
17098  * previously created vm_map_copy_t or pass a newly created
17099  * one to ensure that it hasn't been forged.
17100  */
17101 static kern_return_t
vm_map_copy_to_physcopy(vm_map_copy_t copy_map,vm_map_t target_map)17102 vm_map_copy_to_physcopy(
17103 	vm_map_copy_t   copy_map,
17104 	vm_map_t        target_map)
17105 {
17106 	vm_map_size_t           size;
17107 	vm_map_entry_t          entry;
17108 	vm_map_entry_t          new_entry;
17109 	vm_object_t             new_object;
17110 	unsigned int            pmap_flags;
17111 	pmap_t                  new_pmap;
17112 	vm_map_t                new_map;
17113 	vm_map_address_t        src_start, src_end, src_cur;
17114 	vm_map_address_t        dst_start, dst_end, dst_cur;
17115 	kern_return_t           kr;
17116 	void                    *kbuf;
17117 
17118 	/*
17119 	 * Perform the equivalent of vm_allocate() and memcpy().
17120 	 * Replace the mappings in "copy_map" with the newly allocated mapping.
17121 	 */
17122 	DEBUG4K_COPY("copy_map %p (%d %d 0x%llx 0x%llx) BEFORE\n", copy_map, copy_map->cpy_hdr.page_shift, copy_map->cpy_hdr.nentries, copy_map->offset, (uint64_t)copy_map->size);
17123 
17124 	assert(copy_map->cpy_hdr.page_shift != VM_MAP_PAGE_MASK(target_map));
17125 
17126 	/* create a new pmap to map "copy_map" */
17127 	pmap_flags = 0;
17128 	assert(copy_map->cpy_hdr.page_shift == FOURK_PAGE_SHIFT);
17129 #if PMAP_CREATE_FORCE_4K_PAGES
17130 	pmap_flags |= PMAP_CREATE_FORCE_4K_PAGES;
17131 #endif /* PMAP_CREATE_FORCE_4K_PAGES */
17132 	pmap_flags |= PMAP_CREATE_64BIT;
17133 	new_pmap = pmap_create_options(NULL, (vm_map_size_t)0, pmap_flags);
17134 	if (new_pmap == NULL) {
17135 		return KERN_RESOURCE_SHORTAGE;
17136 	}
17137 
17138 	/* allocate new VM object */
17139 	size = VM_MAP_ROUND_PAGE(copy_map->size, PAGE_MASK);
17140 	new_object = vm_object_allocate(size);
17141 	assert(new_object);
17142 
17143 	/* allocate new VM map entry */
17144 	new_entry = vm_map_copy_entry_create(copy_map);
17145 	assert(new_entry);
17146 
17147 	/* finish initializing new VM map entry */
17148 	new_entry->protection = VM_PROT_DEFAULT;
17149 	new_entry->max_protection = VM_PROT_DEFAULT;
17150 	new_entry->use_pmap = TRUE;
17151 
17152 	/* make new VM map entry point to new VM object */
17153 	new_entry->vme_start = 0;
17154 	new_entry->vme_end = size;
17155 	VME_OBJECT_SET(new_entry, new_object, false, 0);
17156 	VME_OFFSET_SET(new_entry, 0);
17157 
17158 	/* create a new pageable VM map to map "copy_map" */
17159 	new_map = vm_map_create_options(new_pmap, 0, MACH_VM_MAX_ADDRESS,
17160 	    VM_MAP_CREATE_PAGEABLE);
17161 	assert(new_map);
17162 	vm_map_set_page_shift(new_map, copy_map->cpy_hdr.page_shift);
17163 
17164 	/* map "copy_map" in the new VM map */
17165 	src_start = 0;
17166 	kr = vm_map_copyout_internal(
17167 		new_map,
17168 		&src_start,
17169 		copy_map,
17170 		copy_map->size,
17171 		FALSE, /* consume_on_success */
17172 		VM_PROT_DEFAULT,
17173 		VM_PROT_DEFAULT,
17174 		VM_INHERIT_DEFAULT);
17175 	assert(kr == KERN_SUCCESS);
17176 	src_end = src_start + copy_map->size;
17177 
17178 	/* map "new_object" in the new VM map */
17179 	vm_object_reference(new_object);
17180 	dst_start = 0;
17181 	kr = vm_map_enter(new_map,
17182 	    &dst_start,
17183 	    size,
17184 	    0,               /* mask */
17185 	    VM_FLAGS_ANYWHERE,
17186 	    VM_MAP_KERNEL_FLAGS_NONE,
17187 	    VM_KERN_MEMORY_OSFMK,
17188 	    new_object,
17189 	    0,               /* offset */
17190 	    FALSE,               /* needs copy */
17191 	    VM_PROT_DEFAULT,
17192 	    VM_PROT_DEFAULT,
17193 	    VM_INHERIT_DEFAULT);
17194 	assert(kr == KERN_SUCCESS);
17195 	dst_end = dst_start + size;
17196 
17197 	/* get a kernel buffer */
17198 	kbuf = kalloc_data(PAGE_SIZE, Z_WAITOK | Z_NOFAIL);
17199 
17200 	/* physically copy "copy_map" mappings to new VM object */
17201 	for (src_cur = src_start, dst_cur = dst_start;
17202 	    src_cur < src_end;
17203 	    src_cur += PAGE_SIZE, dst_cur += PAGE_SIZE) {
17204 		vm_size_t bytes;
17205 
17206 		bytes = PAGE_SIZE;
17207 		if (src_cur + PAGE_SIZE > src_end) {
17208 			/* partial copy for last page */
17209 			bytes = src_end - src_cur;
17210 			assert(bytes > 0 && bytes < PAGE_SIZE);
17211 			/* rest of dst page should be zero-filled */
17212 		}
17213 		/* get bytes from src mapping */
17214 		kr = copyinmap(new_map, src_cur, kbuf, bytes);
17215 		if (kr != KERN_SUCCESS) {
17216 			DEBUG4K_COPY("copyinmap(%p, 0x%llx, %p, 0x%llx) kr 0x%x\n", new_map, (uint64_t)src_cur, kbuf, (uint64_t)bytes, kr);
17217 		}
17218 		/* put bytes in dst mapping */
17219 		assert(dst_cur < dst_end);
17220 		assert(dst_cur + bytes <= dst_end);
17221 		kr = copyoutmap(new_map, kbuf, dst_cur, bytes);
17222 		if (kr != KERN_SUCCESS) {
17223 			DEBUG4K_COPY("copyoutmap(%p, %p, 0x%llx, 0x%llx) kr 0x%x\n", new_map, kbuf, (uint64_t)dst_cur, (uint64_t)bytes, kr);
17224 		}
17225 	}
17226 
17227 	/* free kernel buffer */
17228 	kfree_data(kbuf, PAGE_SIZE);
17229 
17230 	/* destroy new map */
17231 	vm_map_destroy(new_map);
17232 	new_map = VM_MAP_NULL;
17233 
17234 	/* dispose of the old map entries in "copy_map" */
17235 	while (vm_map_copy_first_entry(copy_map) !=
17236 	    vm_map_copy_to_entry(copy_map)) {
17237 		entry = vm_map_copy_first_entry(copy_map);
17238 		vm_map_copy_entry_unlink(copy_map, entry);
17239 		if (entry->is_sub_map) {
17240 			vm_map_deallocate(VME_SUBMAP(entry));
17241 		} else {
17242 			vm_object_deallocate(VME_OBJECT(entry));
17243 		}
17244 		vm_map_copy_entry_dispose(entry);
17245 	}
17246 
17247 	/* change "copy_map"'s page_size to match "target_map" */
17248 	copy_map->cpy_hdr.page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(target_map);
17249 	copy_map->offset = 0;
17250 	copy_map->size = size;
17251 
17252 	/* insert new map entry in "copy_map" */
17253 	assert(vm_map_copy_last_entry(copy_map) == vm_map_copy_to_entry(copy_map));
17254 	vm_map_copy_entry_link(copy_map, vm_map_copy_last_entry(copy_map), new_entry);
17255 
17256 	DEBUG4K_COPY("copy_map %p (%d %d 0x%llx 0x%llx) AFTER\n", copy_map, copy_map->cpy_hdr.page_shift, copy_map->cpy_hdr.nentries, copy_map->offset, (uint64_t)copy_map->size);
17257 	return KERN_SUCCESS;
17258 }
17259 
17260 void
17261 vm_map_copy_adjust_get_target_copy_map(
17262 	vm_map_copy_t   copy_map,
17263 	vm_map_copy_t   *target_copy_map_p);
17264 void
vm_map_copy_adjust_get_target_copy_map(vm_map_copy_t copy_map,vm_map_copy_t * target_copy_map_p)17265 vm_map_copy_adjust_get_target_copy_map(
17266 	vm_map_copy_t   copy_map,
17267 	vm_map_copy_t   *target_copy_map_p)
17268 {
17269 	vm_map_copy_t   target_copy_map;
17270 	vm_map_entry_t  entry, target_entry;
17271 
17272 	if (*target_copy_map_p != VM_MAP_COPY_NULL) {
17273 		/* the caller already has a "target_copy_map": use it */
17274 		return;
17275 	}
17276 
17277 	/* the caller wants us to create a new copy of "copy_map" */
17278 	target_copy_map = vm_map_copy_allocate();
17279 	target_copy_map->type = copy_map->type;
17280 	assert(target_copy_map->type == VM_MAP_COPY_ENTRY_LIST);
17281 	target_copy_map->offset = copy_map->offset;
17282 	target_copy_map->size = copy_map->size;
17283 	target_copy_map->cpy_hdr.page_shift = copy_map->cpy_hdr.page_shift;
17284 	vm_map_store_init(&target_copy_map->cpy_hdr);
17285 	for (entry = vm_map_copy_first_entry(copy_map);
17286 	    entry != vm_map_copy_to_entry(copy_map);
17287 	    entry = entry->vme_next) {
17288 		target_entry = vm_map_copy_entry_create(target_copy_map);
17289 		vm_map_entry_copy_full(target_entry, entry);
17290 		if (target_entry->is_sub_map) {
17291 			vm_map_reference(VME_SUBMAP(target_entry));
17292 		} else {
17293 			vm_object_reference(VME_OBJECT(target_entry));
17294 		}
17295 		vm_map_copy_entry_link(
17296 			target_copy_map,
17297 			vm_map_copy_last_entry(target_copy_map),
17298 			target_entry);
17299 	}
17300 	entry = VM_MAP_ENTRY_NULL;
17301 	*target_copy_map_p = target_copy_map;
17302 }
17303 
17304 /*
17305  * Callers of this function must call vm_map_copy_require on
17306  * previously created vm_map_copy_t or pass a newly created
17307  * one to ensure that it hasn't been forged.
17308  */
17309 static void
vm_map_copy_trim(vm_map_copy_t copy_map,uint16_t new_page_shift,vm_map_offset_t trim_start,vm_map_offset_t trim_end)17310 vm_map_copy_trim(
17311 	vm_map_copy_t   copy_map,
17312 	uint16_t        new_page_shift,
17313 	vm_map_offset_t trim_start,
17314 	vm_map_offset_t trim_end)
17315 {
17316 	uint16_t        copy_page_shift;
17317 	vm_map_entry_t  entry, next_entry;
17318 
17319 	assert(copy_map->type == VM_MAP_COPY_ENTRY_LIST);
17320 	assert(copy_map->cpy_hdr.nentries > 0);
17321 
17322 	trim_start += vm_map_copy_first_entry(copy_map)->vme_start;
17323 	trim_end += vm_map_copy_first_entry(copy_map)->vme_start;
17324 
17325 	/* use the new page_shift to do the clipping */
17326 	copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy_map);
17327 	copy_map->cpy_hdr.page_shift = new_page_shift;
17328 
17329 	for (entry = vm_map_copy_first_entry(copy_map);
17330 	    entry != vm_map_copy_to_entry(copy_map);
17331 	    entry = next_entry) {
17332 		next_entry = entry->vme_next;
17333 		if (entry->vme_end <= trim_start) {
17334 			/* entry fully before trim range: skip */
17335 			continue;
17336 		}
17337 		if (entry->vme_start >= trim_end) {
17338 			/* entry fully after trim range: done */
17339 			break;
17340 		}
17341 		/* clip entry if needed */
17342 		vm_map_copy_clip_start(copy_map, entry, trim_start);
17343 		vm_map_copy_clip_end(copy_map, entry, trim_end);
17344 		/* dispose of entry */
17345 		copy_map->size -= entry->vme_end - entry->vme_start;
17346 		vm_map_copy_entry_unlink(copy_map, entry);
17347 		if (entry->is_sub_map) {
17348 			vm_map_deallocate(VME_SUBMAP(entry));
17349 		} else {
17350 			vm_object_deallocate(VME_OBJECT(entry));
17351 		}
17352 		vm_map_copy_entry_dispose(entry);
17353 		entry = VM_MAP_ENTRY_NULL;
17354 	}
17355 
17356 	/* restore copy_map's original page_shift */
17357 	copy_map->cpy_hdr.page_shift = copy_page_shift;
17358 }
17359 
17360 /*
17361  * Make any necessary adjustments to "copy_map" to allow it to be
17362  * mapped into "target_map".
17363  * If no changes were necessary, "target_copy_map" points to the
17364  * untouched "copy_map".
17365  * If changes are necessary, changes will be made to "target_copy_map".
17366  * If "target_copy_map" was NULL, we create a new "vm_map_copy_t" and
17367  * copy the original "copy_map" to it before applying the changes.
17368  * The caller should discard "target_copy_map" if it's not the same as
17369  * the original "copy_map".
17370  */
17371 /* TODO4K: also adjust to sub-range in the copy_map -> add start&end? */
17372 kern_return_t
vm_map_copy_adjust_to_target(vm_map_copy_t src_copy_map,vm_map_offset_t offset,vm_map_size_t size,vm_map_t target_map,boolean_t copy,vm_map_copy_t * target_copy_map_p,vm_map_offset_t * overmap_start_p,vm_map_offset_t * overmap_end_p,vm_map_offset_t * trimmed_start_p)17373 vm_map_copy_adjust_to_target(
17374 	vm_map_copy_t           src_copy_map,
17375 	vm_map_offset_t         offset,
17376 	vm_map_size_t           size,
17377 	vm_map_t                target_map,
17378 	boolean_t               copy,
17379 	vm_map_copy_t           *target_copy_map_p,
17380 	vm_map_offset_t         *overmap_start_p,
17381 	vm_map_offset_t         *overmap_end_p,
17382 	vm_map_offset_t         *trimmed_start_p)
17383 {
17384 	vm_map_copy_t           copy_map, target_copy_map;
17385 	vm_map_size_t           target_size;
17386 	vm_map_size_t           src_copy_map_size;
17387 	vm_map_size_t           overmap_start, overmap_end;
17388 	int                     misalignments;
17389 	vm_map_entry_t          entry, target_entry;
17390 	vm_map_offset_t         addr_adjustment;
17391 	vm_map_offset_t         new_start, new_end;
17392 	int                     copy_page_mask, target_page_mask;
17393 	uint16_t                copy_page_shift, target_page_shift;
17394 	vm_map_offset_t         trimmed_end;
17395 
17396 	/*
17397 	 * Assert that the vm_map_copy is coming from the right
17398 	 * zone and hasn't been forged
17399 	 */
17400 	vm_map_copy_require(src_copy_map);
17401 	assert(src_copy_map->type == VM_MAP_COPY_ENTRY_LIST);
17402 
17403 	/*
17404 	 * Start working with "src_copy_map" but we'll switch
17405 	 * to "target_copy_map" as soon as we start making adjustments.
17406 	 */
17407 	copy_map = src_copy_map;
17408 	src_copy_map_size = src_copy_map->size;
17409 
17410 	copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy_map);
17411 	copy_page_mask = VM_MAP_COPY_PAGE_MASK(copy_map);
17412 	target_page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(target_map);
17413 	target_page_mask = VM_MAP_PAGE_MASK(target_map);
17414 
17415 	DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p...\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, (uint64_t)offset, (uint64_t)size, *target_copy_map_p);
17416 
17417 	target_copy_map = *target_copy_map_p;
17418 	if (target_copy_map != VM_MAP_COPY_NULL) {
17419 		vm_map_copy_require(target_copy_map);
17420 	}
17421 
17422 	if (offset + size > copy_map->size) {
17423 		DEBUG4K_ERROR("copy_map %p (%d->%d) copy_map->size 0x%llx offset 0x%llx size 0x%llx KERN_INVALID_ARGUMENT\n", copy_map, copy_page_shift, target_page_shift, (uint64_t)copy_map->size, (uint64_t)offset, (uint64_t)size);
17424 		return KERN_INVALID_ARGUMENT;
17425 	}
17426 
17427 	/* trim the end */
17428 	trimmed_end = 0;
17429 	new_end = VM_MAP_ROUND_PAGE(offset + size, target_page_mask);
17430 	if (new_end < copy_map->size) {
17431 		trimmed_end = src_copy_map_size - new_end;
17432 		DEBUG4K_ADJUST("copy_map %p (%d->%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p... trim end from 0x%llx to 0x%llx\n", copy_map, copy_page_shift, target_page_shift, copy, (uint64_t)offset, (uint64_t)size, target_copy_map, (uint64_t)new_end, (uint64_t)copy_map->size);
17433 		/* get "target_copy_map" if needed and adjust it */
17434 		vm_map_copy_adjust_get_target_copy_map(copy_map,
17435 		    &target_copy_map);
17436 		copy_map = target_copy_map;
17437 		vm_map_copy_trim(target_copy_map, target_page_shift,
17438 		    new_end, copy_map->size);
17439 	}
17440 
17441 	/* trim the start */
17442 	new_start = VM_MAP_TRUNC_PAGE(offset, target_page_mask);
17443 	if (new_start != 0) {
17444 		DEBUG4K_ADJUST("copy_map %p (%d->%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p... trim start from 0x%llx to 0x%llx\n", copy_map, copy_page_shift, target_page_shift, copy, (uint64_t)offset, (uint64_t)size, target_copy_map, (uint64_t)0, (uint64_t)new_start);
17445 		/* get "target_copy_map" if needed and adjust it */
17446 		vm_map_copy_adjust_get_target_copy_map(copy_map,
17447 		    &target_copy_map);
17448 		copy_map = target_copy_map;
17449 		vm_map_copy_trim(target_copy_map, target_page_shift,
17450 		    0, new_start);
17451 	}
17452 	*trimmed_start_p = new_start;
17453 
17454 	/* target_size starts with what's left after trimming */
17455 	target_size = copy_map->size;
17456 	assertf(target_size == src_copy_map_size - *trimmed_start_p - trimmed_end,
17457 	    "target_size 0x%llx src_copy_map_size 0x%llx trimmed_start 0x%llx trimmed_end 0x%llx\n",
17458 	    (uint64_t)target_size, (uint64_t)src_copy_map_size,
17459 	    (uint64_t)*trimmed_start_p, (uint64_t)trimmed_end);
17460 
17461 	/* check for misalignments but don't adjust yet */
17462 	misalignments = 0;
17463 	overmap_start = 0;
17464 	overmap_end = 0;
17465 	if (copy_page_shift < target_page_shift) {
17466 		/*
17467 		 * Remapping from 4K to 16K: check the VM object alignments
17468 		 * throughout the range.
17469 		 * If the start and end of the range are mis-aligned, we can
17470 		 * over-map to re-align, and adjust the "overmap" start/end
17471 		 * and "target_size" of the range accordingly.
17472 		 * If there is any mis-alignment within the range:
17473 		 *     if "copy":
17474 		 *         we can do immediate-copy instead of copy-on-write,
17475 		 *     else:
17476 		 *         no way to remap and share; fail.
17477 		 */
17478 		for (entry = vm_map_copy_first_entry(copy_map);
17479 		    entry != vm_map_copy_to_entry(copy_map);
17480 		    entry = entry->vme_next) {
17481 			vm_object_offset_t object_offset_start, object_offset_end;
17482 
17483 			object_offset_start = VME_OFFSET(entry);
17484 			object_offset_end = object_offset_start;
17485 			object_offset_end += entry->vme_end - entry->vme_start;
17486 			if (object_offset_start & target_page_mask) {
17487 				if (entry == vm_map_copy_first_entry(copy_map) && !copy) {
17488 					overmap_start++;
17489 				} else {
17490 					misalignments++;
17491 				}
17492 			}
17493 			if (object_offset_end & target_page_mask) {
17494 				if (entry->vme_next == vm_map_copy_to_entry(copy_map) && !copy) {
17495 					overmap_end++;
17496 				} else {
17497 					misalignments++;
17498 				}
17499 			}
17500 		}
17501 	}
17502 	entry = VM_MAP_ENTRY_NULL;
17503 
17504 	/* decide how to deal with misalignments */
17505 	assert(overmap_start <= 1);
17506 	assert(overmap_end <= 1);
17507 	if (!overmap_start && !overmap_end && !misalignments) {
17508 		/* copy_map is properly aligned for target_map ... */
17509 		if (*trimmed_start_p) {
17510 			/* ... but we trimmed it, so still need to adjust */
17511 		} else {
17512 			/* ... and we didn't trim anything: we're done */
17513 			if (target_copy_map == VM_MAP_COPY_NULL) {
17514 				target_copy_map = copy_map;
17515 			}
17516 			*target_copy_map_p = target_copy_map;
17517 			*overmap_start_p = 0;
17518 			*overmap_end_p = 0;
17519 			DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx) -> trimmed 0x%llx overmap start 0x%llx end 0x%llx KERN_SUCCESS\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p);
17520 			return KERN_SUCCESS;
17521 		}
17522 	} else if (misalignments && !copy) {
17523 		/* can't "share" if misaligned */
17524 		DEBUG4K_ADJUST("unsupported sharing\n");
17525 #if MACH_ASSERT
17526 		if (debug4k_panic_on_misaligned_sharing) {
17527 			panic("DEBUG4k %s:%d unsupported sharing", __FUNCTION__, __LINE__);
17528 		}
17529 #endif /* MACH_ASSERT */
17530 		DEBUG4K_ADJUST("copy_map %p (%d) target_map %p (%d) copy %d target_copy_map %p -> KERN_NOT_SUPPORTED\n", copy_map, copy_page_shift, target_map, target_page_shift, copy, *target_copy_map_p);
17531 		return KERN_NOT_SUPPORTED;
17532 	} else {
17533 		/* can't virtual-copy if misaligned (but can physical-copy) */
17534 		DEBUG4K_ADJUST("mis-aligned copying\n");
17535 	}
17536 
17537 	/* get a "target_copy_map" if needed and switch to it */
17538 	vm_map_copy_adjust_get_target_copy_map(copy_map, &target_copy_map);
17539 	copy_map = target_copy_map;
17540 
17541 	if (misalignments && copy) {
17542 		vm_map_size_t target_copy_map_size;
17543 
17544 		/*
17545 		 * Can't do copy-on-write with misaligned mappings.
17546 		 * Replace the mappings with a physical copy of the original
17547 		 * mappings' contents.
17548 		 */
17549 		target_copy_map_size = target_copy_map->size;
17550 		kern_return_t kr = vm_map_copy_to_physcopy(target_copy_map, target_map);
17551 		if (kr != KERN_SUCCESS) {
17552 			return kr;
17553 		}
17554 		*target_copy_map_p = target_copy_map;
17555 		*overmap_start_p = 0;
17556 		*overmap_end_p = target_copy_map->size - target_copy_map_size;
17557 		DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx)-> trimmed 0x%llx overmap start 0x%llx end 0x%llx PHYSCOPY\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p);
17558 		return KERN_SUCCESS;
17559 	}
17560 
17561 	/* apply the adjustments */
17562 	misalignments = 0;
17563 	overmap_start = 0;
17564 	overmap_end = 0;
17565 	/* remove copy_map->offset, so that everything starts at offset 0 */
17566 	addr_adjustment = copy_map->offset;
17567 	/* also remove whatever we trimmed from the start */
17568 	addr_adjustment += *trimmed_start_p;
17569 	for (target_entry = vm_map_copy_first_entry(target_copy_map);
17570 	    target_entry != vm_map_copy_to_entry(target_copy_map);
17571 	    target_entry = target_entry->vme_next) {
17572 		vm_object_offset_t object_offset_start, object_offset_end;
17573 
17574 		DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx BEFORE\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
17575 		object_offset_start = VME_OFFSET(target_entry);
17576 		if (object_offset_start & target_page_mask) {
17577 			DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx misaligned at start\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
17578 			if (target_entry == vm_map_copy_first_entry(target_copy_map)) {
17579 				/*
17580 				 * start of 1st entry is mis-aligned:
17581 				 * re-adjust by over-mapping.
17582 				 */
17583 				overmap_start = object_offset_start - trunc_page_mask_64(object_offset_start, target_page_mask);
17584 				DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> overmap_start 0x%llx\n", target_entry, VME_OFFSET(target_entry), copy, (uint64_t)overmap_start);
17585 				VME_OFFSET_SET(target_entry, VME_OFFSET(target_entry) - overmap_start);
17586 			} else {
17587 				misalignments++;
17588 				DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> misalignments %d\n", target_entry, VME_OFFSET(target_entry), copy, misalignments);
17589 				assert(copy);
17590 			}
17591 		}
17592 
17593 		if (target_entry == vm_map_copy_first_entry(target_copy_map)) {
17594 			target_size += overmap_start;
17595 		} else {
17596 			target_entry->vme_start += overmap_start;
17597 		}
17598 		target_entry->vme_end += overmap_start;
17599 
17600 		object_offset_end = VME_OFFSET(target_entry) + target_entry->vme_end - target_entry->vme_start;
17601 		if (object_offset_end & target_page_mask) {
17602 			DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx misaligned at end\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
17603 			if (target_entry->vme_next == vm_map_copy_to_entry(target_copy_map)) {
17604 				/*
17605 				 * end of last entry is mis-aligned: re-adjust by over-mapping.
17606 				 */
17607 				overmap_end = round_page_mask_64(object_offset_end, target_page_mask) - object_offset_end;
17608 				DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> overmap_end 0x%llx\n", target_entry, VME_OFFSET(target_entry), copy, (uint64_t)overmap_end);
17609 				target_entry->vme_end += overmap_end;
17610 				target_size += overmap_end;
17611 			} else {
17612 				misalignments++;
17613 				DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> misalignments %d\n", target_entry, VME_OFFSET(target_entry), copy, misalignments);
17614 				assert(copy);
17615 			}
17616 		}
17617 		target_entry->vme_start -= addr_adjustment;
17618 		target_entry->vme_end -= addr_adjustment;
17619 		DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx AFTER\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
17620 	}
17621 
17622 	target_copy_map->size = target_size;
17623 	target_copy_map->offset += overmap_start;
17624 	target_copy_map->offset -= addr_adjustment;
17625 	target_copy_map->cpy_hdr.page_shift = target_page_shift;
17626 
17627 //	assert(VM_MAP_PAGE_ALIGNED(target_copy_map->size, target_page_mask));
17628 //	assert(VM_MAP_PAGE_ALIGNED(target_copy_map->offset, FOURK_PAGE_MASK));
17629 	assert(overmap_start < VM_MAP_PAGE_SIZE(target_map));
17630 	assert(overmap_end < VM_MAP_PAGE_SIZE(target_map));
17631 
17632 	*target_copy_map_p = target_copy_map;
17633 	*overmap_start_p = overmap_start;
17634 	*overmap_end_p = overmap_end;
17635 
17636 	DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx) -> trimmed 0x%llx overmap start 0x%llx end 0x%llx KERN_SUCCESS\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p);
17637 	return KERN_SUCCESS;
17638 }
17639 
17640 kern_return_t
vm_map_range_physical_size(vm_map_t map,vm_map_address_t start,mach_vm_size_t size,mach_vm_size_t * phys_size)17641 vm_map_range_physical_size(
17642 	vm_map_t         map,
17643 	vm_map_address_t start,
17644 	mach_vm_size_t   size,
17645 	mach_vm_size_t * phys_size)
17646 {
17647 	kern_return_t   kr;
17648 	vm_map_copy_t   copy_map, target_copy_map;
17649 	vm_map_offset_t adjusted_start, adjusted_end;
17650 	vm_map_size_t   adjusted_size;
17651 	vm_prot_t       cur_prot, max_prot;
17652 	vm_map_offset_t overmap_start, overmap_end, trimmed_start;
17653 	vm_map_kernel_flags_t vmk_flags;
17654 
17655 	adjusted_start = vm_map_trunc_page(start, VM_MAP_PAGE_MASK(map));
17656 	adjusted_end = vm_map_round_page(start + size, VM_MAP_PAGE_MASK(map));
17657 	adjusted_size = adjusted_end - adjusted_start;
17658 	*phys_size = adjusted_size;
17659 	if (VM_MAP_PAGE_SIZE(map) == PAGE_SIZE) {
17660 		return KERN_SUCCESS;
17661 	}
17662 	if (start == 0) {
17663 		adjusted_start = vm_map_trunc_page(start, PAGE_MASK);
17664 		adjusted_end = vm_map_round_page(start + size, PAGE_MASK);
17665 		adjusted_size = adjusted_end - adjusted_start;
17666 		*phys_size = adjusted_size;
17667 		return KERN_SUCCESS;
17668 	}
17669 	if (adjusted_size == 0) {
17670 		DEBUG4K_SHARE("map %p start 0x%llx size 0x%llx adjusted 0x%llx -> phys_size 0!\n", map, (uint64_t)start, (uint64_t)size, (uint64_t)adjusted_size);
17671 		*phys_size = 0;
17672 		return KERN_SUCCESS;
17673 	}
17674 
17675 	vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
17676 	vmk_flags.vmkf_copy_pageable = TRUE;
17677 	vmk_flags.vmkf_copy_same_map = TRUE;
17678 	assert(adjusted_size != 0);
17679 	cur_prot = VM_PROT_NONE; /* legacy mode */
17680 	max_prot = VM_PROT_NONE; /* legacy mode */
17681 	kr = vm_map_copy_extract(map, adjusted_start, adjusted_size,
17682 	    FALSE /* copy */,
17683 	    &copy_map,
17684 	    &cur_prot, &max_prot, VM_INHERIT_DEFAULT,
17685 	    vmk_flags);
17686 	if (kr != KERN_SUCCESS) {
17687 		DEBUG4K_ERROR("map %p start 0x%llx 0x%llx size 0x%llx 0x%llx kr 0x%x\n", map, (uint64_t)start, (uint64_t)adjusted_start, size, (uint64_t)adjusted_size, kr);
17688 		//assert(0);
17689 		*phys_size = 0;
17690 		return kr;
17691 	}
17692 	assert(copy_map != VM_MAP_COPY_NULL);
17693 	target_copy_map = copy_map;
17694 	DEBUG4K_ADJUST("adjusting...\n");
17695 	kr = vm_map_copy_adjust_to_target(
17696 		copy_map,
17697 		start - adjusted_start, /* offset */
17698 		size, /* size */
17699 		kernel_map,
17700 		FALSE,                          /* copy */
17701 		&target_copy_map,
17702 		&overmap_start,
17703 		&overmap_end,
17704 		&trimmed_start);
17705 	if (kr == KERN_SUCCESS) {
17706 		if (target_copy_map->size != *phys_size) {
17707 			DEBUG4K_ADJUST("map %p (%d) start 0x%llx size 0x%llx adjusted_start 0x%llx adjusted_end 0x%llx overmap_start 0x%llx overmap_end 0x%llx trimmed_start 0x%llx phys_size 0x%llx -> 0x%llx\n", map, VM_MAP_PAGE_SHIFT(map), (uint64_t)start, (uint64_t)size, (uint64_t)adjusted_start, (uint64_t)adjusted_end, (uint64_t)overmap_start, (uint64_t)overmap_end, (uint64_t)trimmed_start, (uint64_t)*phys_size, (uint64_t)target_copy_map->size);
17708 		}
17709 		*phys_size = target_copy_map->size;
17710 	} else {
17711 		DEBUG4K_ERROR("map %p start 0x%llx 0x%llx size 0x%llx 0x%llx kr 0x%x\n", map, (uint64_t)start, (uint64_t)adjusted_start, size, (uint64_t)adjusted_size, kr);
17712 		//assert(0);
17713 		*phys_size = 0;
17714 	}
17715 	vm_map_copy_discard(copy_map);
17716 	copy_map = VM_MAP_COPY_NULL;
17717 
17718 	return kr;
17719 }
17720 
17721 
17722 kern_return_t
memory_entry_check_for_adjustment(vm_map_t src_map,ipc_port_t port,vm_map_offset_t * overmap_start,vm_map_offset_t * overmap_end)17723 memory_entry_check_for_adjustment(
17724 	vm_map_t                        src_map,
17725 	ipc_port_t                      port,
17726 	vm_map_offset_t         *overmap_start,
17727 	vm_map_offset_t         *overmap_end)
17728 {
17729 	kern_return_t kr = KERN_SUCCESS;
17730 	vm_map_copy_t copy_map = VM_MAP_COPY_NULL, target_copy_map = VM_MAP_COPY_NULL;
17731 
17732 	assert(port);
17733 	assertf(ip_kotype(port) == IKOT_NAMED_ENTRY, "Port Type expected: %d...received:%d\n", IKOT_NAMED_ENTRY, ip_kotype(port));
17734 
17735 	vm_named_entry_t        named_entry;
17736 
17737 	named_entry = mach_memory_entry_from_port(port);
17738 	named_entry_lock(named_entry);
17739 	copy_map = named_entry->backing.copy;
17740 	target_copy_map = copy_map;
17741 
17742 	if (src_map && VM_MAP_PAGE_SHIFT(src_map) < PAGE_SHIFT) {
17743 		vm_map_offset_t trimmed_start;
17744 
17745 		trimmed_start = 0;
17746 		DEBUG4K_ADJUST("adjusting...\n");
17747 		kr = vm_map_copy_adjust_to_target(
17748 			copy_map,
17749 			0, /* offset */
17750 			copy_map->size, /* size */
17751 			src_map,
17752 			FALSE, /* copy */
17753 			&target_copy_map,
17754 			overmap_start,
17755 			overmap_end,
17756 			&trimmed_start);
17757 		assert(trimmed_start == 0);
17758 	}
17759 	named_entry_unlock(named_entry);
17760 
17761 	return kr;
17762 }
17763 
17764 
17765 /*
17766  *	Routine:	vm_remap
17767  *
17768  *			Map portion of a task's address space.
17769  *			Mapped region must not overlap more than
17770  *			one vm memory object. Protections and
17771  *			inheritance attributes remain the same
17772  *			as in the original task and are	out parameters.
17773  *			Source and Target task can be identical
17774  *			Other attributes are identical as for vm_map()
17775  */
17776 kern_return_t
vm_map_remap(vm_map_t target_map,vm_map_address_t * address,vm_map_size_t size,vm_map_offset_t mask,int flags,vm_map_kernel_flags_t vmk_flags,vm_tag_t tag,vm_map_t src_map,vm_map_offset_t memory_address,boolean_t copy,vm_prot_t * cur_protection,vm_prot_t * max_protection,vm_inherit_t inheritance)17777 vm_map_remap(
17778 	vm_map_t                target_map,
17779 	vm_map_address_t        *address,
17780 	vm_map_size_t           size,
17781 	vm_map_offset_t         mask,
17782 	int                     flags,
17783 	vm_map_kernel_flags_t   vmk_flags,
17784 	vm_tag_t                tag,
17785 	vm_map_t                src_map,
17786 	vm_map_offset_t         memory_address,
17787 	boolean_t               copy,
17788 	vm_prot_t               *cur_protection, /* IN/OUT */
17789 	vm_prot_t               *max_protection, /* IN/OUT */
17790 	vm_inherit_t            inheritance)
17791 {
17792 	kern_return_t           result;
17793 	vm_map_entry_t          entry;
17794 	vm_map_entry_t          insp_entry = VM_MAP_ENTRY_NULL;
17795 	vm_map_entry_t          new_entry;
17796 	vm_map_copy_t           copy_map;
17797 	vm_map_offset_t         offset_in_mapping;
17798 	vm_map_size_t           target_size = 0;
17799 	vm_map_size_t           src_page_mask, target_page_mask;
17800 	vm_map_offset_t         overmap_start, overmap_end, trimmed_start;
17801 	vm_map_offset_t         initial_memory_address;
17802 	vm_map_size_t           initial_size;
17803 	VM_MAP_ZAP_DECLARE(zap_list);
17804 
17805 	if (target_map == VM_MAP_NULL) {
17806 		return KERN_INVALID_ARGUMENT;
17807 	}
17808 
17809 	initial_memory_address = memory_address;
17810 	initial_size = size;
17811 	src_page_mask = VM_MAP_PAGE_MASK(src_map);
17812 	target_page_mask = VM_MAP_PAGE_MASK(target_map);
17813 
17814 	switch (inheritance) {
17815 	case VM_INHERIT_NONE:
17816 	case VM_INHERIT_COPY:
17817 	case VM_INHERIT_SHARE:
17818 		if (size != 0 && src_map != VM_MAP_NULL) {
17819 			break;
17820 		}
17821 		OS_FALLTHROUGH;
17822 	default:
17823 		return KERN_INVALID_ARGUMENT;
17824 	}
17825 
17826 	if (src_page_mask != target_page_mask) {
17827 		if (copy) {
17828 			DEBUG4K_COPY("src_map %p pgsz 0x%x addr 0x%llx size 0x%llx copy %d -> target_map %p pgsz 0x%x\n", src_map, VM_MAP_PAGE_SIZE(src_map), (uint64_t)memory_address, (uint64_t)size, copy, target_map, VM_MAP_PAGE_SIZE(target_map));
17829 		} else {
17830 			DEBUG4K_SHARE("src_map %p pgsz 0x%x addr 0x%llx size 0x%llx copy %d -> target_map %p pgsz 0x%x\n", src_map, VM_MAP_PAGE_SIZE(src_map), (uint64_t)memory_address, (uint64_t)size, copy, target_map, VM_MAP_PAGE_SIZE(target_map));
17831 		}
17832 	}
17833 
17834 	/*
17835 	 * If the user is requesting that we return the address of the
17836 	 * first byte of the data (rather than the base of the page),
17837 	 * then we use different rounding semantics: specifically,
17838 	 * we assume that (memory_address, size) describes a region
17839 	 * all of whose pages we must cover, rather than a base to be truncated
17840 	 * down and a size to be added to that base.  So we figure out
17841 	 * the highest page that the requested region includes and make
17842 	 * sure that the size will cover it.
17843 	 *
17844 	 * The key example we're worried about it is of the form:
17845 	 *
17846 	 *              memory_address = 0x1ff0, size = 0x20
17847 	 *
17848 	 * With the old semantics, we round down the memory_address to 0x1000
17849 	 * and round up the size to 0x1000, resulting in our covering *only*
17850 	 * page 0x1000.  With the new semantics, we'd realize that the region covers
17851 	 * 0x1ff0-0x2010, and compute a size of 0x2000.  Thus, we cover both page
17852 	 * 0x1000 and page 0x2000 in the region we remap.
17853 	 */
17854 	if ((flags & VM_FLAGS_RETURN_DATA_ADDR) != 0) {
17855 		vm_map_offset_t range_start, range_end;
17856 
17857 		range_start = vm_map_trunc_page(memory_address, src_page_mask);
17858 		range_end = vm_map_round_page(memory_address + size, src_page_mask);
17859 		memory_address = range_start;
17860 		size = range_end - range_start;
17861 		offset_in_mapping = initial_memory_address - memory_address;
17862 	} else {
17863 		/*
17864 		 * IMPORTANT:
17865 		 * This legacy code path is broken: for the range mentioned
17866 		 * above [ memory_address = 0x1ff0,size = 0x20 ], which spans
17867 		 * two 4k pages, it yields [ memory_address = 0x1000,
17868 		 * size = 0x1000 ], which covers only the first 4k page.
17869 		 * BUT some code unfortunately depends on this bug, so we
17870 		 * can't fix it without breaking something.
17871 		 * New code should get automatically opted in the new
17872 		 * behavior with the new VM_FLAGS_RETURN_DATA_ADDR flags.
17873 		 */
17874 		offset_in_mapping = 0;
17875 		memory_address = vm_map_trunc_page(memory_address, src_page_mask);
17876 		size = vm_map_round_page(size, src_page_mask);
17877 		initial_memory_address = memory_address;
17878 		initial_size = size;
17879 	}
17880 
17881 
17882 	if (size == 0) {
17883 		return KERN_INVALID_ARGUMENT;
17884 	}
17885 
17886 	if (flags & VM_FLAGS_RESILIENT_MEDIA) {
17887 		/* must be copy-on-write to be "media resilient" */
17888 		if (!copy) {
17889 			return KERN_INVALID_ARGUMENT;
17890 		}
17891 	}
17892 
17893 	vmk_flags.vmkf_copy_pageable = target_map->hdr.entries_pageable;
17894 	vmk_flags.vmkf_copy_same_map = (src_map == target_map);
17895 
17896 	assert(size != 0);
17897 	result = vm_map_copy_extract(src_map,
17898 	    memory_address,
17899 	    size,
17900 	    copy, &copy_map,
17901 	    cur_protection, /* IN/OUT */
17902 	    max_protection, /* IN/OUT */
17903 	    inheritance,
17904 	    vmk_flags);
17905 	if (result != KERN_SUCCESS) {
17906 		return result;
17907 	}
17908 	assert(copy_map != VM_MAP_COPY_NULL);
17909 
17910 	overmap_start = 0;
17911 	overmap_end = 0;
17912 	trimmed_start = 0;
17913 	target_size = size;
17914 	if (src_page_mask != target_page_mask) {
17915 		vm_map_copy_t target_copy_map;
17916 
17917 		target_copy_map = copy_map; /* can modify "copy_map" itself */
17918 		DEBUG4K_ADJUST("adjusting...\n");
17919 		result = vm_map_copy_adjust_to_target(
17920 			copy_map,
17921 			offset_in_mapping, /* offset */
17922 			initial_size,
17923 			target_map,
17924 			copy,
17925 			&target_copy_map,
17926 			&overmap_start,
17927 			&overmap_end,
17928 			&trimmed_start);
17929 		if (result != KERN_SUCCESS) {
17930 			DEBUG4K_COPY("failed to adjust 0x%x\n", result);
17931 			vm_map_copy_discard(copy_map);
17932 			return result;
17933 		}
17934 		if (trimmed_start == 0) {
17935 			/* nothing trimmed: no adjustment needed */
17936 		} else if (trimmed_start >= offset_in_mapping) {
17937 			/* trimmed more than offset_in_mapping: nothing left */
17938 			assert(overmap_start == 0);
17939 			assert(overmap_end == 0);
17940 			offset_in_mapping = 0;
17941 		} else {
17942 			/* trimmed some of offset_in_mapping: adjust */
17943 			assert(overmap_start == 0);
17944 			assert(overmap_end == 0);
17945 			offset_in_mapping -= trimmed_start;
17946 		}
17947 		offset_in_mapping += overmap_start;
17948 		target_size = target_copy_map->size;
17949 	}
17950 
17951 	/*
17952 	 * Allocate/check a range of free virtual address
17953 	 * space for the target
17954 	 */
17955 	*address = vm_map_trunc_page(*address, target_page_mask);
17956 	vm_map_lock(target_map);
17957 	target_size = vm_map_round_page(target_size, target_page_mask);
17958 	result = vm_map_remap_range_allocate(target_map, address,
17959 	    target_size, mask, flags, vmk_flags, tag,
17960 	    &insp_entry, &zap_list);
17961 
17962 	for (entry = vm_map_copy_first_entry(copy_map);
17963 	    entry != vm_map_copy_to_entry(copy_map);
17964 	    entry = new_entry) {
17965 		new_entry = entry->vme_next;
17966 		vm_map_copy_entry_unlink(copy_map, entry);
17967 		if (result == KERN_SUCCESS) {
17968 			if (flags & VM_FLAGS_RESILIENT_CODESIGN) {
17969 				/* no codesigning -> read-only access */
17970 				entry->max_protection = VM_PROT_READ;
17971 				entry->protection = VM_PROT_READ;
17972 				entry->vme_resilient_codesign = TRUE;
17973 			}
17974 			entry->vme_start += *address;
17975 			entry->vme_end += *address;
17976 			assert(!entry->map_aligned);
17977 			if ((flags & VM_FLAGS_RESILIENT_MEDIA) &&
17978 			    !entry->is_sub_map &&
17979 			    (VME_OBJECT(entry) == VM_OBJECT_NULL ||
17980 			    VME_OBJECT(entry)->internal)) {
17981 				entry->vme_resilient_media = TRUE;
17982 			}
17983 			assert(VM_MAP_PAGE_ALIGNED(entry->vme_start, MIN(target_page_mask, PAGE_MASK)));
17984 			assert(VM_MAP_PAGE_ALIGNED(entry->vme_end, MIN(target_page_mask, PAGE_MASK)));
17985 			assert(VM_MAP_PAGE_ALIGNED(VME_OFFSET(entry), MIN(target_page_mask, PAGE_MASK)));
17986 			vm_map_store_entry_link(target_map, insp_entry, entry,
17987 			    vmk_flags);
17988 			insp_entry = entry;
17989 		} else {
17990 			if (!entry->is_sub_map) {
17991 				vm_object_deallocate(VME_OBJECT(entry));
17992 			} else {
17993 				vm_map_deallocate(VME_SUBMAP(entry));
17994 			}
17995 			vm_map_copy_entry_dispose(entry);
17996 		}
17997 	}
17998 
17999 	if (flags & VM_FLAGS_RESILIENT_CODESIGN) {
18000 		*cur_protection = VM_PROT_READ;
18001 		*max_protection = VM_PROT_READ;
18002 	}
18003 
18004 	if (result == KERN_SUCCESS) {
18005 		target_map->size += target_size;
18006 		SAVE_HINT_MAP_WRITE(target_map, insp_entry);
18007 
18008 	}
18009 	vm_map_unlock(target_map);
18010 
18011 	vm_map_zap_dispose(&zap_list);
18012 
18013 	if (result == KERN_SUCCESS && target_map->wiring_required) {
18014 		result = vm_map_wire_kernel(target_map, *address,
18015 		    *address + size, *cur_protection, VM_KERN_MEMORY_MLOCK,
18016 		    TRUE);
18017 	}
18018 
18019 	/*
18020 	 * If requested, return the address of the data pointed to by the
18021 	 * request, rather than the base of the resulting page.
18022 	 */
18023 	if ((flags & VM_FLAGS_RETURN_DATA_ADDR) != 0) {
18024 		*address += offset_in_mapping;
18025 	}
18026 
18027 	if (src_page_mask != target_page_mask) {
18028 		DEBUG4K_SHARE("vm_remap(%p 0x%llx 0x%llx copy=%d-> %p 0x%llx 0x%llx  result=0x%x\n", src_map, (uint64_t)memory_address, (uint64_t)size, copy, target_map, (uint64_t)*address, (uint64_t)offset_in_mapping, result);
18029 	}
18030 	vm_map_copy_discard(copy_map);
18031 	copy_map = VM_MAP_COPY_NULL;
18032 
18033 	return result;
18034 }
18035 
18036 /*
18037  *	Routine:	vm_map_remap_range_allocate
18038  *
18039  *	Description:
18040  *		Allocate a range in the specified virtual address map.
18041  *		returns the address and the map entry just before the allocated
18042  *		range
18043  *
18044  *	Map must be locked.
18045  */
18046 
18047 static kern_return_t
vm_map_remap_range_allocate(vm_map_t map,vm_map_address_t * address,vm_map_size_t size,vm_map_offset_t mask,int flags,vm_map_kernel_flags_t vmk_flags,__unused vm_tag_t tag,vm_map_entry_t * map_entry,vm_map_zap_t zap_list)18048 vm_map_remap_range_allocate(
18049 	vm_map_t                map,
18050 	vm_map_address_t        *address,       /* IN/OUT */
18051 	vm_map_size_t           size,
18052 	vm_map_offset_t         mask,
18053 	int                     flags,
18054 	vm_map_kernel_flags_t   vmk_flags,
18055 	__unused vm_tag_t       tag,
18056 	vm_map_entry_t          *map_entry,     /* OUT */
18057 	vm_map_zap_t            zap_list)
18058 {
18059 	vm_map_entry_t  entry;
18060 	vm_map_offset_t start;
18061 	kern_return_t   kr;
18062 
18063 	start = *address;
18064 
18065 	if (flags & VM_FLAGS_ANYWHERE) {
18066 		if (flags & VM_FLAGS_RANDOM_ADDR) {
18067 			vmk_flags.vmkf_random_address = true;
18068 		}
18069 		if (start) {
18070 			vmk_flags.vmkf_range_id = kmem_addr_get_range(start, size);
18071 		}
18072 
18073 		kr = vm_map_locate_space(map, size, mask, vmk_flags,
18074 		    &start, &entry);
18075 		if (kr != KERN_SUCCESS) {
18076 			return kr;
18077 		}
18078 		*address = start;
18079 	} else {
18080 		vm_map_entry_t  temp_entry;
18081 		vm_map_offset_t end;
18082 
18083 		/*
18084 		 *	Verify that:
18085 		 *		the address doesn't itself violate
18086 		 *		the mask requirement.
18087 		 */
18088 
18089 		if ((start & mask) != 0) {
18090 			return KERN_NO_SPACE;
18091 		}
18092 
18093 
18094 		/*
18095 		 *	...	the address is within bounds
18096 		 */
18097 
18098 		end = start + size;
18099 
18100 		if ((start < map->min_offset) ||
18101 		    (end > map->max_offset) ||
18102 		    (start >= end)) {
18103 			return KERN_INVALID_ADDRESS;
18104 		}
18105 
18106 		/*
18107 		 * If we're asked to overwrite whatever was mapped in that
18108 		 * range, first deallocate that range.
18109 		 */
18110 		if (flags & VM_FLAGS_OVERWRITE) {
18111 			vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_MAP_ALIGN;
18112 
18113 			/*
18114 			 * We use a "zap_list" to avoid having to unlock
18115 			 * the "map" in vm_map_delete(), which would compromise
18116 			 * the atomicity of the "deallocate" and then "remap"
18117 			 * combination.
18118 			 */
18119 			remove_flags |= VM_MAP_REMOVE_NO_YIELD;
18120 
18121 			if (vmk_flags.vmkf_overwrite_immutable) {
18122 				remove_flags |= VM_MAP_REMOVE_IMMUTABLE;
18123 			}
18124 			(void)vm_map_delete(map, start, end, remove_flags,
18125 			    KMEM_GUARD_NONE, zap_list);
18126 		}
18127 
18128 		/*
18129 		 *	...	the starting address isn't allocated
18130 		 */
18131 
18132 		if (vm_map_lookup_entry(map, start, &temp_entry)) {
18133 			return KERN_NO_SPACE;
18134 		}
18135 
18136 		entry = temp_entry;
18137 
18138 		/*
18139 		 *	...	the next region doesn't overlap the
18140 		 *		end point.
18141 		 */
18142 
18143 		if ((entry->vme_next != vm_map_to_entry(map)) &&
18144 		    (entry->vme_next->vme_start < end)) {
18145 			return KERN_NO_SPACE;
18146 		}
18147 	}
18148 	*map_entry = entry;
18149 	return KERN_SUCCESS;
18150 }
18151 
18152 /*
18153  *	vm_map_switch:
18154  *
18155  *	Set the address map for the current thread to the specified map
18156  */
18157 
18158 vm_map_t
vm_map_switch(vm_map_t map)18159 vm_map_switch(
18160 	vm_map_t        map)
18161 {
18162 	int             mycpu;
18163 	thread_t        thread = current_thread();
18164 	vm_map_t        oldmap = thread->map;
18165 
18166 	mp_disable_preemption();
18167 	mycpu = cpu_number();
18168 
18169 	/*
18170 	 *	Deactivate the current map and activate the requested map
18171 	 */
18172 	PMAP_SWITCH_USER(thread, map, mycpu);
18173 
18174 	mp_enable_preemption();
18175 	return oldmap;
18176 }
18177 
18178 
18179 /*
18180  *	Routine:	vm_map_write_user
18181  *
18182  *	Description:
18183  *		Copy out data from a kernel space into space in the
18184  *		destination map. The space must already exist in the
18185  *		destination map.
18186  *		NOTE:  This routine should only be called by threads
18187  *		which can block on a page fault. i.e. kernel mode user
18188  *		threads.
18189  *
18190  */
18191 kern_return_t
vm_map_write_user(vm_map_t map,void * src_p,vm_map_address_t dst_addr,vm_size_t size)18192 vm_map_write_user(
18193 	vm_map_t                map,
18194 	void                    *src_p,
18195 	vm_map_address_t        dst_addr,
18196 	vm_size_t               size)
18197 {
18198 	kern_return_t   kr = KERN_SUCCESS;
18199 
18200 	if (current_map() == map) {
18201 		if (copyout(src_p, dst_addr, size)) {
18202 			kr = KERN_INVALID_ADDRESS;
18203 		}
18204 	} else {
18205 		vm_map_t        oldmap;
18206 
18207 		/* take on the identity of the target map while doing */
18208 		/* the transfer */
18209 
18210 		vm_map_reference(map);
18211 		oldmap = vm_map_switch(map);
18212 		if (copyout(src_p, dst_addr, size)) {
18213 			kr = KERN_INVALID_ADDRESS;
18214 		}
18215 		vm_map_switch(oldmap);
18216 		vm_map_deallocate(map);
18217 	}
18218 	return kr;
18219 }
18220 
18221 /*
18222  *	Routine:	vm_map_read_user
18223  *
18224  *	Description:
18225  *		Copy in data from a user space source map into the
18226  *		kernel map. The space must already exist in the
18227  *		kernel map.
18228  *		NOTE:  This routine should only be called by threads
18229  *		which can block on a page fault. i.e. kernel mode user
18230  *		threads.
18231  *
18232  */
18233 kern_return_t
vm_map_read_user(vm_map_t map,vm_map_address_t src_addr,void * dst_p,vm_size_t size)18234 vm_map_read_user(
18235 	vm_map_t                map,
18236 	vm_map_address_t        src_addr,
18237 	void                    *dst_p,
18238 	vm_size_t               size)
18239 {
18240 	kern_return_t   kr = KERN_SUCCESS;
18241 
18242 	if (current_map() == map) {
18243 		if (copyin(src_addr, dst_p, size)) {
18244 			kr = KERN_INVALID_ADDRESS;
18245 		}
18246 	} else {
18247 		vm_map_t        oldmap;
18248 
18249 		/* take on the identity of the target map while doing */
18250 		/* the transfer */
18251 
18252 		vm_map_reference(map);
18253 		oldmap = vm_map_switch(map);
18254 		if (copyin(src_addr, dst_p, size)) {
18255 			kr = KERN_INVALID_ADDRESS;
18256 		}
18257 		vm_map_switch(oldmap);
18258 		vm_map_deallocate(map);
18259 	}
18260 	return kr;
18261 }
18262 
18263 
18264 /*
18265  *	vm_map_check_protection:
18266  *
18267  *	Assert that the target map allows the specified
18268  *	privilege on the entire address region given.
18269  *	The entire region must be allocated.
18270  */
18271 boolean_t
vm_map_check_protection(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t protection)18272 vm_map_check_protection(vm_map_t map, vm_map_offset_t start,
18273     vm_map_offset_t end, vm_prot_t protection)
18274 {
18275 	vm_map_entry_t entry;
18276 	vm_map_entry_t tmp_entry;
18277 
18278 	vm_map_lock(map);
18279 
18280 	if (start < vm_map_min(map) || end > vm_map_max(map) || start > end) {
18281 		vm_map_unlock(map);
18282 		return FALSE;
18283 	}
18284 
18285 	if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
18286 		vm_map_unlock(map);
18287 		return FALSE;
18288 	}
18289 
18290 	entry = tmp_entry;
18291 
18292 	while (start < end) {
18293 		if (entry == vm_map_to_entry(map)) {
18294 			vm_map_unlock(map);
18295 			return FALSE;
18296 		}
18297 
18298 		/*
18299 		 *	No holes allowed!
18300 		 */
18301 
18302 		if (start < entry->vme_start) {
18303 			vm_map_unlock(map);
18304 			return FALSE;
18305 		}
18306 
18307 		/*
18308 		 * Check protection associated with entry.
18309 		 */
18310 
18311 		if ((entry->protection & protection) != protection) {
18312 			vm_map_unlock(map);
18313 			return FALSE;
18314 		}
18315 
18316 		/* go to next entry */
18317 
18318 		start = entry->vme_end;
18319 		entry = entry->vme_next;
18320 	}
18321 	vm_map_unlock(map);
18322 	return TRUE;
18323 }
18324 
18325 kern_return_t
vm_map_purgable_control(vm_map_t map,vm_map_offset_t address,vm_purgable_t control,int * state)18326 vm_map_purgable_control(
18327 	vm_map_t                map,
18328 	vm_map_offset_t         address,
18329 	vm_purgable_t           control,
18330 	int                     *state)
18331 {
18332 	vm_map_entry_t          entry;
18333 	vm_object_t             object;
18334 	kern_return_t           kr;
18335 	boolean_t               was_nonvolatile;
18336 
18337 	/*
18338 	 * Vet all the input parameters and current type and state of the
18339 	 * underlaying object.  Return with an error if anything is amiss.
18340 	 */
18341 	if (map == VM_MAP_NULL) {
18342 		return KERN_INVALID_ARGUMENT;
18343 	}
18344 
18345 	if (control != VM_PURGABLE_SET_STATE &&
18346 	    control != VM_PURGABLE_GET_STATE &&
18347 	    control != VM_PURGABLE_PURGE_ALL &&
18348 	    control != VM_PURGABLE_SET_STATE_FROM_KERNEL) {
18349 		return KERN_INVALID_ARGUMENT;
18350 	}
18351 
18352 	if (control == VM_PURGABLE_PURGE_ALL) {
18353 		vm_purgeable_object_purge_all();
18354 		return KERN_SUCCESS;
18355 	}
18356 
18357 	if ((control == VM_PURGABLE_SET_STATE ||
18358 	    control == VM_PURGABLE_SET_STATE_FROM_KERNEL) &&
18359 	    (((*state & ~(VM_PURGABLE_ALL_MASKS)) != 0) ||
18360 	    ((*state & VM_PURGABLE_STATE_MASK) > VM_PURGABLE_STATE_MASK))) {
18361 		return KERN_INVALID_ARGUMENT;
18362 	}
18363 
18364 	vm_map_lock_read(map);
18365 
18366 	if (!vm_map_lookup_entry(map, address, &entry) || entry->is_sub_map) {
18367 		/*
18368 		 * Must pass a valid non-submap address.
18369 		 */
18370 		vm_map_unlock_read(map);
18371 		return KERN_INVALID_ADDRESS;
18372 	}
18373 
18374 	if ((entry->protection & VM_PROT_WRITE) == 0 &&
18375 	    control != VM_PURGABLE_GET_STATE) {
18376 		/*
18377 		 * Can't apply purgable controls to something you can't write.
18378 		 */
18379 		vm_map_unlock_read(map);
18380 		return KERN_PROTECTION_FAILURE;
18381 	}
18382 
18383 	object = VME_OBJECT(entry);
18384 	if (object == VM_OBJECT_NULL ||
18385 	    object->purgable == VM_PURGABLE_DENY) {
18386 		/*
18387 		 * Object must already be present and be purgeable.
18388 		 */
18389 		vm_map_unlock_read(map);
18390 		return KERN_INVALID_ARGUMENT;
18391 	}
18392 
18393 	vm_object_lock(object);
18394 
18395 #if 00
18396 	if (VME_OFFSET(entry) != 0 ||
18397 	    entry->vme_end - entry->vme_start != object->vo_size) {
18398 		/*
18399 		 * Can only apply purgable controls to the whole (existing)
18400 		 * object at once.
18401 		 */
18402 		vm_map_unlock_read(map);
18403 		vm_object_unlock(object);
18404 		return KERN_INVALID_ARGUMENT;
18405 	}
18406 #endif
18407 
18408 	assert(!entry->is_sub_map);
18409 	assert(!entry->use_pmap); /* purgeable has its own accounting */
18410 
18411 	vm_map_unlock_read(map);
18412 
18413 	was_nonvolatile = (object->purgable == VM_PURGABLE_NONVOLATILE);
18414 
18415 	kr = vm_object_purgable_control(object, control, state);
18416 
18417 	if (was_nonvolatile &&
18418 	    object->purgable != VM_PURGABLE_NONVOLATILE &&
18419 	    map->pmap == kernel_pmap) {
18420 #if DEBUG
18421 		object->vo_purgeable_volatilizer = kernel_task;
18422 #endif /* DEBUG */
18423 	}
18424 
18425 	vm_object_unlock(object);
18426 
18427 	return kr;
18428 }
18429 
18430 void
vm_map_footprint_query_page_info(vm_map_t map,vm_map_entry_t map_entry,vm_map_offset_t curr_s_offset,int * disposition_p)18431 vm_map_footprint_query_page_info(
18432 	vm_map_t        map,
18433 	vm_map_entry_t  map_entry,
18434 	vm_map_offset_t curr_s_offset,
18435 	int             *disposition_p)
18436 {
18437 	int             pmap_disp;
18438 	vm_object_t     object = VM_OBJECT_NULL;
18439 	int             disposition;
18440 	int             effective_page_size;
18441 
18442 	vm_map_lock_assert_held(map);
18443 	assert(!map->has_corpse_footprint);
18444 	assert(curr_s_offset >= map_entry->vme_start);
18445 	assert(curr_s_offset < map_entry->vme_end);
18446 
18447 	if (map_entry->is_sub_map) {
18448 		if (!map_entry->use_pmap) {
18449 			/* nested pmap: no footprint */
18450 			*disposition_p = 0;
18451 			return;
18452 		}
18453 	} else {
18454 		object = VME_OBJECT(map_entry);
18455 		if (object == VM_OBJECT_NULL) {
18456 			/* nothing mapped here: no need to ask */
18457 			*disposition_p = 0;
18458 			return;
18459 		}
18460 	}
18461 
18462 	effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(map));
18463 
18464 	pmap_disp = 0;
18465 
18466 	/*
18467 	 * Query the pmap.
18468 	 */
18469 	pmap_query_page_info(map->pmap, curr_s_offset, &pmap_disp);
18470 
18471 	/*
18472 	 * Compute this page's disposition.
18473 	 */
18474 	disposition = 0;
18475 
18476 	/* deal with "alternate accounting" first */
18477 	if (!map_entry->is_sub_map &&
18478 	    object->vo_no_footprint) {
18479 		/* does not count in footprint */
18480 		assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
18481 	} else if (!map_entry->is_sub_map &&
18482 	    (object->purgable == VM_PURGABLE_NONVOLATILE ||
18483 	    (object->purgable == VM_PURGABLE_DENY &&
18484 	    object->vo_ledger_tag)) &&
18485 	    VM_OBJECT_OWNER(object) != NULL &&
18486 	    VM_OBJECT_OWNER(object)->map == map) {
18487 		assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
18488 		if ((((curr_s_offset
18489 		    - map_entry->vme_start
18490 		    + VME_OFFSET(map_entry))
18491 		    / effective_page_size) <
18492 		    (object->resident_page_count +
18493 		    vm_compressor_pager_get_count(object->pager)))) {
18494 			/*
18495 			 * Non-volatile purgeable object owned
18496 			 * by this task: report the first
18497 			 * "#resident + #compressed" pages as
18498 			 * "resident" (to show that they
18499 			 * contribute to the footprint) but not
18500 			 * "dirty" (to avoid double-counting
18501 			 * with the fake "non-volatile" region
18502 			 * we'll report at the end of the
18503 			 * address space to account for all
18504 			 * (mapped or not) non-volatile memory
18505 			 * owned by this task.
18506 			 */
18507 			disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
18508 		}
18509 	} else if (!map_entry->is_sub_map &&
18510 	    (object->purgable == VM_PURGABLE_VOLATILE ||
18511 	    object->purgable == VM_PURGABLE_EMPTY) &&
18512 	    VM_OBJECT_OWNER(object) != NULL &&
18513 	    VM_OBJECT_OWNER(object)->map == map) {
18514 		assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
18515 		if ((((curr_s_offset
18516 		    - map_entry->vme_start
18517 		    + VME_OFFSET(map_entry))
18518 		    / effective_page_size) <
18519 		    object->wired_page_count)) {
18520 			/*
18521 			 * Volatile|empty purgeable object owned
18522 			 * by this task: report the first
18523 			 * "#wired" pages as "resident" (to
18524 			 * show that they contribute to the
18525 			 * footprint) but not "dirty" (to avoid
18526 			 * double-counting with the fake
18527 			 * "non-volatile" region we'll report
18528 			 * at the end of the address space to
18529 			 * account for all (mapped or not)
18530 			 * non-volatile memory owned by this
18531 			 * task.
18532 			 */
18533 			disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
18534 		}
18535 	} else if (!map_entry->is_sub_map &&
18536 	    map_entry->iokit_acct &&
18537 	    object->internal &&
18538 	    object->purgable == VM_PURGABLE_DENY) {
18539 		/*
18540 		 * Non-purgeable IOKit memory: phys_footprint
18541 		 * includes the entire virtual mapping.
18542 		 */
18543 		assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
18544 		disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
18545 		disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
18546 	} else if (pmap_disp & (PMAP_QUERY_PAGE_ALTACCT |
18547 	    PMAP_QUERY_PAGE_COMPRESSED_ALTACCT)) {
18548 		/* alternate accounting */
18549 #if (__arm__ || __arm64__) && (DEVELOPMENT || DEBUG)
18550 		if (map->pmap->footprint_was_suspended) {
18551 			/*
18552 			 * The assertion below can fail if dyld
18553 			 * suspended footprint accounting
18554 			 * while doing some adjustments to
18555 			 * this page;  the mapping would say
18556 			 * "use pmap accounting" but the page
18557 			 * would be marked "alternate
18558 			 * accounting".
18559 			 */
18560 		} else
18561 #endif /* (__arm__ || __arm64__) && (DEVELOPMENT || DEBUG) */
18562 		{
18563 			assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
18564 		}
18565 		disposition = 0;
18566 	} else {
18567 		if (pmap_disp & PMAP_QUERY_PAGE_PRESENT) {
18568 			assertf(map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
18569 			disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
18570 			disposition |= VM_PAGE_QUERY_PAGE_REF;
18571 			if (pmap_disp & PMAP_QUERY_PAGE_INTERNAL) {
18572 				disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
18573 			} else {
18574 				disposition |= VM_PAGE_QUERY_PAGE_EXTERNAL;
18575 			}
18576 			if (pmap_disp & PMAP_QUERY_PAGE_REUSABLE) {
18577 				disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
18578 			}
18579 		} else if (pmap_disp & PMAP_QUERY_PAGE_COMPRESSED) {
18580 			assertf(map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
18581 			disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
18582 		}
18583 	}
18584 
18585 	*disposition_p = disposition;
18586 }
18587 
18588 kern_return_t
vm_map_page_query_internal(vm_map_t target_map,vm_map_offset_t offset,int * disposition,int * ref_count)18589 vm_map_page_query_internal(
18590 	vm_map_t        target_map,
18591 	vm_map_offset_t offset,
18592 	int             *disposition,
18593 	int             *ref_count)
18594 {
18595 	kern_return_t                   kr;
18596 	vm_page_info_basic_data_t       info;
18597 	mach_msg_type_number_t          count;
18598 
18599 	count = VM_PAGE_INFO_BASIC_COUNT;
18600 	kr = vm_map_page_info(target_map,
18601 	    offset,
18602 	    VM_PAGE_INFO_BASIC,
18603 	    (vm_page_info_t) &info,
18604 	    &count);
18605 	if (kr == KERN_SUCCESS) {
18606 		*disposition = info.disposition;
18607 		*ref_count = info.ref_count;
18608 	} else {
18609 		*disposition = 0;
18610 		*ref_count = 0;
18611 	}
18612 
18613 	return kr;
18614 }
18615 
18616 kern_return_t
vm_map_page_info(vm_map_t map,vm_map_offset_t offset,vm_page_info_flavor_t flavor,vm_page_info_t info,mach_msg_type_number_t * count)18617 vm_map_page_info(
18618 	vm_map_t                map,
18619 	vm_map_offset_t         offset,
18620 	vm_page_info_flavor_t   flavor,
18621 	vm_page_info_t          info,
18622 	mach_msg_type_number_t  *count)
18623 {
18624 	return vm_map_page_range_info_internal(map,
18625 	           offset, /* start of range */
18626 	           (offset + 1), /* this will get rounded in the call to the page boundary */
18627 	           (int)-1, /* effective_page_shift: unspecified */
18628 	           flavor,
18629 	           info,
18630 	           count);
18631 }
18632 
18633 kern_return_t
vm_map_page_range_info_internal(vm_map_t map,vm_map_offset_t start_offset,vm_map_offset_t end_offset,int effective_page_shift,vm_page_info_flavor_t flavor,vm_page_info_t info,mach_msg_type_number_t * count)18634 vm_map_page_range_info_internal(
18635 	vm_map_t                map,
18636 	vm_map_offset_t         start_offset,
18637 	vm_map_offset_t         end_offset,
18638 	int                     effective_page_shift,
18639 	vm_page_info_flavor_t   flavor,
18640 	vm_page_info_t          info,
18641 	mach_msg_type_number_t  *count)
18642 {
18643 	vm_map_entry_t          map_entry = VM_MAP_ENTRY_NULL;
18644 	vm_object_t             object = VM_OBJECT_NULL, curr_object = VM_OBJECT_NULL;
18645 	vm_page_t               m = VM_PAGE_NULL;
18646 	kern_return_t           retval = KERN_SUCCESS;
18647 	int                     disposition = 0;
18648 	int                     ref_count = 0;
18649 	int                     depth = 0, info_idx = 0;
18650 	vm_page_info_basic_t    basic_info = 0;
18651 	vm_map_offset_t         offset_in_page = 0, offset_in_object = 0, curr_offset_in_object = 0;
18652 	vm_map_offset_t         start = 0, end = 0, curr_s_offset = 0, curr_e_offset = 0;
18653 	boolean_t               do_region_footprint;
18654 	ledger_amount_t         ledger_resident, ledger_compressed;
18655 	int                     effective_page_size;
18656 	vm_map_offset_t         effective_page_mask;
18657 
18658 	switch (flavor) {
18659 	case VM_PAGE_INFO_BASIC:
18660 		if (*count != VM_PAGE_INFO_BASIC_COUNT) {
18661 			/*
18662 			 * The "vm_page_info_basic_data" structure was not
18663 			 * properly padded, so allow the size to be off by
18664 			 * one to maintain backwards binary compatibility...
18665 			 */
18666 			if (*count != VM_PAGE_INFO_BASIC_COUNT - 1) {
18667 				return KERN_INVALID_ARGUMENT;
18668 			}
18669 		}
18670 		break;
18671 	default:
18672 		return KERN_INVALID_ARGUMENT;
18673 	}
18674 
18675 	if (effective_page_shift == -1) {
18676 		effective_page_shift = vm_self_region_page_shift_safely(map);
18677 		if (effective_page_shift == -1) {
18678 			return KERN_INVALID_ARGUMENT;
18679 		}
18680 	}
18681 	effective_page_size = (1 << effective_page_shift);
18682 	effective_page_mask = effective_page_size - 1;
18683 
18684 	do_region_footprint = task_self_region_footprint();
18685 	disposition = 0;
18686 	ref_count = 0;
18687 	depth = 0;
18688 	info_idx = 0; /* Tracks the next index within the info structure to be filled.*/
18689 	retval = KERN_SUCCESS;
18690 
18691 	offset_in_page = start_offset & effective_page_mask;
18692 	start = vm_map_trunc_page(start_offset, effective_page_mask);
18693 	end = vm_map_round_page(end_offset, effective_page_mask);
18694 
18695 	if (end < start) {
18696 		return KERN_INVALID_ARGUMENT;
18697 	}
18698 
18699 	assert((end - start) <= MAX_PAGE_RANGE_QUERY);
18700 
18701 	vm_map_lock_read(map);
18702 
18703 	task_ledgers_footprint(map->pmap->ledger, &ledger_resident, &ledger_compressed);
18704 
18705 	for (curr_s_offset = start; curr_s_offset < end;) {
18706 		/*
18707 		 * New lookup needs reset of these variables.
18708 		 */
18709 		curr_object = object = VM_OBJECT_NULL;
18710 		offset_in_object = 0;
18711 		ref_count = 0;
18712 		depth = 0;
18713 
18714 		if (do_region_footprint &&
18715 		    curr_s_offset >= vm_map_last_entry(map)->vme_end) {
18716 			/*
18717 			 * Request for "footprint" info about a page beyond
18718 			 * the end of address space: this must be for
18719 			 * the fake region vm_map_region_recurse_64()
18720 			 * reported to account for non-volatile purgeable
18721 			 * memory owned by this task.
18722 			 */
18723 			disposition = 0;
18724 
18725 			if (curr_s_offset - vm_map_last_entry(map)->vme_end <=
18726 			    (unsigned) ledger_compressed) {
18727 				/*
18728 				 * We haven't reported all the "non-volatile
18729 				 * compressed" pages yet, so report this fake
18730 				 * page as "compressed".
18731 				 */
18732 				disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
18733 			} else {
18734 				/*
18735 				 * We've reported all the non-volatile
18736 				 * compressed page but not all the non-volatile
18737 				 * pages , so report this fake page as
18738 				 * "resident dirty".
18739 				 */
18740 				disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
18741 				disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
18742 				disposition |= VM_PAGE_QUERY_PAGE_REF;
18743 			}
18744 			switch (flavor) {
18745 			case VM_PAGE_INFO_BASIC:
18746 				basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
18747 				basic_info->disposition = disposition;
18748 				basic_info->ref_count = 1;
18749 				basic_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
18750 				basic_info->offset = 0;
18751 				basic_info->depth = 0;
18752 
18753 				info_idx++;
18754 				break;
18755 			}
18756 			curr_s_offset += effective_page_size;
18757 			continue;
18758 		}
18759 
18760 		/*
18761 		 * First, find the map entry covering "curr_s_offset", going down
18762 		 * submaps if necessary.
18763 		 */
18764 		if (!vm_map_lookup_entry(map, curr_s_offset, &map_entry)) {
18765 			/* no entry -> no object -> no page */
18766 
18767 			if (curr_s_offset < vm_map_min(map)) {
18768 				/*
18769 				 * Illegal address that falls below map min.
18770 				 */
18771 				curr_e_offset = MIN(end, vm_map_min(map));
18772 			} else if (curr_s_offset >= vm_map_max(map)) {
18773 				/*
18774 				 * Illegal address that falls on/after map max.
18775 				 */
18776 				curr_e_offset = end;
18777 			} else if (map_entry == vm_map_to_entry(map)) {
18778 				/*
18779 				 * Hit a hole.
18780 				 */
18781 				if (map_entry->vme_next == vm_map_to_entry(map)) {
18782 					/*
18783 					 * Empty map.
18784 					 */
18785 					curr_e_offset = MIN(map->max_offset, end);
18786 				} else {
18787 					/*
18788 					 * Hole at start of the map.
18789 					 */
18790 					curr_e_offset = MIN(map_entry->vme_next->vme_start, end);
18791 				}
18792 			} else {
18793 				if (map_entry->vme_next == vm_map_to_entry(map)) {
18794 					/*
18795 					 * Hole at the end of the map.
18796 					 */
18797 					curr_e_offset = MIN(map->max_offset, end);
18798 				} else {
18799 					curr_e_offset = MIN(map_entry->vme_next->vme_start, end);
18800 				}
18801 			}
18802 
18803 			assert(curr_e_offset >= curr_s_offset);
18804 
18805 			uint64_t num_pages = (curr_e_offset - curr_s_offset) >> effective_page_shift;
18806 
18807 			void *info_ptr = (void*) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
18808 
18809 			bzero(info_ptr, num_pages * sizeof(struct vm_page_info_basic));
18810 
18811 			curr_s_offset = curr_e_offset;
18812 
18813 			info_idx += num_pages;
18814 
18815 			continue;
18816 		}
18817 
18818 		/* compute offset from this map entry's start */
18819 		offset_in_object = curr_s_offset - map_entry->vme_start;
18820 
18821 		/* compute offset into this map entry's object (or submap) */
18822 		offset_in_object += VME_OFFSET(map_entry);
18823 
18824 		if (map_entry->is_sub_map) {
18825 			vm_map_t sub_map = VM_MAP_NULL;
18826 			vm_page_info_t submap_info = 0;
18827 			vm_map_offset_t submap_s_offset = 0, submap_e_offset = 0, range_len = 0;
18828 
18829 			range_len = MIN(map_entry->vme_end, end) - curr_s_offset;
18830 
18831 			submap_s_offset = offset_in_object;
18832 			submap_e_offset = submap_s_offset + range_len;
18833 
18834 			sub_map = VME_SUBMAP(map_entry);
18835 
18836 			vm_map_reference(sub_map);
18837 			vm_map_unlock_read(map);
18838 
18839 			submap_info = (vm_page_info_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
18840 
18841 			assertf(VM_MAP_PAGE_SHIFT(sub_map) >= VM_MAP_PAGE_SHIFT(map),
18842 			    "Submap page size (%d) differs from current map (%d)\n", VM_MAP_PAGE_SIZE(sub_map), VM_MAP_PAGE_SIZE(map));
18843 
18844 			retval = vm_map_page_range_info_internal(sub_map,
18845 			    submap_s_offset,
18846 			    submap_e_offset,
18847 			    effective_page_shift,
18848 			    VM_PAGE_INFO_BASIC,
18849 			    (vm_page_info_t) submap_info,
18850 			    count);
18851 
18852 			assert(retval == KERN_SUCCESS);
18853 
18854 			vm_map_lock_read(map);
18855 			vm_map_deallocate(sub_map);
18856 
18857 			/* Move the "info" index by the number of pages we inspected.*/
18858 			info_idx += range_len >> effective_page_shift;
18859 
18860 			/* Move our current offset by the size of the range we inspected.*/
18861 			curr_s_offset += range_len;
18862 
18863 			continue;
18864 		}
18865 
18866 		object = VME_OBJECT(map_entry);
18867 
18868 		if (object == VM_OBJECT_NULL) {
18869 			/*
18870 			 * We don't have an object here and, hence,
18871 			 * no pages to inspect. We'll fill up the
18872 			 * info structure appropriately.
18873 			 */
18874 
18875 			curr_e_offset = MIN(map_entry->vme_end, end);
18876 
18877 			uint64_t num_pages = (curr_e_offset - curr_s_offset) >> effective_page_shift;
18878 
18879 			void *info_ptr = (void*) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
18880 
18881 			bzero(info_ptr, num_pages * sizeof(struct vm_page_info_basic));
18882 
18883 			curr_s_offset = curr_e_offset;
18884 
18885 			info_idx += num_pages;
18886 
18887 			continue;
18888 		}
18889 
18890 		if (do_region_footprint) {
18891 			disposition = 0;
18892 			if (map->has_corpse_footprint) {
18893 				/*
18894 				 * Query the page info data we saved
18895 				 * while forking the corpse.
18896 				 */
18897 				vm_map_corpse_footprint_query_page_info(
18898 					map,
18899 					curr_s_offset,
18900 					&disposition);
18901 			} else {
18902 				/*
18903 				 * Query the live pmap for footprint info
18904 				 * about this page.
18905 				 */
18906 				vm_map_footprint_query_page_info(
18907 					map,
18908 					map_entry,
18909 					curr_s_offset,
18910 					&disposition);
18911 			}
18912 			switch (flavor) {
18913 			case VM_PAGE_INFO_BASIC:
18914 				basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
18915 				basic_info->disposition = disposition;
18916 				basic_info->ref_count = 1;
18917 				basic_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
18918 				basic_info->offset = 0;
18919 				basic_info->depth = 0;
18920 
18921 				info_idx++;
18922 				break;
18923 			}
18924 			curr_s_offset += effective_page_size;
18925 			continue;
18926 		}
18927 
18928 		vm_object_reference(object);
18929 		/*
18930 		 * Shared mode -- so we can allow other readers
18931 		 * to grab the lock too.
18932 		 */
18933 		vm_object_lock_shared(object);
18934 
18935 		curr_e_offset = MIN(map_entry->vme_end, end);
18936 
18937 		vm_map_unlock_read(map);
18938 
18939 		map_entry = NULL; /* map is unlocked, the entry is no longer valid. */
18940 
18941 		curr_object = object;
18942 
18943 		for (; curr_s_offset < curr_e_offset;) {
18944 			if (object == curr_object) {
18945 				ref_count = curr_object->ref_count - 1; /* account for our object reference above. */
18946 			} else {
18947 				ref_count = curr_object->ref_count;
18948 			}
18949 
18950 			curr_offset_in_object = offset_in_object;
18951 
18952 			for (;;) {
18953 				m = vm_page_lookup(curr_object, vm_object_trunc_page(curr_offset_in_object));
18954 
18955 				if (m != VM_PAGE_NULL) {
18956 					disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
18957 					break;
18958 				} else {
18959 					if (curr_object->internal &&
18960 					    curr_object->alive &&
18961 					    !curr_object->terminating &&
18962 					    curr_object->pager_ready) {
18963 						if (VM_COMPRESSOR_PAGER_STATE_GET(curr_object, vm_object_trunc_page(curr_offset_in_object))
18964 						    == VM_EXTERNAL_STATE_EXISTS) {
18965 							/* the pager has that page */
18966 							disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
18967 							break;
18968 						}
18969 					}
18970 
18971 					/*
18972 					 * Go down the VM object shadow chain until we find the page
18973 					 * we're looking for.
18974 					 */
18975 
18976 					if (curr_object->shadow != VM_OBJECT_NULL) {
18977 						vm_object_t shadow = VM_OBJECT_NULL;
18978 
18979 						curr_offset_in_object += curr_object->vo_shadow_offset;
18980 						shadow = curr_object->shadow;
18981 
18982 						vm_object_lock_shared(shadow);
18983 						vm_object_unlock(curr_object);
18984 
18985 						curr_object = shadow;
18986 						depth++;
18987 						continue;
18988 					} else {
18989 						break;
18990 					}
18991 				}
18992 			}
18993 
18994 			/* The ref_count is not strictly accurate, it measures the number   */
18995 			/* of entities holding a ref on the object, they may not be mapping */
18996 			/* the object or may not be mapping the section holding the         */
18997 			/* target page but its still a ball park number and though an over- */
18998 			/* count, it picks up the copy-on-write cases                       */
18999 
19000 			/* We could also get a picture of page sharing from pmap_attributes */
19001 			/* but this would under count as only faulted-in mappings would     */
19002 			/* show up.							    */
19003 
19004 			if ((curr_object == object) && curr_object->shadow) {
19005 				disposition |= VM_PAGE_QUERY_PAGE_COPIED;
19006 			}
19007 
19008 			if (!curr_object->internal) {
19009 				disposition |= VM_PAGE_QUERY_PAGE_EXTERNAL;
19010 			}
19011 
19012 			if (m != VM_PAGE_NULL) {
19013 				if (m->vmp_fictitious) {
19014 					disposition |= VM_PAGE_QUERY_PAGE_FICTITIOUS;
19015 				} else {
19016 					if (m->vmp_dirty || pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(m))) {
19017 						disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
19018 					}
19019 
19020 					if (m->vmp_reference || pmap_is_referenced(VM_PAGE_GET_PHYS_PAGE(m))) {
19021 						disposition |= VM_PAGE_QUERY_PAGE_REF;
19022 					}
19023 
19024 					if (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
19025 						disposition |= VM_PAGE_QUERY_PAGE_SPECULATIVE;
19026 					}
19027 
19028 					/*
19029 					 * XXX TODO4K:
19030 					 * when this routine deals with 4k
19031 					 * pages, check the appropriate CS bit
19032 					 * here.
19033 					 */
19034 					if (m->vmp_cs_validated) {
19035 						disposition |= VM_PAGE_QUERY_PAGE_CS_VALIDATED;
19036 					}
19037 					if (m->vmp_cs_tainted) {
19038 						disposition |= VM_PAGE_QUERY_PAGE_CS_TAINTED;
19039 					}
19040 					if (m->vmp_cs_nx) {
19041 						disposition |= VM_PAGE_QUERY_PAGE_CS_NX;
19042 					}
19043 					if (m->vmp_reusable || curr_object->all_reusable) {
19044 						disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
19045 					}
19046 				}
19047 			}
19048 
19049 			switch (flavor) {
19050 			case VM_PAGE_INFO_BASIC:
19051 				basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
19052 				basic_info->disposition = disposition;
19053 				basic_info->ref_count = ref_count;
19054 				basic_info->object_id = (vm_object_id_t) (uintptr_t)
19055 				    VM_KERNEL_ADDRPERM(curr_object);
19056 				basic_info->offset =
19057 				    (memory_object_offset_t) curr_offset_in_object + offset_in_page;
19058 				basic_info->depth = depth;
19059 
19060 				info_idx++;
19061 				break;
19062 			}
19063 
19064 			disposition = 0;
19065 			offset_in_page = 0; // This doesn't really make sense for any offset other than the starting offset.
19066 
19067 			/*
19068 			 * Move to next offset in the range and in our object.
19069 			 */
19070 			curr_s_offset += effective_page_size;
19071 			offset_in_object += effective_page_size;
19072 			curr_offset_in_object = offset_in_object;
19073 
19074 			if (curr_object != object) {
19075 				vm_object_unlock(curr_object);
19076 
19077 				curr_object = object;
19078 
19079 				vm_object_lock_shared(curr_object);
19080 			} else {
19081 				vm_object_lock_yield_shared(curr_object);
19082 			}
19083 		}
19084 
19085 		vm_object_unlock(curr_object);
19086 		vm_object_deallocate(curr_object);
19087 
19088 		vm_map_lock_read(map);
19089 	}
19090 
19091 	vm_map_unlock_read(map);
19092 	return retval;
19093 }
19094 
19095 /*
19096  *	vm_map_msync
19097  *
19098  *	Synchronises the memory range specified with its backing store
19099  *	image by either flushing or cleaning the contents to the appropriate
19100  *	memory manager engaging in a memory object synchronize dialog with
19101  *	the manager.  The client doesn't return until the manager issues
19102  *	m_o_s_completed message.  MIG Magically converts user task parameter
19103  *	to the task's address map.
19104  *
19105  *	interpretation of sync_flags
19106  *	VM_SYNC_INVALIDATE	- discard pages, only return precious
19107  *				  pages to manager.
19108  *
19109  *	VM_SYNC_INVALIDATE & (VM_SYNC_SYNCHRONOUS | VM_SYNC_ASYNCHRONOUS)
19110  *				- discard pages, write dirty or precious
19111  *				  pages back to memory manager.
19112  *
19113  *	VM_SYNC_SYNCHRONOUS | VM_SYNC_ASYNCHRONOUS
19114  *				- write dirty or precious pages back to
19115  *				  the memory manager.
19116  *
19117  *	VM_SYNC_CONTIGUOUS	- does everything normally, but if there
19118  *				  is a hole in the region, and we would
19119  *				  have returned KERN_SUCCESS, return
19120  *				  KERN_INVALID_ADDRESS instead.
19121  *
19122  *	NOTE
19123  *	The memory object attributes have not yet been implemented, this
19124  *	function will have to deal with the invalidate attribute
19125  *
19126  *	RETURNS
19127  *	KERN_INVALID_TASK		Bad task parameter
19128  *	KERN_INVALID_ARGUMENT		both sync and async were specified.
19129  *	KERN_SUCCESS			The usual.
19130  *	KERN_INVALID_ADDRESS		There was a hole in the region.
19131  */
19132 
19133 kern_return_t
vm_map_msync(vm_map_t map,vm_map_address_t address,vm_map_size_t size,vm_sync_t sync_flags)19134 vm_map_msync(
19135 	vm_map_t                map,
19136 	vm_map_address_t        address,
19137 	vm_map_size_t           size,
19138 	vm_sync_t               sync_flags)
19139 {
19140 	vm_map_entry_t          entry;
19141 	vm_map_size_t           amount_left;
19142 	vm_object_offset_t      offset;
19143 	vm_object_offset_t      start_offset, end_offset;
19144 	boolean_t               do_sync_req;
19145 	boolean_t               had_hole = FALSE;
19146 	vm_map_offset_t         pmap_offset;
19147 
19148 	if ((sync_flags & VM_SYNC_ASYNCHRONOUS) &&
19149 	    (sync_flags & VM_SYNC_SYNCHRONOUS)) {
19150 		return KERN_INVALID_ARGUMENT;
19151 	}
19152 
19153 	if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
19154 		DEBUG4K_SHARE("map %p address 0x%llx size 0x%llx flags 0x%x\n", map, (uint64_t)address, (uint64_t)size, sync_flags);
19155 	}
19156 
19157 	/*
19158 	 * align address and size on page boundaries
19159 	 */
19160 	size = (vm_map_round_page(address + size,
19161 	    VM_MAP_PAGE_MASK(map)) -
19162 	    vm_map_trunc_page(address,
19163 	    VM_MAP_PAGE_MASK(map)));
19164 	address = vm_map_trunc_page(address,
19165 	    VM_MAP_PAGE_MASK(map));
19166 
19167 	if (map == VM_MAP_NULL) {
19168 		return KERN_INVALID_TASK;
19169 	}
19170 
19171 	if (size == 0) {
19172 		return KERN_SUCCESS;
19173 	}
19174 
19175 	amount_left = size;
19176 
19177 	while (amount_left > 0) {
19178 		vm_object_size_t        flush_size;
19179 		vm_object_t             object;
19180 
19181 		vm_map_lock(map);
19182 		if (!vm_map_lookup_entry(map,
19183 		    address,
19184 		    &entry)) {
19185 			vm_map_size_t   skip;
19186 
19187 			/*
19188 			 * hole in the address map.
19189 			 */
19190 			had_hole = TRUE;
19191 
19192 			if (sync_flags & VM_SYNC_KILLPAGES) {
19193 				/*
19194 				 * For VM_SYNC_KILLPAGES, there should be
19195 				 * no holes in the range, since we couldn't
19196 				 * prevent someone else from allocating in
19197 				 * that hole and we wouldn't want to "kill"
19198 				 * their pages.
19199 				 */
19200 				vm_map_unlock(map);
19201 				break;
19202 			}
19203 
19204 			/*
19205 			 * Check for empty map.
19206 			 */
19207 			if (entry == vm_map_to_entry(map) &&
19208 			    entry->vme_next == entry) {
19209 				vm_map_unlock(map);
19210 				break;
19211 			}
19212 			/*
19213 			 * Check that we don't wrap and that
19214 			 * we have at least one real map entry.
19215 			 */
19216 			if ((map->hdr.nentries == 0) ||
19217 			    (entry->vme_next->vme_start < address)) {
19218 				vm_map_unlock(map);
19219 				break;
19220 			}
19221 			/*
19222 			 * Move up to the next entry if needed
19223 			 */
19224 			skip = (entry->vme_next->vme_start - address);
19225 			if (skip >= amount_left) {
19226 				amount_left = 0;
19227 			} else {
19228 				amount_left -= skip;
19229 			}
19230 			address = entry->vme_next->vme_start;
19231 			vm_map_unlock(map);
19232 			continue;
19233 		}
19234 
19235 		offset = address - entry->vme_start;
19236 		pmap_offset = address;
19237 
19238 		/*
19239 		 * do we have more to flush than is contained in this
19240 		 * entry ?
19241 		 */
19242 		if (amount_left + entry->vme_start + offset > entry->vme_end) {
19243 			flush_size = entry->vme_end -
19244 			    (entry->vme_start + offset);
19245 		} else {
19246 			flush_size = amount_left;
19247 		}
19248 		amount_left -= flush_size;
19249 		address += flush_size;
19250 
19251 		if (entry->is_sub_map == TRUE) {
19252 			vm_map_t        local_map;
19253 			vm_map_offset_t local_offset;
19254 
19255 			local_map = VME_SUBMAP(entry);
19256 			local_offset = VME_OFFSET(entry);
19257 			vm_map_reference(local_map);
19258 			vm_map_unlock(map);
19259 			if (vm_map_msync(
19260 				    local_map,
19261 				    local_offset,
19262 				    flush_size,
19263 				    sync_flags) == KERN_INVALID_ADDRESS) {
19264 				had_hole = TRUE;
19265 			}
19266 			vm_map_deallocate(local_map);
19267 			continue;
19268 		}
19269 		object = VME_OBJECT(entry);
19270 
19271 		/*
19272 		 * We can't sync this object if the object has not been
19273 		 * created yet
19274 		 */
19275 		if (object == VM_OBJECT_NULL) {
19276 			vm_map_unlock(map);
19277 			continue;
19278 		}
19279 		offset += VME_OFFSET(entry);
19280 
19281 		vm_object_lock(object);
19282 
19283 		if (sync_flags & (VM_SYNC_KILLPAGES | VM_SYNC_DEACTIVATE)) {
19284 			int kill_pages = 0;
19285 			boolean_t reusable_pages = FALSE;
19286 
19287 			if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
19288 				/*
19289 				 * This is a destructive operation and so we
19290 				 * err on the side of limiting the range of
19291 				 * the operation.
19292 				 */
19293 				start_offset = vm_object_round_page(offset);
19294 				end_offset = vm_object_trunc_page(offset + flush_size);
19295 
19296 				if (end_offset <= start_offset) {
19297 					vm_object_unlock(object);
19298 					vm_map_unlock(map);
19299 					continue;
19300 				}
19301 
19302 				pmap_offset += start_offset - offset;
19303 			} else {
19304 				start_offset = offset;
19305 				end_offset = offset + flush_size;
19306 			}
19307 
19308 			if (sync_flags & VM_SYNC_KILLPAGES) {
19309 				if (((object->ref_count == 1) ||
19310 				    ((object->copy_strategy !=
19311 				    MEMORY_OBJECT_COPY_SYMMETRIC) &&
19312 				    (object->copy == VM_OBJECT_NULL))) &&
19313 				    (object->shadow == VM_OBJECT_NULL)) {
19314 					if (object->ref_count != 1) {
19315 						vm_page_stats_reusable.free_shared++;
19316 					}
19317 					kill_pages = 1;
19318 				} else {
19319 					kill_pages = -1;
19320 				}
19321 			}
19322 			if (kill_pages != -1) {
19323 				vm_object_deactivate_pages(
19324 					object,
19325 					start_offset,
19326 					(vm_object_size_t) (end_offset - start_offset),
19327 					kill_pages,
19328 					reusable_pages,
19329 					map->pmap,
19330 					pmap_offset);
19331 			}
19332 			vm_object_unlock(object);
19333 			vm_map_unlock(map);
19334 			continue;
19335 		}
19336 		/*
19337 		 * We can't sync this object if there isn't a pager.
19338 		 * Don't bother to sync internal objects, since there can't
19339 		 * be any "permanent" storage for these objects anyway.
19340 		 */
19341 		if ((object->pager == MEMORY_OBJECT_NULL) ||
19342 		    (object->internal) || (object->private)) {
19343 			vm_object_unlock(object);
19344 			vm_map_unlock(map);
19345 			continue;
19346 		}
19347 		/*
19348 		 * keep reference on the object until syncing is done
19349 		 */
19350 		vm_object_reference_locked(object);
19351 		vm_object_unlock(object);
19352 
19353 		vm_map_unlock(map);
19354 
19355 		if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
19356 			start_offset = vm_object_trunc_page(offset);
19357 			end_offset = vm_object_round_page(offset + flush_size);
19358 		} else {
19359 			start_offset = offset;
19360 			end_offset = offset + flush_size;
19361 		}
19362 
19363 		do_sync_req = vm_object_sync(object,
19364 		    start_offset,
19365 		    (end_offset - start_offset),
19366 		    sync_flags & VM_SYNC_INVALIDATE,
19367 		    ((sync_flags & VM_SYNC_SYNCHRONOUS) ||
19368 		    (sync_flags & VM_SYNC_ASYNCHRONOUS)),
19369 		    sync_flags & VM_SYNC_SYNCHRONOUS);
19370 
19371 		if ((sync_flags & VM_SYNC_INVALIDATE) && object->resident_page_count == 0) {
19372 			/*
19373 			 * clear out the clustering and read-ahead hints
19374 			 */
19375 			vm_object_lock(object);
19376 
19377 			object->pages_created = 0;
19378 			object->pages_used = 0;
19379 			object->sequential = 0;
19380 			object->last_alloc = 0;
19381 
19382 			vm_object_unlock(object);
19383 		}
19384 		vm_object_deallocate(object);
19385 	} /* while */
19386 
19387 	/* for proper msync() behaviour */
19388 	if (had_hole == TRUE && (sync_flags & VM_SYNC_CONTIGUOUS)) {
19389 		return KERN_INVALID_ADDRESS;
19390 	}
19391 
19392 	return KERN_SUCCESS;
19393 }/* vm_msync */
19394 
19395 void
vm_named_entry_associate_vm_object(vm_named_entry_t named_entry,vm_object_t object,vm_object_offset_t offset,vm_object_size_t size,vm_prot_t prot)19396 vm_named_entry_associate_vm_object(
19397 	vm_named_entry_t        named_entry,
19398 	vm_object_t             object,
19399 	vm_object_offset_t      offset,
19400 	vm_object_size_t        size,
19401 	vm_prot_t               prot)
19402 {
19403 	vm_map_copy_t copy;
19404 	vm_map_entry_t copy_entry;
19405 
19406 	assert(!named_entry->is_sub_map);
19407 	assert(!named_entry->is_copy);
19408 	assert(!named_entry->is_object);
19409 	assert(!named_entry->internal);
19410 	assert(named_entry->backing.copy == VM_MAP_COPY_NULL);
19411 
19412 	copy = vm_map_copy_allocate();
19413 	copy->type = VM_MAP_COPY_ENTRY_LIST;
19414 	copy->offset = offset;
19415 	copy->size = size;
19416 	copy->cpy_hdr.page_shift = (uint16_t)PAGE_SHIFT;
19417 	vm_map_store_init(&copy->cpy_hdr);
19418 
19419 	copy_entry = vm_map_copy_entry_create(copy);
19420 	copy_entry->protection = prot;
19421 	copy_entry->max_protection = prot;
19422 	copy_entry->use_pmap = TRUE;
19423 	copy_entry->vme_start = VM_MAP_TRUNC_PAGE(offset, PAGE_MASK);
19424 	copy_entry->vme_end = VM_MAP_ROUND_PAGE(offset + size, PAGE_MASK);
19425 	VME_OBJECT_SET(copy_entry, object, false, 0);
19426 	VME_OFFSET_SET(copy_entry, vm_object_trunc_page(offset));
19427 	vm_map_copy_entry_link(copy, vm_map_copy_last_entry(copy), copy_entry);
19428 
19429 	named_entry->backing.copy = copy;
19430 	named_entry->is_object = TRUE;
19431 	if (object->internal) {
19432 		named_entry->internal = TRUE;
19433 	}
19434 
19435 	DEBUG4K_MEMENTRY("named_entry %p copy %p object %p offset 0x%llx size 0x%llx prot 0x%x\n",
19436 	    named_entry, copy, object, offset, size, prot);
19437 }
19438 
19439 vm_object_t
vm_named_entry_to_vm_object(vm_named_entry_t named_entry)19440 vm_named_entry_to_vm_object(
19441 	vm_named_entry_t named_entry)
19442 {
19443 	vm_map_copy_t   copy;
19444 	vm_map_entry_t  copy_entry;
19445 	vm_object_t     object;
19446 
19447 	assert(!named_entry->is_sub_map);
19448 	assert(!named_entry->is_copy);
19449 	assert(named_entry->is_object);
19450 	copy = named_entry->backing.copy;
19451 	assert(copy != VM_MAP_COPY_NULL);
19452 	/*
19453 	 * Assert that the vm_map_copy is coming from the right
19454 	 * zone and hasn't been forged
19455 	 */
19456 	vm_map_copy_require(copy);
19457 	assert(copy->cpy_hdr.nentries == 1);
19458 	copy_entry = vm_map_copy_first_entry(copy);
19459 	object = VME_OBJECT(copy_entry);
19460 
19461 	DEBUG4K_MEMENTRY("%p -> %p -> %p [0x%llx 0x%llx 0x%llx 0x%x/0x%x ] -> %p offset 0x%llx size 0x%llx prot 0x%x\n", named_entry, copy, copy_entry, (uint64_t)copy_entry->vme_start, (uint64_t)copy_entry->vme_end, copy_entry->vme_offset, copy_entry->protection, copy_entry->max_protection, object, named_entry->offset, named_entry->size, named_entry->protection);
19462 
19463 	return object;
19464 }
19465 
19466 /*
19467  *	Routine:	convert_port_entry_to_map
19468  *	Purpose:
19469  *		Convert from a port specifying an entry or a task
19470  *		to a map. Doesn't consume the port ref; produces a map ref,
19471  *		which may be null.  Unlike convert_port_to_map, the
19472  *		port may be task or a named entry backed.
19473  *	Conditions:
19474  *		Nothing locked.
19475  */
19476 
19477 vm_map_t
convert_port_entry_to_map(ipc_port_t port)19478 convert_port_entry_to_map(
19479 	ipc_port_t      port)
19480 {
19481 	vm_map_t map = VM_MAP_NULL;
19482 	vm_named_entry_t named_entry;
19483 
19484 	if (!IP_VALID(port)) {
19485 		return VM_MAP_NULL;
19486 	}
19487 
19488 	if (ip_kotype(port) != IKOT_NAMED_ENTRY) {
19489 		return convert_port_to_map(port);
19490 	}
19491 
19492 	named_entry = mach_memory_entry_from_port(port);
19493 
19494 	if ((named_entry->is_sub_map) &&
19495 	    (named_entry->protection & VM_PROT_WRITE)) {
19496 		map = named_entry->backing.map;
19497 		if (map->pmap != PMAP_NULL) {
19498 			if (map->pmap == kernel_pmap) {
19499 				panic("userspace has access "
19500 				    "to a kernel map %p", map);
19501 			}
19502 			pmap_require(map->pmap);
19503 		}
19504 		vm_map_reference(map);
19505 	}
19506 
19507 	return map;
19508 }
19509 
19510 /*
19511  * Export routines to other components for the things we access locally through
19512  * macros.
19513  */
19514 #undef current_map
19515 vm_map_t
current_map(void)19516 current_map(void)
19517 {
19518 	return current_map_fast();
19519 }
19520 
19521 /*
19522  *	vm_map_reference:
19523  *
19524  *	Takes a reference on the specified map.
19525  */
19526 void
vm_map_reference(vm_map_t map)19527 vm_map_reference(
19528 	vm_map_t        map)
19529 {
19530 	if (__probable(map != VM_MAP_NULL)) {
19531 		vm_map_require(map);
19532 		os_ref_retain_raw(&map->map_refcnt, &map_refgrp);
19533 	}
19534 }
19535 
19536 /*
19537  *	vm_map_deallocate:
19538  *
19539  *	Removes a reference from the specified map,
19540  *	destroying it if no references remain.
19541  *	The map should not be locked.
19542  */
19543 void
vm_map_deallocate(vm_map_t map)19544 vm_map_deallocate(
19545 	vm_map_t        map)
19546 {
19547 	if (__probable(map != VM_MAP_NULL)) {
19548 		vm_map_require(map);
19549 		if (os_ref_release_raw(&map->map_refcnt, &map_refgrp) == 0) {
19550 			vm_map_destroy(map);
19551 		}
19552 	}
19553 }
19554 
19555 void
vm_map_inspect_deallocate(vm_map_inspect_t map)19556 vm_map_inspect_deallocate(
19557 	vm_map_inspect_t      map)
19558 {
19559 	vm_map_deallocate((vm_map_t)map);
19560 }
19561 
19562 void
vm_map_read_deallocate(vm_map_read_t map)19563 vm_map_read_deallocate(
19564 	vm_map_read_t      map)
19565 {
19566 	vm_map_deallocate((vm_map_t)map);
19567 }
19568 
19569 
19570 void
vm_map_disable_NX(vm_map_t map)19571 vm_map_disable_NX(vm_map_t map)
19572 {
19573 	if (map == NULL) {
19574 		return;
19575 	}
19576 	if (map->pmap == NULL) {
19577 		return;
19578 	}
19579 
19580 	pmap_disable_NX(map->pmap);
19581 }
19582 
19583 void
vm_map_disallow_data_exec(vm_map_t map)19584 vm_map_disallow_data_exec(vm_map_t map)
19585 {
19586 	if (map == NULL) {
19587 		return;
19588 	}
19589 
19590 	map->map_disallow_data_exec = TRUE;
19591 }
19592 
19593 /* XXX Consider making these constants (VM_MAX_ADDRESS and MACH_VM_MAX_ADDRESS)
19594  * more descriptive.
19595  */
19596 void
vm_map_set_32bit(vm_map_t map)19597 vm_map_set_32bit(vm_map_t map)
19598 {
19599 #if defined(__arm__) || defined(__arm64__)
19600 	map->max_offset = pmap_max_offset(FALSE, ARM_PMAP_MAX_OFFSET_DEVICE);
19601 #else
19602 	map->max_offset = (vm_map_offset_t)VM_MAX_ADDRESS;
19603 #endif
19604 }
19605 
19606 
19607 void
vm_map_set_64bit(vm_map_t map)19608 vm_map_set_64bit(vm_map_t map)
19609 {
19610 #if defined(__arm__) || defined(__arm64__)
19611 	map->max_offset = pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_DEVICE);
19612 #else
19613 	map->max_offset = (vm_map_offset_t)MACH_VM_MAX_ADDRESS;
19614 #endif
19615 }
19616 
19617 /*
19618  * Expand the maximum size of an existing map to the maximum supported.
19619  */
19620 void
vm_map_set_jumbo(vm_map_t map)19621 vm_map_set_jumbo(vm_map_t map)
19622 {
19623 #if defined (__arm64__) && !defined(CONFIG_ARROW)
19624 	vm_map_set_max_addr(map, ~0);
19625 #else /* arm64 */
19626 	(void) map;
19627 #endif
19628 }
19629 
19630 /*
19631  * This map has a JIT entitlement
19632  */
19633 void
vm_map_set_jit_entitled(vm_map_t map)19634 vm_map_set_jit_entitled(vm_map_t map)
19635 {
19636 #if defined (__arm64__)
19637 	pmap_set_jit_entitled(map->pmap);
19638 #else /* arm64 */
19639 	(void) map;
19640 #endif
19641 }
19642 
19643 /*
19644  * Expand the maximum size of an existing map.
19645  */
19646 void
vm_map_set_max_addr(vm_map_t map,vm_map_offset_t new_max_offset)19647 vm_map_set_max_addr(vm_map_t map, vm_map_offset_t new_max_offset)
19648 {
19649 #if defined(__arm64__)
19650 	vm_map_offset_t max_supported_offset = 0;
19651 	vm_map_offset_t old_max_offset = map->max_offset;
19652 	max_supported_offset = pmap_max_offset(vm_map_is_64bit(map), ARM_PMAP_MAX_OFFSET_JUMBO);
19653 
19654 	new_max_offset = trunc_page(new_max_offset);
19655 
19656 	/* The address space cannot be shrunk using this routine. */
19657 	if (old_max_offset >= new_max_offset) {
19658 		return;
19659 	}
19660 
19661 	if (max_supported_offset < new_max_offset) {
19662 		new_max_offset = max_supported_offset;
19663 	}
19664 
19665 	map->max_offset = new_max_offset;
19666 
19667 	if (map->holes_list->prev->vme_end == old_max_offset) {
19668 		/*
19669 		 * There is already a hole at the end of the map; simply make it bigger.
19670 		 */
19671 		map->holes_list->prev->vme_end = map->max_offset;
19672 	} else {
19673 		/*
19674 		 * There is no hole at the end, so we need to create a new hole
19675 		 * for the new empty space we're creating.
19676 		 */
19677 		struct vm_map_links *new_hole = zalloc(vm_map_holes_zone);
19678 		new_hole->start = old_max_offset;
19679 		new_hole->end = map->max_offset;
19680 		new_hole->prev = map->holes_list->prev;
19681 		new_hole->next = (struct vm_map_entry *)map->holes_list;
19682 		map->holes_list->prev->links.next = (struct vm_map_entry *)new_hole;
19683 		map->holes_list->prev = (struct vm_map_entry *)new_hole;
19684 	}
19685 #else
19686 	(void)map;
19687 	(void)new_max_offset;
19688 #endif
19689 }
19690 
19691 vm_map_offset_t
vm_compute_max_offset(boolean_t is64)19692 vm_compute_max_offset(boolean_t is64)
19693 {
19694 #if defined(__arm__) || defined(__arm64__)
19695 	return pmap_max_offset(is64, ARM_PMAP_MAX_OFFSET_DEVICE);
19696 #else
19697 	return is64 ? (vm_map_offset_t)MACH_VM_MAX_ADDRESS : (vm_map_offset_t)VM_MAX_ADDRESS;
19698 #endif
19699 }
19700 
19701 void
vm_map_get_max_aslr_slide_section(vm_map_t map __unused,int64_t * max_sections,int64_t * section_size)19702 vm_map_get_max_aslr_slide_section(
19703 	vm_map_t                map __unused,
19704 	int64_t                 *max_sections,
19705 	int64_t                 *section_size)
19706 {
19707 #if defined(__arm64__)
19708 	*max_sections = 3;
19709 	*section_size = ARM_TT_TWIG_SIZE;
19710 #else
19711 	*max_sections = 1;
19712 	*section_size = 0;
19713 #endif
19714 }
19715 
19716 uint64_t
vm_map_get_max_aslr_slide_pages(vm_map_t map)19717 vm_map_get_max_aslr_slide_pages(vm_map_t map)
19718 {
19719 #if defined(__arm64__)
19720 	/* Limit arm64 slide to 16MB to conserve contiguous VA space in the more
19721 	 * limited embedded address space; this is also meant to minimize pmap
19722 	 * memory usage on 16KB page systems.
19723 	 */
19724 	return 1 << (24 - VM_MAP_PAGE_SHIFT(map));
19725 #else
19726 	return 1 << (vm_map_is_64bit(map) ? 16 : 8);
19727 #endif
19728 }
19729 
19730 uint64_t
vm_map_get_max_loader_aslr_slide_pages(vm_map_t map)19731 vm_map_get_max_loader_aslr_slide_pages(vm_map_t map)
19732 {
19733 #if defined(__arm64__)
19734 	/* We limit the loader slide to 4MB, in order to ensure at least 8 bits
19735 	 * of independent entropy on 16KB page systems.
19736 	 */
19737 	return 1 << (22 - VM_MAP_PAGE_SHIFT(map));
19738 #else
19739 	return 1 << (vm_map_is_64bit(map) ? 16 : 8);
19740 #endif
19741 }
19742 
19743 #ifndef __arm__
19744 boolean_t
vm_map_is_64bit(vm_map_t map)19745 vm_map_is_64bit(
19746 	vm_map_t map)
19747 {
19748 	return map->max_offset > ((vm_map_offset_t)VM_MAX_ADDRESS);
19749 }
19750 #endif
19751 
19752 boolean_t
vm_map_has_hard_pagezero(vm_map_t map,vm_map_offset_t pagezero_size)19753 vm_map_has_hard_pagezero(
19754 	vm_map_t        map,
19755 	vm_map_offset_t pagezero_size)
19756 {
19757 	/*
19758 	 * XXX FBDP
19759 	 * We should lock the VM map (for read) here but we can get away
19760 	 * with it for now because there can't really be any race condition:
19761 	 * the VM map's min_offset is changed only when the VM map is created
19762 	 * and when the zero page is established (when the binary gets loaded),
19763 	 * and this routine gets called only when the task terminates and the
19764 	 * VM map is being torn down, and when a new map is created via
19765 	 * load_machfile()/execve().
19766 	 */
19767 	return map->min_offset >= pagezero_size;
19768 }
19769 
19770 /*
19771  * Raise a VM map's maximun offset.
19772  */
19773 kern_return_t
vm_map_raise_max_offset(vm_map_t map,vm_map_offset_t new_max_offset)19774 vm_map_raise_max_offset(
19775 	vm_map_t        map,
19776 	vm_map_offset_t new_max_offset)
19777 {
19778 	kern_return_t   ret;
19779 
19780 	vm_map_lock(map);
19781 	ret = KERN_INVALID_ADDRESS;
19782 
19783 	if (new_max_offset >= map->max_offset) {
19784 		if (!vm_map_is_64bit(map)) {
19785 			if (new_max_offset <= (vm_map_offset_t)VM_MAX_ADDRESS) {
19786 				map->max_offset = new_max_offset;
19787 				ret = KERN_SUCCESS;
19788 			}
19789 		} else {
19790 			if (new_max_offset <= (vm_map_offset_t)MACH_VM_MAX_ADDRESS) {
19791 				map->max_offset = new_max_offset;
19792 				ret = KERN_SUCCESS;
19793 			}
19794 		}
19795 	}
19796 
19797 	vm_map_unlock(map);
19798 	return ret;
19799 }
19800 
19801 
19802 /*
19803  * Raise a VM map's minimum offset.
19804  * To strictly enforce "page zero" reservation.
19805  */
19806 kern_return_t
vm_map_raise_min_offset(vm_map_t map,vm_map_offset_t new_min_offset)19807 vm_map_raise_min_offset(
19808 	vm_map_t        map,
19809 	vm_map_offset_t new_min_offset)
19810 {
19811 	vm_map_entry_t  first_entry;
19812 
19813 	new_min_offset = vm_map_round_page(new_min_offset,
19814 	    VM_MAP_PAGE_MASK(map));
19815 
19816 	vm_map_lock(map);
19817 
19818 	if (new_min_offset < map->min_offset) {
19819 		/*
19820 		 * Can't move min_offset backwards, as that would expose
19821 		 * a part of the address space that was previously, and for
19822 		 * possibly good reasons, inaccessible.
19823 		 */
19824 		vm_map_unlock(map);
19825 		return KERN_INVALID_ADDRESS;
19826 	}
19827 	if (new_min_offset >= map->max_offset) {
19828 		/* can't go beyond the end of the address space */
19829 		vm_map_unlock(map);
19830 		return KERN_INVALID_ADDRESS;
19831 	}
19832 
19833 	first_entry = vm_map_first_entry(map);
19834 	if (first_entry != vm_map_to_entry(map) &&
19835 	    first_entry->vme_start < new_min_offset) {
19836 		/*
19837 		 * Some memory was already allocated below the new
19838 		 * minimun offset.  It's too late to change it now...
19839 		 */
19840 		vm_map_unlock(map);
19841 		return KERN_NO_SPACE;
19842 	}
19843 
19844 	map->min_offset = new_min_offset;
19845 
19846 	assert(map->holes_list);
19847 	map->holes_list->start = new_min_offset;
19848 	assert(new_min_offset < map->holes_list->end);
19849 
19850 	vm_map_unlock(map);
19851 
19852 	return KERN_SUCCESS;
19853 }
19854 
19855 /*
19856  * Set the limit on the maximum amount of address space and user wired memory allowed for this map.
19857  * This is basically a copy of the RLIMIT_AS and RLIMIT_MEMLOCK rlimit value maintained by the BSD
19858  * side of the kernel. The limits are checked in the mach VM side, so we keep a copy so we don't
19859  * have to reach over to the BSD data structures.
19860  */
19861 
19862 uint64_t vm_map_set_size_limit_count = 0;
19863 kern_return_t
vm_map_set_size_limit(vm_map_t map,uint64_t new_size_limit)19864 vm_map_set_size_limit(vm_map_t map, uint64_t new_size_limit)
19865 {
19866 	kern_return_t kr;
19867 
19868 	vm_map_lock(map);
19869 	if (new_size_limit < map->size) {
19870 		/* new limit should not be lower than its current size */
19871 		DTRACE_VM2(vm_map_set_size_limit_fail,
19872 		    vm_map_size_t, map->size,
19873 		    uint64_t, new_size_limit);
19874 		kr = KERN_FAILURE;
19875 	} else if (new_size_limit == map->size_limit) {
19876 		/* no change */
19877 		kr = KERN_SUCCESS;
19878 	} else {
19879 		/* set new limit */
19880 		DTRACE_VM2(vm_map_set_size_limit,
19881 		    vm_map_size_t, map->size,
19882 		    uint64_t, new_size_limit);
19883 		if (new_size_limit != RLIM_INFINITY) {
19884 			vm_map_set_size_limit_count++;
19885 		}
19886 		map->size_limit = new_size_limit;
19887 		kr = KERN_SUCCESS;
19888 	}
19889 	vm_map_unlock(map);
19890 	return kr;
19891 }
19892 
19893 uint64_t vm_map_set_data_limit_count = 0;
19894 kern_return_t
vm_map_set_data_limit(vm_map_t map,uint64_t new_data_limit)19895 vm_map_set_data_limit(vm_map_t map, uint64_t new_data_limit)
19896 {
19897 	kern_return_t kr;
19898 
19899 	vm_map_lock(map);
19900 	if (new_data_limit < map->size) {
19901 		/* new limit should not be lower than its current size */
19902 		DTRACE_VM2(vm_map_set_data_limit_fail,
19903 		    vm_map_size_t, map->size,
19904 		    uint64_t, new_data_limit);
19905 		kr = KERN_FAILURE;
19906 	} else if (new_data_limit == map->data_limit) {
19907 		/* no change */
19908 		kr = KERN_SUCCESS;
19909 	} else {
19910 		/* set new limit */
19911 		DTRACE_VM2(vm_map_set_data_limit,
19912 		    vm_map_size_t, map->size,
19913 		    uint64_t, new_data_limit);
19914 		if (new_data_limit != RLIM_INFINITY) {
19915 			vm_map_set_data_limit_count++;
19916 		}
19917 		map->data_limit = new_data_limit;
19918 		kr = KERN_SUCCESS;
19919 	}
19920 	vm_map_unlock(map);
19921 	return kr;
19922 }
19923 
19924 void
vm_map_set_user_wire_limit(vm_map_t map,vm_size_t limit)19925 vm_map_set_user_wire_limit(vm_map_t     map,
19926     vm_size_t    limit)
19927 {
19928 	vm_map_lock(map);
19929 	map->user_wire_limit = limit;
19930 	vm_map_unlock(map);
19931 }
19932 
19933 
19934 void
vm_map_switch_protect(vm_map_t map,boolean_t val)19935 vm_map_switch_protect(vm_map_t     map,
19936     boolean_t    val)
19937 {
19938 	vm_map_lock(map);
19939 	map->switch_protect = val;
19940 	vm_map_unlock(map);
19941 }
19942 
19943 extern int cs_process_enforcement_enable;
19944 boolean_t
vm_map_cs_enforcement(vm_map_t map)19945 vm_map_cs_enforcement(
19946 	vm_map_t map)
19947 {
19948 	if (cs_process_enforcement_enable) {
19949 		return TRUE;
19950 	}
19951 	return map->cs_enforcement;
19952 }
19953 
19954 kern_return_t
vm_map_cs_wx_enable(vm_map_t map)19955 vm_map_cs_wx_enable(
19956 	vm_map_t map)
19957 {
19958 	return pmap_cs_allow_invalid(vm_map_pmap(map));
19959 }
19960 
19961 void
vm_map_cs_debugged_set(vm_map_t map,boolean_t val)19962 vm_map_cs_debugged_set(
19963 	vm_map_t map,
19964 	boolean_t val)
19965 {
19966 	vm_map_lock(map);
19967 	map->cs_debugged = val;
19968 	vm_map_unlock(map);
19969 }
19970 
19971 void
vm_map_cs_enforcement_set(vm_map_t map,boolean_t val)19972 vm_map_cs_enforcement_set(
19973 	vm_map_t map,
19974 	boolean_t val)
19975 {
19976 	vm_map_lock(map);
19977 	map->cs_enforcement = val;
19978 	pmap_set_vm_map_cs_enforced(map->pmap, val);
19979 	vm_map_unlock(map);
19980 }
19981 
19982 /*
19983  * IOKit has mapped a region into this map; adjust the pmap's ledgers appropriately.
19984  * phys_footprint is a composite limit consisting of iokit + physmem, so we need to
19985  * bump both counters.
19986  */
19987 void
vm_map_iokit_mapped_region(vm_map_t map,vm_size_t bytes)19988 vm_map_iokit_mapped_region(vm_map_t map, vm_size_t bytes)
19989 {
19990 	pmap_t pmap = vm_map_pmap(map);
19991 
19992 	ledger_credit(pmap->ledger, task_ledgers.iokit_mapped, bytes);
19993 	ledger_credit(pmap->ledger, task_ledgers.phys_footprint, bytes);
19994 }
19995 
19996 void
vm_map_iokit_unmapped_region(vm_map_t map,vm_size_t bytes)19997 vm_map_iokit_unmapped_region(vm_map_t map, vm_size_t bytes)
19998 {
19999 	pmap_t pmap = vm_map_pmap(map);
20000 
20001 	ledger_debit(pmap->ledger, task_ledgers.iokit_mapped, bytes);
20002 	ledger_debit(pmap->ledger, task_ledgers.phys_footprint, bytes);
20003 }
20004 
20005 /* Add (generate) code signature for memory range */
20006 #if CONFIG_DYNAMIC_CODE_SIGNING
20007 kern_return_t
vm_map_sign(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)20008 vm_map_sign(vm_map_t map,
20009     vm_map_offset_t start,
20010     vm_map_offset_t end)
20011 {
20012 	vm_map_entry_t entry;
20013 	vm_page_t m;
20014 	vm_object_t object;
20015 
20016 	/*
20017 	 * Vet all the input parameters and current type and state of the
20018 	 * underlaying object.  Return with an error if anything is amiss.
20019 	 */
20020 	if (map == VM_MAP_NULL) {
20021 		return KERN_INVALID_ARGUMENT;
20022 	}
20023 
20024 	vm_map_lock_read(map);
20025 
20026 	if (!vm_map_lookup_entry(map, start, &entry) || entry->is_sub_map) {
20027 		/*
20028 		 * Must pass a valid non-submap address.
20029 		 */
20030 		vm_map_unlock_read(map);
20031 		return KERN_INVALID_ADDRESS;
20032 	}
20033 
20034 	if ((entry->vme_start > start) || (entry->vme_end < end)) {
20035 		/*
20036 		 * Map entry doesn't cover the requested range. Not handling
20037 		 * this situation currently.
20038 		 */
20039 		vm_map_unlock_read(map);
20040 		return KERN_INVALID_ARGUMENT;
20041 	}
20042 
20043 	object = VME_OBJECT(entry);
20044 	if (object == VM_OBJECT_NULL) {
20045 		/*
20046 		 * Object must already be present or we can't sign.
20047 		 */
20048 		vm_map_unlock_read(map);
20049 		return KERN_INVALID_ARGUMENT;
20050 	}
20051 
20052 	vm_object_lock(object);
20053 	vm_map_unlock_read(map);
20054 
20055 	while (start < end) {
20056 		uint32_t refmod;
20057 
20058 		m = vm_page_lookup(object,
20059 		    start - entry->vme_start + VME_OFFSET(entry));
20060 		if (m == VM_PAGE_NULL) {
20061 			/* shoud we try to fault a page here? we can probably
20062 			 * demand it exists and is locked for this request */
20063 			vm_object_unlock(object);
20064 			return KERN_FAILURE;
20065 		}
20066 		/* deal with special page status */
20067 		if (m->vmp_busy ||
20068 		    (m->vmp_unusual && (m->vmp_error || m->vmp_restart || m->vmp_private || m->vmp_absent))) {
20069 			vm_object_unlock(object);
20070 			return KERN_FAILURE;
20071 		}
20072 
20073 		/* Page is OK... now "validate" it */
20074 		/* This is the place where we'll call out to create a code
20075 		 * directory, later */
20076 		/* XXX TODO4K: deal with 4k subpages individually? */
20077 		m->vmp_cs_validated = VMP_CS_ALL_TRUE;
20078 
20079 		/* The page is now "clean" for codesigning purposes. That means
20080 		 * we don't consider it as modified (wpmapped) anymore. But
20081 		 * we'll disconnect the page so we note any future modification
20082 		 * attempts. */
20083 		m->vmp_wpmapped = FALSE;
20084 		refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
20085 
20086 		/* Pull the dirty status from the pmap, since we cleared the
20087 		 * wpmapped bit */
20088 		if ((refmod & VM_MEM_MODIFIED) && !m->vmp_dirty) {
20089 			SET_PAGE_DIRTY(m, FALSE);
20090 		}
20091 
20092 		/* On to the next page */
20093 		start += PAGE_SIZE;
20094 	}
20095 	vm_object_unlock(object);
20096 
20097 	return KERN_SUCCESS;
20098 }
20099 #endif
20100 
20101 kern_return_t
vm_map_partial_reap(vm_map_t map,unsigned int * reclaimed_resident,unsigned int * reclaimed_compressed)20102 vm_map_partial_reap(vm_map_t map, unsigned int *reclaimed_resident, unsigned int *reclaimed_compressed)
20103 {
20104 	vm_map_entry_t  entry = VM_MAP_ENTRY_NULL;
20105 	vm_map_entry_t  next_entry;
20106 	kern_return_t   kr = KERN_SUCCESS;
20107 	VM_MAP_ZAP_DECLARE(zap_list);
20108 
20109 	vm_map_lock(map);
20110 
20111 	for (entry = vm_map_first_entry(map);
20112 	    entry != vm_map_to_entry(map);
20113 	    entry = next_entry) {
20114 		next_entry = entry->vme_next;
20115 
20116 		if (!entry->is_sub_map &&
20117 		    VME_OBJECT(entry) &&
20118 		    (VME_OBJECT(entry)->internal == TRUE) &&
20119 		    (VME_OBJECT(entry)->ref_count == 1)) {
20120 			*reclaimed_resident += VME_OBJECT(entry)->resident_page_count;
20121 			*reclaimed_compressed += vm_compressor_pager_get_count(VME_OBJECT(entry)->pager);
20122 
20123 			(void)vm_map_delete(map, entry->vme_start,
20124 			    entry->vme_end, VM_MAP_REMOVE_NO_YIELD,
20125 			    KMEM_GUARD_NONE, &zap_list);
20126 		}
20127 	}
20128 
20129 	vm_map_unlock(map);
20130 
20131 	vm_map_zap_dispose(&zap_list);
20132 
20133 	return kr;
20134 }
20135 
20136 
20137 #if DEVELOPMENT || DEBUG
20138 
20139 int
vm_map_disconnect_page_mappings(vm_map_t map,boolean_t do_unnest)20140 vm_map_disconnect_page_mappings(
20141 	vm_map_t map,
20142 	boolean_t do_unnest)
20143 {
20144 	vm_map_entry_t entry;
20145 	ledger_amount_t byte_count = 0;
20146 
20147 	if (do_unnest == TRUE) {
20148 #ifndef NO_NESTED_PMAP
20149 		vm_map_lock(map);
20150 
20151 		for (entry = vm_map_first_entry(map);
20152 		    entry != vm_map_to_entry(map);
20153 		    entry = entry->vme_next) {
20154 			if (entry->is_sub_map && entry->use_pmap) {
20155 				/*
20156 				 * Make sure the range between the start of this entry and
20157 				 * the end of this entry is no longer nested, so that
20158 				 * we will only remove mappings from the pmap in use by this
20159 				 * this task
20160 				 */
20161 				vm_map_clip_unnest(map, entry, entry->vme_start, entry->vme_end);
20162 			}
20163 		}
20164 		vm_map_unlock(map);
20165 #endif
20166 	}
20167 	vm_map_lock_read(map);
20168 
20169 	ledger_get_balance(map->pmap->ledger, task_ledgers.phys_mem, &byte_count);
20170 
20171 	for (entry = vm_map_first_entry(map);
20172 	    entry != vm_map_to_entry(map);
20173 	    entry = entry->vme_next) {
20174 		if (!entry->is_sub_map && ((VME_OBJECT(entry) == 0) ||
20175 		    (VME_OBJECT(entry)->phys_contiguous))) {
20176 			continue;
20177 		}
20178 		if (entry->is_sub_map) {
20179 			assert(!entry->use_pmap);
20180 		}
20181 
20182 		pmap_remove_options(map->pmap, entry->vme_start, entry->vme_end, 0);
20183 	}
20184 	vm_map_unlock_read(map);
20185 
20186 	return (int) (byte_count / VM_MAP_PAGE_SIZE(map));
20187 }
20188 
20189 kern_return_t
vm_map_inject_error(vm_map_t map,vm_map_offset_t vaddr)20190 vm_map_inject_error(vm_map_t map, vm_map_offset_t vaddr)
20191 {
20192 	vm_object_t object = NULL;
20193 	vm_object_offset_t offset;
20194 	vm_prot_t prot;
20195 	boolean_t wired;
20196 	vm_map_version_t version;
20197 	vm_map_t real_map;
20198 	int result = KERN_FAILURE;
20199 
20200 	vaddr = vm_map_trunc_page(vaddr, PAGE_MASK);
20201 	vm_map_lock(map);
20202 
20203 	result = vm_map_lookup_locked(&map, vaddr, VM_PROT_READ,
20204 	    OBJECT_LOCK_EXCLUSIVE, &version, &object, &offset, &prot, &wired,
20205 	    NULL, &real_map, NULL);
20206 	if (object == NULL) {
20207 		result = KERN_MEMORY_ERROR;
20208 	} else if (object->pager) {
20209 		result = vm_compressor_pager_inject_error(object->pager,
20210 		    offset);
20211 	} else {
20212 		result = KERN_MEMORY_PRESENT;
20213 	}
20214 
20215 	if (object != NULL) {
20216 		vm_object_unlock(object);
20217 	}
20218 
20219 	if (real_map != map) {
20220 		vm_map_unlock(real_map);
20221 	}
20222 	vm_map_unlock(map);
20223 
20224 	return result;
20225 }
20226 
20227 #endif
20228 
20229 
20230 #if CONFIG_FREEZE
20231 
20232 
20233 extern struct freezer_context freezer_context_global;
20234 AbsoluteTime c_freezer_last_yield_ts = 0;
20235 
20236 extern unsigned int memorystatus_freeze_private_shared_pages_ratio;
20237 extern unsigned int memorystatus_freeze_shared_mb_per_process_max;
20238 
20239 kern_return_t
vm_map_freeze(task_t task,unsigned int * purgeable_count,unsigned int * wired_count,unsigned int * clean_count,unsigned int * dirty_count,unsigned int dirty_budget,unsigned int * shared_count,int * freezer_error_code,boolean_t eval_only)20240 vm_map_freeze(
20241 	task_t       task,
20242 	unsigned int *purgeable_count,
20243 	unsigned int *wired_count,
20244 	unsigned int *clean_count,
20245 	unsigned int *dirty_count,
20246 	unsigned int dirty_budget,
20247 	unsigned int *shared_count,
20248 	int          *freezer_error_code,
20249 	boolean_t    eval_only)
20250 {
20251 	vm_map_entry_t  entry2 = VM_MAP_ENTRY_NULL;
20252 	kern_return_t   kr = KERN_SUCCESS;
20253 	boolean_t       evaluation_phase = TRUE;
20254 	vm_object_t     cur_shared_object = NULL;
20255 	int             cur_shared_obj_ref_cnt = 0;
20256 	unsigned int    dirty_private_count = 0, dirty_shared_count = 0, obj_pages_snapshot = 0;
20257 
20258 	*purgeable_count = *wired_count = *clean_count = *dirty_count = *shared_count = 0;
20259 
20260 	/*
20261 	 * We need the exclusive lock here so that we can
20262 	 * block any page faults or lookups while we are
20263 	 * in the middle of freezing this vm map.
20264 	 */
20265 	vm_map_t map = task->map;
20266 
20267 	vm_map_lock(map);
20268 
20269 	assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
20270 
20271 	if (vm_compressor_low_on_space() || vm_swap_low_on_space()) {
20272 		if (vm_compressor_low_on_space()) {
20273 			*freezer_error_code = FREEZER_ERROR_NO_COMPRESSOR_SPACE;
20274 		}
20275 
20276 		if (vm_swap_low_on_space()) {
20277 			*freezer_error_code = FREEZER_ERROR_NO_SWAP_SPACE;
20278 		}
20279 
20280 		kr = KERN_NO_SPACE;
20281 		goto done;
20282 	}
20283 
20284 	if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE == FALSE) {
20285 		/*
20286 		 * In-memory compressor backing the freezer. No disk.
20287 		 * So no need to do the evaluation phase.
20288 		 */
20289 		evaluation_phase = FALSE;
20290 
20291 		if (eval_only == TRUE) {
20292 			/*
20293 			 * We don't support 'eval_only' mode
20294 			 * in this non-swap config.
20295 			 */
20296 			*freezer_error_code = FREEZER_ERROR_GENERIC;
20297 			kr = KERN_INVALID_ARGUMENT;
20298 			goto done;
20299 		}
20300 
20301 		freezer_context_global.freezer_ctx_uncompressed_pages = 0;
20302 		clock_get_uptime(&c_freezer_last_yield_ts);
20303 	}
20304 again:
20305 
20306 	for (entry2 = vm_map_first_entry(map);
20307 	    entry2 != vm_map_to_entry(map);
20308 	    entry2 = entry2->vme_next) {
20309 		vm_object_t src_object;
20310 
20311 		if (entry2->is_sub_map) {
20312 			continue;
20313 		}
20314 
20315 		src_object = VME_OBJECT(entry2);
20316 		if (!src_object ||
20317 		    src_object->phys_contiguous ||
20318 		    !src_object->internal) {
20319 			continue;
20320 		}
20321 
20322 		/* If eligible, scan the entry, moving eligible pages over to our parent object */
20323 
20324 		if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
20325 			/*
20326 			 * We skip purgeable objects during evaluation phase only.
20327 			 * If we decide to freeze this process, we'll explicitly
20328 			 * purge these objects before we go around again with
20329 			 * 'evaluation_phase' set to FALSE.
20330 			 */
20331 
20332 			if ((src_object->purgable == VM_PURGABLE_EMPTY) || (src_object->purgable == VM_PURGABLE_VOLATILE)) {
20333 				/*
20334 				 * We want to purge objects that may not belong to this task but are mapped
20335 				 * in this task alone. Since we already purged this task's purgeable memory
20336 				 * at the end of a successful evaluation phase, we want to avoid doing no-op calls
20337 				 * on this task's purgeable objects. Hence the check for only volatile objects.
20338 				 */
20339 				if (evaluation_phase == FALSE &&
20340 				    (src_object->purgable == VM_PURGABLE_VOLATILE) &&
20341 				    (src_object->ref_count == 1)) {
20342 					vm_object_lock(src_object);
20343 					vm_object_purge(src_object, 0);
20344 					vm_object_unlock(src_object);
20345 				}
20346 				continue;
20347 			}
20348 
20349 			/*
20350 			 * Pages belonging to this object could be swapped to disk.
20351 			 * Make sure it's not a shared object because we could end
20352 			 * up just bringing it back in again.
20353 			 *
20354 			 * We try to optimize somewhat by checking for objects that are mapped
20355 			 * more than once within our own map. But we don't do full searches,
20356 			 * we just look at the entries following our current entry.
20357 			 */
20358 
20359 			if (src_object->ref_count > 1) {
20360 				if (src_object != cur_shared_object) {
20361 					obj_pages_snapshot = (src_object->resident_page_count - src_object->wired_page_count) + vm_compressor_pager_get_count(src_object->pager);
20362 					dirty_shared_count += obj_pages_snapshot;
20363 
20364 					cur_shared_object = src_object;
20365 					cur_shared_obj_ref_cnt = 1;
20366 					continue;
20367 				} else {
20368 					cur_shared_obj_ref_cnt++;
20369 					if (src_object->ref_count == cur_shared_obj_ref_cnt) {
20370 						/*
20371 						 * Fall through to below and treat this object as private.
20372 						 * So deduct its pages from our shared total and add it to the
20373 						 * private total.
20374 						 */
20375 
20376 						dirty_shared_count -= obj_pages_snapshot;
20377 						dirty_private_count += obj_pages_snapshot;
20378 					} else {
20379 						continue;
20380 					}
20381 				}
20382 			}
20383 
20384 
20385 			if (src_object->ref_count == 1) {
20386 				dirty_private_count += (src_object->resident_page_count - src_object->wired_page_count) + vm_compressor_pager_get_count(src_object->pager);
20387 			}
20388 
20389 			if (evaluation_phase == TRUE) {
20390 				continue;
20391 			}
20392 		}
20393 
20394 		uint32_t paged_out_count = vm_object_compressed_freezer_pageout(src_object, dirty_budget);
20395 		*wired_count += src_object->wired_page_count;
20396 
20397 		if (vm_compressor_low_on_space() || vm_swap_low_on_space()) {
20398 			if (vm_compressor_low_on_space()) {
20399 				*freezer_error_code = FREEZER_ERROR_NO_COMPRESSOR_SPACE;
20400 			}
20401 
20402 			if (vm_swap_low_on_space()) {
20403 				*freezer_error_code = FREEZER_ERROR_NO_SWAP_SPACE;
20404 			}
20405 
20406 			kr = KERN_NO_SPACE;
20407 			break;
20408 		}
20409 		if (paged_out_count >= dirty_budget) {
20410 			break;
20411 		}
20412 		dirty_budget -= paged_out_count;
20413 	}
20414 
20415 	*shared_count = (unsigned int) ((dirty_shared_count * PAGE_SIZE_64) / (1024 * 1024ULL));
20416 	if (evaluation_phase) {
20417 		unsigned int shared_pages_threshold = (memorystatus_freeze_shared_mb_per_process_max * 1024 * 1024ULL) / PAGE_SIZE_64;
20418 
20419 		if (dirty_shared_count > shared_pages_threshold) {
20420 			*freezer_error_code = FREEZER_ERROR_EXCESS_SHARED_MEMORY;
20421 			kr = KERN_FAILURE;
20422 			goto done;
20423 		}
20424 
20425 		if (dirty_shared_count &&
20426 		    ((dirty_private_count / dirty_shared_count) < memorystatus_freeze_private_shared_pages_ratio)) {
20427 			*freezer_error_code = FREEZER_ERROR_LOW_PRIVATE_SHARED_RATIO;
20428 			kr = KERN_FAILURE;
20429 			goto done;
20430 		}
20431 
20432 		evaluation_phase = FALSE;
20433 		dirty_shared_count = dirty_private_count = 0;
20434 
20435 		freezer_context_global.freezer_ctx_uncompressed_pages = 0;
20436 		clock_get_uptime(&c_freezer_last_yield_ts);
20437 
20438 		if (eval_only) {
20439 			kr = KERN_SUCCESS;
20440 			goto done;
20441 		}
20442 
20443 		vm_purgeable_purge_task_owned(task);
20444 
20445 		goto again;
20446 	} else {
20447 		kr = KERN_SUCCESS;
20448 	}
20449 
20450 done:
20451 	vm_map_unlock(map);
20452 
20453 	if ((eval_only == FALSE) && (kr == KERN_SUCCESS)) {
20454 		vm_object_compressed_freezer_done();
20455 	}
20456 	return kr;
20457 }
20458 
20459 #endif
20460 
20461 /*
20462  * vm_map_entry_should_cow_for_true_share:
20463  *
20464  * Determines if the map entry should be clipped and setup for copy-on-write
20465  * to avoid applying "true_share" to a large VM object when only a subset is
20466  * targeted.
20467  *
20468  * For now, we target only the map entries created for the Objective C
20469  * Garbage Collector, which initially have the following properties:
20470  *	- alias == VM_MEMORY_MALLOC
20471  *      - wired_count == 0
20472  *      - !needs_copy
20473  * and a VM object with:
20474  *      - internal
20475  *      - copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC
20476  *      - !true_share
20477  *      - vo_size == ANON_CHUNK_SIZE
20478  *
20479  * Only non-kernel map entries.
20480  */
20481 boolean_t
vm_map_entry_should_cow_for_true_share(vm_map_entry_t entry)20482 vm_map_entry_should_cow_for_true_share(
20483 	vm_map_entry_t  entry)
20484 {
20485 	vm_object_t     object;
20486 
20487 	if (entry->is_sub_map) {
20488 		/* entry does not point at a VM object */
20489 		return FALSE;
20490 	}
20491 
20492 	if (entry->needs_copy) {
20493 		/* already set for copy_on_write: done! */
20494 		return FALSE;
20495 	}
20496 
20497 	if (VME_ALIAS(entry) != VM_MEMORY_MALLOC &&
20498 	    VME_ALIAS(entry) != VM_MEMORY_MALLOC_SMALL) {
20499 		/* not a malloc heap or Obj-C Garbage Collector heap */
20500 		return FALSE;
20501 	}
20502 
20503 	if (entry->wired_count) {
20504 		/* wired: can't change the map entry... */
20505 		vm_counters.should_cow_but_wired++;
20506 		return FALSE;
20507 	}
20508 
20509 	object = VME_OBJECT(entry);
20510 
20511 	if (object == VM_OBJECT_NULL) {
20512 		/* no object yet... */
20513 		return FALSE;
20514 	}
20515 
20516 	if (!object->internal) {
20517 		/* not an internal object */
20518 		return FALSE;
20519 	}
20520 
20521 	if (object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
20522 		/* not the default copy strategy */
20523 		return FALSE;
20524 	}
20525 
20526 	if (object->true_share) {
20527 		/* already true_share: too late to avoid it */
20528 		return FALSE;
20529 	}
20530 
20531 	if (VME_ALIAS(entry) == VM_MEMORY_MALLOC &&
20532 	    object->vo_size != ANON_CHUNK_SIZE) {
20533 		/* ... not an object created for the ObjC Garbage Collector */
20534 		return FALSE;
20535 	}
20536 
20537 	if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_SMALL &&
20538 	    object->vo_size != 2048 * 4096) {
20539 		/* ... not a "MALLOC_SMALL" heap */
20540 		return FALSE;
20541 	}
20542 
20543 	/*
20544 	 * All the criteria match: we have a large object being targeted for "true_share".
20545 	 * To limit the adverse side-effects linked with "true_share", tell the caller to
20546 	 * try and avoid setting up the entire object for "true_share" by clipping the
20547 	 * targeted range and setting it up for copy-on-write.
20548 	 */
20549 	return TRUE;
20550 }
20551 
20552 vm_map_offset_t
vm_map_round_page_mask(vm_map_offset_t offset,vm_map_offset_t mask)20553 vm_map_round_page_mask(
20554 	vm_map_offset_t offset,
20555 	vm_map_offset_t mask)
20556 {
20557 	return VM_MAP_ROUND_PAGE(offset, mask);
20558 }
20559 
20560 vm_map_offset_t
vm_map_trunc_page_mask(vm_map_offset_t offset,vm_map_offset_t mask)20561 vm_map_trunc_page_mask(
20562 	vm_map_offset_t offset,
20563 	vm_map_offset_t mask)
20564 {
20565 	return VM_MAP_TRUNC_PAGE(offset, mask);
20566 }
20567 
20568 boolean_t
vm_map_page_aligned(vm_map_offset_t offset,vm_map_offset_t mask)20569 vm_map_page_aligned(
20570 	vm_map_offset_t offset,
20571 	vm_map_offset_t mask)
20572 {
20573 	return ((offset) & mask) == 0;
20574 }
20575 
20576 int
vm_map_page_shift(vm_map_t map)20577 vm_map_page_shift(
20578 	vm_map_t map)
20579 {
20580 	return VM_MAP_PAGE_SHIFT(map);
20581 }
20582 
20583 int
vm_map_page_size(vm_map_t map)20584 vm_map_page_size(
20585 	vm_map_t map)
20586 {
20587 	return VM_MAP_PAGE_SIZE(map);
20588 }
20589 
20590 vm_map_offset_t
vm_map_page_mask(vm_map_t map)20591 vm_map_page_mask(
20592 	vm_map_t map)
20593 {
20594 	return VM_MAP_PAGE_MASK(map);
20595 }
20596 
20597 kern_return_t
vm_map_set_page_shift(vm_map_t map,int pageshift)20598 vm_map_set_page_shift(
20599 	vm_map_t        map,
20600 	int             pageshift)
20601 {
20602 	if (map->hdr.nentries != 0) {
20603 		/* too late to change page size */
20604 		return KERN_FAILURE;
20605 	}
20606 
20607 	map->hdr.page_shift = (uint16_t)pageshift;
20608 
20609 	return KERN_SUCCESS;
20610 }
20611 
20612 kern_return_t
vm_map_query_volatile(vm_map_t map,mach_vm_size_t * volatile_virtual_size_p,mach_vm_size_t * volatile_resident_size_p,mach_vm_size_t * volatile_compressed_size_p,mach_vm_size_t * volatile_pmap_size_p,mach_vm_size_t * volatile_compressed_pmap_size_p)20613 vm_map_query_volatile(
20614 	vm_map_t        map,
20615 	mach_vm_size_t  *volatile_virtual_size_p,
20616 	mach_vm_size_t  *volatile_resident_size_p,
20617 	mach_vm_size_t  *volatile_compressed_size_p,
20618 	mach_vm_size_t  *volatile_pmap_size_p,
20619 	mach_vm_size_t  *volatile_compressed_pmap_size_p)
20620 {
20621 	mach_vm_size_t  volatile_virtual_size;
20622 	mach_vm_size_t  volatile_resident_count;
20623 	mach_vm_size_t  volatile_compressed_count;
20624 	mach_vm_size_t  volatile_pmap_count;
20625 	mach_vm_size_t  volatile_compressed_pmap_count;
20626 	mach_vm_size_t  resident_count;
20627 	vm_map_entry_t  entry;
20628 	vm_object_t     object;
20629 
20630 	/* map should be locked by caller */
20631 
20632 	volatile_virtual_size = 0;
20633 	volatile_resident_count = 0;
20634 	volatile_compressed_count = 0;
20635 	volatile_pmap_count = 0;
20636 	volatile_compressed_pmap_count = 0;
20637 
20638 	for (entry = vm_map_first_entry(map);
20639 	    entry != vm_map_to_entry(map);
20640 	    entry = entry->vme_next) {
20641 		mach_vm_size_t  pmap_resident_bytes, pmap_compressed_bytes;
20642 
20643 		if (entry->is_sub_map) {
20644 			continue;
20645 		}
20646 		if (!(entry->protection & VM_PROT_WRITE)) {
20647 			continue;
20648 		}
20649 		object = VME_OBJECT(entry);
20650 		if (object == VM_OBJECT_NULL) {
20651 			continue;
20652 		}
20653 		if (object->purgable != VM_PURGABLE_VOLATILE &&
20654 		    object->purgable != VM_PURGABLE_EMPTY) {
20655 			continue;
20656 		}
20657 		if (VME_OFFSET(entry)) {
20658 			/*
20659 			 * If the map entry has been split and the object now
20660 			 * appears several times in the VM map, we don't want
20661 			 * to count the object's resident_page_count more than
20662 			 * once.  We count it only for the first one, starting
20663 			 * at offset 0 and ignore the other VM map entries.
20664 			 */
20665 			continue;
20666 		}
20667 		resident_count = object->resident_page_count;
20668 		if ((VME_OFFSET(entry) / PAGE_SIZE) >= resident_count) {
20669 			resident_count = 0;
20670 		} else {
20671 			resident_count -= (VME_OFFSET(entry) / PAGE_SIZE);
20672 		}
20673 
20674 		volatile_virtual_size += entry->vme_end - entry->vme_start;
20675 		volatile_resident_count += resident_count;
20676 		if (object->pager) {
20677 			volatile_compressed_count +=
20678 			    vm_compressor_pager_get_count(object->pager);
20679 		}
20680 		pmap_compressed_bytes = 0;
20681 		pmap_resident_bytes =
20682 		    pmap_query_resident(map->pmap,
20683 		    entry->vme_start,
20684 		    entry->vme_end,
20685 		    &pmap_compressed_bytes);
20686 		volatile_pmap_count += (pmap_resident_bytes / PAGE_SIZE);
20687 		volatile_compressed_pmap_count += (pmap_compressed_bytes
20688 		    / PAGE_SIZE);
20689 	}
20690 
20691 	/* map is still locked on return */
20692 
20693 	*volatile_virtual_size_p = volatile_virtual_size;
20694 	*volatile_resident_size_p = volatile_resident_count * PAGE_SIZE;
20695 	*volatile_compressed_size_p = volatile_compressed_count * PAGE_SIZE;
20696 	*volatile_pmap_size_p = volatile_pmap_count * PAGE_SIZE;
20697 	*volatile_compressed_pmap_size_p = volatile_compressed_pmap_count * PAGE_SIZE;
20698 
20699 	return KERN_SUCCESS;
20700 }
20701 
20702 void
vm_map_sizes(vm_map_t map,vm_map_size_t * psize,vm_map_size_t * pfree,vm_map_size_t * plargest_free)20703 vm_map_sizes(vm_map_t map,
20704     vm_map_size_t * psize,
20705     vm_map_size_t * pfree,
20706     vm_map_size_t * plargest_free)
20707 {
20708 	vm_map_entry_t  entry;
20709 	vm_map_offset_t prev;
20710 	vm_map_size_t   free, total_free, largest_free;
20711 	boolean_t       end;
20712 
20713 	if (!map) {
20714 		*psize = *pfree = *plargest_free = 0;
20715 		return;
20716 	}
20717 	total_free = largest_free = 0;
20718 
20719 	vm_map_lock_read(map);
20720 	if (psize) {
20721 		*psize = map->max_offset - map->min_offset;
20722 	}
20723 
20724 	prev = map->min_offset;
20725 	for (entry = vm_map_first_entry(map);; entry = entry->vme_next) {
20726 		end = (entry == vm_map_to_entry(map));
20727 
20728 		if (end) {
20729 			free = entry->vme_end   - prev;
20730 		} else {
20731 			free = entry->vme_start - prev;
20732 		}
20733 
20734 		total_free += free;
20735 		if (free > largest_free) {
20736 			largest_free = free;
20737 		}
20738 
20739 		if (end) {
20740 			break;
20741 		}
20742 		prev = entry->vme_end;
20743 	}
20744 	vm_map_unlock_read(map);
20745 	if (pfree) {
20746 		*pfree = total_free;
20747 	}
20748 	if (plargest_free) {
20749 		*plargest_free = largest_free;
20750 	}
20751 }
20752 
20753 #if VM_SCAN_FOR_SHADOW_CHAIN
20754 int vm_map_shadow_max(vm_map_t map);
20755 int
vm_map_shadow_max(vm_map_t map)20756 vm_map_shadow_max(
20757 	vm_map_t map)
20758 {
20759 	int             shadows, shadows_max;
20760 	vm_map_entry_t  entry;
20761 	vm_object_t     object, next_object;
20762 
20763 	if (map == NULL) {
20764 		return 0;
20765 	}
20766 
20767 	shadows_max = 0;
20768 
20769 	vm_map_lock_read(map);
20770 
20771 	for (entry = vm_map_first_entry(map);
20772 	    entry != vm_map_to_entry(map);
20773 	    entry = entry->vme_next) {
20774 		if (entry->is_sub_map) {
20775 			continue;
20776 		}
20777 		object = VME_OBJECT(entry);
20778 		if (object == NULL) {
20779 			continue;
20780 		}
20781 		vm_object_lock_shared(object);
20782 		for (shadows = 0;
20783 		    object->shadow != NULL;
20784 		    shadows++, object = next_object) {
20785 			next_object = object->shadow;
20786 			vm_object_lock_shared(next_object);
20787 			vm_object_unlock(object);
20788 		}
20789 		vm_object_unlock(object);
20790 		if (shadows > shadows_max) {
20791 			shadows_max = shadows;
20792 		}
20793 	}
20794 
20795 	vm_map_unlock_read(map);
20796 
20797 	return shadows_max;
20798 }
20799 #endif /* VM_SCAN_FOR_SHADOW_CHAIN */
20800 
20801 void
vm_commit_pagezero_status(vm_map_t lmap)20802 vm_commit_pagezero_status(vm_map_t lmap)
20803 {
20804 	pmap_advise_pagezero_range(lmap->pmap, lmap->min_offset);
20805 }
20806 
20807 #if XNU_TARGET_OS_OSX
20808 void
vm_map_set_high_start(vm_map_t map,vm_map_offset_t high_start)20809 vm_map_set_high_start(
20810 	vm_map_t        map,
20811 	vm_map_offset_t high_start)
20812 {
20813 	map->vmmap_high_start = high_start;
20814 }
20815 #endif /* XNU_TARGET_OS_OSX */
20816 
20817 
20818 /*
20819  * FORKED CORPSE FOOTPRINT
20820  *
20821  * A forked corpse gets a copy of the original VM map but its pmap is mostly
20822  * empty since it never ran and never got to fault in any pages.
20823  * Collecting footprint info (via "sysctl vm.self_region_footprint") for
20824  * a forked corpse would therefore return very little information.
20825  *
20826  * When forking a corpse, we can pass the VM_MAP_FORK_CORPSE_FOOTPRINT option
20827  * to vm_map_fork() to collect footprint information from the original VM map
20828  * and its pmap, and store it in the forked corpse's VM map.  That information
20829  * is stored in place of the VM map's "hole list" since we'll never need to
20830  * lookup for holes in the corpse's map.
20831  *
20832  * The corpse's footprint info looks like this:
20833  *
20834  * vm_map->vmmap_corpse_footprint points to pageable kernel memory laid out
20835  * as follows:
20836  *                     +---------------------------------------+
20837  *            header-> | cf_size                               |
20838  *                     +-------------------+-------------------+
20839  *                     | cf_last_region    | cf_last_zeroes    |
20840  *                     +-------------------+-------------------+
20841  *           region1-> | cfr_vaddr                             |
20842  *                     +-------------------+-------------------+
20843  *                     | cfr_num_pages     | d0 | d1 | d2 | d3 |
20844  *                     +---------------------------------------+
20845  *                     | d4 | d5 | ...                         |
20846  *                     +---------------------------------------+
20847  *                     | ...                                   |
20848  *                     +-------------------+-------------------+
20849  *                     | dy | dz | na | na | cfr_vaddr...      | <-region2
20850  *                     +-------------------+-------------------+
20851  *                     | cfr_vaddr (ctd)   | cfr_num_pages     |
20852  *                     +---------------------------------------+
20853  *                     | d0 | d1 ...                           |
20854  *                     +---------------------------------------+
20855  *                       ...
20856  *                     +---------------------------------------+
20857  *       last region-> | cfr_vaddr                             |
20858  *                     +---------------------------------------+
20859  *                     + cfr_num_pages     | d0 | d1 | d2 | d3 |
20860  *                     +---------------------------------------+
20861  *                       ...
20862  *                     +---------------------------------------+
20863  *                     | dx | dy | dz | na | na | na | na | na |
20864  *                     +---------------------------------------+
20865  *
20866  * where:
20867  *      cf_size:	total size of the buffer (rounded to page size)
20868  *      cf_last_region:	offset in the buffer of the last "region" sub-header
20869  *	cf_last_zeroes: number of trailing "zero" dispositions at the end
20870  *			of last region
20871  *	cfr_vaddr:	virtual address of the start of the covered "region"
20872  *	cfr_num_pages:	number of pages in the covered "region"
20873  *	d*:		disposition of the page at that virtual address
20874  * Regions in the buffer are word-aligned.
20875  *
20876  * We estimate the size of the buffer based on the number of memory regions
20877  * and the virtual size of the address space.  While copying each memory region
20878  * during vm_map_fork(), we also collect the footprint info for that region
20879  * and store it in the buffer, packing it as much as possible (coalescing
20880  * contiguous memory regions to avoid having too many region headers and
20881  * avoiding long streaks of "zero" page dispositions by splitting footprint
20882  * "regions", so the number of regions in the footprint buffer might not match
20883  * the number of memory regions in the address space.
20884  *
20885  * We also have to copy the original task's "nonvolatile" ledgers since that's
20886  * part of the footprint and will need to be reported to any tool asking for
20887  * the footprint information of the forked corpse.
20888  */
20889 
20890 uint64_t vm_map_corpse_footprint_count = 0;
20891 uint64_t vm_map_corpse_footprint_size_avg = 0;
20892 uint64_t vm_map_corpse_footprint_size_max = 0;
20893 uint64_t vm_map_corpse_footprint_full = 0;
20894 uint64_t vm_map_corpse_footprint_no_buf = 0;
20895 
20896 struct vm_map_corpse_footprint_header {
20897 	vm_size_t       cf_size;        /* allocated buffer size */
20898 	uint32_t        cf_last_region; /* offset of last region in buffer */
20899 	union {
20900 		uint32_t cfu_last_zeroes; /* during creation:
20901 		                           * number of "zero" dispositions at
20902 		                           * end of last region */
20903 		uint32_t cfu_hint_region; /* during lookup:
20904 		                           * offset of last looked up region */
20905 #define cf_last_zeroes cfu.cfu_last_zeroes
20906 #define cf_hint_region cfu.cfu_hint_region
20907 	} cfu;
20908 };
20909 typedef uint8_t cf_disp_t;
20910 struct vm_map_corpse_footprint_region {
20911 	vm_map_offset_t cfr_vaddr;      /* region start virtual address */
20912 	uint32_t        cfr_num_pages;  /* number of pages in this "region" */
20913 	cf_disp_t   cfr_disposition[0]; /* disposition of each page */
20914 } __attribute__((packed));
20915 
20916 static cf_disp_t
vm_page_disposition_to_cf_disp(int disposition)20917 vm_page_disposition_to_cf_disp(
20918 	int disposition)
20919 {
20920 	assert(sizeof(cf_disp_t) == 1);
20921 	/* relocate bits that don't fit in a "uint8_t" */
20922 	if (disposition & VM_PAGE_QUERY_PAGE_REUSABLE) {
20923 		disposition |= VM_PAGE_QUERY_PAGE_FICTITIOUS;
20924 	}
20925 	/* cast gets rid of extra bits */
20926 	return (cf_disp_t) disposition;
20927 }
20928 
20929 static int
vm_page_cf_disp_to_disposition(cf_disp_t cf_disp)20930 vm_page_cf_disp_to_disposition(
20931 	cf_disp_t cf_disp)
20932 {
20933 	int disposition;
20934 
20935 	assert(sizeof(cf_disp_t) == 1);
20936 	disposition = (int) cf_disp;
20937 	/* move relocated bits back in place */
20938 	if (cf_disp & VM_PAGE_QUERY_PAGE_FICTITIOUS) {
20939 		disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
20940 		disposition &= ~VM_PAGE_QUERY_PAGE_FICTITIOUS;
20941 	}
20942 	return disposition;
20943 }
20944 
20945 /*
20946  * vm_map_corpse_footprint_new_region:
20947  *      closes the current footprint "region" and creates a new one
20948  *
20949  * Returns NULL if there's not enough space in the buffer for a new region.
20950  */
20951 static struct vm_map_corpse_footprint_region *
vm_map_corpse_footprint_new_region(struct vm_map_corpse_footprint_header * footprint_header)20952 vm_map_corpse_footprint_new_region(
20953 	struct vm_map_corpse_footprint_header *footprint_header)
20954 {
20955 	uintptr_t       footprint_edge;
20956 	uint32_t        new_region_offset;
20957 	struct vm_map_corpse_footprint_region *footprint_region;
20958 	struct vm_map_corpse_footprint_region *new_footprint_region;
20959 
20960 	footprint_edge = ((uintptr_t)footprint_header +
20961 	    footprint_header->cf_size);
20962 	footprint_region = ((struct vm_map_corpse_footprint_region *)
20963 	    ((char *)footprint_header +
20964 	    footprint_header->cf_last_region));
20965 	assert((uintptr_t)footprint_region + sizeof(*footprint_region) <=
20966 	    footprint_edge);
20967 
20968 	/* get rid of trailing zeroes in the last region */
20969 	assert(footprint_region->cfr_num_pages >=
20970 	    footprint_header->cf_last_zeroes);
20971 	footprint_region->cfr_num_pages -=
20972 	    footprint_header->cf_last_zeroes;
20973 	footprint_header->cf_last_zeroes = 0;
20974 
20975 	/* reuse this region if it's now empty */
20976 	if (footprint_region->cfr_num_pages == 0) {
20977 		return footprint_region;
20978 	}
20979 
20980 	/* compute offset of new region */
20981 	new_region_offset = footprint_header->cf_last_region;
20982 	new_region_offset += sizeof(*footprint_region);
20983 	new_region_offset += (footprint_region->cfr_num_pages * sizeof(cf_disp_t));
20984 	new_region_offset = roundup(new_region_offset, sizeof(int));
20985 
20986 	/* check if we're going over the edge */
20987 	if (((uintptr_t)footprint_header +
20988 	    new_region_offset +
20989 	    sizeof(*footprint_region)) >=
20990 	    footprint_edge) {
20991 		/* over the edge: no new region */
20992 		return NULL;
20993 	}
20994 
20995 	/* adjust offset of last region in header */
20996 	footprint_header->cf_last_region = new_region_offset;
20997 
20998 	new_footprint_region = (struct vm_map_corpse_footprint_region *)
20999 	    ((char *)footprint_header +
21000 	    footprint_header->cf_last_region);
21001 	new_footprint_region->cfr_vaddr = 0;
21002 	new_footprint_region->cfr_num_pages = 0;
21003 	/* caller needs to initialize new region */
21004 
21005 	return new_footprint_region;
21006 }
21007 
21008 /*
21009  * vm_map_corpse_footprint_collect:
21010  *	collect footprint information for "old_entry" in "old_map" and
21011  *	stores it in "new_map"'s vmmap_footprint_info.
21012  */
21013 kern_return_t
vm_map_corpse_footprint_collect(vm_map_t old_map,vm_map_entry_t old_entry,vm_map_t new_map)21014 vm_map_corpse_footprint_collect(
21015 	vm_map_t        old_map,
21016 	vm_map_entry_t  old_entry,
21017 	vm_map_t        new_map)
21018 {
21019 	vm_map_offset_t va;
21020 	kern_return_t   kr;
21021 	struct vm_map_corpse_footprint_header *footprint_header;
21022 	struct vm_map_corpse_footprint_region *footprint_region;
21023 	struct vm_map_corpse_footprint_region *new_footprint_region;
21024 	cf_disp_t       *next_disp_p;
21025 	uintptr_t       footprint_edge;
21026 	uint32_t        num_pages_tmp;
21027 	int             effective_page_size;
21028 
21029 	effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(old_map));
21030 
21031 	va = old_entry->vme_start;
21032 
21033 	vm_map_lock_assert_exclusive(old_map);
21034 	vm_map_lock_assert_exclusive(new_map);
21035 
21036 	assert(new_map->has_corpse_footprint);
21037 	assert(!old_map->has_corpse_footprint);
21038 	if (!new_map->has_corpse_footprint ||
21039 	    old_map->has_corpse_footprint) {
21040 		/*
21041 		 * This can only transfer footprint info from a
21042 		 * map with a live pmap to a map with a corpse footprint.
21043 		 */
21044 		return KERN_NOT_SUPPORTED;
21045 	}
21046 
21047 	if (new_map->vmmap_corpse_footprint == NULL) {
21048 		vm_offset_t     buf;
21049 		vm_size_t       buf_size;
21050 
21051 		buf = 0;
21052 		buf_size = (sizeof(*footprint_header) +
21053 		    (old_map->hdr.nentries
21054 		    *
21055 		    (sizeof(*footprint_region) +
21056 		    +3))            /* potential alignment for each region */
21057 		    +
21058 		    ((old_map->size / effective_page_size)
21059 		    *
21060 		    sizeof(cf_disp_t)));      /* disposition for each page */
21061 //		printf("FBDP corpse map %p guestimate footprint size 0x%llx\n", new_map, (uint64_t) buf_size);
21062 		buf_size = round_page(buf_size);
21063 
21064 		/* limit buffer to 1 page to validate overflow detection */
21065 //		buf_size = PAGE_SIZE;
21066 
21067 		/* limit size to a somewhat sane amount */
21068 #if XNU_TARGET_OS_OSX
21069 #define VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE   (8*1024*1024)   /* 8MB */
21070 #else /* XNU_TARGET_OS_OSX */
21071 #define VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE   (256*1024)      /* 256KB */
21072 #endif /* XNU_TARGET_OS_OSX */
21073 		if (buf_size > VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE) {
21074 			buf_size = VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE;
21075 		}
21076 
21077 		/*
21078 		 * Allocate the pageable buffer (with a trailing guard page).
21079 		 * It will be zero-filled on demand.
21080 		 */
21081 		kr = kmem_alloc(kernel_map, &buf, buf_size + PAGE_SIZE,
21082 		    KMA_DATA | KMA_PAGEABLE | KMA_GUARD_LAST,
21083 		    VM_KERN_MEMORY_DIAG);
21084 		if (kr != KERN_SUCCESS) {
21085 			vm_map_corpse_footprint_no_buf++;
21086 			return kr;
21087 		}
21088 
21089 		/* initialize header and 1st region */
21090 		footprint_header = (struct vm_map_corpse_footprint_header *)buf;
21091 		new_map->vmmap_corpse_footprint = footprint_header;
21092 
21093 		footprint_header->cf_size = buf_size;
21094 		footprint_header->cf_last_region =
21095 		    sizeof(*footprint_header);
21096 		footprint_header->cf_last_zeroes = 0;
21097 
21098 		footprint_region = (struct vm_map_corpse_footprint_region *)
21099 		    ((char *)footprint_header +
21100 		    footprint_header->cf_last_region);
21101 		footprint_region->cfr_vaddr = 0;
21102 		footprint_region->cfr_num_pages = 0;
21103 	} else {
21104 		/* retrieve header and last region */
21105 		footprint_header = (struct vm_map_corpse_footprint_header *)
21106 		    new_map->vmmap_corpse_footprint;
21107 		footprint_region = (struct vm_map_corpse_footprint_region *)
21108 		    ((char *)footprint_header +
21109 		    footprint_header->cf_last_region);
21110 	}
21111 	footprint_edge = ((uintptr_t)footprint_header +
21112 	    footprint_header->cf_size);
21113 
21114 	if ((footprint_region->cfr_vaddr +
21115 	    (((vm_map_offset_t)footprint_region->cfr_num_pages) *
21116 	    effective_page_size))
21117 	    != old_entry->vme_start) {
21118 		uint64_t num_pages_delta, num_pages_delta_size;
21119 		uint32_t region_offset_delta_size;
21120 
21121 		/*
21122 		 * Not the next contiguous virtual address:
21123 		 * start a new region or store "zero" dispositions for
21124 		 * the missing pages?
21125 		 */
21126 		/* size of gap in actual page dispositions */
21127 		num_pages_delta = ((old_entry->vme_start -
21128 		    footprint_region->cfr_vaddr) / effective_page_size)
21129 		    - footprint_region->cfr_num_pages;
21130 		num_pages_delta_size = num_pages_delta * sizeof(cf_disp_t);
21131 		/* size of gap as a new footprint region header */
21132 		region_offset_delta_size =
21133 		    (sizeof(*footprint_region) +
21134 		    roundup(((footprint_region->cfr_num_pages -
21135 		    footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)),
21136 		    sizeof(int)) -
21137 		    ((footprint_region->cfr_num_pages -
21138 		    footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)));
21139 //		printf("FBDP %s:%d region 0x%x 0x%llx 0x%x vme_start 0x%llx pages_delta 0x%llx region_delta 0x%x\n", __FUNCTION__, __LINE__, footprint_header->cf_last_region, footprint_region->cfr_vaddr, footprint_region->cfr_num_pages, old_entry->vme_start, num_pages_delta, region_offset_delta);
21140 		if (region_offset_delta_size < num_pages_delta_size ||
21141 		    os_add3_overflow(footprint_region->cfr_num_pages,
21142 		    (uint32_t) num_pages_delta,
21143 		    1,
21144 		    &num_pages_tmp)) {
21145 			/*
21146 			 * Storing data for this gap would take more space
21147 			 * than inserting a new footprint region header:
21148 			 * let's start a new region and save space. If it's a
21149 			 * tie, let's avoid using a new region, since that
21150 			 * would require more region hops to find the right
21151 			 * range during lookups.
21152 			 *
21153 			 * If the current region's cfr_num_pages would overflow
21154 			 * if we added "zero" page dispositions for the gap,
21155 			 * no choice but to start a new region.
21156 			 */
21157 //			printf("FBDP %s:%d new region\n", __FUNCTION__, __LINE__);
21158 			new_footprint_region =
21159 			    vm_map_corpse_footprint_new_region(footprint_header);
21160 			/* check that we're not going over the edge */
21161 			if (new_footprint_region == NULL) {
21162 				goto over_the_edge;
21163 			}
21164 			footprint_region = new_footprint_region;
21165 			/* initialize new region as empty */
21166 			footprint_region->cfr_vaddr = old_entry->vme_start;
21167 			footprint_region->cfr_num_pages = 0;
21168 		} else {
21169 			/*
21170 			 * Store "zero" page dispositions for the missing
21171 			 * pages.
21172 			 */
21173 //			printf("FBDP %s:%d zero gap\n", __FUNCTION__, __LINE__);
21174 			for (; num_pages_delta > 0; num_pages_delta--) {
21175 				next_disp_p = (cf_disp_t *)
21176 				    ((uintptr_t) footprint_region +
21177 				    sizeof(*footprint_region));
21178 				next_disp_p += footprint_region->cfr_num_pages;
21179 				/* check that we're not going over the edge */
21180 				if ((uintptr_t)next_disp_p >= footprint_edge) {
21181 					goto over_the_edge;
21182 				}
21183 				/* store "zero" disposition for this gap page */
21184 				footprint_region->cfr_num_pages++;
21185 				*next_disp_p = (cf_disp_t) 0;
21186 				footprint_header->cf_last_zeroes++;
21187 			}
21188 		}
21189 	}
21190 
21191 	for (va = old_entry->vme_start;
21192 	    va < old_entry->vme_end;
21193 	    va += effective_page_size) {
21194 		int             disposition;
21195 		cf_disp_t       cf_disp;
21196 
21197 		vm_map_footprint_query_page_info(old_map,
21198 		    old_entry,
21199 		    va,
21200 		    &disposition);
21201 		cf_disp = vm_page_disposition_to_cf_disp(disposition);
21202 
21203 //		if (va < SHARED_REGION_BASE_ARM64) printf("FBDP collect map %p va 0x%llx disp 0x%x\n", new_map, va, disp);
21204 
21205 		if (cf_disp == 0 && footprint_region->cfr_num_pages == 0) {
21206 			/*
21207 			 * Ignore "zero" dispositions at start of
21208 			 * region: just move start of region.
21209 			 */
21210 			footprint_region->cfr_vaddr += effective_page_size;
21211 			continue;
21212 		}
21213 
21214 		/* would region's cfr_num_pages overflow? */
21215 		if (os_add_overflow(footprint_region->cfr_num_pages, 1,
21216 		    &num_pages_tmp)) {
21217 			/* overflow: create a new region */
21218 			new_footprint_region =
21219 			    vm_map_corpse_footprint_new_region(
21220 				footprint_header);
21221 			if (new_footprint_region == NULL) {
21222 				goto over_the_edge;
21223 			}
21224 			footprint_region = new_footprint_region;
21225 			footprint_region->cfr_vaddr = va;
21226 			footprint_region->cfr_num_pages = 0;
21227 		}
21228 
21229 		next_disp_p = (cf_disp_t *) ((uintptr_t) footprint_region +
21230 		    sizeof(*footprint_region));
21231 		next_disp_p += footprint_region->cfr_num_pages;
21232 		/* check that we're not going over the edge */
21233 		if ((uintptr_t)next_disp_p >= footprint_edge) {
21234 			goto over_the_edge;
21235 		}
21236 		/* store this dispostion */
21237 		*next_disp_p = cf_disp;
21238 		footprint_region->cfr_num_pages++;
21239 
21240 		if (cf_disp != 0) {
21241 			/* non-zero disp: break the current zero streak */
21242 			footprint_header->cf_last_zeroes = 0;
21243 			/* done */
21244 			continue;
21245 		}
21246 
21247 		/* zero disp: add to the current streak of zeroes */
21248 		footprint_header->cf_last_zeroes++;
21249 		if ((footprint_header->cf_last_zeroes +
21250 		    roundup(((footprint_region->cfr_num_pages -
21251 		    footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)) &
21252 		    (sizeof(int) - 1),
21253 		    sizeof(int))) <
21254 		    (sizeof(*footprint_header))) {
21255 			/*
21256 			 * There are not enough trailing "zero" dispositions
21257 			 * (+ the extra padding we would need for the previous
21258 			 * region); creating a new region would not save space
21259 			 * at this point, so let's keep this "zero" disposition
21260 			 * in this region and reconsider later.
21261 			 */
21262 			continue;
21263 		}
21264 		/*
21265 		 * Create a new region to avoid having too many consecutive
21266 		 * "zero" dispositions.
21267 		 */
21268 		new_footprint_region =
21269 		    vm_map_corpse_footprint_new_region(footprint_header);
21270 		if (new_footprint_region == NULL) {
21271 			goto over_the_edge;
21272 		}
21273 		footprint_region = new_footprint_region;
21274 		/* initialize the new region as empty ... */
21275 		footprint_region->cfr_num_pages = 0;
21276 		/* ... and skip this "zero" disp */
21277 		footprint_region->cfr_vaddr = va + effective_page_size;
21278 	}
21279 
21280 	return KERN_SUCCESS;
21281 
21282 over_the_edge:
21283 //	printf("FBDP map %p footprint was full for va 0x%llx\n", new_map, va);
21284 	vm_map_corpse_footprint_full++;
21285 	return KERN_RESOURCE_SHORTAGE;
21286 }
21287 
21288 /*
21289  * vm_map_corpse_footprint_collect_done:
21290  *	completes the footprint collection by getting rid of any remaining
21291  *	trailing "zero" dispositions and trimming the unused part of the
21292  *	kernel buffer
21293  */
21294 void
vm_map_corpse_footprint_collect_done(vm_map_t new_map)21295 vm_map_corpse_footprint_collect_done(
21296 	vm_map_t        new_map)
21297 {
21298 	struct vm_map_corpse_footprint_header *footprint_header;
21299 	struct vm_map_corpse_footprint_region *footprint_region;
21300 	vm_size_t       buf_size, actual_size;
21301 	kern_return_t   kr;
21302 
21303 	assert(new_map->has_corpse_footprint);
21304 	if (!new_map->has_corpse_footprint ||
21305 	    new_map->vmmap_corpse_footprint == NULL) {
21306 		return;
21307 	}
21308 
21309 	footprint_header = (struct vm_map_corpse_footprint_header *)
21310 	    new_map->vmmap_corpse_footprint;
21311 	buf_size = footprint_header->cf_size;
21312 
21313 	footprint_region = (struct vm_map_corpse_footprint_region *)
21314 	    ((char *)footprint_header +
21315 	    footprint_header->cf_last_region);
21316 
21317 	/* get rid of trailing zeroes in last region */
21318 	assert(footprint_region->cfr_num_pages >= footprint_header->cf_last_zeroes);
21319 	footprint_region->cfr_num_pages -= footprint_header->cf_last_zeroes;
21320 	footprint_header->cf_last_zeroes = 0;
21321 
21322 	actual_size = (vm_size_t)(footprint_header->cf_last_region +
21323 	    sizeof(*footprint_region) +
21324 	    (footprint_region->cfr_num_pages * sizeof(cf_disp_t)));
21325 
21326 //	printf("FBDP map %p buf_size 0x%llx actual_size 0x%llx\n", new_map, (uint64_t) buf_size, (uint64_t) actual_size);
21327 	vm_map_corpse_footprint_size_avg =
21328 	    (((vm_map_corpse_footprint_size_avg *
21329 	    vm_map_corpse_footprint_count) +
21330 	    actual_size) /
21331 	    (vm_map_corpse_footprint_count + 1));
21332 	vm_map_corpse_footprint_count++;
21333 	if (actual_size > vm_map_corpse_footprint_size_max) {
21334 		vm_map_corpse_footprint_size_max = actual_size;
21335 	}
21336 
21337 	actual_size = round_page(actual_size);
21338 	if (buf_size > actual_size) {
21339 		kr = vm_deallocate(kernel_map,
21340 		    ((vm_address_t)footprint_header +
21341 		    actual_size +
21342 		    PAGE_SIZE),                 /* trailing guard page */
21343 		    (buf_size - actual_size));
21344 		assertf(kr == KERN_SUCCESS,
21345 		    "trim: footprint_header %p buf_size 0x%llx actual_size 0x%llx kr=0x%x\n",
21346 		    footprint_header,
21347 		    (uint64_t) buf_size,
21348 		    (uint64_t) actual_size,
21349 		    kr);
21350 		kr = vm_protect(kernel_map,
21351 		    ((vm_address_t)footprint_header +
21352 		    actual_size),
21353 		    PAGE_SIZE,
21354 		    FALSE,             /* set_maximum */
21355 		    VM_PROT_NONE);
21356 		assertf(kr == KERN_SUCCESS,
21357 		    "guard: footprint_header %p buf_size 0x%llx actual_size 0x%llx kr=0x%x\n",
21358 		    footprint_header,
21359 		    (uint64_t) buf_size,
21360 		    (uint64_t) actual_size,
21361 		    kr);
21362 	}
21363 
21364 	footprint_header->cf_size = actual_size;
21365 }
21366 
21367 /*
21368  * vm_map_corpse_footprint_query_page_info:
21369  *	retrieves the disposition of the page at virtual address "vaddr"
21370  *	in the forked corpse's VM map
21371  *
21372  * This is the equivalent of vm_map_footprint_query_page_info() for a forked corpse.
21373  */
21374 kern_return_t
vm_map_corpse_footprint_query_page_info(vm_map_t map,vm_map_offset_t va,int * disposition_p)21375 vm_map_corpse_footprint_query_page_info(
21376 	vm_map_t        map,
21377 	vm_map_offset_t va,
21378 	int             *disposition_p)
21379 {
21380 	struct vm_map_corpse_footprint_header *footprint_header;
21381 	struct vm_map_corpse_footprint_region *footprint_region;
21382 	uint32_t        footprint_region_offset;
21383 	vm_map_offset_t region_start, region_end;
21384 	int             disp_idx;
21385 	kern_return_t   kr;
21386 	int             effective_page_size;
21387 	cf_disp_t       cf_disp;
21388 
21389 	if (!map->has_corpse_footprint) {
21390 		*disposition_p = 0;
21391 		kr = KERN_INVALID_ARGUMENT;
21392 		goto done;
21393 	}
21394 
21395 	footprint_header = map->vmmap_corpse_footprint;
21396 	if (footprint_header == NULL) {
21397 		*disposition_p = 0;
21398 //		if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p);
21399 		kr = KERN_INVALID_ARGUMENT;
21400 		goto done;
21401 	}
21402 
21403 	/* start looking at the hint ("cf_hint_region") */
21404 	footprint_region_offset = footprint_header->cf_hint_region;
21405 
21406 	effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(map));
21407 
21408 lookup_again:
21409 	if (footprint_region_offset < sizeof(*footprint_header)) {
21410 		/* hint too low: start from 1st region */
21411 		footprint_region_offset = sizeof(*footprint_header);
21412 	}
21413 	if (footprint_region_offset >= footprint_header->cf_last_region) {
21414 		/* hint too high: re-start from 1st region */
21415 		footprint_region_offset = sizeof(*footprint_header);
21416 	}
21417 	footprint_region = (struct vm_map_corpse_footprint_region *)
21418 	    ((char *)footprint_header + footprint_region_offset);
21419 	region_start = footprint_region->cfr_vaddr;
21420 	region_end = (region_start +
21421 	    ((vm_map_offset_t)(footprint_region->cfr_num_pages) *
21422 	    effective_page_size));
21423 	if (va < region_start &&
21424 	    footprint_region_offset != sizeof(*footprint_header)) {
21425 		/* our range starts before the hint region */
21426 
21427 		/* reset the hint (in a racy way...) */
21428 		footprint_header->cf_hint_region = sizeof(*footprint_header);
21429 		/* lookup "va" again from 1st region */
21430 		footprint_region_offset = sizeof(*footprint_header);
21431 		goto lookup_again;
21432 	}
21433 
21434 	while (va >= region_end) {
21435 		if (footprint_region_offset >= footprint_header->cf_last_region) {
21436 			break;
21437 		}
21438 		/* skip the region's header */
21439 		footprint_region_offset += sizeof(*footprint_region);
21440 		/* skip the region's page dispositions */
21441 		footprint_region_offset += (footprint_region->cfr_num_pages * sizeof(cf_disp_t));
21442 		/* align to next word boundary */
21443 		footprint_region_offset =
21444 		    roundup(footprint_region_offset,
21445 		    sizeof(int));
21446 		footprint_region = (struct vm_map_corpse_footprint_region *)
21447 		    ((char *)footprint_header + footprint_region_offset);
21448 		region_start = footprint_region->cfr_vaddr;
21449 		region_end = (region_start +
21450 		    ((vm_map_offset_t)(footprint_region->cfr_num_pages) *
21451 		    effective_page_size));
21452 	}
21453 	if (va < region_start || va >= region_end) {
21454 		/* page not found */
21455 		*disposition_p = 0;
21456 //		if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p);
21457 		kr = KERN_SUCCESS;
21458 		goto done;
21459 	}
21460 
21461 	/* "va" found: set the lookup hint for next lookup (in a racy way...) */
21462 	footprint_header->cf_hint_region = footprint_region_offset;
21463 
21464 	/* get page disposition for "va" in this region */
21465 	disp_idx = (int) ((va - footprint_region->cfr_vaddr) / effective_page_size);
21466 	cf_disp = footprint_region->cfr_disposition[disp_idx];
21467 	*disposition_p = vm_page_cf_disp_to_disposition(cf_disp);
21468 	kr = KERN_SUCCESS;
21469 done:
21470 //	if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p);
21471 	/* dtrace -n 'vminfo:::footprint_query_page_info { printf("map 0x%p va 0x%llx disp 0x%x kr 0x%x", arg0, arg1, arg2, arg3); }' */
21472 	DTRACE_VM4(footprint_query_page_info,
21473 	    vm_map_t, map,
21474 	    vm_map_offset_t, va,
21475 	    int, *disposition_p,
21476 	    kern_return_t, kr);
21477 
21478 	return kr;
21479 }
21480 
21481 void
vm_map_corpse_footprint_destroy(vm_map_t map)21482 vm_map_corpse_footprint_destroy(
21483 	vm_map_t        map)
21484 {
21485 	if (map->has_corpse_footprint &&
21486 	    map->vmmap_corpse_footprint != 0) {
21487 		struct vm_map_corpse_footprint_header *footprint_header;
21488 		vm_size_t buf_size;
21489 		kern_return_t kr;
21490 
21491 		footprint_header = map->vmmap_corpse_footprint;
21492 		buf_size = footprint_header->cf_size;
21493 		kr = vm_deallocate(kernel_map,
21494 		    (vm_offset_t) map->vmmap_corpse_footprint,
21495 		    ((vm_size_t) buf_size
21496 		    + PAGE_SIZE));                 /* trailing guard page */
21497 		assertf(kr == KERN_SUCCESS, "kr=0x%x\n", kr);
21498 		map->vmmap_corpse_footprint = 0;
21499 		map->has_corpse_footprint = FALSE;
21500 	}
21501 }
21502 
21503 /*
21504  * vm_map_copy_footprint_ledgers:
21505  *	copies any ledger that's relevant to the memory footprint of "old_task"
21506  *	into the forked corpse's task ("new_task")
21507  */
21508 void
vm_map_copy_footprint_ledgers(task_t old_task,task_t new_task)21509 vm_map_copy_footprint_ledgers(
21510 	task_t  old_task,
21511 	task_t  new_task)
21512 {
21513 	vm_map_copy_ledger(old_task, new_task, task_ledgers.phys_footprint);
21514 	vm_map_copy_ledger(old_task, new_task, task_ledgers.purgeable_nonvolatile);
21515 	vm_map_copy_ledger(old_task, new_task, task_ledgers.purgeable_nonvolatile_compressed);
21516 	vm_map_copy_ledger(old_task, new_task, task_ledgers.internal);
21517 	vm_map_copy_ledger(old_task, new_task, task_ledgers.internal_compressed);
21518 	vm_map_copy_ledger(old_task, new_task, task_ledgers.iokit_mapped);
21519 	vm_map_copy_ledger(old_task, new_task, task_ledgers.alternate_accounting);
21520 	vm_map_copy_ledger(old_task, new_task, task_ledgers.alternate_accounting_compressed);
21521 	vm_map_copy_ledger(old_task, new_task, task_ledgers.page_table);
21522 	vm_map_copy_ledger(old_task, new_task, task_ledgers.tagged_footprint);
21523 	vm_map_copy_ledger(old_task, new_task, task_ledgers.tagged_footprint_compressed);
21524 	vm_map_copy_ledger(old_task, new_task, task_ledgers.network_nonvolatile);
21525 	vm_map_copy_ledger(old_task, new_task, task_ledgers.network_nonvolatile_compressed);
21526 	vm_map_copy_ledger(old_task, new_task, task_ledgers.media_footprint);
21527 	vm_map_copy_ledger(old_task, new_task, task_ledgers.media_footprint_compressed);
21528 	vm_map_copy_ledger(old_task, new_task, task_ledgers.graphics_footprint);
21529 	vm_map_copy_ledger(old_task, new_task, task_ledgers.graphics_footprint_compressed);
21530 	vm_map_copy_ledger(old_task, new_task, task_ledgers.neural_footprint);
21531 	vm_map_copy_ledger(old_task, new_task, task_ledgers.neural_footprint_compressed);
21532 	vm_map_copy_ledger(old_task, new_task, task_ledgers.wired_mem);
21533 }
21534 
21535 /*
21536  * vm_map_copy_ledger:
21537  *	copy a single ledger from "old_task" to "new_task"
21538  */
21539 void
vm_map_copy_ledger(task_t old_task,task_t new_task,int ledger_entry)21540 vm_map_copy_ledger(
21541 	task_t  old_task,
21542 	task_t  new_task,
21543 	int     ledger_entry)
21544 {
21545 	ledger_amount_t old_balance, new_balance, delta;
21546 
21547 	assert(new_task->map->has_corpse_footprint);
21548 	if (!new_task->map->has_corpse_footprint) {
21549 		return;
21550 	}
21551 
21552 	/* turn off sanity checks for the ledger we're about to mess with */
21553 	ledger_disable_panic_on_negative(new_task->ledger,
21554 	    ledger_entry);
21555 
21556 	/* adjust "new_task" to match "old_task" */
21557 	ledger_get_balance(old_task->ledger,
21558 	    ledger_entry,
21559 	    &old_balance);
21560 	ledger_get_balance(new_task->ledger,
21561 	    ledger_entry,
21562 	    &new_balance);
21563 	if (new_balance == old_balance) {
21564 		/* new == old: done */
21565 	} else if (new_balance > old_balance) {
21566 		/* new > old ==> new -= new - old */
21567 		delta = new_balance - old_balance;
21568 		ledger_debit(new_task->ledger,
21569 		    ledger_entry,
21570 		    delta);
21571 	} else {
21572 		/* new < old ==> new += old - new */
21573 		delta = old_balance - new_balance;
21574 		ledger_credit(new_task->ledger,
21575 		    ledger_entry,
21576 		    delta);
21577 	}
21578 }
21579 
21580 /*
21581  * vm_map_get_pmap:
21582  * returns the pmap associated with the vm_map
21583  */
21584 pmap_t
vm_map_get_pmap(vm_map_t map)21585 vm_map_get_pmap(vm_map_t map)
21586 {
21587 	return vm_map_pmap(map);
21588 }
21589 
21590 #if MACH_ASSERT
21591 
21592 extern int pmap_ledgers_panic;
21593 extern int pmap_ledgers_panic_leeway;
21594 
21595 #define LEDGER_DRIFT(__LEDGER)                    \
21596 	int             __LEDGER##_over;          \
21597 	ledger_amount_t __LEDGER##_over_total;    \
21598 	ledger_amount_t __LEDGER##_over_max;      \
21599 	int             __LEDGER##_under;         \
21600 	ledger_amount_t __LEDGER##_under_total;   \
21601 	ledger_amount_t __LEDGER##_under_max
21602 
21603 struct {
21604 	uint64_t        num_pmaps_checked;
21605 
21606 	LEDGER_DRIFT(phys_footprint);
21607 	LEDGER_DRIFT(internal);
21608 	LEDGER_DRIFT(internal_compressed);
21609 	LEDGER_DRIFT(external);
21610 	LEDGER_DRIFT(reusable);
21611 	LEDGER_DRIFT(iokit_mapped);
21612 	LEDGER_DRIFT(alternate_accounting);
21613 	LEDGER_DRIFT(alternate_accounting_compressed);
21614 	LEDGER_DRIFT(page_table);
21615 	LEDGER_DRIFT(purgeable_volatile);
21616 	LEDGER_DRIFT(purgeable_nonvolatile);
21617 	LEDGER_DRIFT(purgeable_volatile_compressed);
21618 	LEDGER_DRIFT(purgeable_nonvolatile_compressed);
21619 	LEDGER_DRIFT(tagged_nofootprint);
21620 	LEDGER_DRIFT(tagged_footprint);
21621 	LEDGER_DRIFT(tagged_nofootprint_compressed);
21622 	LEDGER_DRIFT(tagged_footprint_compressed);
21623 	LEDGER_DRIFT(network_volatile);
21624 	LEDGER_DRIFT(network_nonvolatile);
21625 	LEDGER_DRIFT(network_volatile_compressed);
21626 	LEDGER_DRIFT(network_nonvolatile_compressed);
21627 	LEDGER_DRIFT(media_nofootprint);
21628 	LEDGER_DRIFT(media_footprint);
21629 	LEDGER_DRIFT(media_nofootprint_compressed);
21630 	LEDGER_DRIFT(media_footprint_compressed);
21631 	LEDGER_DRIFT(graphics_nofootprint);
21632 	LEDGER_DRIFT(graphics_footprint);
21633 	LEDGER_DRIFT(graphics_nofootprint_compressed);
21634 	LEDGER_DRIFT(graphics_footprint_compressed);
21635 	LEDGER_DRIFT(neural_nofootprint);
21636 	LEDGER_DRIFT(neural_footprint);
21637 	LEDGER_DRIFT(neural_nofootprint_compressed);
21638 	LEDGER_DRIFT(neural_footprint_compressed);
21639 } pmap_ledgers_drift;
21640 
21641 void
vm_map_pmap_check_ledgers(pmap_t pmap,ledger_t ledger,int pid,char * procname)21642 vm_map_pmap_check_ledgers(
21643 	pmap_t          pmap,
21644 	ledger_t        ledger,
21645 	int             pid,
21646 	char            *procname)
21647 {
21648 	ledger_amount_t bal;
21649 	boolean_t       do_panic;
21650 
21651 	do_panic = FALSE;
21652 
21653 	pmap_ledgers_drift.num_pmaps_checked++;
21654 
21655 #define LEDGER_CHECK_BALANCE(__LEDGER)                                  \
21656 MACRO_BEGIN                                                             \
21657 	int panic_on_negative = TRUE;                                   \
21658 	ledger_get_balance(ledger,                                      \
21659 	                   task_ledgers.__LEDGER,                       \
21660 	                   &bal);                                       \
21661 	ledger_get_panic_on_negative(ledger,                            \
21662 	                             task_ledgers.__LEDGER,             \
21663 	                             &panic_on_negative);               \
21664 	if (bal != 0) {                                                 \
21665 	        if (panic_on_negative ||                                \
21666 	            (pmap_ledgers_panic &&                              \
21667 	             pmap_ledgers_panic_leeway > 0 &&                   \
21668 	             (bal > (pmap_ledgers_panic_leeway * PAGE_SIZE) ||  \
21669 	              bal < (-pmap_ledgers_panic_leeway * PAGE_SIZE)))) { \
21670 	                do_panic = TRUE;                                \
21671 	        }                                                       \
21672 	        printf("LEDGER BALANCE proc %d (%s) "                   \
21673 	               "\"%s\" = %lld\n",                               \
21674 	               pid, procname, #__LEDGER, bal);                  \
21675 	        if (bal > 0) {                                          \
21676 	                pmap_ledgers_drift.__LEDGER##_over++;           \
21677 	                pmap_ledgers_drift.__LEDGER##_over_total += bal; \
21678 	                if (bal > pmap_ledgers_drift.__LEDGER##_over_max) { \
21679 	                        pmap_ledgers_drift.__LEDGER##_over_max = bal; \
21680 	                }                                               \
21681 	        } else if (bal < 0) {                                   \
21682 	                pmap_ledgers_drift.__LEDGER##_under++;          \
21683 	                pmap_ledgers_drift.__LEDGER##_under_total += bal; \
21684 	                if (bal < pmap_ledgers_drift.__LEDGER##_under_max) { \
21685 	                        pmap_ledgers_drift.__LEDGER##_under_max = bal; \
21686 	                }                                               \
21687 	        }                                                       \
21688 	}                                                               \
21689 MACRO_END
21690 
21691 	LEDGER_CHECK_BALANCE(phys_footprint);
21692 	LEDGER_CHECK_BALANCE(internal);
21693 	LEDGER_CHECK_BALANCE(internal_compressed);
21694 	LEDGER_CHECK_BALANCE(external);
21695 	LEDGER_CHECK_BALANCE(reusable);
21696 	LEDGER_CHECK_BALANCE(iokit_mapped);
21697 	LEDGER_CHECK_BALANCE(alternate_accounting);
21698 	LEDGER_CHECK_BALANCE(alternate_accounting_compressed);
21699 	LEDGER_CHECK_BALANCE(page_table);
21700 	LEDGER_CHECK_BALANCE(purgeable_volatile);
21701 	LEDGER_CHECK_BALANCE(purgeable_nonvolatile);
21702 	LEDGER_CHECK_BALANCE(purgeable_volatile_compressed);
21703 	LEDGER_CHECK_BALANCE(purgeable_nonvolatile_compressed);
21704 	LEDGER_CHECK_BALANCE(tagged_nofootprint);
21705 	LEDGER_CHECK_BALANCE(tagged_footprint);
21706 	LEDGER_CHECK_BALANCE(tagged_nofootprint_compressed);
21707 	LEDGER_CHECK_BALANCE(tagged_footprint_compressed);
21708 	LEDGER_CHECK_BALANCE(network_volatile);
21709 	LEDGER_CHECK_BALANCE(network_nonvolatile);
21710 	LEDGER_CHECK_BALANCE(network_volatile_compressed);
21711 	LEDGER_CHECK_BALANCE(network_nonvolatile_compressed);
21712 	LEDGER_CHECK_BALANCE(media_nofootprint);
21713 	LEDGER_CHECK_BALANCE(media_footprint);
21714 	LEDGER_CHECK_BALANCE(media_nofootprint_compressed);
21715 	LEDGER_CHECK_BALANCE(media_footprint_compressed);
21716 	LEDGER_CHECK_BALANCE(graphics_nofootprint);
21717 	LEDGER_CHECK_BALANCE(graphics_footprint);
21718 	LEDGER_CHECK_BALANCE(graphics_nofootprint_compressed);
21719 	LEDGER_CHECK_BALANCE(graphics_footprint_compressed);
21720 	LEDGER_CHECK_BALANCE(neural_nofootprint);
21721 	LEDGER_CHECK_BALANCE(neural_footprint);
21722 	LEDGER_CHECK_BALANCE(neural_nofootprint_compressed);
21723 	LEDGER_CHECK_BALANCE(neural_footprint_compressed);
21724 
21725 	if (do_panic) {
21726 		if (pmap_ledgers_panic) {
21727 			panic("pmap_destroy(%p) %d[%s] has imbalanced ledgers",
21728 			    pmap, pid, procname);
21729 		} else {
21730 			printf("pmap_destroy(%p) %d[%s] has imbalanced ledgers\n",
21731 			    pmap, pid, procname);
21732 		}
21733 	}
21734 }
21735 #endif /* MACH_ASSERT */
21736