xref: /xnu-8020.121.3/osfmk/vm/vm_map.c (revision fdd8201d7b966f0c3ea610489d29bd841d358941)
1 /*
2  * Copyright (c) 2000-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * @OSF_COPYRIGHT@
30  */
31 /*
32  * Mach Operating System
33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34  * All Rights Reserved.
35  *
36  * Permission to use, copy, modify and distribute this software and its
37  * documentation is hereby granted, provided that both the copyright
38  * notice and this permission notice appear in all copies of the
39  * software, derivative works or modified versions, and any portions
40  * thereof, and that both notices appear in supporting documentation.
41  *
42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45  *
46  * Carnegie Mellon requests users of this software to return to
47  *
48  *  Software Distribution Coordinator  or  [email protected]
49  *  School of Computer Science
50  *  Carnegie Mellon University
51  *  Pittsburgh PA 15213-3890
52  *
53  * any improvements or extensions that they make and grant Carnegie Mellon
54  * the rights to redistribute these changes.
55  */
56 /*
57  */
58 /*
59  *	File:	vm/vm_map.c
60  *	Author:	Avadis Tevanian, Jr., Michael Wayne Young
61  *	Date:	1985
62  *
63  *	Virtual memory mapping module.
64  */
65 
66 #include <mach_assert.h>
67 
68 #include <vm/vm_options.h>
69 
70 #include <libkern/OSAtomic.h>
71 
72 #include <mach/kern_return.h>
73 #include <mach/port.h>
74 #include <mach/vm_attributes.h>
75 #include <mach/vm_param.h>
76 #include <mach/vm_behavior.h>
77 #include <mach/vm_statistics.h>
78 #include <mach/memory_object.h>
79 #include <mach/mach_vm.h>
80 #include <machine/cpu_capabilities.h>
81 #include <mach/sdt.h>
82 
83 #include <kern/assert.h>
84 #include <kern/backtrace.h>
85 #include <kern/counter.h>
86 #include <kern/exc_guard.h>
87 #include <kern/kalloc.h>
88 #include <kern/zalloc_internal.h>
89 
90 #include <vm/cpm.h>
91 #include <vm/vm_compressor.h>
92 #include <vm/vm_compressor_pager.h>
93 #include <vm/vm_init.h>
94 #include <vm/vm_fault.h>
95 #include <vm/vm_map_internal.h>
96 #include <vm/vm_object.h>
97 #include <vm/vm_page.h>
98 #include <vm/vm_pageout.h>
99 #include <vm/pmap.h>
100 #include <vm/vm_kern.h>
101 #include <ipc/ipc_port.h>
102 #include <kern/sched_prim.h>
103 #include <kern/misc_protos.h>
104 
105 #include <mach/vm_map_server.h>
106 #include <mach/mach_host_server.h>
107 #include <vm/vm_protos.h>
108 #include <vm/vm_purgeable_internal.h>
109 
110 #include <vm/vm_protos.h>
111 #include <vm/vm_shared_region.h>
112 #include <vm/vm_map_store.h>
113 
114 #include <san/kasan.h>
115 
116 #include <sys/resource.h>
117 #include <sys/codesign.h>
118 #include <sys/mman.h>
119 #include <sys/reboot.h>
120 #include <sys/kdebug_triage.h>
121 
122 #if __LP64__
123 #define HAVE_VM_MAP_RESERVED_ENTRY_ZONE 0
124 #else
125 #define HAVE_VM_MAP_RESERVED_ENTRY_ZONE 1
126 #endif
127 
128 #include <libkern/section_keywords.h>
129 #if DEVELOPMENT || DEBUG
130 extern int proc_selfcsflags(void);
131 int panic_on_unsigned_execute = 0;
132 int panic_on_mlock_failure = 0;
133 #endif /* DEVELOPMENT || DEBUG */
134 
135 #if MACH_ASSERT
136 int debug4k_filter = 0;
137 char debug4k_proc_name[1024] = "";
138 int debug4k_proc_filter = (int)-1 & ~(1 << __DEBUG4K_FAULT);
139 int debug4k_panic_on_misaligned_sharing = 0;
140 const char *debug4k_category_name[] = {
141 	"error",        /* 0 */
142 	"life",         /* 1 */
143 	"load",         /* 2 */
144 	"fault",        /* 3 */
145 	"copy",         /* 4 */
146 	"share",        /* 5 */
147 	"adjust",       /* 6 */
148 	"pmap",         /* 7 */
149 	"mementry",     /* 8 */
150 	"iokit",        /* 9 */
151 	"upl",          /* 10 */
152 	"exc",          /* 11 */
153 	"vfs"           /* 12 */
154 };
155 #endif /* MACH_ASSERT */
156 int debug4k_no_cow_copyin = 0;
157 
158 
159 #if __arm64__
160 extern const int fourk_binary_compatibility_unsafe;
161 extern const int fourk_binary_compatibility_allow_wx;
162 #endif /* __arm64__ */
163 extern int proc_selfpid(void);
164 extern char *proc_name_address(void *p);
165 
166 #if VM_MAP_DEBUG_APPLE_PROTECT
167 int vm_map_debug_apple_protect = 0;
168 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
169 #if VM_MAP_DEBUG_FOURK
170 int vm_map_debug_fourk = 0;
171 #endif /* VM_MAP_DEBUG_FOURK */
172 
173 #if DEBUG || DEVELOPMENT
174 static TUNABLE(bool, vm_map_executable_immutable,
175     "vm_map_executable_immutable", true);
176 #else
177 #define vm_map_executable_immutable true
178 #endif
179 
180 os_refgrp_decl(static, map_refgrp, "vm_map", NULL);
181 
182 extern u_int32_t random(void);  /* from <libkern/libkern.h> */
183 /* Internal prototypes
184  */
185 
186 typedef struct vm_map_zap {
187 	vm_map_entry_t          vmz_head;
188 	vm_map_entry_t         *vmz_tail;
189 } *vm_map_zap_t;
190 
191 #define VM_MAP_ZAP_DECLARE(zap) \
192 	struct vm_map_zap zap = { .vmz_tail = &zap.vmz_head }
193 
194 static vm_map_entry_t   vm_map_entry_insert(
195 	vm_map_t                map,
196 	vm_map_entry_t          insp_entry,
197 	vm_map_offset_t         start,
198 	vm_map_offset_t         end,
199 	vm_object_t             object,
200 	vm_object_offset_t      offset,
201 	vm_map_kernel_flags_t   vmk_flags,
202 	boolean_t               needs_copy,
203 	vm_prot_t               cur_protection,
204 	vm_prot_t               max_protection,
205 	vm_inherit_t            inheritance,
206 	boolean_t               no_cache,
207 	boolean_t               permanent,
208 	boolean_t               no_copy_on_read,
209 	unsigned int            superpage_size,
210 	boolean_t               clear_map_aligned,
211 	boolean_t               is_submap,
212 	boolean_t               used_for_jit,
213 	int                     alias,
214 	boolean_t               translated_allow_execute);
215 
216 static void vm_map_simplify_range(
217 	vm_map_t        map,
218 	vm_map_offset_t start,
219 	vm_map_offset_t end);   /* forward */
220 
221 static boolean_t        vm_map_range_check(
222 	vm_map_t        map,
223 	vm_map_offset_t start,
224 	vm_map_offset_t end,
225 	vm_map_entry_t  *entry);
226 
227 static void vm_map_submap_pmap_clean(
228 	vm_map_t        map,
229 	vm_map_offset_t start,
230 	vm_map_offset_t end,
231 	vm_map_t        sub_map,
232 	vm_map_offset_t offset);
233 
234 static void             vm_map_pmap_enter(
235 	vm_map_t                map,
236 	vm_map_offset_t         addr,
237 	vm_map_offset_t         end_addr,
238 	vm_object_t             object,
239 	vm_object_offset_t      offset,
240 	vm_prot_t               protection);
241 
242 static void             _vm_map_clip_end(
243 	struct vm_map_header    *map_header,
244 	vm_map_entry_t          entry,
245 	vm_map_offset_t         end);
246 
247 static void             _vm_map_clip_start(
248 	struct vm_map_header    *map_header,
249 	vm_map_entry_t          entry,
250 	vm_map_offset_t         start);
251 
252 static kern_return_t    vm_map_delete(
253 	vm_map_t        map,
254 	vm_map_offset_t start,
255 	vm_map_offset_t end,
256 	vmr_flags_t     flags,
257 	vm_map_zap_t    zap);
258 
259 static void             vm_map_copy_insert(
260 	vm_map_t        map,
261 	vm_map_entry_t  after_where,
262 	vm_map_copy_t   copy);
263 
264 static kern_return_t    vm_map_copy_overwrite_unaligned(
265 	vm_map_t        dst_map,
266 	vm_map_entry_t  entry,
267 	vm_map_copy_t   copy,
268 	vm_map_address_t start,
269 	boolean_t       discard_on_success);
270 
271 static kern_return_t    vm_map_copy_overwrite_aligned(
272 	vm_map_t        dst_map,
273 	vm_map_entry_t  tmp_entry,
274 	vm_map_copy_t   copy,
275 	vm_map_offset_t start,
276 	pmap_t          pmap);
277 
278 static kern_return_t    vm_map_copyin_kernel_buffer(
279 	vm_map_t        src_map,
280 	vm_map_address_t src_addr,
281 	vm_map_size_t   len,
282 	boolean_t       src_destroy,
283 	vm_map_copy_t   *copy_result);  /* OUT */
284 
285 static kern_return_t    vm_map_copyout_kernel_buffer(
286 	vm_map_t        map,
287 	vm_map_address_t *addr, /* IN/OUT */
288 	vm_map_copy_t   copy,
289 	vm_map_size_t   copy_size,
290 	boolean_t       overwrite,
291 	boolean_t       consume_on_success);
292 
293 static void             vm_map_fork_share(
294 	vm_map_t        old_map,
295 	vm_map_entry_t  old_entry,
296 	vm_map_t        new_map);
297 
298 static boolean_t        vm_map_fork_copy(
299 	vm_map_t        old_map,
300 	vm_map_entry_t  *old_entry_p,
301 	vm_map_t        new_map,
302 	int             vm_map_copyin_flags);
303 
304 static kern_return_t    vm_map_wire_nested(
305 	vm_map_t                   map,
306 	vm_map_offset_t            start,
307 	vm_map_offset_t            end,
308 	vm_prot_t                  caller_prot,
309 	vm_tag_t                   tag,
310 	boolean_t                  user_wire,
311 	pmap_t                     map_pmap,
312 	vm_map_offset_t            pmap_addr,
313 	ppnum_t                    *physpage_p);
314 
315 static kern_return_t    vm_map_unwire_nested(
316 	vm_map_t                   map,
317 	vm_map_offset_t            start,
318 	vm_map_offset_t            end,
319 	boolean_t                  user_wire,
320 	pmap_t                     map_pmap,
321 	vm_map_offset_t            pmap_addr);
322 
323 static kern_return_t    vm_map_overwrite_submap_recurse(
324 	vm_map_t                   dst_map,
325 	vm_map_offset_t            dst_addr,
326 	vm_map_size_t              dst_size);
327 
328 static kern_return_t    vm_map_copy_overwrite_nested(
329 	vm_map_t                   dst_map,
330 	vm_map_offset_t            dst_addr,
331 	vm_map_copy_t              copy,
332 	boolean_t                  interruptible,
333 	pmap_t                     pmap,
334 	boolean_t                  discard_on_success);
335 
336 static kern_return_t    vm_map_remap_extract(
337 	vm_map_t                map,
338 	vm_map_offset_t         addr,
339 	vm_map_size_t           size,
340 	boolean_t               copy,
341 	struct vm_map_header    *map_header,
342 	vm_prot_t               *cur_protection,
343 	vm_prot_t               *max_protection,
344 	vm_inherit_t            inheritance,
345 	vm_map_kernel_flags_t   vmk_flags);
346 
347 static kern_return_t    vm_map_remap_range_allocate(
348 	vm_map_t                map,
349 	vm_map_address_t        *address,
350 	vm_map_size_t           size,
351 	vm_map_offset_t         mask,
352 	int                     flags,
353 	vm_map_kernel_flags_t   vmk_flags,
354 	vm_tag_t                tag,
355 	vm_map_entry_t          *map_entry,
356 	vm_map_zap_t            zap_list);
357 
358 static void             vm_map_region_look_for_page(
359 	vm_map_t                   map,
360 	vm_map_offset_t            va,
361 	vm_object_t                object,
362 	vm_object_offset_t         offset,
363 	int                        max_refcnt,
364 	unsigned short             depth,
365 	vm_region_extended_info_t  extended,
366 	mach_msg_type_number_t count);
367 
368 static int              vm_map_region_count_obj_refs(
369 	vm_map_entry_t             entry,
370 	vm_object_t                object);
371 
372 
373 static kern_return_t    vm_map_willneed(
374 	vm_map_t        map,
375 	vm_map_offset_t start,
376 	vm_map_offset_t end);
377 
378 static kern_return_t    vm_map_reuse_pages(
379 	vm_map_t        map,
380 	vm_map_offset_t start,
381 	vm_map_offset_t end);
382 
383 static kern_return_t    vm_map_reusable_pages(
384 	vm_map_t        map,
385 	vm_map_offset_t start,
386 	vm_map_offset_t end);
387 
388 static kern_return_t    vm_map_can_reuse(
389 	vm_map_t        map,
390 	vm_map_offset_t start,
391 	vm_map_offset_t end);
392 
393 #if MACH_ASSERT
394 static kern_return_t    vm_map_pageout(
395 	vm_map_t        map,
396 	vm_map_offset_t start,
397 	vm_map_offset_t end);
398 #endif /* MACH_ASSERT */
399 
400 kern_return_t vm_map_corpse_footprint_collect(
401 	vm_map_t        old_map,
402 	vm_map_entry_t  old_entry,
403 	vm_map_t        new_map);
404 void vm_map_corpse_footprint_collect_done(
405 	vm_map_t        new_map);
406 void vm_map_corpse_footprint_destroy(
407 	vm_map_t        map);
408 kern_return_t vm_map_corpse_footprint_query_page_info(
409 	vm_map_t        map,
410 	vm_map_offset_t va,
411 	int             *disposition_p);
412 void vm_map_footprint_query_page_info(
413 	vm_map_t        map,
414 	vm_map_entry_t  map_entry,
415 	vm_map_offset_t curr_s_offset,
416 	int             *disposition_p);
417 
418 pid_t find_largest_process_vm_map_entries(void);
419 
420 extern int exit_with_guard_exception(void *p, mach_exception_data_type_t code,
421     mach_exception_data_type_t subcode);
422 
423 /*
424  * Macros to copy a vm_map_entry. We must be careful to correctly
425  * manage the wired page count. vm_map_entry_copy() creates a new
426  * map entry to the same memory - the wired count in the new entry
427  * must be set to zero. vm_map_entry_copy_full() creates a new
428  * entry that is identical to the old entry.  This preserves the
429  * wire count; it's used for map splitting and zone changing in
430  * vm_map_copyout.
431  */
432 
433 static inline void
vm_map_entry_copy_pmap_cs_assoc(vm_map_t map __unused,vm_map_entry_t new __unused,vm_map_entry_t old __unused)434 vm_map_entry_copy_pmap_cs_assoc(
435 	vm_map_t map __unused,
436 	vm_map_entry_t new __unused,
437 	vm_map_entry_t old __unused)
438 {
439 	/* when pmap_cs is not enabled, assert as a sanity check */
440 	assert(new->pmap_cs_associated == FALSE);
441 }
442 
443 /*
444  * The "used_for_jit" flag was copied from OLD to NEW in vm_map_entry_copy().
445  * But for security reasons on some platforms, we don't want the
446  * new mapping to be "used for jit", so we reset the flag here.
447  */
448 static inline void
vm_map_entry_copy_code_signing(vm_map_t map,vm_map_entry_t new,vm_map_entry_t old __unused)449 vm_map_entry_copy_code_signing(
450 	vm_map_t map,
451 	vm_map_entry_t new,
452 	vm_map_entry_t old __unused)
453 {
454 	if (VM_MAP_POLICY_ALLOW_JIT_COPY(map)) {
455 		assert(new->used_for_jit == old->used_for_jit);
456 	} else {
457 		new->used_for_jit = FALSE;
458 	}
459 }
460 
461 static inline void
vm_map_entry_copy_full(vm_map_entry_t new,vm_map_entry_t old)462 vm_map_entry_copy_full(
463 	vm_map_entry_t new,
464 	vm_map_entry_t old)
465 {
466 #if MAP_ENTRY_CREATION_DEBUG
467 	btref_put(new->vme_creation_bt);
468 	btref_retain(old->vme_creation_bt);
469 #endif
470 #if MAP_ENTRY_INSERTION_DEBUG
471 	btref_put(new->vme_insertion_bt);
472 	btref_retain(old->vme_insertion_bt);
473 #endif
474 	*new = *old;
475 }
476 
477 static inline void
vm_map_entry_copy(vm_map_t map,vm_map_entry_t new,vm_map_entry_t old)478 vm_map_entry_copy(
479 	vm_map_t map,
480 	vm_map_entry_t new,
481 	vm_map_entry_t old)
482 {
483 	vm_map_entry_copy_full(new, old);
484 
485 	new->is_shared = FALSE;
486 	new->needs_wakeup = FALSE;
487 	new->in_transition = FALSE;
488 	new->wired_count = 0;
489 	new->user_wired_count = 0;
490 	new->permanent = FALSE;
491 	vm_map_entry_copy_code_signing(map, new, old);
492 	vm_map_entry_copy_pmap_cs_assoc(map, new, old);
493 	if (new->iokit_acct) {
494 		assertf(!new->use_pmap, "old %p new %p\n", old, new);
495 		new->iokit_acct = FALSE;
496 		new->use_pmap = TRUE;
497 	}
498 	new->vme_resilient_codesign = FALSE;
499 	new->vme_resilient_media = FALSE;
500 	new->vme_atomic = FALSE;
501 	new->vme_no_copy_on_read = FALSE;
502 }
503 
504 /*
505  * Normal lock_read_to_write() returns FALSE/0 on failure.
506  * These functions evaluate to zero on success and non-zero value on failure.
507  */
508 __attribute__((always_inline))
509 int
vm_map_lock_read_to_write(vm_map_t map)510 vm_map_lock_read_to_write(vm_map_t map)
511 {
512 	if (lck_rw_lock_shared_to_exclusive(&(map)->lock)) {
513 		DTRACE_VM(vm_map_lock_upgrade);
514 		return 0;
515 	}
516 	return 1;
517 }
518 
519 __attribute__((always_inline))
520 boolean_t
vm_map_try_lock(vm_map_t map)521 vm_map_try_lock(vm_map_t map)
522 {
523 	if (lck_rw_try_lock_exclusive(&(map)->lock)) {
524 		DTRACE_VM(vm_map_lock_w);
525 		return TRUE;
526 	}
527 	return FALSE;
528 }
529 
530 __attribute__((always_inline))
531 boolean_t
vm_map_try_lock_read(vm_map_t map)532 vm_map_try_lock_read(vm_map_t map)
533 {
534 	if (lck_rw_try_lock_shared(&(map)->lock)) {
535 		DTRACE_VM(vm_map_lock_r);
536 		return TRUE;
537 	}
538 	return FALSE;
539 }
540 
541 /*
542  * Routines to get the page size the caller should
543  * use while inspecting the target address space.
544  * Use the "_safely" variant if the caller is dealing with a user-provided
545  * array whose size depends on the page size, to avoid any overflow or
546  * underflow of a user-allocated buffer.
547  */
548 int
vm_self_region_page_shift_safely(vm_map_t target_map)549 vm_self_region_page_shift_safely(
550 	vm_map_t target_map)
551 {
552 	int effective_page_shift = 0;
553 
554 	if (PAGE_SIZE == (4096)) {
555 		/* x86_64 and 4k watches: always use 4k */
556 		return PAGE_SHIFT;
557 	}
558 	/* did caller provide an explicit page size for this thread to use? */
559 	effective_page_shift = thread_self_region_page_shift();
560 	if (effective_page_shift) {
561 		/* use the explicitly-provided page size */
562 		return effective_page_shift;
563 	}
564 	/* no explicit page size: use the caller's page size... */
565 	effective_page_shift = VM_MAP_PAGE_SHIFT(current_map());
566 	if (effective_page_shift == VM_MAP_PAGE_SHIFT(target_map)) {
567 		/* page size match: safe to use */
568 		return effective_page_shift;
569 	}
570 	/* page size mismatch */
571 	return -1;
572 }
573 int
vm_self_region_page_shift(vm_map_t target_map)574 vm_self_region_page_shift(
575 	vm_map_t target_map)
576 {
577 	int effective_page_shift;
578 
579 	effective_page_shift = vm_self_region_page_shift_safely(target_map);
580 	if (effective_page_shift == -1) {
581 		/* no safe value but OK to guess for caller */
582 		effective_page_shift = MIN(VM_MAP_PAGE_SHIFT(current_map()),
583 		    VM_MAP_PAGE_SHIFT(target_map));
584 	}
585 	return effective_page_shift;
586 }
587 
588 
589 /*
590  *	Decide if we want to allow processes to execute from their data or stack areas.
591  *	override_nx() returns true if we do.  Data/stack execution can be enabled independently
592  *	for 32 and 64 bit processes.  Set the VM_ABI_32 or VM_ABI_64 flags in allow_data_exec
593  *	or allow_stack_exec to enable data execution for that type of data area for that particular
594  *	ABI (or both by or'ing the flags together).  These are initialized in the architecture
595  *	specific pmap files since the default behavior varies according to architecture.  The
596  *	main reason it varies is because of the need to provide binary compatibility with old
597  *	applications that were written before these restrictions came into being.  In the old
598  *	days, an app could execute anything it could read, but this has slowly been tightened
599  *	up over time.  The default behavior is:
600  *
601  *	32-bit PPC apps		may execute from both stack and data areas
602  *	32-bit Intel apps	may exeucte from data areas but not stack
603  *	64-bit PPC/Intel apps	may not execute from either data or stack
604  *
605  *	An application on any architecture may override these defaults by explicitly
606  *	adding PROT_EXEC permission to the page in question with the mprotect(2)
607  *	system call.  This code here just determines what happens when an app tries to
608  *      execute from a page that lacks execute permission.
609  *
610  *	Note that allow_data_exec or allow_stack_exec may also be modified by sysctl to change the
611  *	default behavior for both 32 and 64 bit apps on a system-wide basis. Furthermore,
612  *	a Mach-O header flag bit (MH_NO_HEAP_EXECUTION) can be used to forcibly disallow
613  *	execution from data areas for a particular binary even if the arch normally permits it. As
614  *	a final wrinkle, a posix_spawn attribute flag can be used to negate this opt-in header bit
615  *	to support some complicated use cases, notably browsers with out-of-process plugins that
616  *	are not all NX-safe.
617  */
618 
619 extern int allow_data_exec, allow_stack_exec;
620 
621 int
override_nx(vm_map_t map,uint32_t user_tag)622 override_nx(vm_map_t map, uint32_t user_tag) /* map unused on arm */
623 {
624 	int current_abi;
625 
626 	if (map->pmap == kernel_pmap) {
627 		return FALSE;
628 	}
629 
630 	/*
631 	 * Determine if the app is running in 32 or 64 bit mode.
632 	 */
633 
634 	if (vm_map_is_64bit(map)) {
635 		current_abi = VM_ABI_64;
636 	} else {
637 		current_abi = VM_ABI_32;
638 	}
639 
640 	/*
641 	 * Determine if we should allow the execution based on whether it's a
642 	 * stack or data area and the current architecture.
643 	 */
644 
645 	if (user_tag == VM_MEMORY_STACK) {
646 		return allow_stack_exec & current_abi;
647 	}
648 
649 	return (allow_data_exec & current_abi) && (map->map_disallow_data_exec == FALSE);
650 }
651 
652 
653 /*
654  *	Virtual memory maps provide for the mapping, protection,
655  *	and sharing of virtual memory objects.  In addition,
656  *	this module provides for an efficient virtual copy of
657  *	memory from one map to another.
658  *
659  *	Synchronization is required prior to most operations.
660  *
661  *	Maps consist of an ordered doubly-linked list of simple
662  *	entries; a single hint is used to speed up lookups.
663  *
664  *	Sharing maps have been deleted from this version of Mach.
665  *	All shared objects are now mapped directly into the respective
666  *	maps.  This requires a change in the copy on write strategy;
667  *	the asymmetric (delayed) strategy is used for shared temporary
668  *	objects instead of the symmetric (shadow) strategy.  All maps
669  *	are now "top level" maps (either task map, kernel map or submap
670  *	of the kernel map).
671  *
672  *	Since portions of maps are specified by start/end addreses,
673  *	which may not align with existing map entries, all
674  *	routines merely "clip" entries to these start/end values.
675  *	[That is, an entry is split into two, bordering at a
676  *	start or end value.]  Note that these clippings may not
677  *	always be necessary (as the two resulting entries are then
678  *	not changed); however, the clipping is done for convenience.
679  *	No attempt is currently made to "glue back together" two
680  *	abutting entries.
681  *
682  *	The symmetric (shadow) copy strategy implements virtual copy
683  *	by copying VM object references from one map to
684  *	another, and then marking both regions as copy-on-write.
685  *	It is important to note that only one writeable reference
686  *	to a VM object region exists in any map when this strategy
687  *	is used -- this means that shadow object creation can be
688  *	delayed until a write operation occurs.  The symmetric (delayed)
689  *	strategy allows multiple maps to have writeable references to
690  *	the same region of a vm object, and hence cannot delay creating
691  *	its copy objects.  See vm_object_copy_quickly() in vm_object.c.
692  *	Copying of permanent objects is completely different; see
693  *	vm_object_copy_strategically() in vm_object.c.
694  */
695 
696 static SECURITY_READ_ONLY_LATE(zone_t) vm_map_zone;       /* zone for vm_map structures */
697 static SECURITY_READ_ONLY_LATE(zone_t) vm_map_copy_zone;  /* zone for vm_map_copy structures */
698 
699 SECURITY_READ_ONLY_LATE(zone_t)        vm_map_entry_zone; /* zone for vm_map_entry structures */
700 SECURITY_READ_ONLY_LATE(zone_t)        vm_map_holes_zone; /* zone for vm map holes (vm_map_links) structures */
701 #if HAVE_VM_MAP_RESERVED_ENTRY_ZONE
702 SECURITY_READ_ONLY_LATE(zone_t)        vm_map_entry_reserved_zone;
703 #endif /* HAVE_VM_MAP_RESERVED_ENTRY_ZONE */
704 
705 #define VM_MAP_ZONE_NAME "maps"
706 #define VM_MAP_ZFLAGS ( \
707 	ZC_NOENCRYPT | \
708 	ZC_NOGZALLOC | \
709 	ZC_VM_LP64)
710 
711 #define VM_MAP_ENTRY_ZONE_NAME "VM map entries"
712 #define VM_MAP_ENTRY_ZFLAGS ( \
713 	ZC_NOENCRYPT | \
714 	ZC_CACHING | \
715 	ZC_NOGZALLOC | \
716 	ZC_KASAN_NOQUARANTINE | \
717 	ZC_VM_LP64)
718 
719 #if HAVE_VM_MAP_RESERVED_ENTRY_ZONE
720 #define VM_MAP_ENTRY_RESERVED_ZONE_NAME "Reserved VM map entries"
721 #define VM_MAP_ENTRY_RESERVED_ZFLAGS ( \
722 	ZC_NOENCRYPT | \
723 	ZC_NOCACHING | \
724 	ZC_NOGZALLOC | \
725 	ZC_KASAN_NOQUARANTINE | \
726 	ZC_VM)
727 #endif /* HAVE_VM_MAP_RESERVED_ENTRY_ZONE */
728 
729 #define VM_MAP_HOLES_ZONE_NAME "VM map holes"
730 #define VM_MAP_HOLES_ZFLAGS ( \
731 	ZC_NOENCRYPT | \
732 	ZC_CACHING | \
733 	ZC_NOGZALLOC | \
734 	ZC_KASAN_NOQUARANTINE | \
735 	ZC_VM_LP64)
736 
737 /*
738  * Asserts that a vm_map_copy object is coming from the
739  * vm_map_copy_zone to ensure that it isn't a fake constructed
740  * anywhere else.
741  */
742 void
vm_map_copy_require(struct vm_map_copy * copy)743 vm_map_copy_require(struct vm_map_copy *copy)
744 {
745 	zone_id_require(ZONE_ID_VM_MAP_COPY, sizeof(struct vm_map_copy), copy);
746 }
747 
748 /*
749  *	vm_map_require:
750  *
751  *	Ensures that the argument is memory allocated from the genuine
752  *	vm map zone. (See zone_id_require_allow_foreign).
753  */
754 void
vm_map_require(vm_map_t map)755 vm_map_require(vm_map_t map)
756 {
757 	zone_id_require(ZONE_ID_VM_MAP, sizeof(struct _vm_map), map);
758 }
759 
760 #define VM_MAP_EARLY_COUNT_MAX         16
761 static __startup_data vm_offset_t      map_data;
762 static __startup_data vm_size_t        map_data_size;
763 static __startup_data vm_offset_t      kentry_data;
764 static __startup_data vm_size_t        kentry_data_size;
765 static __startup_data vm_offset_t      map_holes_data;
766 static __startup_data vm_size_t        map_holes_data_size;
767 static __startup_data vm_map_t        *early_map_owners[VM_MAP_EARLY_COUNT_MAX];
768 static __startup_data uint32_t         early_map_count;
769 
770 #if XNU_TARGET_OS_OSX
771 #define         NO_COALESCE_LIMIT  ((1024 * 128) - 1)
772 #else /* XNU_TARGET_OS_OSX */
773 #define         NO_COALESCE_LIMIT  0
774 #endif /* XNU_TARGET_OS_OSX */
775 
776 /* Skip acquiring locks if we're in the midst of a kernel core dump */
777 unsigned int not_in_kdp = 1;
778 
779 unsigned int vm_map_set_cache_attr_count = 0;
780 
781 kern_return_t
vm_map_set_cache_attr(vm_map_t map,vm_map_offset_t va)782 vm_map_set_cache_attr(
783 	vm_map_t        map,
784 	vm_map_offset_t va)
785 {
786 	vm_map_entry_t  map_entry;
787 	vm_object_t     object;
788 	kern_return_t   kr = KERN_SUCCESS;
789 
790 	vm_map_lock_read(map);
791 
792 	if (!vm_map_lookup_entry(map, va, &map_entry) ||
793 	    map_entry->is_sub_map) {
794 		/*
795 		 * that memory is not properly mapped
796 		 */
797 		kr = KERN_INVALID_ARGUMENT;
798 		goto done;
799 	}
800 	object = VME_OBJECT(map_entry);
801 
802 	if (object == VM_OBJECT_NULL) {
803 		/*
804 		 * there should be a VM object here at this point
805 		 */
806 		kr = KERN_INVALID_ARGUMENT;
807 		goto done;
808 	}
809 	vm_object_lock(object);
810 	object->set_cache_attr = TRUE;
811 	vm_object_unlock(object);
812 
813 	vm_map_set_cache_attr_count++;
814 done:
815 	vm_map_unlock_read(map);
816 
817 	return kr;
818 }
819 
820 
821 #if CONFIG_CODE_DECRYPTION
822 /*
823  * vm_map_apple_protected:
824  * This remaps the requested part of the object with an object backed by
825  * the decrypting pager.
826  * crypt_info contains entry points and session data for the crypt module.
827  * The crypt_info block will be copied by vm_map_apple_protected. The data structures
828  * referenced in crypt_info must remain valid until crypt_info->crypt_end() is called.
829  */
830 kern_return_t
vm_map_apple_protected(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_object_offset_t crypto_backing_offset,struct pager_crypt_info * crypt_info,uint32_t cryptid)831 vm_map_apple_protected(
832 	vm_map_t                map,
833 	vm_map_offset_t         start,
834 	vm_map_offset_t         end,
835 	vm_object_offset_t      crypto_backing_offset,
836 	struct pager_crypt_info *crypt_info,
837 	uint32_t                cryptid)
838 {
839 	boolean_t       map_locked;
840 	kern_return_t   kr;
841 	vm_map_entry_t  map_entry;
842 	struct vm_map_entry tmp_entry;
843 	memory_object_t unprotected_mem_obj;
844 	vm_object_t     protected_object;
845 	vm_map_offset_t map_addr;
846 	vm_map_offset_t start_aligned, end_aligned;
847 	vm_object_offset_t      crypto_start, crypto_end;
848 	int             vm_flags;
849 	vm_map_kernel_flags_t vmk_flags;
850 	boolean_t       cache_pager;
851 
852 	vm_flags = 0;
853 	vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
854 
855 	map_locked = FALSE;
856 	unprotected_mem_obj = MEMORY_OBJECT_NULL;
857 
858 	start_aligned = vm_map_trunc_page(start, PAGE_MASK_64);
859 	end_aligned = vm_map_round_page(end, PAGE_MASK_64);
860 	start_aligned = vm_map_trunc_page(start_aligned, VM_MAP_PAGE_MASK(map));
861 	end_aligned = vm_map_round_page(end_aligned, VM_MAP_PAGE_MASK(map));
862 
863 #if __arm64__
864 	/*
865 	 * "start" and "end" might be 4K-aligned but not 16K-aligned,
866 	 * so we might have to loop and establish up to 3 mappings:
867 	 *
868 	 * + the first 16K-page, which might overlap with the previous
869 	 *   4K-aligned mapping,
870 	 * + the center,
871 	 * + the last 16K-page, which might overlap with the next
872 	 *   4K-aligned mapping.
873 	 * Each of these mapping might be backed by a vnode pager (if
874 	 * properly page-aligned) or a "fourk_pager", itself backed by a
875 	 * vnode pager (if 4K-aligned but not page-aligned).
876 	 */
877 #endif /* __arm64__ */
878 
879 	map_addr = start_aligned;
880 	for (map_addr = start_aligned;
881 	    map_addr < end;
882 	    map_addr = tmp_entry.vme_end) {
883 		vm_map_lock(map);
884 		map_locked = TRUE;
885 
886 		/* lookup the protected VM object */
887 		if (!vm_map_lookup_entry(map,
888 		    map_addr,
889 		    &map_entry) ||
890 		    map_entry->is_sub_map ||
891 		    VME_OBJECT(map_entry) == VM_OBJECT_NULL) {
892 			/* that memory is not properly mapped */
893 			kr = KERN_INVALID_ARGUMENT;
894 			goto done;
895 		}
896 
897 		/* ensure mapped memory is mapped as executable except
898 		 *  except for model decryption flow */
899 		if ((cryptid != CRYPTID_MODEL_ENCRYPTION) &&
900 		    !(map_entry->protection & VM_PROT_EXECUTE)) {
901 			kr = KERN_INVALID_ARGUMENT;
902 			goto done;
903 		}
904 
905 		/* get the protected object to be decrypted */
906 		protected_object = VME_OBJECT(map_entry);
907 		if (protected_object == VM_OBJECT_NULL) {
908 			/* there should be a VM object here at this point */
909 			kr = KERN_INVALID_ARGUMENT;
910 			goto done;
911 		}
912 		/* ensure protected object stays alive while map is unlocked */
913 		vm_object_reference(protected_object);
914 
915 		/* limit the map entry to the area we want to cover */
916 		vm_map_clip_start(map, map_entry, start_aligned);
917 		vm_map_clip_end(map, map_entry, end_aligned);
918 
919 		tmp_entry = *map_entry;
920 		map_entry = VM_MAP_ENTRY_NULL; /* not valid after unlocking map */
921 		vm_map_unlock(map);
922 		map_locked = FALSE;
923 
924 		/*
925 		 * This map entry might be only partially encrypted
926 		 * (if not fully "page-aligned").
927 		 */
928 		crypto_start = 0;
929 		crypto_end = tmp_entry.vme_end - tmp_entry.vme_start;
930 		if (tmp_entry.vme_start < start) {
931 			if (tmp_entry.vme_start != start_aligned) {
932 				kr = KERN_INVALID_ADDRESS;
933 			}
934 			crypto_start += (start - tmp_entry.vme_start);
935 		}
936 		if (tmp_entry.vme_end > end) {
937 			if (tmp_entry.vme_end != end_aligned) {
938 				kr = KERN_INVALID_ADDRESS;
939 			}
940 			crypto_end -= (tmp_entry.vme_end - end);
941 		}
942 
943 		/*
944 		 * This "extra backing offset" is needed to get the decryption
945 		 * routine to use the right key.  It adjusts for the possibly
946 		 * relative offset of an interposed "4K" pager...
947 		 */
948 		if (crypto_backing_offset == (vm_object_offset_t) -1) {
949 			crypto_backing_offset = VME_OFFSET(&tmp_entry);
950 		}
951 
952 		cache_pager = TRUE;
953 #if XNU_TARGET_OS_OSX
954 		if (vm_map_is_alien(map)) {
955 			cache_pager = FALSE;
956 		}
957 #endif /* XNU_TARGET_OS_OSX */
958 
959 		/*
960 		 * Lookup (and create if necessary) the protected memory object
961 		 * matching that VM object.
962 		 * If successful, this also grabs a reference on the memory object,
963 		 * to guarantee that it doesn't go away before we get a chance to map
964 		 * it.
965 		 */
966 		unprotected_mem_obj = apple_protect_pager_setup(
967 			protected_object,
968 			VME_OFFSET(&tmp_entry),
969 			crypto_backing_offset,
970 			crypt_info,
971 			crypto_start,
972 			crypto_end,
973 			cache_pager);
974 
975 		/* release extra ref on protected object */
976 		vm_object_deallocate(protected_object);
977 
978 		if (unprotected_mem_obj == NULL) {
979 			kr = KERN_FAILURE;
980 			goto done;
981 		}
982 
983 		vm_flags = VM_FLAGS_FIXED | VM_FLAGS_OVERWRITE;
984 		/* can overwrite an immutable mapping */
985 		vmk_flags.vmkf_overwrite_immutable = TRUE;
986 #if __arm64__
987 		if (tmp_entry.used_for_jit &&
988 		    (VM_MAP_PAGE_SHIFT(map) != FOURK_PAGE_SHIFT ||
989 		    PAGE_SHIFT != FOURK_PAGE_SHIFT) &&
990 		    fourk_binary_compatibility_unsafe &&
991 		    fourk_binary_compatibility_allow_wx) {
992 			printf("** FOURK_COMPAT [%d]: "
993 			    "allowing write+execute at 0x%llx\n",
994 			    proc_selfpid(), tmp_entry.vme_start);
995 			vmk_flags.vmkf_map_jit = TRUE;
996 		}
997 #endif /* __arm64__ */
998 
999 		/* map this memory object in place of the current one */
1000 		map_addr = tmp_entry.vme_start;
1001 		kr = vm_map_enter_mem_object(map,
1002 		    &map_addr,
1003 		    (tmp_entry.vme_end -
1004 		    tmp_entry.vme_start),
1005 		    (mach_vm_offset_t) 0,
1006 		    vm_flags,
1007 		    vmk_flags,
1008 		    VM_KERN_MEMORY_NONE,
1009 		    (ipc_port_t)(uintptr_t) unprotected_mem_obj,
1010 		    0,
1011 		    TRUE,
1012 		    tmp_entry.protection,
1013 		    tmp_entry.max_protection,
1014 		    tmp_entry.inheritance);
1015 		assertf(kr == KERN_SUCCESS,
1016 		    "kr = 0x%x\n", kr);
1017 		assertf(map_addr == tmp_entry.vme_start,
1018 		    "map_addr=0x%llx vme_start=0x%llx tmp_entry=%p\n",
1019 		    (uint64_t)map_addr,
1020 		    (uint64_t) tmp_entry.vme_start,
1021 		    &tmp_entry);
1022 
1023 #if VM_MAP_DEBUG_APPLE_PROTECT
1024 		if (vm_map_debug_apple_protect) {
1025 			printf("APPLE_PROTECT: map %p [0x%llx:0x%llx] pager %p:"
1026 			    " backing:[object:%p,offset:0x%llx,"
1027 			    "crypto_backing_offset:0x%llx,"
1028 			    "crypto_start:0x%llx,crypto_end:0x%llx]\n",
1029 			    map,
1030 			    (uint64_t) map_addr,
1031 			    (uint64_t) (map_addr + (tmp_entry.vme_end -
1032 			    tmp_entry.vme_start)),
1033 			    unprotected_mem_obj,
1034 			    protected_object,
1035 			    VME_OFFSET(&tmp_entry),
1036 			    crypto_backing_offset,
1037 			    crypto_start,
1038 			    crypto_end);
1039 		}
1040 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
1041 
1042 		/*
1043 		 * Release the reference obtained by
1044 		 * apple_protect_pager_setup().
1045 		 * The mapping (if it succeeded) is now holding a reference on
1046 		 * the memory object.
1047 		 */
1048 		memory_object_deallocate(unprotected_mem_obj);
1049 		unprotected_mem_obj = MEMORY_OBJECT_NULL;
1050 
1051 		/* continue with next map entry */
1052 		crypto_backing_offset += (tmp_entry.vme_end -
1053 		    tmp_entry.vme_start);
1054 		crypto_backing_offset -= crypto_start;
1055 	}
1056 	kr = KERN_SUCCESS;
1057 
1058 done:
1059 	if (map_locked) {
1060 		vm_map_unlock(map);
1061 	}
1062 	return kr;
1063 }
1064 #endif  /* CONFIG_CODE_DECRYPTION */
1065 
1066 
1067 LCK_GRP_DECLARE(vm_map_lck_grp, "vm_map");
1068 LCK_ATTR_DECLARE(vm_map_lck_attr, 0, 0);
1069 LCK_ATTR_DECLARE(vm_map_lck_rw_attr, 0, LCK_ATTR_DEBUG);
1070 
1071 #if XNU_TARGET_OS_OSX
1072 int malloc_no_cow = 0;
1073 #else /* XNU_TARGET_OS_OSX */
1074 int malloc_no_cow = 1;
1075 #endif /* XNU_TARGET_OS_OSX */
1076 uint64_t vm_memory_malloc_no_cow_mask = 0ULL;
1077 #if DEBUG
1078 int vm_check_map_sanity = 0;
1079 #endif
1080 
1081 /*
1082  *	vm_map_init:
1083  *
1084  *	Initialize the vm_map module.  Must be called before
1085  *	any other vm_map routines.
1086  *
1087  *	Map and entry structures are allocated from zones -- we must
1088  *	initialize those zones.
1089  *
1090  *	There are three zones of interest:
1091  *
1092  *	vm_map_zone:		used to allocate maps.
1093  *	vm_map_entry_zone:	used to allocate map entries.
1094  *
1095  *	LP32:
1096  *	vm_map_entry_reserved_zone:     fallback zone for kernel map entries
1097  *
1098  *	The kernel allocates map entries from a special zone that is initially
1099  *	"crammed" with memory.  It would be difficult (perhaps impossible) for
1100  *	the kernel to allocate more memory to a entry zone when it became
1101  *	empty since the very act of allocating memory implies the creation
1102  *	of a new entry.
1103  */
1104 __startup_func
1105 void
vm_map_init(void)1106 vm_map_init(void)
1107 {
1108 
1109 #if MACH_ASSERT
1110 	PE_parse_boot_argn("debug4k_filter", &debug4k_filter,
1111 	    sizeof(debug4k_filter));
1112 #endif /* MACH_ASSERT */
1113 
1114 	vm_map_zone = zone_create_ext(VM_MAP_ZONE_NAME, sizeof(struct _vm_map),
1115 	    VM_MAP_ZFLAGS, ZONE_ID_VM_MAP, NULL);
1116 
1117 	/*
1118 	 * Don't quarantine because we always need elements available
1119 	 * Disallow GC on this zone... to aid the GC.
1120 	 */
1121 	vm_map_entry_zone = zone_create_ext(VM_MAP_ENTRY_ZONE_NAME,
1122 	    sizeof(struct vm_map_entry), VM_MAP_ENTRY_ZFLAGS,
1123 	    ZONE_ID_VM_MAP_ENTRY, ^(zone_t z) {
1124 		z->z_elems_rsv = (uint16_t)(32 *
1125 		(ml_early_cpu_max_number() + 1));
1126 	});
1127 #if HAVE_VM_MAP_RESERVED_ENTRY_ZONE
1128 	vm_map_entry_reserved_zone = zone_create(VM_MAP_ENTRY_RESERVED_ZONE_NAME,
1129 	    sizeof(struct vm_map_entry), VM_MAP_ENTRY_RESERVED_ZFLAGS);
1130 #endif /* HAVE_VM_MAP_RESERVED_ENTRY_ZONE */
1131 
1132 	vm_map_holes_zone = zone_create_ext(VM_MAP_HOLES_ZONE_NAME,
1133 	    sizeof(struct vm_map_links), VM_MAP_HOLES_ZFLAGS,
1134 	    ZONE_ID_VM_MAP_HOLES, ^(zone_t z) {
1135 		z->z_elems_rsv = (uint16_t)(16 * 1024 / zone_elem_size(z));
1136 	});
1137 
1138 	vm_map_copy_zone = zone_create_ext("VM map copies", sizeof(struct vm_map_copy),
1139 	    ZC_NOENCRYPT | ZC_CACHING, ZONE_ID_VM_MAP_COPY, NULL);
1140 
1141 	/*
1142 	 * Add the stolen memory to zones, adjust zone size and stolen counts.
1143 	 */
1144 	zone_cram_early(vm_map_zone, map_data, map_data_size);
1145 	zone_cram_early(vm_map_entry_zone, kentry_data, kentry_data_size);
1146 	zone_cram_early(vm_map_holes_zone, map_holes_data, map_holes_data_size);
1147 	printf("VM boostrap: %d maps, %d entries and %d holes available\n",
1148 	    vm_map_zone->z_elems_free,
1149 	    vm_map_entry_zone->z_elems_free,
1150 	    vm_map_holes_zone->z_elems_free);
1151 
1152 	/*
1153 	 * Since these are covered by zones, remove them from stolen page accounting.
1154 	 */
1155 	VM_PAGE_MOVE_STOLEN(atop_64(map_data_size) + atop_64(kentry_data_size) + atop_64(map_holes_data_size));
1156 
1157 #if VM_MAP_DEBUG_APPLE_PROTECT
1158 	PE_parse_boot_argn("vm_map_debug_apple_protect",
1159 	    &vm_map_debug_apple_protect,
1160 	    sizeof(vm_map_debug_apple_protect));
1161 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
1162 #if VM_MAP_DEBUG_APPLE_FOURK
1163 	PE_parse_boot_argn("vm_map_debug_fourk",
1164 	    &vm_map_debug_fourk,
1165 	    sizeof(vm_map_debug_fourk));
1166 #endif /* VM_MAP_DEBUG_FOURK */
1167 
1168 	PE_parse_boot_argn("malloc_no_cow",
1169 	    &malloc_no_cow,
1170 	    sizeof(malloc_no_cow));
1171 	if (malloc_no_cow) {
1172 		vm_memory_malloc_no_cow_mask = 0ULL;
1173 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC;
1174 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_SMALL;
1175 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_MEDIUM;
1176 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE;
1177 //		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_HUGE;
1178 //		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_REALLOC;
1179 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_TINY;
1180 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE_REUSABLE;
1181 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE_REUSED;
1182 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_NANO;
1183 //		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_TCMALLOC;
1184 		PE_parse_boot_argn("vm_memory_malloc_no_cow_mask",
1185 		    &vm_memory_malloc_no_cow_mask,
1186 		    sizeof(vm_memory_malloc_no_cow_mask));
1187 	}
1188 
1189 #if DEBUG
1190 	PE_parse_boot_argn("vm_check_map_sanity", &vm_check_map_sanity, sizeof(vm_check_map_sanity));
1191 	if (vm_check_map_sanity) {
1192 		kprintf("VM sanity checking enabled\n");
1193 	} else {
1194 		kprintf("VM sanity checking disabled. Set bootarg vm_check_map_sanity=1 to enable\n");
1195 	}
1196 #endif /* DEBUG */
1197 
1198 #if DEVELOPMENT || DEBUG
1199 	PE_parse_boot_argn("panic_on_unsigned_execute",
1200 	    &panic_on_unsigned_execute,
1201 	    sizeof(panic_on_unsigned_execute));
1202 	PE_parse_boot_argn("panic_on_mlock_failure",
1203 	    &panic_on_mlock_failure,
1204 	    sizeof(panic_on_mlock_failure));
1205 #endif /* DEVELOPMENT || DEBUG */
1206 }
1207 
1208 __startup_func
1209 static void
vm_map_steal_memory(void)1210 vm_map_steal_memory(void)
1211 {
1212 	/*
1213 	 * We need to reserve enough memory to support boostraping VM maps
1214 	 * and the zone subsystem.
1215 	 *
1216 	 * The VM Maps that need to function before zones can support them
1217 	 * are the ones registered with vm_map_will_allocate_early_map(),
1218 	 * which are:
1219 	 * - the kernel map
1220 	 * - the various submaps used by zones (pgz, meta, ...)
1221 	 *
1222 	 * We also need enough entries and holes to support them
1223 	 * until zone_metadata_init() is called, which is when
1224 	 * the zone allocator becomes capable of expanding dynamically.
1225 	 *
1226 	 * We need:
1227 	 * - VM_MAP_EARLY_COUNT_MAX worth of VM Maps.
1228 	 * - To allow for 3-4 entries per map, but the kernel map
1229 	 *   needs a multiple of VM_MAP_EARLY_COUNT_MAX entries
1230 	 *   to describe the submaps, so double it (and make it 8x too)
1231 	 * - To allow for holes between entries,
1232 	 *   hence needs the same budget as entries
1233 	 */
1234 	map_data_size = zone_get_early_alloc_size(VM_MAP_ZONE_NAME,
1235 	    sizeof(struct _vm_map), VM_MAP_ZFLAGS,
1236 	    VM_MAP_EARLY_COUNT_MAX);
1237 
1238 	kentry_data_size = zone_get_early_alloc_size(VM_MAP_ENTRY_ZONE_NAME,
1239 	    sizeof(struct vm_map_entry), VM_MAP_ENTRY_ZFLAGS,
1240 	    8 * VM_MAP_EARLY_COUNT_MAX);
1241 
1242 	map_holes_data_size = zone_get_early_alloc_size(VM_MAP_HOLES_ZONE_NAME,
1243 	    sizeof(struct vm_map_links), VM_MAP_HOLES_ZFLAGS,
1244 	    8 * VM_MAP_EARLY_COUNT_MAX);
1245 
1246 	/*
1247 	 * Steal a contiguous range of memory so that a simple range check
1248 	 * can validate early addresses being freed/crammed to these
1249 	 * zones
1250 	 */
1251 	map_data       = zone_early_mem_init(map_data_size + kentry_data_size +
1252 	    map_holes_data_size);
1253 	kentry_data    = map_data + map_data_size;
1254 	map_holes_data = kentry_data + kentry_data_size;
1255 }
1256 STARTUP(PMAP_STEAL, STARTUP_RANK_FIRST, vm_map_steal_memory);
1257 
1258 __startup_func
1259 static void
vm_kernel_boostraped(void)1260 vm_kernel_boostraped(void)
1261 {
1262 	printf("VM bootstrap done: %d maps, %d entries and %d holes left\n",
1263 	    vm_map_zone->z_elems_free,
1264 	    vm_map_entry_zone->z_elems_free,
1265 	    vm_map_holes_zone->z_elems_free);
1266 }
1267 STARTUP(ZALLOC, STARTUP_RANK_SECOND, vm_kernel_boostraped);
1268 
1269 void
vm_map_disable_hole_optimization(vm_map_t map)1270 vm_map_disable_hole_optimization(vm_map_t map)
1271 {
1272 	vm_map_entry_t  head_entry, hole_entry, next_hole_entry;
1273 
1274 	if (map->holelistenabled) {
1275 		head_entry = hole_entry = CAST_TO_VM_MAP_ENTRY(map->holes_list);
1276 
1277 		while (hole_entry != NULL) {
1278 			next_hole_entry = hole_entry->vme_next;
1279 
1280 			hole_entry->vme_next = NULL;
1281 			hole_entry->vme_prev = NULL;
1282 			zfree(vm_map_holes_zone, hole_entry);
1283 
1284 			if (next_hole_entry == head_entry) {
1285 				hole_entry = NULL;
1286 			} else {
1287 				hole_entry = next_hole_entry;
1288 			}
1289 		}
1290 
1291 		map->holes_list = NULL;
1292 		map->holelistenabled = FALSE;
1293 
1294 		map->first_free = vm_map_first_entry(map);
1295 		SAVE_HINT_HOLE_WRITE(map, NULL);
1296 	}
1297 }
1298 
1299 boolean_t
vm_kernel_map_is_kernel(vm_map_t map)1300 vm_kernel_map_is_kernel(vm_map_t map)
1301 {
1302 	return map->pmap == kernel_pmap;
1303 }
1304 
1305 /*
1306  *	vm_map_create:
1307  *
1308  *	Creates and returns a new empty VM map with
1309  *	the given physical map structure, and having
1310  *	the given lower and upper address bounds.
1311  */
1312 
1313 extern vm_map_t vm_map_create_external(
1314 	pmap_t                  pmap,
1315 	vm_map_offset_t         min_off,
1316 	vm_map_offset_t         max_off,
1317 	boolean_t               pageable);
1318 
1319 vm_map_t
vm_map_create_external(pmap_t pmap,vm_map_offset_t min,vm_map_offset_t max,boolean_t pageable)1320 vm_map_create_external(
1321 	pmap_t                  pmap,
1322 	vm_map_offset_t         min,
1323 	vm_map_offset_t         max,
1324 	boolean_t               pageable)
1325 {
1326 	vm_map_create_options_t options = VM_MAP_CREATE_DEFAULT;
1327 
1328 	if (pageable) {
1329 		options |= VM_MAP_CREATE_PAGEABLE;
1330 	}
1331 	return vm_map_create_options(pmap, min, max, options);
1332 }
1333 
1334 __startup_func
1335 void
vm_map_will_allocate_early_map(vm_map_t * owner)1336 vm_map_will_allocate_early_map(vm_map_t *owner)
1337 {
1338 	if (early_map_count >= VM_MAP_EARLY_COUNT_MAX) {
1339 		panic("VM_MAP_EARLY_COUNT_MAX is too low");
1340 	}
1341 
1342 	early_map_owners[early_map_count++] = owner;
1343 }
1344 
1345 __startup_func
1346 void
vm_map_relocate_early_maps(vm_offset_t delta)1347 vm_map_relocate_early_maps(vm_offset_t delta)
1348 {
1349 	for (uint32_t i = 0; i < early_map_count; i++) {
1350 		vm_address_t addr = (vm_address_t)*early_map_owners[i];
1351 
1352 		*early_map_owners[i] = (vm_map_t)(addr + delta);
1353 	}
1354 
1355 	early_map_count = ~0u;
1356 }
1357 
1358 /*
1359  *	Routine:	vm_map_relocate_early_elem
1360  *
1361  *	Purpose:
1362  *		Early zone elements are allocated in a temporary part
1363  *		of the address space.
1364  *
1365  *		Once the zones live in their final place, the early
1366  *		VM maps, map entries and map holes need to be relocated.
1367  *
1368  *		It involves rewriting any vm_map_t, vm_map_entry_t or
1369  *		pointers to vm_map_links. Other pointers to other types
1370  *		are fine.
1371  *
1372  *		Fortunately, pointers to those types are self-contained
1373  *		in those zones, _except_ for pointers to VM maps,
1374  *		which are tracked during early boot and fixed with
1375  *		vm_map_relocate_early_maps().
1376  */
1377 __startup_func
1378 void
vm_map_relocate_early_elem(uint32_t zone_id,vm_offset_t new_addr,vm_offset_t delta)1379 vm_map_relocate_early_elem(
1380 	uint32_t                zone_id,
1381 	vm_offset_t             new_addr,
1382 	vm_offset_t             delta)
1383 {
1384 #define relocate(type_t, field)  ({ \
1385 	typeof(((type_t)NULL)->field) *__field = &((type_t)new_addr)->field;   \
1386 	if (*__field) {                                                        \
1387 	        *__field = (typeof(*__field))((vm_offset_t)*__field + delta);  \
1388 	}                                                                      \
1389 })
1390 
1391 	switch (zone_id) {
1392 	case ZONE_ID_VM_MAP:
1393 	case ZONE_ID_VM_MAP_ENTRY:
1394 	case ZONE_ID_VM_MAP_HOLES:
1395 		break;
1396 
1397 	default:
1398 		panic("Unexpected zone ID %d", zone_id);
1399 	}
1400 
1401 	if (zone_id == ZONE_ID_VM_MAP) {
1402 		relocate(vm_map_t, hdr.links.prev);
1403 		relocate(vm_map_t, hdr.links.next);
1404 		((vm_map_t)new_addr)->pmap = kernel_pmap;
1405 #ifdef VM_MAP_STORE_USE_RB
1406 		relocate(vm_map_t, hdr.rb_head_store.rbh_root);
1407 #endif /* VM_MAP_STORE_USE_RB */
1408 		relocate(vm_map_t, hint);
1409 		relocate(vm_map_t, hole_hint);
1410 		relocate(vm_map_t, first_free);
1411 		return;
1412 	}
1413 
1414 	relocate(struct vm_map_links *, prev);
1415 	relocate(struct vm_map_links *, next);
1416 
1417 	if (zone_id == ZONE_ID_VM_MAP_ENTRY) {
1418 #ifdef VM_MAP_STORE_USE_RB
1419 		relocate(vm_map_entry_t, store.entry.rbe_left);
1420 		relocate(vm_map_entry_t, store.entry.rbe_right);
1421 		relocate(vm_map_entry_t, store.entry.rbe_parent);
1422 #endif /* VM_MAP_STORE_USE_RB */
1423 		if (((vm_map_entry_t)new_addr)->is_sub_map) {
1424 			/* no object to relocate because we haven't made any */
1425 			relocate(vm_map_entry_t, vme_object.vmo_submap);
1426 		}
1427 #if MAP_ENTRY_CREATION_DEBUG
1428 		relocate(vm_map_entry_t, vme_creation_maphdr);
1429 #endif /* MAP_ENTRY_CREATION_DEBUG */
1430 	}
1431 
1432 #undef relocate
1433 }
1434 
1435 vm_map_t
vm_map_create_options(pmap_t pmap,vm_map_offset_t min,vm_map_offset_t max,vm_map_create_options_t options)1436 vm_map_create_options(
1437 	pmap_t                  pmap,
1438 	vm_map_offset_t         min,
1439 	vm_map_offset_t         max,
1440 	vm_map_create_options_t options)
1441 {
1442 	vm_map_t result;
1443 
1444 #if DEBUG || DEVELOPMENT
1445 	if (__improbable(startup_phase < STARTUP_SUB_ZALLOC)) {
1446 		if (early_map_count != ~0u && early_map_count !=
1447 		    zone_count_allocated(vm_map_zone) + 1) {
1448 			panic("allocating %dth early map, owner not known",
1449 			    zone_count_allocated(vm_map_zone) + 1);
1450 		}
1451 		if (early_map_count != ~0u && pmap && pmap != kernel_pmap) {
1452 			panic("allocating %dth early map for non kernel pmap",
1453 			    early_map_count);
1454 		}
1455 	}
1456 #endif /* DEBUG || DEVELOPMENT */
1457 
1458 	result = zalloc_flags(vm_map_zone, Z_WAITOK | Z_NOFAIL | Z_ZERO);
1459 
1460 	vm_map_first_entry(result) = vm_map_to_entry(result);
1461 	vm_map_last_entry(result)  = vm_map_to_entry(result);
1462 
1463 	vm_map_store_init(&result->hdr);
1464 	result->hdr.entries_pageable = (bool)(options & VM_MAP_CREATE_PAGEABLE);
1465 	vm_map_set_page_shift(result, PAGE_SHIFT);
1466 
1467 	result->size_limit = RLIM_INFINITY;             /* default unlimited */
1468 	result->data_limit = RLIM_INFINITY;             /* default unlimited */
1469 	result->user_wire_limit = MACH_VM_MAX_ADDRESS;  /* default limit is unlimited */
1470 	os_ref_init_count_raw(&result->map_refcnt, &map_refgrp, 1);
1471 	result->pmap = pmap;
1472 	result->min_offset = min;
1473 	result->max_offset = max;
1474 	result->first_free = vm_map_to_entry(result);
1475 	result->hint = vm_map_to_entry(result);
1476 
1477 	if (options & VM_MAP_CREATE_NEVER_FAULTS) {
1478 		assert(pmap == kernel_pmap);
1479 		result->never_faults = true;
1480 	}
1481 
1482 	/* "has_corpse_footprint" and "holelistenabled" are mutually exclusive */
1483 	if (options & VM_MAP_CREATE_CORPSE_FOOTPRINT) {
1484 		result->has_corpse_footprint = true;
1485 	} else if (!(options & VM_MAP_CREATE_DISABLE_HOLELIST)) {
1486 		struct vm_map_links *hole_entry = zalloc(vm_map_holes_zone);
1487 
1488 		hole_entry->start = min;
1489 #if defined(__arm__) || defined(__arm64__)
1490 		hole_entry->end = result->max_offset;
1491 #else
1492 		hole_entry->end = MAX(max, (vm_map_offset_t)MACH_VM_MAX_ADDRESS);
1493 #endif
1494 		result->holes_list = result->hole_hint = hole_entry;
1495 		hole_entry->prev = hole_entry->next = CAST_TO_VM_MAP_ENTRY(hole_entry);
1496 		result->holelistenabled = true;
1497 	}
1498 
1499 	vm_map_lock_init(result);
1500 
1501 	return result;
1502 }
1503 
1504 /*
1505  * Adjusts a submap that was made by kmem_suballoc()
1506  * before it knew where it would be mapped,
1507  * so that it has the right min/max offsets.
1508  *
1509  * We do not need to hold any locks:
1510  * only the caller knows about this map,
1511  * and it is not published on any entry yet.
1512  */
1513 static void
vm_map_adjust_offsets(vm_map_t map,vm_map_offset_t min_off,vm_map_offset_t max_off)1514 vm_map_adjust_offsets(
1515 	vm_map_t                map,
1516 	vm_map_offset_t         min_off,
1517 	vm_map_offset_t         max_off)
1518 {
1519 	assert(map->min_offset == 0);
1520 	assert(map->max_offset == max_off - min_off);
1521 	assert(map->hdr.nentries == 0);
1522 	assert(os_ref_get_count_raw(&map->map_refcnt) == 2);
1523 
1524 	map->min_offset = min_off;
1525 	map->max_offset = max_off;
1526 
1527 	if (map->holelistenabled) {
1528 		struct vm_map_links *hole = map->holes_list;
1529 
1530 		hole->start = min_off;
1531 #if defined(__arm__) || defined(__arm64__)
1532 		hole->end = max_off;
1533 #else
1534 		hole->end = MAX(max_off, (vm_map_offset_t)MACH_VM_MAX_ADDRESS);
1535 #endif
1536 	}
1537 }
1538 
1539 
1540 vm_map_size_t
vm_map_adjusted_size(vm_map_t map)1541 vm_map_adjusted_size(vm_map_t map)
1542 {
1543 	struct vm_reserved_region *regions = NULL;
1544 	size_t num_regions = 0;
1545 	mach_vm_size_t  reserved_size = 0, map_size = 0;
1546 
1547 	if (map == NULL || (map->size == 0)) {
1548 		return 0;
1549 	}
1550 
1551 	map_size = map->size;
1552 
1553 	if (map->reserved_regions == FALSE || !vm_map_is_exotic(map) || map->terminated) {
1554 		/*
1555 		 * No special reserved regions or not an exotic map or the task
1556 		 * is terminating and these special regions might have already
1557 		 * been deallocated.
1558 		 */
1559 		return map_size;
1560 	}
1561 
1562 	num_regions = ml_get_vm_reserved_regions(vm_map_is_64bit(map), &regions);
1563 	assert((num_regions == 0) || (num_regions > 0 && regions != NULL));
1564 
1565 	while (num_regions) {
1566 		reserved_size += regions[--num_regions].vmrr_size;
1567 	}
1568 
1569 	/*
1570 	 * There are a few places where the map is being switched out due to
1571 	 * 'termination' without that bit being set (e.g. exec and corpse purging).
1572 	 * In those cases, we could have the map's regions being deallocated on
1573 	 * a core while some accounting process is trying to get the map's size.
1574 	 * So this assert can't be enabled till all those places are uniform in
1575 	 * their use of the 'map->terminated' bit.
1576 	 *
1577 	 * assert(map_size >= reserved_size);
1578 	 */
1579 
1580 	return (map_size >= reserved_size) ? (map_size - reserved_size) : map_size;
1581 }
1582 
1583 /*
1584  *	vm_map_entry_create:	[ internal use only ]
1585  *
1586  *	Allocates a VM map entry for insertion in the
1587  *	given map (or map copy).  No fields are filled.
1588  *
1589  *	The VM entry will be zero initialized, except for:
1590  *	- behavior set to VM_BEHAVIOR_DEFAULT
1591  *	- inheritance set to VM_INHERIT_DEFAULT
1592  */
1593 #define vm_map_entry_create(map)    _vm_map_entry_create(&(map)->hdr)
1594 
1595 #define vm_map_copy_entry_create(copy) _vm_map_entry_create(&(copy)->cpy_hdr)
1596 
1597 static vm_map_entry_t
_vm_map_entry_create(struct vm_map_header * map_header __unused)1598 _vm_map_entry_create(
1599 	struct vm_map_header    *map_header __unused)
1600 {
1601 	vm_map_entry_t  entry = NULL;
1602 	zone_t zone = vm_map_entry_zone;
1603 
1604 #if HAVE_VM_MAP_RESERVED_ENTRY_ZONE
1605 	zone_security_flags_t zsflags = zone_security_array[ZONE_ID_VM_MAP_ENTRY];
1606 	if (map_header == &zone_submap(zsflags)->hdr) {
1607 		/*
1608 		 * If we are trying to allocate an entry for the submap
1609 		 * of the vm_map_entry_zone, then this can cause recursive
1610 		 * locking of this map.
1611 		 *
1612 		 * Try to allocate _without blocking_ from this zone,
1613 		 * but if it is depleted, we need to go to the
1614 		 * vm_map_entry_reserved_zone which is in the zalloc
1615 		 * "VM" submap, which can grow without taking any map lock.
1616 		 *
1617 		 * Note: the vm_map_entry_zone has a rather high "reserve"
1618 		 * setup in order to minimize usage of the reserved one.
1619 		 */
1620 		entry = zalloc_flags(vm_map_entry_zone, Z_NOWAIT | Z_ZERO);
1621 		zone = vm_map_entry_reserved_zone;
1622 	}
1623 #endif
1624 	if (entry == NULL) {
1625 		entry = zalloc_flags(zone, Z_WAITOK | Z_ZERO);
1626 	}
1627 
1628 	/*
1629 	 * Help the compiler with what we know to be true,
1630 	 * so that the further bitfields inits have good codegen.
1631 	 *
1632 	 * See rdar://87041299
1633 	 */
1634 	__builtin_assume(entry->vme_object.vmo_object == NULL);
1635 #if __LP64__
1636 	__builtin_assume(*(uint64_t *)(&entry->vme_object + 1) == 0);
1637 	__builtin_assume(*(uint64_t *)(&entry->vme_object + 2) == 0);
1638 #else
1639 	__builtin_assume(*(uint32_t *)(&entry->vme_object + 1) == 0);
1640 	__builtin_assume(*(uint32_t *)(&entry->vme_object + 2) == 0);
1641 	__builtin_assume(*(uint32_t *)(&entry->vme_object + 3) == 0);
1642 	__builtin_assume(*(uint32_t *)(&entry->vme_object + 4) == 0);
1643 #endif
1644 
1645 	static_assert(VM_MAX_TAG_VALUE <= VME_ALIAS_MASK,
1646 	    "VME_ALIAS_MASK covers tags");
1647 
1648 	static_assert(VM_BEHAVIOR_DEFAULT == 0,
1649 	    "can skip zeroing of the behavior field");
1650 	entry->inheritance = VM_INHERIT_DEFAULT;
1651 
1652 	vm_map_store_update((vm_map_t) NULL, entry, VM_MAP_ENTRY_CREATE);
1653 
1654 #if MAP_ENTRY_CREATION_DEBUG
1655 	entry->vme_creation_maphdr = map_header;
1656 	entry->vme_creation_bt = btref_get(__builtin_frame_address(0),
1657 	    BTREF_GET_NOWAIT);
1658 #endif
1659 	return entry;
1660 }
1661 
1662 /*
1663  *	vm_map_entry_dispose:	[ internal use only ]
1664  *
1665  *	Inverse of vm_map_entry_create.
1666  *
1667  *      write map lock held so no need to
1668  *	do anything special to insure correctness
1669  *      of the stores
1670  */
1671 static void
vm_map_entry_dispose(vm_map_entry_t entry)1672 vm_map_entry_dispose(
1673 	vm_map_entry_t          entry)
1674 {
1675 #if MAP_ENTRY_CREATION_DEBUG
1676 	btref_put(entry->vme_creation_bt);
1677 #endif
1678 #if MAP_ENTRY_INSERTION_DEBUG
1679 	btref_put(entry->vme_insertion_bt);
1680 #endif
1681 #if HAVE_VM_MAP_RESERVED_ENTRY_ZONE
1682 	if (zone_id_for_element(entry, sizeof(*entry)) != ZONE_ID_VM_MAP_ENTRY) {
1683 		zfree(vm_map_entry_reserved_zone, entry);
1684 		return;
1685 	}
1686 #endif /* HAVE_VM_MAP_RESERVED_ENTRY_ZONE */
1687 	zfree(vm_map_entry_zone, entry);
1688 }
1689 
1690 #define vm_map_copy_entry_dispose(copy_entry) \
1691 	vm_map_entry_dispose(copy_entry)
1692 
1693 static vm_map_entry_t
vm_map_zap_first_entry(vm_map_zap_t list)1694 vm_map_zap_first_entry(
1695 	vm_map_zap_t            list)
1696 {
1697 	return list->vmz_head;
1698 }
1699 
1700 static vm_map_entry_t
vm_map_zap_last_entry(vm_map_zap_t list)1701 vm_map_zap_last_entry(
1702 	vm_map_zap_t            list)
1703 {
1704 	assert(vm_map_zap_first_entry(list));
1705 	return __container_of(list->vmz_tail, struct vm_map_entry, vme_next);
1706 }
1707 
1708 static void
vm_map_zap_append(vm_map_zap_t list,vm_map_entry_t entry)1709 vm_map_zap_append(
1710 	vm_map_zap_t            list,
1711 	vm_map_entry_t          entry)
1712 {
1713 	entry->vme_next = VM_MAP_ENTRY_NULL;
1714 	*list->vmz_tail = entry;
1715 	list->vmz_tail = &entry->vme_next;
1716 }
1717 
1718 static vm_map_entry_t
vm_map_zap_pop(vm_map_zap_t list)1719 vm_map_zap_pop(
1720 	vm_map_zap_t            list)
1721 {
1722 	vm_map_entry_t head = list->vmz_head;
1723 
1724 	if (head != VM_MAP_ENTRY_NULL &&
1725 	    (list->vmz_head = head->vme_next) == VM_MAP_ENTRY_NULL) {
1726 		list->vmz_tail = &list->vmz_head;
1727 	}
1728 
1729 	return head;
1730 }
1731 
1732 static void
vm_map_zap_dispose(vm_map_zap_t list)1733 vm_map_zap_dispose(
1734 	vm_map_zap_t            list)
1735 {
1736 	vm_map_entry_t          entry;
1737 
1738 	while ((entry = vm_map_zap_pop(list))) {
1739 		if (entry->is_sub_map) {
1740 			vm_map_deallocate(VME_SUBMAP(entry));
1741 		} else {
1742 			vm_object_deallocate(VME_OBJECT(entry));
1743 		}
1744 
1745 		vm_map_entry_dispose(entry);
1746 	}
1747 }
1748 
1749 #if MACH_ASSERT
1750 static boolean_t first_free_check = FALSE;
1751 boolean_t
first_free_is_valid(vm_map_t map)1752 first_free_is_valid(
1753 	vm_map_t        map)
1754 {
1755 	if (!first_free_check) {
1756 		return TRUE;
1757 	}
1758 
1759 	return first_free_is_valid_store( map );
1760 }
1761 #endif /* MACH_ASSERT */
1762 
1763 
1764 #define vm_map_copy_entry_link(copy, after_where, entry)                \
1765 	_vm_map_store_entry_link(&(copy)->cpy_hdr, after_where, (entry))
1766 
1767 #define vm_map_copy_entry_unlink(copy, entry)                           \
1768 	_vm_map_store_entry_unlink(&(copy)->cpy_hdr, (entry))
1769 
1770 /*
1771  *	vm_map_destroy:
1772  *
1773  *	Actually destroy a map.
1774  */
1775 void
vm_map_destroy(vm_map_t map)1776 vm_map_destroy(
1777 	vm_map_t        map)
1778 {
1779 	/* final cleanup: this is not allowed to fail */
1780 	vmr_flags_t flags = VM_MAP_REMOVE_NO_FLAGS;
1781 
1782 	VM_MAP_ZAP_DECLARE(zap);
1783 
1784 	vm_map_lock(map);
1785 
1786 	map->terminated = true;
1787 	/* clean up regular map entries */
1788 	(void)vm_map_delete(map, map->min_offset, map->max_offset, flags, &zap);
1789 	/* clean up leftover special mappings (commpage, GPU carveout, etc...) */
1790 #if     !defined(__arm__)
1791 	(void)vm_map_delete(map, 0x0, 0xFFFFFFFFFFFFF000ULL, flags, &zap);
1792 #endif /* !__arm__ */
1793 
1794 	vm_map_disable_hole_optimization(map);
1795 	vm_map_corpse_footprint_destroy(map);
1796 
1797 	vm_map_unlock(map);
1798 
1799 	vm_map_zap_dispose(&zap);
1800 
1801 	assert(map->hdr.nentries == 0);
1802 
1803 	if (map->pmap) {
1804 		pmap_destroy(map->pmap);
1805 	}
1806 
1807 #if LOCKS_INDIRECT_ALLOW
1808 	if (vm_map_lck_attr.lck_attr_val & LCK_ATTR_DEBUG) {
1809 		/*
1810 		 * If lock debugging is enabled the mutexes get tagged as LCK_MTX_TAG_INDIRECT.
1811 		 * And this is regardless of whether the lck_mtx_ext_t is embedded in the
1812 		 * structure or kalloc'ed via lck_mtx_init.
1813 		 * An example is s_lock_ext within struct _vm_map.
1814 		 *
1815 		 * A lck_mtx_destroy on such a mutex will attempt a kfree and panic. We
1816 		 * can add another tag to detect embedded vs alloc'ed indirect external
1817 		 * mutexes but that'll be additional checks in the lock path and require
1818 		 * updating dependencies for the old vs new tag.
1819 		 *
1820 		 * Since the kfree() is for LCK_MTX_TAG_INDIRECT mutexes and that tag is applied
1821 		 * just when lock debugging is ON, we choose to forego explicitly destroying
1822 		 * the vm_map mutex and rw lock. Because the vm_map_lck_grp is
1823 		 * permanent, this has no serious side-effect.
1824 		 */
1825 	} else
1826 #endif /* LOCKS_INDIRECT_ALLOW */
1827 	{
1828 		lck_rw_destroy(&(map)->lock, &vm_map_lck_grp);
1829 	}
1830 
1831 	zfree(vm_map_zone, map);
1832 }
1833 
1834 /*
1835  * Returns pid of the task with the largest number of VM map entries.
1836  * Used in the zone-map-exhaustion jetsam path.
1837  */
1838 pid_t
find_largest_process_vm_map_entries(void)1839 find_largest_process_vm_map_entries(void)
1840 {
1841 	pid_t victim_pid = -1;
1842 	int max_vm_map_entries = 0;
1843 	task_t task = TASK_NULL;
1844 	queue_head_t *task_list = &tasks;
1845 
1846 	lck_mtx_lock(&tasks_threads_lock);
1847 	queue_iterate(task_list, task, task_t, tasks) {
1848 		if (task == kernel_task || !task->active) {
1849 			continue;
1850 		}
1851 
1852 		vm_map_t task_map = task->map;
1853 		if (task_map != VM_MAP_NULL) {
1854 			int task_vm_map_entries = task_map->hdr.nentries;
1855 			if (task_vm_map_entries > max_vm_map_entries) {
1856 				max_vm_map_entries = task_vm_map_entries;
1857 				victim_pid = pid_from_task(task);
1858 			}
1859 		}
1860 	}
1861 	lck_mtx_unlock(&tasks_threads_lock);
1862 
1863 	printf("zone_map_exhaustion: victim pid %d, vm region count: %d\n", victim_pid, max_vm_map_entries);
1864 	return victim_pid;
1865 }
1866 
1867 
1868 /*
1869  *	vm_map_lookup_entry:	[ internal use only ]
1870  *
1871  *	Calls into the vm map store layer to find the map
1872  *	entry containing (or immediately preceding) the
1873  *	specified address in the given map; the entry is returned
1874  *	in the "entry" parameter.  The boolean
1875  *	result indicates whether the address is
1876  *	actually contained in the map.
1877  */
1878 boolean_t
vm_map_lookup_entry(vm_map_t map,vm_map_offset_t address,vm_map_entry_t * entry)1879 vm_map_lookup_entry(
1880 	vm_map_t        map,
1881 	vm_map_offset_t address,
1882 	vm_map_entry_t  *entry)         /* OUT */
1883 {
1884 #if CONFIG_KERNEL_TBI
1885 	if (VM_KERNEL_ADDRESS(address)) {
1886 		address = VM_KERNEL_STRIP_UPTR(address);
1887 	}
1888 #endif /* CONFIG_KERNEL_TBI */
1889 #if CONFIG_PROB_GZALLOC
1890 	if (map->pmap == kernel_pmap) {
1891 		assertf(!pgz_owned(address),
1892 		    "it is the responsibility of callers to unguard PGZ addresses");
1893 	}
1894 #endif /* CONFIG_PROB_GZALLOC */
1895 	return vm_map_store_lookup_entry( map, address, entry );
1896 }
1897 
1898 boolean_t
vm_map_lookup_entry_or_next(vm_map_t map,vm_map_offset_t address,vm_map_entry_t * entry)1899 vm_map_lookup_entry_or_next(
1900 	vm_map_t        map,
1901 	vm_map_offset_t address,
1902 	vm_map_entry_t  *entry)         /* OUT */
1903 {
1904 	if (vm_map_lookup_entry(map, address, entry)) {
1905 		return true;
1906 	}
1907 
1908 	*entry = (*entry)->vme_next;
1909 	return false;
1910 }
1911 
1912 #if CONFIG_PROB_GZALLOC
1913 boolean_t
vm_map_lookup_entry_allow_pgz(vm_map_t map,vm_map_offset_t address,vm_map_entry_t * entry)1914 vm_map_lookup_entry_allow_pgz(
1915 	vm_map_t        map,
1916 	vm_map_offset_t address,
1917 	vm_map_entry_t  *entry)         /* OUT */
1918 {
1919 #if CONFIG_KERNEL_TBI
1920 	if (VM_KERNEL_ADDRESS(address)) {
1921 		address = VM_KERNEL_STRIP_UPTR(address);
1922 	}
1923 #endif /* CONFIG_KERNEL_TBI */
1924 	return vm_map_store_lookup_entry( map, address, entry );
1925 }
1926 #endif /* CONFIG_PROB_GZALLOC */
1927 
1928 #if !ZSECURITY_CONFIG(KERNEL_DATA_SPLIT)
1929 /*
1930  *	Routine:	vm_map_adjust_direction
1931  *	Purpose:
1932  *			Overrides direction to reduce fragmentation. Allocate small
1933  *			allocations from the end and large allocations from the right.
1934  */
1935 static void
vm_map_adjust_direction(vm_map_kernel_flags_t * vmk_flags,vm_map_size_t size)1936 vm_map_adjust_direction(
1937 	vm_map_kernel_flags_t *vmk_flags,
1938 	vm_map_size_t          size)
1939 {
1940 	if (size < KMEM_SMALLMAP_THRESHOLD) {
1941 		vmk_flags->vmkf_last_free = true;
1942 	} else {
1943 		vmk_flags->vmkf_last_free = false;
1944 	}
1945 }
1946 #endif /* !ZSECURITY_CONFIG(KERNEL_DATA_SPLIT) || !ZSECURITY_CONFIG(KERNEL_PTR_SPLIT) */
1947 
1948 /*
1949  *	Routine:	vm_map_get_range
1950  *	Purpose:
1951  *			Adjust bounds based on security policy.
1952  */
1953 static struct kmem_range
vm_map_get_range(vm_map_t map,vm_map_offset_t * address,vm_map_kernel_flags_t * vmk_flags,vm_map_size_t size)1954 vm_map_get_range(
1955 	vm_map_t                map,
1956 	vm_map_offset_t        *address,
1957 	vm_map_kernel_flags_t  *vmk_flags,
1958 	vm_map_size_t           size)
1959 {
1960 	struct kmem_range effective_range = {};
1961 	if (map == kernel_map) {
1962 		kmem_range_id_t range_id = vmk_flags->vmkf_range_id;
1963 		effective_range = kmem_ranges[range_id];
1964 
1965 		if (startup_phase > STARTUP_SUB_KMEM) {
1966 			/*
1967 			 * Hint provided by caller is zeroed as the range is restricted to a
1968 			 * subset of the entire kernel_map VA, which could put the hint outside
1969 			 * the range, causing vm_map_store_find_space to fail.
1970 			 */
1971 			*address = 0ull;
1972 #if ZSECURITY_CONFIG(KERNEL_DATA_SPLIT)
1973 			/*
1974 			 * Each allocation front looks like [ S | L ]
1975 			 * Adjust range for allocations larger than KMEM_SMALLMAP_THRESHOLD.
1976 			 * Allocations smaller than KMEM_SMALLMAP_THRESHOLD are allowed to
1977 			 * use the entire range.
1978 			 */
1979 			if (size >= KMEM_SMALLMAP_THRESHOLD) {
1980 				effective_range = kmem_large_ranges[range_id];
1981 			}
1982 #else /* ZSECURITY_CONFIG(KERNEL_DATA_SPLIT) */
1983 			vm_map_adjust_direction(vmk_flags, size);
1984 #endif /* ZSECURITY_CONFIG(KERNEL_DATA_SPLIT) */
1985 		}
1986 	} else {
1987 		/*
1988 		 * If minimum is 0, bump it up by PAGE_SIZE.  We want to limit
1989 		 * allocations of PAGEZERO to explicit requests since its
1990 		 * normal use is to catch dereferences of NULL and many
1991 		 * applications also treat pointers with a value of 0 as
1992 		 * special and suddenly having address 0 contain useable
1993 		 * memory would tend to confuse those applications.
1994 		 */
1995 		effective_range.min_address = MAX(map->min_offset, VM_MAP_PAGE_SIZE(map));
1996 		effective_range.max_address = map->max_offset;
1997 	}
1998 
1999 	return effective_range;
2000 }
2001 
2002 /*
2003  *	Routine:	vm_map_locate_space
2004  *	Purpose:
2005  *		Finds a range in the specified virtual address map,
2006  *		returning the start of that range,
2007  *		as well as the entry right before it.
2008  */
2009 kern_return_t
vm_map_locate_space(vm_map_t map,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_map_offset_t * start_inout,vm_map_entry_t * entry_out)2010 vm_map_locate_space(
2011 	vm_map_t                map,
2012 	vm_map_size_t           size,
2013 	vm_map_offset_t         mask,
2014 	vm_map_kernel_flags_t   vmk_flags,
2015 	vm_map_offset_t        *start_inout,
2016 	vm_map_entry_t         *entry_out)
2017 {
2018 	struct kmem_range effective_range = {};
2019 	vm_map_size_t   guard_offset;
2020 	vm_map_offset_t hint, limit;
2021 	vm_map_entry_t  entry;
2022 
2023 	/*
2024 	 * Only supported by vm_map_enter() with a fixed address.
2025 	 */
2026 	assert(!vmk_flags.vmkf_beyond_max);
2027 
2028 	if (__improbable(map->wait_for_space)) {
2029 		/*
2030 		 * support for "wait_for_space" is minimal,
2031 		 * its only consumer is the ipc_kernel_copy_map.
2032 		 */
2033 		assert(!map->holelistenabled &&
2034 		    !vmk_flags.vmkf_last_free &&
2035 		    !vmk_flags.vmkf_keep_map_locked &&
2036 		    !vmk_flags.vmkf_map_jit &&
2037 		    !vmk_flags.vmkf_random_address &&
2038 		    *start_inout <= map->min_offset);
2039 	} else if (vmk_flags.vmkf_last_free) {
2040 		assert(!vmk_flags.vmkf_map_jit &&
2041 		    !vmk_flags.vmkf_random_address);
2042 	}
2043 
2044 	if (vmk_flags.vmkf_guard_before) {
2045 		guard_offset = VM_MAP_PAGE_SIZE(map);
2046 		assert(size > guard_offset);
2047 		size -= guard_offset;
2048 	} else {
2049 		assert(size != 0);
2050 		guard_offset = 0;
2051 	}
2052 
2053 	effective_range = vm_map_get_range(map, start_inout, &vmk_flags, size);
2054 #if XNU_TARGET_OS_OSX
2055 	if (__improbable(vmk_flags.vmkf_32bit_map_va)) {
2056 		assert(map != kernel_map);
2057 		effective_range.max_address = MIN(map->max_offset, 0x00000000FFFFF000ULL);
2058 	}
2059 #endif /* XNU_TARGET_OS_OSX */
2060 
2061 again:
2062 	if (vmk_flags.vmkf_last_free) {
2063 		hint = *start_inout;
2064 
2065 		if (hint == 0 || hint > effective_range.max_address) {
2066 			hint = effective_range.max_address;
2067 		}
2068 		if (hint <= effective_range.min_address) {
2069 			return KERN_NO_SPACE;
2070 		}
2071 		limit = effective_range.min_address;
2072 	} else {
2073 		hint = *start_inout;
2074 
2075 		if (vmk_flags.vmkf_map_jit) {
2076 			if (map->jit_entry_exists &&
2077 			    !VM_MAP_POLICY_ALLOW_MULTIPLE_JIT(map)) {
2078 				return KERN_INVALID_ARGUMENT;
2079 			}
2080 			if (VM_MAP_POLICY_ALLOW_JIT_RANDOM_ADDRESS(map)) {
2081 				vmk_flags.vmkf_random_address = true;
2082 			}
2083 		}
2084 
2085 		if (vmk_flags.vmkf_random_address) {
2086 			kern_return_t kr;
2087 
2088 			kr = vm_map_random_address_for_size(map, &hint, size, vmk_flags);
2089 			if (kr != KERN_SUCCESS) {
2090 				return kr;
2091 			}
2092 		}
2093 #if XNU_TARGET_OS_OSX
2094 		else if ((hint == 0 || hint == vm_map_min(map)) &&
2095 		    !map->disable_vmentry_reuse &&
2096 		    map->vmmap_high_start != 0) {
2097 			hint = map->vmmap_high_start;
2098 		}
2099 #endif /* XNU_TARGET_OS_OSX */
2100 
2101 		if (hint < effective_range.min_address) {
2102 			hint = effective_range.min_address;
2103 		}
2104 		if (effective_range.max_address <= hint) {
2105 			return KERN_NO_SPACE;
2106 		}
2107 
2108 		limit = effective_range.max_address;
2109 	}
2110 	entry = vm_map_store_find_space(map,
2111 	    hint, limit, vmk_flags.vmkf_last_free,
2112 	    guard_offset, size, mask,
2113 	    start_inout);
2114 
2115 	if (__improbable(entry == NULL)) {
2116 		if (map->wait_for_space &&
2117 		    guard_offset + size <=
2118 		    effective_range.max_address - effective_range.min_address) {
2119 			assert_wait((event_t)map, THREAD_ABORTSAFE);
2120 			vm_map_unlock(map);
2121 			thread_block(THREAD_CONTINUE_NULL);
2122 			vm_map_lock(map);
2123 			goto again;
2124 		}
2125 		return KERN_NO_SPACE;
2126 	}
2127 
2128 	if (entry_out) {
2129 		*entry_out = entry;
2130 	}
2131 	return KERN_SUCCESS;
2132 }
2133 
2134 
2135 /*
2136  *	Routine:	vm_map_find_space
2137  *	Purpose:
2138  *		Allocate a range in the specified virtual address map,
2139  *		returning the entry allocated for that range.
2140  *		Used by kmem_alloc, etc.
2141  *
2142  *		The map must be NOT be locked. It will be returned locked
2143  *		on KERN_SUCCESS, unlocked on failure.
2144  *
2145  *		If an entry is allocated, the object/offset fields
2146  *		are initialized to zero.
2147  */
2148 kern_return_t
vm_map_find_space(vm_map_t map,vm_map_offset_t hint_address,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_map_entry_t * o_entry)2149 vm_map_find_space(
2150 	vm_map_t                map,
2151 	vm_map_offset_t         hint_address,
2152 	vm_map_size_t           size,
2153 	vm_map_offset_t         mask,
2154 	vm_map_kernel_flags_t   vmk_flags,
2155 	vm_map_entry_t          *o_entry)       /* OUT */
2156 {
2157 	vm_map_entry_t          new_entry, entry;
2158 	kern_return_t           kr;
2159 
2160 	if (size == 0) {
2161 		return KERN_INVALID_ARGUMENT;
2162 	}
2163 
2164 	new_entry = vm_map_entry_create(map);
2165 	new_entry->use_pmap = true;
2166 	new_entry->protection = VM_PROT_DEFAULT;
2167 	new_entry->max_protection = VM_PROT_ALL;
2168 
2169 	if (VM_MAP_PAGE_SHIFT(map) != PAGE_SHIFT) {
2170 		new_entry->map_aligned = true;
2171 	}
2172 	if (vmk_flags.vmkf_atomic_entry) {
2173 		new_entry->vme_atomic = true;
2174 	}
2175 	if (vmk_flags.vmkf_permanent) {
2176 		new_entry->permanent = true;
2177 	}
2178 
2179 	vm_map_lock(map);
2180 
2181 	kr = vm_map_locate_space(map, size, mask, vmk_flags,
2182 	    &hint_address, &entry);
2183 	if (kr != KERN_SUCCESS) {
2184 		vm_map_unlock(map);
2185 		vm_map_entry_dispose(new_entry);
2186 		return kr;
2187 	}
2188 	new_entry->vme_start = hint_address;
2189 	new_entry->vme_end = hint_address + size;
2190 
2191 	/*
2192 	 *	At this point,
2193 	 *
2194 	 *	- new_entry's "vme_start" and "vme_end" should define
2195 	 *	  the endpoints of the available new range,
2196 	 *
2197 	 *	- and "entry" should refer to the region before
2198 	 *	  the new range,
2199 	 *
2200 	 *	- and the map should still be locked.
2201 	 */
2202 
2203 	assert(page_aligned(new_entry->vme_start));
2204 	assert(page_aligned(new_entry->vme_end));
2205 	assert(VM_MAP_PAGE_ALIGNED(new_entry->vme_start, VM_MAP_PAGE_MASK(map)));
2206 	assert(VM_MAP_PAGE_ALIGNED(new_entry->vme_end, VM_MAP_PAGE_MASK(map)));
2207 
2208 	/*
2209 	 *	Insert the new entry into the list
2210 	 */
2211 
2212 	vm_map_store_entry_link(map, entry, new_entry, VM_MAP_KERNEL_FLAGS_NONE);
2213 	map->size += size;
2214 
2215 	/*
2216 	 *	Update the lookup hint
2217 	 */
2218 	SAVE_HINT_MAP_WRITE(map, new_entry);
2219 
2220 	*o_entry = new_entry;
2221 	return KERN_SUCCESS;
2222 }
2223 
2224 int vm_map_pmap_enter_print = FALSE;
2225 int vm_map_pmap_enter_enable = FALSE;
2226 
2227 /*
2228  *	Routine:	vm_map_pmap_enter [internal only]
2229  *
2230  *	Description:
2231  *		Force pages from the specified object to be entered into
2232  *		the pmap at the specified address if they are present.
2233  *		As soon as a page not found in the object the scan ends.
2234  *
2235  *	Returns:
2236  *		Nothing.
2237  *
2238  *	In/out conditions:
2239  *		The source map should not be locked on entry.
2240  */
2241 __unused static void
vm_map_pmap_enter(vm_map_t map,vm_map_offset_t addr,vm_map_offset_t end_addr,vm_object_t object,vm_object_offset_t offset,vm_prot_t protection)2242 vm_map_pmap_enter(
2243 	vm_map_t                map,
2244 	vm_map_offset_t         addr,
2245 	vm_map_offset_t         end_addr,
2246 	vm_object_t             object,
2247 	vm_object_offset_t      offset,
2248 	vm_prot_t               protection)
2249 {
2250 	int                     type_of_fault;
2251 	kern_return_t           kr;
2252 	struct vm_object_fault_info fault_info = {};
2253 
2254 	if (map->pmap == 0) {
2255 		return;
2256 	}
2257 
2258 	assert(VM_MAP_PAGE_SHIFT(map) == PAGE_SHIFT);
2259 
2260 	while (addr < end_addr) {
2261 		vm_page_t       m;
2262 
2263 
2264 		/*
2265 		 * TODO:
2266 		 * From vm_map_enter(), we come into this function without the map
2267 		 * lock held or the object lock held.
2268 		 * We haven't taken a reference on the object either.
2269 		 * We should do a proper lookup on the map to make sure
2270 		 * that things are sane before we go locking objects that
2271 		 * could have been deallocated from under us.
2272 		 */
2273 
2274 		vm_object_lock(object);
2275 
2276 		m = vm_page_lookup(object, offset);
2277 
2278 		if (m == VM_PAGE_NULL || m->vmp_busy || m->vmp_fictitious ||
2279 		    (m->vmp_unusual && (m->vmp_error || m->vmp_restart || m->vmp_absent))) {
2280 			vm_object_unlock(object);
2281 			return;
2282 		}
2283 
2284 		if (vm_map_pmap_enter_print) {
2285 			printf("vm_map_pmap_enter:");
2286 			printf("map: %p, addr: %llx, object: %p, offset: %llx\n",
2287 			    map, (unsigned long long)addr, object, (unsigned long long)offset);
2288 		}
2289 		type_of_fault = DBG_CACHE_HIT_FAULT;
2290 		kr = vm_fault_enter(m, map->pmap,
2291 		    addr,
2292 		    PAGE_SIZE, 0,
2293 		    protection, protection,
2294 		    VM_PAGE_WIRED(m),
2295 		    FALSE,                 /* change_wiring */
2296 		    VM_KERN_MEMORY_NONE,                 /* tag - not wiring */
2297 		    &fault_info,
2298 		    NULL,                  /* need_retry */
2299 		    &type_of_fault);
2300 
2301 		vm_object_unlock(object);
2302 
2303 		offset += PAGE_SIZE_64;
2304 		addr += PAGE_SIZE;
2305 	}
2306 }
2307 
2308 #define MAX_TRIES_TO_GET_RANDOM_ADDRESS 1000
2309 kern_return_t
vm_map_random_address_for_size(vm_map_t map,vm_map_offset_t * address,vm_map_size_t size,vm_map_kernel_flags_t vmk_flags)2310 vm_map_random_address_for_size(
2311 	vm_map_t                map,
2312 	vm_map_offset_t        *address,
2313 	vm_map_size_t           size,
2314 	vm_map_kernel_flags_t   vmk_flags)
2315 {
2316 	kern_return_t   kr = KERN_SUCCESS;
2317 	int             tries = 0;
2318 	vm_map_offset_t random_addr = 0;
2319 	vm_map_offset_t hole_end;
2320 
2321 	vm_map_entry_t  next_entry = VM_MAP_ENTRY_NULL;
2322 	vm_map_entry_t  prev_entry = VM_MAP_ENTRY_NULL;
2323 	vm_map_size_t   vm_hole_size = 0;
2324 	vm_map_size_t   addr_space_size;
2325 	struct kmem_range effective_range = vm_map_get_range(map, address, &vmk_flags, size);
2326 
2327 	addr_space_size = effective_range.max_address - effective_range.min_address;
2328 	if (size >= addr_space_size) {
2329 		return KERN_NO_SPACE;
2330 	}
2331 	addr_space_size -= size;
2332 
2333 	assert(VM_MAP_PAGE_ALIGNED(size, VM_MAP_PAGE_MASK(map)));
2334 
2335 	while (tries < MAX_TRIES_TO_GET_RANDOM_ADDRESS) {
2336 		if (startup_phase < STARTUP_SUB_ZALLOC) {
2337 			random_addr = (vm_map_offset_t)early_random();
2338 		} else {
2339 			random_addr = (vm_map_offset_t)random();
2340 		}
2341 		random_addr <<= VM_MAP_PAGE_SHIFT(map);
2342 		random_addr = vm_map_trunc_page(
2343 			effective_range.min_address + (random_addr % addr_space_size),
2344 			VM_MAP_PAGE_MASK(map));
2345 
2346 #if CONFIG_PROB_GZALLOC
2347 		if (map->pmap == kernel_pmap && pgz_owned(random_addr)) {
2348 			continue;
2349 		}
2350 #endif /* CONFIG_PROB_GZALLOC */
2351 
2352 		if (vm_map_lookup_entry(map, random_addr, &prev_entry) == FALSE) {
2353 			if (prev_entry == vm_map_to_entry(map)) {
2354 				next_entry = vm_map_first_entry(map);
2355 			} else {
2356 				next_entry = prev_entry->vme_next;
2357 			}
2358 			if (next_entry == vm_map_to_entry(map)) {
2359 				hole_end = vm_map_max(map);
2360 			} else {
2361 				hole_end = next_entry->vme_start;
2362 			}
2363 			vm_hole_size = hole_end - random_addr;
2364 			if (vm_hole_size >= size) {
2365 				*address = random_addr;
2366 				break;
2367 			}
2368 		}
2369 		tries++;
2370 	}
2371 
2372 	if (tries == MAX_TRIES_TO_GET_RANDOM_ADDRESS) {
2373 		kr = KERN_NO_SPACE;
2374 	}
2375 	return kr;
2376 }
2377 
2378 static boolean_t
vm_memory_malloc_no_cow(int alias)2379 vm_memory_malloc_no_cow(
2380 	int alias)
2381 {
2382 	uint64_t alias_mask;
2383 
2384 	if (alias > 63) {
2385 		return FALSE;
2386 	}
2387 
2388 	alias_mask = 1ULL << alias;
2389 	if (alias_mask & vm_memory_malloc_no_cow_mask) {
2390 		return TRUE;
2391 	}
2392 	return FALSE;
2393 }
2394 
2395 uint64_t vm_map_enter_RLIMIT_AS_count = 0;
2396 uint64_t vm_map_enter_RLIMIT_DATA_count = 0;
2397 /*
2398  *	Routine:	vm_map_enter
2399  *
2400  *	Description:
2401  *		Allocate a range in the specified virtual address map.
2402  *		The resulting range will refer to memory defined by
2403  *		the given memory object and offset into that object.
2404  *
2405  *		Arguments are as defined in the vm_map call.
2406  */
2407 static unsigned int vm_map_enter_restore_successes = 0;
2408 static unsigned int vm_map_enter_restore_failures = 0;
2409 kern_return_t
vm_map_enter(vm_map_t map,vm_map_offset_t * address,vm_map_size_t size,vm_map_offset_t mask,int flags,vm_map_kernel_flags_t vmk_flags,vm_tag_t alias,vm_object_t object,vm_object_offset_t offset,boolean_t needs_copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)2410 vm_map_enter(
2411 	vm_map_t                map,
2412 	vm_map_offset_t         *address,       /* IN/OUT */
2413 	vm_map_size_t           size,
2414 	vm_map_offset_t         mask,
2415 	int                     flags,
2416 	vm_map_kernel_flags_t   vmk_flags,
2417 	vm_tag_t                alias,
2418 	vm_object_t             object,
2419 	vm_object_offset_t      offset,
2420 	boolean_t               needs_copy,
2421 	vm_prot_t               cur_protection,
2422 	vm_prot_t               max_protection,
2423 	vm_inherit_t            inheritance)
2424 {
2425 	vm_map_entry_t          entry, new_entry;
2426 	vm_map_offset_t         start, tmp_start, tmp_offset;
2427 	vm_map_offset_t         end, tmp_end;
2428 	vm_map_offset_t         tmp2_start, tmp2_end;
2429 	vm_map_offset_t         step;
2430 	kern_return_t           result = KERN_SUCCESS;
2431 	boolean_t               map_locked = FALSE;
2432 	boolean_t               pmap_empty = TRUE;
2433 	boolean_t               new_mapping_established = FALSE;
2434 	boolean_t               keep_map_locked = vmk_flags.vmkf_keep_map_locked;
2435 	boolean_t               anywhere = ((flags & VM_FLAGS_ANYWHERE) != 0);
2436 	boolean_t               purgable = ((flags & VM_FLAGS_PURGABLE) != 0);
2437 	boolean_t               overwrite = ((flags & VM_FLAGS_OVERWRITE) != 0);
2438 	boolean_t               no_cache = ((flags & VM_FLAGS_NO_CACHE) != 0);
2439 	boolean_t               is_submap = vmk_flags.vmkf_submap;
2440 	boolean_t               permanent = (((flags & VM_FLAGS_PERMANENT) != 0) || vmk_flags.vmkf_permanent);
2441 	boolean_t               no_copy_on_read = vmk_flags.vmkf_no_copy_on_read;
2442 	boolean_t               entry_for_jit = vmk_flags.vmkf_map_jit;
2443 	boolean_t               iokit_acct = vmk_flags.vmkf_iokit_acct;
2444 	boolean_t               translated_allow_execute = vmk_flags.vmkf_translated_allow_execute;
2445 	boolean_t               resilient_codesign = ((flags & VM_FLAGS_RESILIENT_CODESIGN) != 0);
2446 	boolean_t               resilient_media = ((flags & VM_FLAGS_RESILIENT_MEDIA) != 0);
2447 	unsigned int            superpage_size = ((flags & VM_FLAGS_SUPERPAGE_MASK) >> VM_FLAGS_SUPERPAGE_SHIFT);
2448 	vm_tag_t                user_alias;
2449 	kern_return_t           kr;
2450 	boolean_t               clear_map_aligned = FALSE;
2451 	vm_map_size_t           chunk_size = 0;
2452 	vm_object_t             caller_object;
2453 	VM_MAP_ZAP_DECLARE(zap_old_list);
2454 	VM_MAP_ZAP_DECLARE(zap_new_list);
2455 
2456 	caller_object = object;
2457 
2458 	assertf(vmk_flags.__vmkf_unused == 0, "vmk_flags unused=0x%x\n", vmk_flags.__vmkf_unused);
2459 
2460 	if (flags & VM_FLAGS_4GB_CHUNK) {
2461 #if defined(__LP64__)
2462 		chunk_size = (4ULL * 1024 * 1024 * 1024); /* max. 4GB chunks for the new allocation */
2463 #else /* __LP64__ */
2464 		chunk_size = ANON_CHUNK_SIZE;
2465 #endif /* __LP64__ */
2466 	} else {
2467 		chunk_size = ANON_CHUNK_SIZE;
2468 	}
2469 
2470 	if (superpage_size) {
2471 		switch (superpage_size) {
2472 			/*
2473 			 * Note that the current implementation only supports
2474 			 * a single size for superpages, SUPERPAGE_SIZE, per
2475 			 * architecture. As soon as more sizes are supposed
2476 			 * to be supported, SUPERPAGE_SIZE has to be replaced
2477 			 * with a lookup of the size depending on superpage_size.
2478 			 */
2479 #ifdef __x86_64__
2480 		case SUPERPAGE_SIZE_ANY:
2481 			/* handle it like 2 MB and round up to page size */
2482 			size = (size + 2 * 1024 * 1024 - 1) & ~(2 * 1024 * 1024 - 1);
2483 			OS_FALLTHROUGH;
2484 		case SUPERPAGE_SIZE_2MB:
2485 			break;
2486 #endif
2487 		default:
2488 			return KERN_INVALID_ARGUMENT;
2489 		}
2490 		mask = SUPERPAGE_SIZE - 1;
2491 		if (size & (SUPERPAGE_SIZE - 1)) {
2492 			return KERN_INVALID_ARGUMENT;
2493 		}
2494 		inheritance = VM_INHERIT_NONE;  /* fork() children won't inherit superpages */
2495 	}
2496 
2497 
2498 	if ((cur_protection & VM_PROT_WRITE) &&
2499 	    (cur_protection & VM_PROT_EXECUTE) &&
2500 #if XNU_TARGET_OS_OSX
2501 	    map->pmap != kernel_pmap &&
2502 	    (cs_process_global_enforcement() ||
2503 	    (vmk_flags.vmkf_cs_enforcement_override
2504 	    ? vmk_flags.vmkf_cs_enforcement
2505 	    : (vm_map_cs_enforcement(map)
2506 #if __arm64__
2507 	    || !VM_MAP_IS_EXOTIC(map)
2508 #endif /* __arm64__ */
2509 	    ))) &&
2510 #endif /* XNU_TARGET_OS_OSX */
2511 	    (VM_MAP_POLICY_WX_FAIL(map) ||
2512 	    VM_MAP_POLICY_WX_STRIP_X(map)) &&
2513 	    !entry_for_jit) {
2514 		boolean_t vm_protect_wx_fail = VM_MAP_POLICY_WX_FAIL(map);
2515 
2516 		DTRACE_VM3(cs_wx,
2517 		    uint64_t, 0,
2518 		    uint64_t, 0,
2519 		    vm_prot_t, cur_protection);
2520 		printf("CODE SIGNING: %d[%s] %s: curprot cannot be write+execute. %s\n",
2521 		    proc_selfpid(),
2522 		    (current_task()->bsd_info
2523 		    ? proc_name_address(current_task()->bsd_info)
2524 		    : "?"),
2525 		    __FUNCTION__,
2526 		    (vm_protect_wx_fail ? "failing" : "turning off execute"));
2527 		cur_protection &= ~VM_PROT_EXECUTE;
2528 		if (vm_protect_wx_fail) {
2529 			return KERN_PROTECTION_FAILURE;
2530 		}
2531 	}
2532 
2533 	/*
2534 	 * If the task has requested executable lockdown,
2535 	 * deny any new executable mapping.
2536 	 */
2537 	if (map->map_disallow_new_exec == TRUE) {
2538 		if (cur_protection & VM_PROT_EXECUTE) {
2539 			return KERN_PROTECTION_FAILURE;
2540 		}
2541 	}
2542 
2543 	if (resilient_codesign) {
2544 		assert(!is_submap);
2545 		int reject_prot = (needs_copy ? VM_PROT_ALLEXEC : (VM_PROT_WRITE | VM_PROT_ALLEXEC));
2546 		if ((cur_protection | max_protection) & reject_prot) {
2547 			return KERN_PROTECTION_FAILURE;
2548 		}
2549 	}
2550 
2551 	if (resilient_media) {
2552 		assert(!is_submap);
2553 //		assert(!needs_copy);
2554 		if (object != VM_OBJECT_NULL &&
2555 		    !object->internal) {
2556 			/*
2557 			 * This mapping is directly backed by an external
2558 			 * memory manager (e.g. a vnode pager for a file):
2559 			 * we would not have any safe place to inject
2560 			 * a zero-filled page if an actual page is not
2561 			 * available, without possibly impacting the actual
2562 			 * contents of the mapped object (e.g. the file),
2563 			 * so we can't provide any media resiliency here.
2564 			 */
2565 			return KERN_INVALID_ARGUMENT;
2566 		}
2567 	}
2568 
2569 	if (is_submap) {
2570 		if (purgable) {
2571 			/* submaps can not be purgeable */
2572 			return KERN_INVALID_ARGUMENT;
2573 		}
2574 		if (object == VM_OBJECT_NULL) {
2575 			/* submaps can not be created lazily */
2576 			return KERN_INVALID_ARGUMENT;
2577 		}
2578 	}
2579 	if (vmk_flags.vmkf_already) {
2580 		/*
2581 		 * VM_FLAGS_ALREADY says that it's OK if the same mapping
2582 		 * is already present.  For it to be meaningul, the requested
2583 		 * mapping has to be at a fixed address (!VM_FLAGS_ANYWHERE) and
2584 		 * we shouldn't try and remove what was mapped there first
2585 		 * (!VM_FLAGS_OVERWRITE).
2586 		 */
2587 		if ((flags & VM_FLAGS_ANYWHERE) ||
2588 		    (flags & VM_FLAGS_OVERWRITE)) {
2589 			return KERN_INVALID_ARGUMENT;
2590 		}
2591 	}
2592 
2593 	if (size == 0 ||
2594 	    (offset & MIN(VM_MAP_PAGE_MASK(map), PAGE_MASK_64)) != 0) {
2595 		*address = 0;
2596 		return KERN_INVALID_ARGUMENT;
2597 	}
2598 
2599 	if (map->pmap == kernel_pmap) {
2600 		user_alias = VM_KERN_MEMORY_NONE;
2601 	} else {
2602 		user_alias = alias;
2603 	}
2604 
2605 	if (user_alias == VM_MEMORY_MALLOC_MEDIUM) {
2606 		chunk_size = MALLOC_MEDIUM_CHUNK_SIZE;
2607 	}
2608 
2609 #define RETURN(value)   { result = value; goto BailOut; }
2610 
2611 	assertf(VM_MAP_PAGE_ALIGNED(*address, FOURK_PAGE_MASK), "0x%llx", (uint64_t)*address);
2612 	assertf(VM_MAP_PAGE_ALIGNED(size, FOURK_PAGE_MASK), "0x%llx", (uint64_t)size);
2613 	if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK) {
2614 		assertf(page_aligned(*address), "0x%llx", (uint64_t)*address);
2615 		assertf(page_aligned(size), "0x%llx", (uint64_t)size);
2616 	}
2617 
2618 	if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK &&
2619 	    !VM_MAP_PAGE_ALIGNED(size, VM_MAP_PAGE_MASK(map))) {
2620 		/*
2621 		 * In most cases, the caller rounds the size up to the
2622 		 * map's page size.
2623 		 * If we get a size that is explicitly not map-aligned here,
2624 		 * we'll have to respect the caller's wish and mark the
2625 		 * mapping as "not map-aligned" to avoid tripping the
2626 		 * map alignment checks later.
2627 		 */
2628 		clear_map_aligned = TRUE;
2629 	}
2630 	if (!anywhere &&
2631 	    VM_MAP_PAGE_MASK(map) >= PAGE_MASK &&
2632 	    !VM_MAP_PAGE_ALIGNED(*address, VM_MAP_PAGE_MASK(map))) {
2633 		/*
2634 		 * We've been asked to map at a fixed address and that
2635 		 * address is not aligned to the map's specific alignment.
2636 		 * The caller should know what it's doing (i.e. most likely
2637 		 * mapping some fragmented copy map, transferring memory from
2638 		 * a VM map with a different alignment), so clear map_aligned
2639 		 * for this new VM map entry and proceed.
2640 		 */
2641 		clear_map_aligned = TRUE;
2642 	}
2643 
2644 	/*
2645 	 * Only zero-fill objects are allowed to be purgable.
2646 	 * LP64todo - limit purgable objects to 32-bits for now
2647 	 */
2648 	if (purgable &&
2649 	    (offset != 0 ||
2650 	    (object != VM_OBJECT_NULL &&
2651 	    (object->vo_size != size ||
2652 	    object->purgable == VM_PURGABLE_DENY))
2653 	    || size > ANON_MAX_SIZE)) { /* LP64todo: remove when dp capable */
2654 		return KERN_INVALID_ARGUMENT;
2655 	}
2656 
2657 	start = *address;
2658 
2659 	if (anywhere) {
2660 		vm_map_lock(map);
2661 		map_locked = TRUE;
2662 
2663 		if (flags & VM_FLAGS_RANDOM_ADDR) {
2664 			vmk_flags.vmkf_random_address = true;
2665 		}
2666 
2667 		/*
2668 		 * Default to data range for kernel_map
2669 		 */
2670 		if (map == kernel_map) {
2671 			vmk_flags.vmkf_range_id = KMEM_RANGE_ID_DATA;
2672 		}
2673 
2674 		result = vm_map_locate_space(map, size, mask, vmk_flags,
2675 		    &start, &entry);
2676 		if (result != KERN_SUCCESS) {
2677 			goto BailOut;
2678 		}
2679 
2680 		*address = start;
2681 		end = start + size;
2682 		assert(VM_MAP_PAGE_ALIGNED(*address,
2683 		    VM_MAP_PAGE_MASK(map)));
2684 	} else {
2685 		vm_map_offset_t effective_min_offset, effective_max_offset;
2686 
2687 		effective_min_offset = map->min_offset;
2688 		effective_max_offset = map->max_offset;
2689 
2690 		if (vmk_flags.vmkf_beyond_max) {
2691 			/*
2692 			 * Allow an insertion beyond the map's max offset.
2693 			 */
2694 			effective_max_offset = 0x00000000FFFFF000ULL;
2695 #if !defined(__arm__)
2696 			if (vm_map_is_64bit(map)) {
2697 				effective_max_offset = 0xFFFFFFFFFFFFF000ULL;
2698 			}
2699 #endif  /* __arm__ */
2700 #if XNU_TARGET_OS_OSX
2701 		} else if (__improbable(vmk_flags.vmkf_32bit_map_va)) {
2702 			effective_max_offset = MIN(map->max_offset, 0x00000000FFFFF000ULL);
2703 #endif /* XNU_TARGET_OS_OSX */
2704 		}
2705 
2706 		if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT &&
2707 		    !overwrite &&
2708 		    user_alias == VM_MEMORY_REALLOC) {
2709 			/*
2710 			 * Force realloc() to switch to a new allocation,
2711 			 * to prevent 4k-fragmented virtual ranges.
2712 			 */
2713 //			DEBUG4K_ERROR("no realloc in place");
2714 			return KERN_NO_SPACE;
2715 		}
2716 
2717 		/*
2718 		 *	Verify that:
2719 		 *		the address doesn't itself violate
2720 		 *		the mask requirement.
2721 		 */
2722 
2723 		vm_map_lock(map);
2724 		map_locked = TRUE;
2725 		if ((start & mask) != 0) {
2726 			RETURN(KERN_NO_SPACE);
2727 		}
2728 
2729 		/*
2730 		 *	...	the address is within bounds
2731 		 */
2732 
2733 		end = start + size;
2734 
2735 		if ((start < effective_min_offset) ||
2736 		    (end > effective_max_offset) ||
2737 		    (start >= end)) {
2738 			RETURN(KERN_INVALID_ADDRESS);
2739 		}
2740 
2741 		if (overwrite) {
2742 			vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_MAP_ALIGN;
2743 
2744 			/*
2745 			 * Fixed mapping and "overwrite" flag: attempt to
2746 			 * remove all existing mappings in the specified
2747 			 * address range, saving them in our "zap_old_list".
2748 			 *
2749 			 * This avoids releasing the VM map lock in
2750 			 * vm_map_entry_delete() and allows atomicity
2751 			 * when we want to replace some mappings with a new one.
2752 			 * It also allows us to restore the old VM mappings if the
2753 			 * new mapping fails.
2754 			 */
2755 			remove_flags |= VM_MAP_REMOVE_NO_YIELD;
2756 
2757 			if (vmk_flags.vmkf_overwrite_immutable) {
2758 				/* we can overwrite immutable mappings */
2759 				remove_flags |= VM_MAP_REMOVE_IMMUTABLE;
2760 			}
2761 			(void)vm_map_delete(map, start, end,
2762 			    remove_flags, &zap_old_list);
2763 		}
2764 
2765 		/*
2766 		 *	...	the starting address isn't allocated
2767 		 */
2768 
2769 		if (vm_map_lookup_entry(map, start, &entry)) {
2770 			if (!(vmk_flags.vmkf_already)) {
2771 				RETURN(KERN_NO_SPACE);
2772 			}
2773 			/*
2774 			 * Check if what's already there is what we want.
2775 			 */
2776 			tmp_start = start;
2777 			tmp_offset = offset;
2778 			if (entry->vme_start < start) {
2779 				tmp_start -= start - entry->vme_start;
2780 				tmp_offset -= start - entry->vme_start;
2781 			}
2782 			for (; entry->vme_start < end;
2783 			    entry = entry->vme_next) {
2784 				/*
2785 				 * Check if the mapping's attributes
2786 				 * match the existing map entry.
2787 				 */
2788 				if (entry == vm_map_to_entry(map) ||
2789 				    entry->vme_start != tmp_start ||
2790 				    entry->is_sub_map != is_submap ||
2791 				    VME_OFFSET(entry) != tmp_offset ||
2792 				    entry->needs_copy != needs_copy ||
2793 				    entry->protection != cur_protection ||
2794 				    entry->max_protection != max_protection ||
2795 				    entry->inheritance != inheritance ||
2796 				    entry->iokit_acct != iokit_acct ||
2797 				    VME_ALIAS(entry) != alias) {
2798 					/* not the same mapping ! */
2799 					RETURN(KERN_NO_SPACE);
2800 				}
2801 				/*
2802 				 * Check if the same object is being mapped.
2803 				 */
2804 				if (is_submap) {
2805 					if (VME_SUBMAP(entry) !=
2806 					    (vm_map_t) object) {
2807 						/* not the same submap */
2808 						RETURN(KERN_NO_SPACE);
2809 					}
2810 				} else {
2811 					if (VME_OBJECT(entry) != object) {
2812 						/* not the same VM object... */
2813 						vm_object_t obj2;
2814 
2815 						obj2 = VME_OBJECT(entry);
2816 						if ((obj2 == VM_OBJECT_NULL ||
2817 						    obj2->internal) &&
2818 						    (object == VM_OBJECT_NULL ||
2819 						    object->internal)) {
2820 							/*
2821 							 * ... but both are
2822 							 * anonymous memory,
2823 							 * so equivalent.
2824 							 */
2825 						} else {
2826 							RETURN(KERN_NO_SPACE);
2827 						}
2828 					}
2829 				}
2830 
2831 				tmp_offset += entry->vme_end - entry->vme_start;
2832 				tmp_start += entry->vme_end - entry->vme_start;
2833 				if (entry->vme_end >= end) {
2834 					/* reached the end of our mapping */
2835 					break;
2836 				}
2837 			}
2838 			/* it all matches:  let's use what's already there ! */
2839 			RETURN(KERN_MEMORY_PRESENT);
2840 		}
2841 
2842 		/*
2843 		 *	...	the next region doesn't overlap the
2844 		 *		end point.
2845 		 */
2846 
2847 		if ((entry->vme_next != vm_map_to_entry(map)) &&
2848 		    (entry->vme_next->vme_start < end)) {
2849 			RETURN(KERN_NO_SPACE);
2850 		}
2851 	}
2852 
2853 	/*
2854 	 *	At this point,
2855 	 *		"start" and "end" should define the endpoints of the
2856 	 *			available new range, and
2857 	 *		"entry" should refer to the region before the new
2858 	 *			range, and
2859 	 *
2860 	 *		the map should be locked.
2861 	 */
2862 
2863 	/*
2864 	 *	See whether we can avoid creating a new entry (and object) by
2865 	 *	extending one of our neighbors.  [So far, we only attempt to
2866 	 *	extend from below.]  Note that we can never extend/join
2867 	 *	purgable objects because they need to remain distinct
2868 	 *	entities in order to implement their "volatile object"
2869 	 *	semantics.
2870 	 */
2871 
2872 	if (purgable ||
2873 	    entry_for_jit ||
2874 	    vm_memory_malloc_no_cow(user_alias)) {
2875 		if (object == VM_OBJECT_NULL) {
2876 			object = vm_object_allocate(size);
2877 			object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
2878 			object->true_share = FALSE;
2879 			if (purgable) {
2880 				task_t owner;
2881 				object->purgable = VM_PURGABLE_NONVOLATILE;
2882 				if (map->pmap == kernel_pmap) {
2883 					/*
2884 					 * Purgeable mappings made in a kernel
2885 					 * map are "owned" by the kernel itself
2886 					 * rather than the current user task
2887 					 * because they're likely to be used by
2888 					 * more than this user task (see
2889 					 * execargs_purgeable_allocate(), for
2890 					 * example).
2891 					 */
2892 					owner = kernel_task;
2893 				} else {
2894 					owner = current_task();
2895 				}
2896 				assert(object->vo_owner == NULL);
2897 				assert(object->resident_page_count == 0);
2898 				assert(object->wired_page_count == 0);
2899 				vm_object_lock(object);
2900 				vm_purgeable_nonvolatile_enqueue(object, owner);
2901 				vm_object_unlock(object);
2902 			}
2903 			offset = (vm_object_offset_t)0;
2904 		}
2905 	} else if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
2906 		/* no coalescing if address space uses sub-pages */
2907 	} else if ((is_submap == FALSE) &&
2908 	    (object == VM_OBJECT_NULL) &&
2909 	    (entry != vm_map_to_entry(map)) &&
2910 	    (entry->vme_end == start) &&
2911 	    (!entry->is_shared) &&
2912 	    (!entry->is_sub_map) &&
2913 	    (!entry->in_transition) &&
2914 	    (!entry->needs_wakeup) &&
2915 	    (entry->behavior == VM_BEHAVIOR_DEFAULT) &&
2916 	    (entry->protection == cur_protection) &&
2917 	    (entry->max_protection == max_protection) &&
2918 	    (entry->inheritance == inheritance) &&
2919 	    ((user_alias == VM_MEMORY_REALLOC) ||
2920 	    (VME_ALIAS(entry) == alias)) &&
2921 	    (entry->no_cache == no_cache) &&
2922 	    (entry->permanent == permanent) &&
2923 	    /* no coalescing for immutable executable mappings */
2924 	    !((entry->protection & VM_PROT_EXECUTE) &&
2925 	    entry->permanent) &&
2926 	    (!entry->superpage_size && !superpage_size) &&
2927 	    /*
2928 	     * No coalescing if not map-aligned, to avoid propagating
2929 	     * that condition any further than needed:
2930 	     */
2931 	    (!entry->map_aligned || !clear_map_aligned) &&
2932 	    (!entry->zero_wired_pages) &&
2933 	    (!entry->used_for_jit && !entry_for_jit) &&
2934 	    (!entry->pmap_cs_associated) &&
2935 	    (entry->iokit_acct == iokit_acct) &&
2936 	    (!entry->vme_resilient_codesign) &&
2937 	    (!entry->vme_resilient_media) &&
2938 	    (!entry->vme_atomic) &&
2939 	    (entry->vme_no_copy_on_read == no_copy_on_read) &&
2940 
2941 	    ((entry->vme_end - entry->vme_start) + size <=
2942 	    (user_alias == VM_MEMORY_REALLOC ?
2943 	    ANON_CHUNK_SIZE :
2944 	    NO_COALESCE_LIMIT)) &&
2945 
2946 	    (entry->wired_count == 0)) {        /* implies user_wired_count == 0 */
2947 		if (vm_object_coalesce(VME_OBJECT(entry),
2948 		    VM_OBJECT_NULL,
2949 		    VME_OFFSET(entry),
2950 		    (vm_object_offset_t) 0,
2951 		    (vm_map_size_t)(entry->vme_end - entry->vme_start),
2952 		    (vm_map_size_t)(end - entry->vme_end))) {
2953 			/*
2954 			 *	Coalesced the two objects - can extend
2955 			 *	the previous map entry to include the
2956 			 *	new range.
2957 			 */
2958 			map->size += (end - entry->vme_end);
2959 			assert(entry->vme_start < end);
2960 			assert(VM_MAP_PAGE_ALIGNED(end,
2961 			    VM_MAP_PAGE_MASK(map)));
2962 			if (__improbable(vm_debug_events)) {
2963 				DTRACE_VM5(map_entry_extend, vm_map_t, map, vm_map_entry_t, entry, vm_address_t, entry->vme_start, vm_address_t, entry->vme_end, vm_address_t, end);
2964 			}
2965 			entry->vme_end = end;
2966 			if (map->holelistenabled) {
2967 				vm_map_store_update_first_free(map, entry, TRUE);
2968 			} else {
2969 				vm_map_store_update_first_free(map, map->first_free, TRUE);
2970 			}
2971 			new_mapping_established = TRUE;
2972 			RETURN(KERN_SUCCESS);
2973 		}
2974 	}
2975 
2976 	step = superpage_size ? SUPERPAGE_SIZE : (end - start);
2977 	new_entry = NULL;
2978 
2979 	if (vmk_flags.vmkf_submap_adjust) {
2980 		vm_map_adjust_offsets((vm_map_t)caller_object, start, end);
2981 		offset = start;
2982 	}
2983 
2984 	for (tmp2_start = start; tmp2_start < end; tmp2_start += step) {
2985 		tmp2_end = tmp2_start + step;
2986 		/*
2987 		 *	Create a new entry
2988 		 *
2989 		 * XXX FBDP
2990 		 * The reserved "page zero" in each process's address space can
2991 		 * be arbitrarily large.  Splitting it into separate objects and
2992 		 * therefore different VM map entries serves no purpose and just
2993 		 * slows down operations on the VM map, so let's not split the
2994 		 * allocation into chunks if the max protection is NONE.  That
2995 		 * memory should never be accessible, so it will never get to the
2996 		 * default pager.
2997 		 */
2998 		tmp_start = tmp2_start;
2999 		if (object == VM_OBJECT_NULL &&
3000 		    size > chunk_size &&
3001 		    max_protection != VM_PROT_NONE &&
3002 		    superpage_size == 0) {
3003 			tmp_end = tmp_start + chunk_size;
3004 		} else {
3005 			tmp_end = tmp2_end;
3006 		}
3007 		do {
3008 			if (!is_submap &&
3009 			    object != VM_OBJECT_NULL &&
3010 			    object->internal &&
3011 			    offset + (tmp_end - tmp_start) > object->vo_size) {
3012 //				printf("FBDP object %p size 0x%llx overmapping offset 0x%llx size 0x%llx\n", object, object->vo_size, offset, (uint64_t)(tmp_end - tmp_start));
3013 				DTRACE_VM5(vm_map_enter_overmap,
3014 				    vm_map_t, map,
3015 				    vm_map_address_t, tmp_start,
3016 				    vm_map_address_t, tmp_end,
3017 				    vm_object_offset_t, offset,
3018 				    vm_object_size_t, object->vo_size);
3019 			}
3020 			new_entry = vm_map_entry_insert(map,
3021 			    entry, tmp_start, tmp_end,
3022 			    object, offset, vmk_flags,
3023 			    needs_copy,
3024 			    cur_protection, max_protection,
3025 			    (entry_for_jit && !VM_MAP_POLICY_ALLOW_JIT_INHERIT(map) ?
3026 			    VM_INHERIT_NONE : inheritance),
3027 			    no_cache,
3028 			    permanent,
3029 			    no_copy_on_read,
3030 			    superpage_size,
3031 			    clear_map_aligned,
3032 			    is_submap,
3033 			    entry_for_jit,
3034 			    alias,
3035 			    translated_allow_execute);
3036 
3037 			assert((object != kernel_object) || (VM_KERN_MEMORY_NONE != alias));
3038 
3039 			if (resilient_codesign) {
3040 				int reject_prot = (needs_copy ? VM_PROT_ALLEXEC : (VM_PROT_WRITE | VM_PROT_ALLEXEC));
3041 				if (!((cur_protection | max_protection) & reject_prot)) {
3042 					new_entry->vme_resilient_codesign = TRUE;
3043 				}
3044 			}
3045 
3046 			if (resilient_media &&
3047 			    (object == VM_OBJECT_NULL ||
3048 			    object->internal)) {
3049 				new_entry->vme_resilient_media = TRUE;
3050 			}
3051 
3052 			assert(!new_entry->iokit_acct);
3053 			if (!is_submap &&
3054 			    object != VM_OBJECT_NULL &&
3055 			    (object->purgable != VM_PURGABLE_DENY ||
3056 			    object->vo_ledger_tag)) {
3057 				assert(new_entry->use_pmap);
3058 				assert(!new_entry->iokit_acct);
3059 				/*
3060 				 * Turn off pmap accounting since
3061 				 * purgeable (or tagged) objects have their
3062 				 * own ledgers.
3063 				 */
3064 				new_entry->use_pmap = FALSE;
3065 			} else if (!is_submap &&
3066 			    iokit_acct &&
3067 			    object != VM_OBJECT_NULL &&
3068 			    object->internal) {
3069 				/* alternate accounting */
3070 				assert(!new_entry->iokit_acct);
3071 				assert(new_entry->use_pmap);
3072 				new_entry->iokit_acct = TRUE;
3073 				new_entry->use_pmap = FALSE;
3074 				DTRACE_VM4(
3075 					vm_map_iokit_mapped_region,
3076 					vm_map_t, map,
3077 					vm_map_offset_t, new_entry->vme_start,
3078 					vm_map_offset_t, new_entry->vme_end,
3079 					int, VME_ALIAS(new_entry));
3080 				vm_map_iokit_mapped_region(
3081 					map,
3082 					(new_entry->vme_end -
3083 					new_entry->vme_start));
3084 			} else if (!is_submap) {
3085 				assert(!new_entry->iokit_acct);
3086 				assert(new_entry->use_pmap);
3087 			}
3088 
3089 			if (is_submap) {
3090 				vm_map_t        submap;
3091 				boolean_t       submap_is_64bit;
3092 				boolean_t       use_pmap;
3093 
3094 				assert(new_entry->is_sub_map);
3095 				assert(!new_entry->use_pmap);
3096 				assert(!new_entry->iokit_acct);
3097 				submap = (vm_map_t) object;
3098 				submap_is_64bit = vm_map_is_64bit(submap);
3099 				use_pmap = vmk_flags.vmkf_nested_pmap;
3100 #ifndef NO_NESTED_PMAP
3101 				if (use_pmap && submap->pmap == NULL) {
3102 					ledger_t ledger = map->pmap->ledger;
3103 					/* we need a sub pmap to nest... */
3104 					submap->pmap = pmap_create_options(ledger, 0,
3105 					    submap_is_64bit ? PMAP_CREATE_64BIT : 0);
3106 					if (submap->pmap == NULL) {
3107 						/* let's proceed without nesting... */
3108 					}
3109 #if     defined(__arm__) || defined(__arm64__)
3110 					else {
3111 						pmap_set_nested(submap->pmap);
3112 					}
3113 #endif
3114 				}
3115 				if (use_pmap && submap->pmap != NULL) {
3116 					if (VM_MAP_PAGE_SHIFT(map) != VM_MAP_PAGE_SHIFT(submap)) {
3117 						DEBUG4K_ERROR("map %p (%d) submap %p (%d): incompatible page sizes\n", map, VM_MAP_PAGE_SHIFT(map), submap, VM_MAP_PAGE_SHIFT(submap));
3118 						kr = KERN_FAILURE;
3119 					} else {
3120 						kr = pmap_nest(map->pmap,
3121 						    submap->pmap,
3122 						    tmp_start,
3123 						    tmp_end - tmp_start);
3124 					}
3125 					if (kr != KERN_SUCCESS) {
3126 						printf("vm_map_enter: "
3127 						    "pmap_nest(0x%llx,0x%llx) "
3128 						    "error 0x%x\n",
3129 						    (long long)tmp_start,
3130 						    (long long)tmp_end,
3131 						    kr);
3132 					} else {
3133 						/* we're now nested ! */
3134 						new_entry->use_pmap = TRUE;
3135 						pmap_empty = FALSE;
3136 					}
3137 				}
3138 #endif /* NO_NESTED_PMAP */
3139 			}
3140 			entry = new_entry;
3141 
3142 			if (superpage_size) {
3143 				vm_page_t pages, m;
3144 				vm_object_t sp_object;
3145 				vm_object_offset_t sp_offset;
3146 
3147 				VME_OFFSET_SET(entry, 0);
3148 
3149 				/* allocate one superpage */
3150 				kr = cpm_allocate(SUPERPAGE_SIZE, &pages, 0, SUPERPAGE_NBASEPAGES - 1, TRUE, 0);
3151 				if (kr != KERN_SUCCESS) {
3152 					/* deallocate whole range... */
3153 					new_mapping_established = TRUE;
3154 					/* ... but only up to "tmp_end" */
3155 					size -= end - tmp_end;
3156 					RETURN(kr);
3157 				}
3158 
3159 				/* create one vm_object per superpage */
3160 				sp_object = vm_object_allocate((vm_map_size_t)(entry->vme_end - entry->vme_start));
3161 				sp_object->phys_contiguous = TRUE;
3162 				sp_object->vo_shadow_offset = (vm_object_offset_t)VM_PAGE_GET_PHYS_PAGE(pages) * PAGE_SIZE;
3163 				VME_OBJECT_SET(entry, sp_object);
3164 				assert(entry->use_pmap);
3165 
3166 				/* enter the base pages into the object */
3167 				vm_object_lock(sp_object);
3168 				for (sp_offset = 0;
3169 				    sp_offset < SUPERPAGE_SIZE;
3170 				    sp_offset += PAGE_SIZE) {
3171 					m = pages;
3172 					pmap_zero_page(VM_PAGE_GET_PHYS_PAGE(m));
3173 					pages = NEXT_PAGE(m);
3174 					*(NEXT_PAGE_PTR(m)) = VM_PAGE_NULL;
3175 					vm_page_insert_wired(m, sp_object, sp_offset, VM_KERN_MEMORY_OSFMK);
3176 				}
3177 				vm_object_unlock(sp_object);
3178 			}
3179 		} while (tmp_end != tmp2_end &&
3180 		    (tmp_start = tmp_end) &&
3181 		    (tmp_end = (tmp2_end - tmp_end > chunk_size) ?
3182 		    tmp_end + chunk_size : tmp2_end));
3183 	}
3184 
3185 	new_mapping_established = TRUE;
3186 
3187 BailOut:
3188 	assert(map_locked == TRUE);
3189 
3190 	/*
3191 	 * Address space limit enforcement (RLIMIT_AS and RLIMIT_DATA):
3192 	 * If we have identified and possibly established the new mapping(s),
3193 	 * make sure we did not go beyond the address space limit.
3194 	 */
3195 	if (result == KERN_SUCCESS) {
3196 		if (map->size_limit != RLIM_INFINITY &&
3197 		    map->size > map->size_limit) {
3198 			/*
3199 			 * Establishing the requested mappings would exceed
3200 			 * the process's RLIMIT_AS limit: fail with
3201 			 * KERN_NO_SPACE.
3202 			 */
3203 			result = KERN_NO_SPACE;
3204 			printf("%d[%s] %s: map size 0x%llx over RLIMIT_AS 0x%llx\n",
3205 			    proc_selfpid(),
3206 			    (current_task()->bsd_info
3207 			    ? proc_name_address(current_task()->bsd_info)
3208 			    : "?"),
3209 			    __FUNCTION__,
3210 			    (uint64_t) map->size,
3211 			    (uint64_t) map->size_limit);
3212 			DTRACE_VM2(vm_map_enter_RLIMIT_AS,
3213 			    vm_map_size_t, map->size,
3214 			    uint64_t, map->size_limit);
3215 			vm_map_enter_RLIMIT_AS_count++;
3216 		} else if (map->data_limit != RLIM_INFINITY &&
3217 		    map->size > map->data_limit) {
3218 			/*
3219 			 * Establishing the requested mappings would exceed
3220 			 * the process's RLIMIT_DATA limit: fail with
3221 			 * KERN_NO_SPACE.
3222 			 */
3223 			result = KERN_NO_SPACE;
3224 			printf("%d[%s] %s: map size 0x%llx over RLIMIT_DATA 0x%llx\n",
3225 			    proc_selfpid(),
3226 			    (current_task()->bsd_info
3227 			    ? proc_name_address(current_task()->bsd_info)
3228 			    : "?"),
3229 			    __FUNCTION__,
3230 			    (uint64_t) map->size,
3231 			    (uint64_t) map->data_limit);
3232 			DTRACE_VM2(vm_map_enter_RLIMIT_DATA,
3233 			    vm_map_size_t, map->size,
3234 			    uint64_t, map->data_limit);
3235 			vm_map_enter_RLIMIT_DATA_count++;
3236 		}
3237 	}
3238 
3239 	if (result == KERN_SUCCESS) {
3240 		vm_prot_t pager_prot;
3241 		memory_object_t pager;
3242 
3243 #if DEBUG
3244 		if (pmap_empty &&
3245 		    !(vmk_flags.vmkf_no_pmap_check)) {
3246 			assert(pmap_is_empty(map->pmap,
3247 			    *address,
3248 			    *address + size));
3249 		}
3250 #endif /* DEBUG */
3251 
3252 		/*
3253 		 * For "named" VM objects, let the pager know that the
3254 		 * memory object is being mapped.  Some pagers need to keep
3255 		 * track of this, to know when they can reclaim the memory
3256 		 * object, for example.
3257 		 * VM calls memory_object_map() for each mapping (specifying
3258 		 * the protection of each mapping) and calls
3259 		 * memory_object_last_unmap() when all the mappings are gone.
3260 		 */
3261 		pager_prot = max_protection;
3262 		if (needs_copy) {
3263 			/*
3264 			 * Copy-On-Write mapping: won't modify
3265 			 * the memory object.
3266 			 */
3267 			pager_prot &= ~VM_PROT_WRITE;
3268 		}
3269 		if (!is_submap &&
3270 		    object != VM_OBJECT_NULL &&
3271 		    object->named &&
3272 		    object->pager != MEMORY_OBJECT_NULL) {
3273 			vm_object_lock(object);
3274 			pager = object->pager;
3275 			if (object->named &&
3276 			    pager != MEMORY_OBJECT_NULL) {
3277 				assert(object->pager_ready);
3278 				vm_object_mapping_wait(object, THREAD_UNINT);
3279 				vm_object_mapping_begin(object);
3280 				vm_object_unlock(object);
3281 
3282 				kr = memory_object_map(pager, pager_prot);
3283 				assert(kr == KERN_SUCCESS);
3284 
3285 				vm_object_lock(object);
3286 				vm_object_mapping_end(object);
3287 			}
3288 			vm_object_unlock(object);
3289 		}
3290 	}
3291 
3292 	assert(map_locked == TRUE);
3293 
3294 	if (!keep_map_locked) {
3295 		vm_map_unlock(map);
3296 		map_locked = FALSE;
3297 	}
3298 
3299 	/*
3300 	 * We can't hold the map lock if we enter this block.
3301 	 */
3302 
3303 	if (result == KERN_SUCCESS) {
3304 		/*	Wire down the new entry if the user
3305 		 *	requested all new map entries be wired.
3306 		 */
3307 		if ((map->wiring_required) || (superpage_size)) {
3308 			assert(!keep_map_locked);
3309 			pmap_empty = FALSE; /* pmap won't be empty */
3310 			kr = vm_map_wire_kernel(map, start, end,
3311 			    new_entry->protection, VM_KERN_MEMORY_MLOCK,
3312 			    TRUE);
3313 			result = kr;
3314 		}
3315 
3316 	}
3317 
3318 	if (result != KERN_SUCCESS) {
3319 		if (new_mapping_established) {
3320 			/*
3321 			 * The caller had an extra reference on the VM object
3322 			 * it gave us.
3323 			 * We've transferred that reference to the mapping we
3324 			 * just established but we're about to undo that mapping
3325 			 * and release that reference.
3326 			 * The caller expects its reference to be consumed on
3327 			 * success only, so we have to get the extra reference
3328 			 * back for the caller.
3329 			 */
3330 			vm_object_reference(caller_object);
3331 
3332 			/*
3333 			 * We have to get rid of the new mappings since we
3334 			 * won't make them available to the user.
3335 			 * Try and do that atomically, to minimize the risk
3336 			 * that someone else create new mappings that range.
3337 			 */
3338 
3339 			if (!map_locked) {
3340 				vm_map_lock(map);
3341 				map_locked = TRUE;
3342 			}
3343 			(void)vm_map_delete(map, *address, *address + size,
3344 			    VM_MAP_REMOVE_NO_MAP_ALIGN | VM_MAP_REMOVE_NO_YIELD,
3345 			    &zap_new_list);
3346 		}
3347 
3348 		if (vm_map_zap_first_entry(&zap_old_list)) {
3349 			vm_map_entry_t entry1, entry2;
3350 
3351 			/*
3352 			 * The new mapping failed.  Attempt to restore
3353 			 * the old mappings, saved in the "zap_old_map".
3354 			 */
3355 			if (!map_locked) {
3356 				vm_map_lock(map);
3357 				map_locked = TRUE;
3358 			}
3359 
3360 			/* first check if the coast is still clear */
3361 			start = vm_map_zap_first_entry(&zap_old_list)->vme_start;
3362 			end   = vm_map_zap_last_entry(&zap_old_list)->vme_end;
3363 
3364 			if (vm_map_lookup_entry(map, start, &entry1) ||
3365 			    vm_map_lookup_entry(map, end, &entry2) ||
3366 			    entry1 != entry2) {
3367 				/*
3368 				 * Part of that range has already been
3369 				 * re-mapped:  we can't restore the old
3370 				 * mappings...
3371 				 */
3372 				vm_map_enter_restore_failures++;
3373 			} else {
3374 				/*
3375 				 * Transfer the saved map entries from
3376 				 * "zap_old_map" to the original "map",
3377 				 * inserting them all after "entry1".
3378 				 */
3379 				while ((entry2 = vm_map_zap_pop(&zap_old_list))) {
3380 					vm_map_size_t entry_size;
3381 
3382 					entry_size = (entry2->vme_end -
3383 					    entry2->vme_start);
3384 					vm_map_store_entry_link(map, entry1, entry2,
3385 					    VM_MAP_KERNEL_FLAGS_NONE);
3386 					map->size += entry_size;
3387 					entry1 = entry2;
3388 				}
3389 				if (map->wiring_required) {
3390 					/*
3391 					 * XXX TODO: we should rewire the
3392 					 * old pages here...
3393 					 */
3394 				}
3395 				vm_map_enter_restore_successes++;
3396 			}
3397 		}
3398 	}
3399 
3400 	/*
3401 	 * The caller is responsible for releasing the lock if it requested to
3402 	 * keep the map locked.
3403 	 */
3404 	if (map_locked && !keep_map_locked) {
3405 		vm_map_unlock(map);
3406 	}
3407 
3408 	vm_map_zap_dispose(&zap_old_list);
3409 	vm_map_zap_dispose(&zap_new_list);
3410 
3411 	return result;
3412 
3413 #undef  RETURN
3414 }
3415 
3416 #if __arm64__
3417 extern const struct memory_object_pager_ops fourk_pager_ops;
3418 kern_return_t
vm_map_enter_fourk(vm_map_t map,vm_map_offset_t * address,vm_map_size_t size,vm_map_offset_t mask,int flags,vm_map_kernel_flags_t vmk_flags,vm_tag_t alias,vm_object_t object,vm_object_offset_t offset,boolean_t needs_copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)3419 vm_map_enter_fourk(
3420 	vm_map_t                map,
3421 	vm_map_offset_t         *address,       /* IN/OUT */
3422 	vm_map_size_t           size,
3423 	vm_map_offset_t         mask,
3424 	int                     flags,
3425 	vm_map_kernel_flags_t   vmk_flags,
3426 	vm_tag_t                alias,
3427 	vm_object_t             object,
3428 	vm_object_offset_t      offset,
3429 	boolean_t               needs_copy,
3430 	vm_prot_t               cur_protection,
3431 	vm_prot_t               max_protection,
3432 	vm_inherit_t            inheritance)
3433 {
3434 	vm_map_entry_t          entry, new_entry;
3435 	vm_map_offset_t         start, fourk_start;
3436 	vm_map_offset_t         end, fourk_end;
3437 	vm_map_size_t           fourk_size;
3438 	kern_return_t           result = KERN_SUCCESS;
3439 	boolean_t               map_locked = FALSE;
3440 	boolean_t               pmap_empty = TRUE;
3441 	boolean_t               new_mapping_established = FALSE;
3442 	boolean_t               keep_map_locked = vmk_flags.vmkf_keep_map_locked;
3443 	boolean_t               anywhere = ((flags & VM_FLAGS_ANYWHERE) != 0);
3444 	boolean_t               purgable = ((flags & VM_FLAGS_PURGABLE) != 0);
3445 	boolean_t               overwrite = ((flags & VM_FLAGS_OVERWRITE) != 0);
3446 	boolean_t               no_cache = ((flags & VM_FLAGS_NO_CACHE) != 0);
3447 	boolean_t               is_submap = vmk_flags.vmkf_submap;
3448 	boolean_t               permanent = (((flags & VM_FLAGS_PERMANENT) != 0) || vmk_flags.vmkf_permanent);
3449 	boolean_t               no_copy_on_read = vmk_flags.vmkf_no_copy_on_read;
3450 	boolean_t               entry_for_jit = vmk_flags.vmkf_map_jit;
3451 //	boolean_t		iokit_acct = vmk_flags.vmkf_iokit_acct;
3452 	boolean_t               translated_allow_execute = vmk_flags.vmkf_translated_allow_execute;
3453 	unsigned int            superpage_size = ((flags & VM_FLAGS_SUPERPAGE_MASK) >> VM_FLAGS_SUPERPAGE_SHIFT);
3454 	vm_map_offset_t         effective_min_offset, effective_max_offset;
3455 	kern_return_t           kr;
3456 	boolean_t               clear_map_aligned = FALSE;
3457 	memory_object_t         fourk_mem_obj;
3458 	vm_object_t             fourk_object;
3459 	vm_map_offset_t         fourk_pager_offset;
3460 	int                     fourk_pager_index_start, fourk_pager_index_num;
3461 	int                     cur_idx;
3462 	boolean_t               fourk_copy;
3463 	vm_object_t             copy_object;
3464 	vm_object_offset_t      copy_offset;
3465 	VM_MAP_ZAP_DECLARE(zap_list);
3466 
3467 	if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
3468 		panic("%s:%d", __FUNCTION__, __LINE__);
3469 	}
3470 	fourk_mem_obj = MEMORY_OBJECT_NULL;
3471 	fourk_object = VM_OBJECT_NULL;
3472 
3473 	if (superpage_size) {
3474 		return KERN_NOT_SUPPORTED;
3475 	}
3476 
3477 	if ((cur_protection & VM_PROT_WRITE) &&
3478 	    (cur_protection & VM_PROT_EXECUTE) &&
3479 #if XNU_TARGET_OS_OSX
3480 	    map->pmap != kernel_pmap &&
3481 	    (vm_map_cs_enforcement(map)
3482 #if __arm64__
3483 	    || !VM_MAP_IS_EXOTIC(map)
3484 #endif /* __arm64__ */
3485 	    ) &&
3486 #endif /* XNU_TARGET_OS_OSX */
3487 	    !entry_for_jit) {
3488 		DTRACE_VM3(cs_wx,
3489 		    uint64_t, 0,
3490 		    uint64_t, 0,
3491 		    vm_prot_t, cur_protection);
3492 		printf("CODE SIGNING: %d[%s] %s: curprot cannot be write+execute. "
3493 		    "turning off execute\n",
3494 		    proc_selfpid(),
3495 		    (current_task()->bsd_info
3496 		    ? proc_name_address(current_task()->bsd_info)
3497 		    : "?"),
3498 		    __FUNCTION__);
3499 		cur_protection &= ~VM_PROT_EXECUTE;
3500 	}
3501 
3502 	/*
3503 	 * If the task has requested executable lockdown,
3504 	 * deny any new executable mapping.
3505 	 */
3506 	if (map->map_disallow_new_exec == TRUE) {
3507 		if (cur_protection & VM_PROT_EXECUTE) {
3508 			return KERN_PROTECTION_FAILURE;
3509 		}
3510 	}
3511 
3512 	if (is_submap) {
3513 		return KERN_NOT_SUPPORTED;
3514 	}
3515 	if (vmk_flags.vmkf_already) {
3516 		return KERN_NOT_SUPPORTED;
3517 	}
3518 	if (purgable || entry_for_jit) {
3519 		return KERN_NOT_SUPPORTED;
3520 	}
3521 
3522 	effective_min_offset = map->min_offset;
3523 
3524 	if (vmk_flags.vmkf_beyond_max) {
3525 		return KERN_NOT_SUPPORTED;
3526 	} else {
3527 		effective_max_offset = map->max_offset;
3528 	}
3529 
3530 	if (size == 0 ||
3531 	    (offset & FOURK_PAGE_MASK) != 0) {
3532 		*address = 0;
3533 		return KERN_INVALID_ARGUMENT;
3534 	}
3535 
3536 #define RETURN(value)   { result = value; goto BailOut; }
3537 
3538 	assert(VM_MAP_PAGE_ALIGNED(*address, FOURK_PAGE_MASK));
3539 	assert(VM_MAP_PAGE_ALIGNED(size, FOURK_PAGE_MASK));
3540 
3541 	if (!anywhere && overwrite) {
3542 		return KERN_NOT_SUPPORTED;
3543 	}
3544 
3545 	fourk_start = *address;
3546 	fourk_size = size;
3547 	fourk_end = fourk_start + fourk_size;
3548 
3549 	start = vm_map_trunc_page(*address, VM_MAP_PAGE_MASK(map));
3550 	end = vm_map_round_page(fourk_end, VM_MAP_PAGE_MASK(map));
3551 	size = end - start;
3552 
3553 	if (anywhere) {
3554 		return KERN_NOT_SUPPORTED;
3555 	} else {
3556 		/*
3557 		 *	Verify that:
3558 		 *		the address doesn't itself violate
3559 		 *		the mask requirement.
3560 		 */
3561 
3562 		vm_map_lock(map);
3563 		map_locked = TRUE;
3564 		if ((start & mask) != 0) {
3565 			RETURN(KERN_NO_SPACE);
3566 		}
3567 
3568 		/*
3569 		 *	...	the address is within bounds
3570 		 */
3571 
3572 		end = start + size;
3573 
3574 		if ((start < effective_min_offset) ||
3575 		    (end > effective_max_offset) ||
3576 		    (start >= end)) {
3577 			RETURN(KERN_INVALID_ADDRESS);
3578 		}
3579 
3580 		/*
3581 		 *	...	the starting address isn't allocated
3582 		 */
3583 		if (vm_map_lookup_entry(map, start, &entry)) {
3584 			vm_object_t cur_object, shadow_object;
3585 
3586 			/*
3587 			 * We might already some 4K mappings
3588 			 * in a 16K page here.
3589 			 */
3590 
3591 			if (entry->vme_end - entry->vme_start
3592 			    != SIXTEENK_PAGE_SIZE) {
3593 				RETURN(KERN_NO_SPACE);
3594 			}
3595 			if (entry->is_sub_map) {
3596 				RETURN(KERN_NO_SPACE);
3597 			}
3598 			if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
3599 				RETURN(KERN_NO_SPACE);
3600 			}
3601 
3602 			/* go all the way down the shadow chain */
3603 			cur_object = VME_OBJECT(entry);
3604 			vm_object_lock(cur_object);
3605 			while (cur_object->shadow != VM_OBJECT_NULL) {
3606 				shadow_object = cur_object->shadow;
3607 				vm_object_lock(shadow_object);
3608 				vm_object_unlock(cur_object);
3609 				cur_object = shadow_object;
3610 				shadow_object = VM_OBJECT_NULL;
3611 			}
3612 			if (cur_object->internal ||
3613 			    cur_object->pager == NULL) {
3614 				vm_object_unlock(cur_object);
3615 				RETURN(KERN_NO_SPACE);
3616 			}
3617 			if (cur_object->pager->mo_pager_ops
3618 			    != &fourk_pager_ops) {
3619 				vm_object_unlock(cur_object);
3620 				RETURN(KERN_NO_SPACE);
3621 			}
3622 			fourk_object = cur_object;
3623 			fourk_mem_obj = fourk_object->pager;
3624 
3625 			/* keep the "4K" object alive */
3626 			vm_object_reference_locked(fourk_object);
3627 			memory_object_reference(fourk_mem_obj);
3628 			vm_object_unlock(fourk_object);
3629 
3630 			/* merge permissions */
3631 			entry->protection |= cur_protection;
3632 			entry->max_protection |= max_protection;
3633 
3634 			if ((entry->protection & VM_PROT_WRITE) &&
3635 			    (entry->protection & VM_PROT_ALLEXEC) &&
3636 			    fourk_binary_compatibility_unsafe &&
3637 			    fourk_binary_compatibility_allow_wx) {
3638 				/* write+execute: need to be "jit" */
3639 				entry->used_for_jit = TRUE;
3640 			}
3641 			goto map_in_fourk_pager;
3642 		}
3643 
3644 		/*
3645 		 *	...	the next region doesn't overlap the
3646 		 *		end point.
3647 		 */
3648 
3649 		if ((entry->vme_next != vm_map_to_entry(map)) &&
3650 		    (entry->vme_next->vme_start < end)) {
3651 			RETURN(KERN_NO_SPACE);
3652 		}
3653 	}
3654 
3655 	/*
3656 	 *	At this point,
3657 	 *		"start" and "end" should define the endpoints of the
3658 	 *			available new range, and
3659 	 *		"entry" should refer to the region before the new
3660 	 *			range, and
3661 	 *
3662 	 *		the map should be locked.
3663 	 */
3664 
3665 	/* create a new "4K" pager */
3666 	fourk_mem_obj = fourk_pager_create();
3667 	fourk_object = fourk_pager_to_vm_object(fourk_mem_obj);
3668 	assert(fourk_object);
3669 
3670 	/* keep the "4" object alive */
3671 	vm_object_reference(fourk_object);
3672 
3673 	/* create a "copy" object, to map the "4K" object copy-on-write */
3674 	fourk_copy = TRUE;
3675 	result = vm_object_copy_strategically(fourk_object,
3676 	    0,
3677 	    end - start,
3678 	    &copy_object,
3679 	    &copy_offset,
3680 	    &fourk_copy);
3681 	assert(result == KERN_SUCCESS);
3682 	assert(copy_object != VM_OBJECT_NULL);
3683 	assert(copy_offset == 0);
3684 
3685 	/* map the "4K" pager's copy object */
3686 	new_entry = vm_map_entry_insert(map,
3687 	    entry,
3688 	    vm_map_trunc_page(start, VM_MAP_PAGE_MASK(map)),
3689 	    vm_map_round_page(end, VM_MAP_PAGE_MASK(map)),
3690 	    copy_object,
3691 	    0,                      /* offset */
3692 	    vmk_flags,
3693 	    FALSE,                  /* needs_copy */
3694 	    cur_protection, max_protection,
3695 	    (entry_for_jit && !VM_MAP_POLICY_ALLOW_JIT_INHERIT(map) ?
3696 	    VM_INHERIT_NONE : inheritance),
3697 	    no_cache,
3698 	    permanent,
3699 	    no_copy_on_read,
3700 	    superpage_size,
3701 	    clear_map_aligned,
3702 	    is_submap,
3703 	    FALSE,                  /* jit */
3704 	    alias,
3705 	    translated_allow_execute);
3706 	entry = new_entry;
3707 
3708 #if VM_MAP_DEBUG_FOURK
3709 	if (vm_map_debug_fourk) {
3710 		printf("FOURK_PAGER: map %p [0x%llx:0x%llx] new pager %p\n",
3711 		    map,
3712 		    (uint64_t) entry->vme_start,
3713 		    (uint64_t) entry->vme_end,
3714 		    fourk_mem_obj);
3715 	}
3716 #endif /* VM_MAP_DEBUG_FOURK */
3717 
3718 	new_mapping_established = TRUE;
3719 
3720 map_in_fourk_pager:
3721 	/* "map" the original "object" where it belongs in the "4K" pager */
3722 	fourk_pager_offset = (fourk_start & SIXTEENK_PAGE_MASK);
3723 	fourk_pager_index_start = (int) (fourk_pager_offset / FOURK_PAGE_SIZE);
3724 	if (fourk_size > SIXTEENK_PAGE_SIZE) {
3725 		fourk_pager_index_num = 4;
3726 	} else {
3727 		fourk_pager_index_num = (int) (fourk_size / FOURK_PAGE_SIZE);
3728 	}
3729 	if (fourk_pager_index_start + fourk_pager_index_num > 4) {
3730 		fourk_pager_index_num = 4 - fourk_pager_index_start;
3731 	}
3732 	for (cur_idx = 0;
3733 	    cur_idx < fourk_pager_index_num;
3734 	    cur_idx++) {
3735 		vm_object_t             old_object;
3736 		vm_object_offset_t      old_offset;
3737 
3738 		kr = fourk_pager_populate(fourk_mem_obj,
3739 		    TRUE,                       /* overwrite */
3740 		    fourk_pager_index_start + cur_idx,
3741 		    object,
3742 		    (object
3743 		    ? (offset +
3744 		    (cur_idx * FOURK_PAGE_SIZE))
3745 		    : 0),
3746 		    &old_object,
3747 		    &old_offset);
3748 #if VM_MAP_DEBUG_FOURK
3749 		if (vm_map_debug_fourk) {
3750 			if (old_object == (vm_object_t) -1 &&
3751 			    old_offset == (vm_object_offset_t) -1) {
3752 				printf("FOURK_PAGER: map %p [0x%llx:0x%llx] "
3753 				    "pager [%p:0x%llx] "
3754 				    "populate[%d] "
3755 				    "[object:%p,offset:0x%llx]\n",
3756 				    map,
3757 				    (uint64_t) entry->vme_start,
3758 				    (uint64_t) entry->vme_end,
3759 				    fourk_mem_obj,
3760 				    VME_OFFSET(entry),
3761 				    fourk_pager_index_start + cur_idx,
3762 				    object,
3763 				    (object
3764 				    ? (offset + (cur_idx * FOURK_PAGE_SIZE))
3765 				    : 0));
3766 			} else {
3767 				printf("FOURK_PAGER: map %p [0x%llx:0x%llx] "
3768 				    "pager [%p:0x%llx] "
3769 				    "populate[%d] [object:%p,offset:0x%llx] "
3770 				    "old [%p:0x%llx]\n",
3771 				    map,
3772 				    (uint64_t) entry->vme_start,
3773 				    (uint64_t) entry->vme_end,
3774 				    fourk_mem_obj,
3775 				    VME_OFFSET(entry),
3776 				    fourk_pager_index_start + cur_idx,
3777 				    object,
3778 				    (object
3779 				    ? (offset + (cur_idx * FOURK_PAGE_SIZE))
3780 				    : 0),
3781 				    old_object,
3782 				    old_offset);
3783 			}
3784 		}
3785 #endif /* VM_MAP_DEBUG_FOURK */
3786 
3787 		assert(kr == KERN_SUCCESS);
3788 		if (object != old_object &&
3789 		    object != VM_OBJECT_NULL &&
3790 		    object != (vm_object_t) -1) {
3791 			vm_object_reference(object);
3792 		}
3793 		if (object != old_object &&
3794 		    old_object != VM_OBJECT_NULL &&
3795 		    old_object != (vm_object_t) -1) {
3796 			vm_object_deallocate(old_object);
3797 		}
3798 	}
3799 
3800 BailOut:
3801 	assert(map_locked == TRUE);
3802 
3803 	if (result == KERN_SUCCESS) {
3804 		vm_prot_t pager_prot;
3805 		memory_object_t pager;
3806 
3807 #if DEBUG
3808 		if (pmap_empty &&
3809 		    !(vmk_flags.vmkf_no_pmap_check)) {
3810 			assert(pmap_is_empty(map->pmap,
3811 			    *address,
3812 			    *address + size));
3813 		}
3814 #endif /* DEBUG */
3815 
3816 		/*
3817 		 * For "named" VM objects, let the pager know that the
3818 		 * memory object is being mapped.  Some pagers need to keep
3819 		 * track of this, to know when they can reclaim the memory
3820 		 * object, for example.
3821 		 * VM calls memory_object_map() for each mapping (specifying
3822 		 * the protection of each mapping) and calls
3823 		 * memory_object_last_unmap() when all the mappings are gone.
3824 		 */
3825 		pager_prot = max_protection;
3826 		if (needs_copy) {
3827 			/*
3828 			 * Copy-On-Write mapping: won't modify
3829 			 * the memory object.
3830 			 */
3831 			pager_prot &= ~VM_PROT_WRITE;
3832 		}
3833 		if (!is_submap &&
3834 		    object != VM_OBJECT_NULL &&
3835 		    object->named &&
3836 		    object->pager != MEMORY_OBJECT_NULL) {
3837 			vm_object_lock(object);
3838 			pager = object->pager;
3839 			if (object->named &&
3840 			    pager != MEMORY_OBJECT_NULL) {
3841 				assert(object->pager_ready);
3842 				vm_object_mapping_wait(object, THREAD_UNINT);
3843 				vm_object_mapping_begin(object);
3844 				vm_object_unlock(object);
3845 
3846 				kr = memory_object_map(pager, pager_prot);
3847 				assert(kr == KERN_SUCCESS);
3848 
3849 				vm_object_lock(object);
3850 				vm_object_mapping_end(object);
3851 			}
3852 			vm_object_unlock(object);
3853 		}
3854 		if (!is_submap &&
3855 		    fourk_object != VM_OBJECT_NULL &&
3856 		    fourk_object->named &&
3857 		    fourk_object->pager != MEMORY_OBJECT_NULL) {
3858 			vm_object_lock(fourk_object);
3859 			pager = fourk_object->pager;
3860 			if (fourk_object->named &&
3861 			    pager != MEMORY_OBJECT_NULL) {
3862 				assert(fourk_object->pager_ready);
3863 				vm_object_mapping_wait(fourk_object,
3864 				    THREAD_UNINT);
3865 				vm_object_mapping_begin(fourk_object);
3866 				vm_object_unlock(fourk_object);
3867 
3868 				kr = memory_object_map(pager, VM_PROT_READ);
3869 				assert(kr == KERN_SUCCESS);
3870 
3871 				vm_object_lock(fourk_object);
3872 				vm_object_mapping_end(fourk_object);
3873 			}
3874 			vm_object_unlock(fourk_object);
3875 		}
3876 	}
3877 
3878 	if (fourk_object != VM_OBJECT_NULL) {
3879 		vm_object_deallocate(fourk_object);
3880 		fourk_object = VM_OBJECT_NULL;
3881 		memory_object_deallocate(fourk_mem_obj);
3882 		fourk_mem_obj = MEMORY_OBJECT_NULL;
3883 	}
3884 
3885 	assert(map_locked == TRUE);
3886 
3887 	if (!keep_map_locked) {
3888 		vm_map_unlock(map);
3889 		map_locked = FALSE;
3890 	}
3891 
3892 	/*
3893 	 * We can't hold the map lock if we enter this block.
3894 	 */
3895 
3896 	if (result == KERN_SUCCESS) {
3897 		/*	Wire down the new entry if the user
3898 		 *	requested all new map entries be wired.
3899 		 */
3900 		if ((map->wiring_required) || (superpage_size)) {
3901 			assert(!keep_map_locked);
3902 			pmap_empty = FALSE; /* pmap won't be empty */
3903 			kr = vm_map_wire_kernel(map, start, end,
3904 			    new_entry->protection, VM_KERN_MEMORY_MLOCK,
3905 			    TRUE);
3906 			result = kr;
3907 		}
3908 
3909 	}
3910 
3911 	if (result != KERN_SUCCESS) {
3912 		if (new_mapping_established) {
3913 			/*
3914 			 * We have to get rid of the new mappings since we
3915 			 * won't make them available to the user.
3916 			 * Try and do that atomically, to minimize the risk
3917 			 * that someone else create new mappings that range.
3918 			 */
3919 
3920 			if (!map_locked) {
3921 				vm_map_lock(map);
3922 				map_locked = TRUE;
3923 			}
3924 			(void)vm_map_delete(map, *address, *address + size,
3925 			    VM_MAP_REMOVE_NO_MAP_ALIGN | VM_MAP_REMOVE_NO_YIELD,
3926 			    &zap_list);
3927 		}
3928 	}
3929 
3930 	/*
3931 	 * The caller is responsible for releasing the lock if it requested to
3932 	 * keep the map locked.
3933 	 */
3934 	if (map_locked && !keep_map_locked) {
3935 		vm_map_unlock(map);
3936 	}
3937 
3938 	vm_map_zap_dispose(&zap_list);
3939 
3940 	return result;
3941 
3942 #undef  RETURN
3943 }
3944 #endif /* __arm64__ */
3945 
3946 /*
3947  * Counters for the prefault optimization.
3948  */
3949 int64_t vm_prefault_nb_pages = 0;
3950 int64_t vm_prefault_nb_bailout = 0;
3951 
3952 static kern_return_t
vm_map_enter_mem_object_helper(vm_map_t target_map,vm_map_offset_t * address,vm_map_size_t initial_size,vm_map_offset_t mask,int flags,vm_map_kernel_flags_t vmk_flags,vm_tag_t tag,ipc_port_t port,vm_object_offset_t offset,boolean_t copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance,upl_page_list_ptr_t page_list,unsigned int page_list_count)3953 vm_map_enter_mem_object_helper(
3954 	vm_map_t                target_map,
3955 	vm_map_offset_t         *address,
3956 	vm_map_size_t           initial_size,
3957 	vm_map_offset_t         mask,
3958 	int                     flags,
3959 	vm_map_kernel_flags_t   vmk_flags,
3960 	vm_tag_t                tag,
3961 	ipc_port_t              port,
3962 	vm_object_offset_t      offset,
3963 	boolean_t               copy,
3964 	vm_prot_t               cur_protection,
3965 	vm_prot_t               max_protection,
3966 	vm_inherit_t            inheritance,
3967 	upl_page_list_ptr_t     page_list,
3968 	unsigned int            page_list_count)
3969 {
3970 	vm_map_address_t        map_addr;
3971 	vm_map_size_t           map_size;
3972 	vm_object_t             object;
3973 	vm_object_size_t        size;
3974 	kern_return_t           result;
3975 	boolean_t               mask_cur_protection, mask_max_protection;
3976 	boolean_t               kernel_prefault, try_prefault = (page_list_count != 0);
3977 	vm_map_offset_t         offset_in_mapping = 0;
3978 #if __arm64__
3979 	boolean_t               fourk = vmk_flags.vmkf_fourk;
3980 #endif /* __arm64__ */
3981 
3982 	if (VM_MAP_PAGE_SHIFT(target_map) < PAGE_SHIFT) {
3983 		/* XXX TODO4K prefaulting depends on page size... */
3984 		try_prefault = FALSE;
3985 	}
3986 
3987 	assertf(vmk_flags.__vmkf_unused == 0, "vmk_flags unused=0x%x\n", vmk_flags.__vmkf_unused);
3988 
3989 	mask_cur_protection = cur_protection & VM_PROT_IS_MASK;
3990 	mask_max_protection = max_protection & VM_PROT_IS_MASK;
3991 	cur_protection &= ~VM_PROT_IS_MASK;
3992 	max_protection &= ~VM_PROT_IS_MASK;
3993 
3994 	/*
3995 	 * Check arguments for validity
3996 	 */
3997 	if ((target_map == VM_MAP_NULL) ||
3998 	    (cur_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)) ||
3999 	    (max_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)) ||
4000 	    (inheritance > VM_INHERIT_LAST_VALID) ||
4001 	    (try_prefault && (copy || !page_list)) ||
4002 	    initial_size == 0) {
4003 		return KERN_INVALID_ARGUMENT;
4004 	}
4005 
4006 	/*
4007 	 * Redirect to kmem_ranges[data]
4008 	 */
4009 	if (target_map == kernel_map) {
4010 		vmk_flags.vmkf_range_id = KMEM_RANGE_ID_DATA;
4011 	}
4012 
4013 #if __arm64__
4014 	if (fourk && VM_MAP_PAGE_SHIFT(target_map) < PAGE_SHIFT) {
4015 		/* no "fourk" if map is using a sub-page page size */
4016 		fourk = FALSE;
4017 	}
4018 	if (fourk) {
4019 		map_addr = vm_map_trunc_page(*address, FOURK_PAGE_MASK);
4020 		map_size = vm_map_round_page(initial_size, FOURK_PAGE_MASK);
4021 	} else
4022 #endif /* __arm64__ */
4023 	{
4024 		map_addr = vm_map_trunc_page(*address,
4025 		    VM_MAP_PAGE_MASK(target_map));
4026 		map_size = vm_map_round_page(initial_size,
4027 		    VM_MAP_PAGE_MASK(target_map));
4028 	}
4029 	size = vm_object_round_page(initial_size);
4030 
4031 	/*
4032 	 * Find the vm object (if any) corresponding to this port.
4033 	 */
4034 	if (!IP_VALID(port)) {
4035 		object = VM_OBJECT_NULL;
4036 		offset = 0;
4037 		copy = FALSE;
4038 	} else if (ip_kotype(port) == IKOT_NAMED_ENTRY) {
4039 		vm_named_entry_t        named_entry;
4040 		vm_object_offset_t      data_offset;
4041 
4042 		named_entry = mach_memory_entry_from_port(port);
4043 
4044 		if (flags & (VM_FLAGS_RETURN_DATA_ADDR |
4045 		    VM_FLAGS_RETURN_4K_DATA_ADDR)) {
4046 			data_offset = named_entry->data_offset;
4047 			offset += named_entry->data_offset;
4048 		} else {
4049 			data_offset = 0;
4050 		}
4051 
4052 		/* a few checks to make sure user is obeying rules */
4053 		if (size == 0) {
4054 			if (offset >= named_entry->size) {
4055 				return KERN_INVALID_RIGHT;
4056 			}
4057 			size = named_entry->size - offset;
4058 		}
4059 		if (mask_max_protection) {
4060 			max_protection &= named_entry->protection;
4061 		}
4062 		if (mask_cur_protection) {
4063 			cur_protection &= named_entry->protection;
4064 		}
4065 		if ((named_entry->protection & max_protection) !=
4066 		    max_protection) {
4067 			return KERN_INVALID_RIGHT;
4068 		}
4069 		if ((named_entry->protection & cur_protection) !=
4070 		    cur_protection) {
4071 			return KERN_INVALID_RIGHT;
4072 		}
4073 		if (offset + size < offset) {
4074 			/* overflow */
4075 			return KERN_INVALID_ARGUMENT;
4076 		}
4077 		if (named_entry->size < (offset + initial_size)) {
4078 			return KERN_INVALID_ARGUMENT;
4079 		}
4080 
4081 		if (named_entry->is_copy) {
4082 			/* for a vm_map_copy, we can only map it whole */
4083 			if ((size != named_entry->size) &&
4084 			    (vm_map_round_page(size,
4085 			    VM_MAP_PAGE_MASK(target_map)) ==
4086 			    named_entry->size)) {
4087 				/* XXX FBDP use the rounded size... */
4088 				size = vm_map_round_page(
4089 					size,
4090 					VM_MAP_PAGE_MASK(target_map));
4091 			}
4092 		}
4093 
4094 		/* the callers parameter offset is defined to be the */
4095 		/* offset from beginning of named entry offset in object */
4096 		offset = offset + named_entry->offset;
4097 
4098 		if (!VM_MAP_PAGE_ALIGNED(size,
4099 		    VM_MAP_PAGE_MASK(target_map))) {
4100 			/*
4101 			 * Let's not map more than requested;
4102 			 * vm_map_enter() will handle this "not map-aligned"
4103 			 * case.
4104 			 */
4105 			map_size = size;
4106 		}
4107 
4108 		named_entry_lock(named_entry);
4109 		if (named_entry->is_sub_map) {
4110 			vm_map_t                submap;
4111 
4112 			if (flags & (VM_FLAGS_RETURN_DATA_ADDR |
4113 			    VM_FLAGS_RETURN_4K_DATA_ADDR)) {
4114 				panic("VM_FLAGS_RETURN_DATA_ADDR not expected for submap.");
4115 			}
4116 
4117 			submap = named_entry->backing.map;
4118 			vm_map_reference(submap);
4119 			named_entry_unlock(named_entry);
4120 
4121 			vmk_flags.vmkf_submap = TRUE;
4122 
4123 			result = vm_map_enter(target_map,
4124 			    &map_addr,
4125 			    map_size,
4126 			    mask,
4127 			    flags,
4128 			    vmk_flags,
4129 			    tag,
4130 			    (vm_object_t)(uintptr_t) submap,
4131 			    offset,
4132 			    copy,
4133 			    cur_protection,
4134 			    max_protection,
4135 			    inheritance);
4136 			if (result != KERN_SUCCESS) {
4137 				vm_map_deallocate(submap);
4138 			} else {
4139 				/*
4140 				 * No need to lock "submap" just to check its
4141 				 * "mapped" flag: that flag is never reset
4142 				 * once it's been set and if we race, we'll
4143 				 * just end up setting it twice, which is OK.
4144 				 */
4145 				if (submap->mapped_in_other_pmaps == FALSE &&
4146 				    vm_map_pmap(submap) != PMAP_NULL &&
4147 				    vm_map_pmap(submap) !=
4148 				    vm_map_pmap(target_map)) {
4149 					/*
4150 					 * This submap is being mapped in a map
4151 					 * that uses a different pmap.
4152 					 * Set its "mapped_in_other_pmaps" flag
4153 					 * to indicate that we now need to
4154 					 * remove mappings from all pmaps rather
4155 					 * than just the submap's pmap.
4156 					 */
4157 					vm_map_lock(submap);
4158 					submap->mapped_in_other_pmaps = TRUE;
4159 					vm_map_unlock(submap);
4160 				}
4161 				*address = map_addr;
4162 			}
4163 			return result;
4164 		} else if (named_entry->is_copy) {
4165 			kern_return_t   kr;
4166 			vm_map_copy_t   copy_map;
4167 			vm_map_entry_t  copy_entry;
4168 			vm_map_offset_t copy_addr;
4169 			vm_map_copy_t   target_copy_map;
4170 			vm_map_offset_t overmap_start, overmap_end;
4171 			vm_map_offset_t trimmed_start;
4172 			vm_map_size_t   target_size;
4173 
4174 			if (flags & ~(VM_FLAGS_FIXED |
4175 			    VM_FLAGS_ANYWHERE |
4176 			    VM_FLAGS_OVERWRITE |
4177 			    VM_FLAGS_RETURN_4K_DATA_ADDR |
4178 			    VM_FLAGS_RETURN_DATA_ADDR |
4179 			    VM_FLAGS_ALIAS_MASK)) {
4180 				named_entry_unlock(named_entry);
4181 				return KERN_INVALID_ARGUMENT;
4182 			}
4183 
4184 			copy_map = named_entry->backing.copy;
4185 			assert(copy_map->type == VM_MAP_COPY_ENTRY_LIST);
4186 			if (copy_map->type != VM_MAP_COPY_ENTRY_LIST) {
4187 				/* unsupported type; should not happen */
4188 				printf("vm_map_enter_mem_object: "
4189 				    "memory_entry->backing.copy "
4190 				    "unsupported type 0x%x\n",
4191 				    copy_map->type);
4192 				named_entry_unlock(named_entry);
4193 				return KERN_INVALID_ARGUMENT;
4194 			}
4195 
4196 			if (VM_MAP_PAGE_SHIFT(target_map) != copy_map->cpy_hdr.page_shift) {
4197 				DEBUG4K_SHARE("copy_map %p offset %llx size 0x%llx pgshift %d -> target_map %p pgshift %d\n", copy_map, offset, (uint64_t)map_size, copy_map->cpy_hdr.page_shift, target_map, VM_MAP_PAGE_SHIFT(target_map));
4198 			}
4199 
4200 			if (flags & (VM_FLAGS_RETURN_DATA_ADDR |
4201 			    VM_FLAGS_RETURN_4K_DATA_ADDR)) {
4202 				offset_in_mapping = offset & VM_MAP_PAGE_MASK(target_map);
4203 				if (flags & VM_FLAGS_RETURN_4K_DATA_ADDR) {
4204 					offset_in_mapping &= ~((signed)(0xFFF));
4205 				}
4206 			}
4207 
4208 			target_copy_map = VM_MAP_COPY_NULL;
4209 			target_size = copy_map->size;
4210 			overmap_start = 0;
4211 			overmap_end = 0;
4212 			trimmed_start = 0;
4213 			if (copy_map->cpy_hdr.page_shift != VM_MAP_PAGE_SHIFT(target_map)) {
4214 				DEBUG4K_ADJUST("adjusting...\n");
4215 				kr = vm_map_copy_adjust_to_target(
4216 					copy_map,
4217 					offset /* includes data_offset */,
4218 					initial_size,
4219 					target_map,
4220 					copy,
4221 					&target_copy_map,
4222 					&overmap_start,
4223 					&overmap_end,
4224 					&trimmed_start);
4225 				if (kr != KERN_SUCCESS) {
4226 					named_entry_unlock(named_entry);
4227 					return kr;
4228 				}
4229 				target_size = target_copy_map->size;
4230 				if (trimmed_start >= data_offset) {
4231 					data_offset = offset & VM_MAP_PAGE_MASK(target_map);
4232 				} else {
4233 					data_offset -= trimmed_start;
4234 				}
4235 			} else {
4236 				/*
4237 				 * Assert that the vm_map_copy is coming from the right
4238 				 * zone and hasn't been forged
4239 				 */
4240 				vm_map_copy_require(copy_map);
4241 				target_copy_map = copy_map;
4242 			}
4243 
4244 			/* reserve a contiguous range */
4245 			kr = vm_map_enter(target_map,
4246 			    &map_addr,
4247 			    vm_map_round_page(target_size, VM_MAP_PAGE_MASK(target_map)),
4248 			    mask,
4249 			    flags & (VM_FLAGS_ANYWHERE |
4250 			    VM_FLAGS_OVERWRITE |
4251 			    VM_FLAGS_RETURN_4K_DATA_ADDR |
4252 			    VM_FLAGS_RETURN_DATA_ADDR),
4253 			    vmk_flags,
4254 			    tag,
4255 			    VM_OBJECT_NULL,
4256 			    0,
4257 			    FALSE,               /* copy */
4258 			    cur_protection,
4259 			    max_protection,
4260 			    inheritance);
4261 			if (kr != KERN_SUCCESS) {
4262 				DEBUG4K_ERROR("kr 0x%x\n", kr);
4263 				if (target_copy_map != copy_map) {
4264 					vm_map_copy_discard(target_copy_map);
4265 					target_copy_map = VM_MAP_COPY_NULL;
4266 				}
4267 				named_entry_unlock(named_entry);
4268 				return kr;
4269 			}
4270 
4271 			copy_addr = map_addr;
4272 
4273 			for (copy_entry = vm_map_copy_first_entry(target_copy_map);
4274 			    copy_entry != vm_map_copy_to_entry(target_copy_map);
4275 			    copy_entry = copy_entry->vme_next) {
4276 				int                     remap_flags;
4277 				vm_map_kernel_flags_t   vmk_remap_flags;
4278 				vm_map_t                copy_submap;
4279 				vm_object_t             copy_object;
4280 				vm_map_size_t           copy_size;
4281 				vm_object_offset_t      copy_offset;
4282 				int                     copy_vm_alias;
4283 				boolean_t               do_copy;
4284 
4285 				do_copy = FALSE;
4286 				remap_flags = 0;
4287 				vmk_remap_flags = VM_MAP_KERNEL_FLAGS_NONE;
4288 
4289 				copy_object = VME_OBJECT(copy_entry);
4290 				copy_offset = VME_OFFSET(copy_entry);
4291 				copy_size = (copy_entry->vme_end -
4292 				    copy_entry->vme_start);
4293 				VM_GET_FLAGS_ALIAS(flags, copy_vm_alias);
4294 				if (copy_vm_alias == 0) {
4295 					/*
4296 					 * Caller does not want a specific
4297 					 * alias for this new mapping:  use
4298 					 * the alias of the original mapping.
4299 					 */
4300 					copy_vm_alias = VME_ALIAS(copy_entry);
4301 				}
4302 
4303 				/* sanity check */
4304 				if ((copy_addr + copy_size) >
4305 				    (map_addr +
4306 				    overmap_start + overmap_end +
4307 				    named_entry->size /* XXX full size */)) {
4308 					/* over-mapping too much !? */
4309 					kr = KERN_INVALID_ARGUMENT;
4310 					DEBUG4K_ERROR("kr 0x%x\n", kr);
4311 					/* abort */
4312 					break;
4313 				}
4314 
4315 				/* take a reference on the object */
4316 				if (copy_entry->is_sub_map) {
4317 					vmk_remap_flags.vmkf_submap = TRUE;
4318 					copy_submap = VME_SUBMAP(copy_entry);
4319 					vm_map_lock(copy_submap);
4320 					vm_map_reference(copy_submap);
4321 					vm_map_unlock(copy_submap);
4322 					copy_object = (vm_object_t)(uintptr_t) copy_submap;
4323 				} else {
4324 					if (!copy &&
4325 					    copy_object != VM_OBJECT_NULL &&
4326 					    copy_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
4327 						/*
4328 						 * We need to resolve our side of this
4329 						 * "symmetric" copy-on-write now; we
4330 						 * need a new object to map and share,
4331 						 * instead of the current one which
4332 						 * might still be shared with the
4333 						 * original mapping.
4334 						 *
4335 						 * Note: A "vm_map_copy_t" does not
4336 						 * have a lock but we're protected by
4337 						 * the named entry's lock here.
4338 						 */
4339 						// assert(copy_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC);
4340 						VME_OBJECT_SHADOW(copy_entry, copy_size);
4341 						assert(copy_object != VME_OBJECT(copy_entry));
4342 						if (!copy_entry->needs_copy &&
4343 						    copy_entry->protection & VM_PROT_WRITE) {
4344 							vm_prot_t prot;
4345 
4346 							prot = copy_entry->protection & ~VM_PROT_WRITE;
4347 							vm_object_pmap_protect(copy_object,
4348 							    copy_offset,
4349 							    copy_size,
4350 							    PMAP_NULL,
4351 							    PAGE_SIZE,
4352 							    0,
4353 							    prot);
4354 						}
4355 
4356 						copy_entry->needs_copy = FALSE;
4357 						copy_entry->is_shared = TRUE;
4358 						copy_object = VME_OBJECT(copy_entry);
4359 						copy_offset = VME_OFFSET(copy_entry);
4360 						vm_object_lock(copy_object);
4361 						/* we're about to make a shared mapping of this object */
4362 						copy_object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
4363 						copy_object->true_share = TRUE;
4364 						vm_object_unlock(copy_object);
4365 					}
4366 
4367 					if (copy_object != VM_OBJECT_NULL &&
4368 					    copy_object->named &&
4369 					    copy_object->pager != MEMORY_OBJECT_NULL &&
4370 					    copy_object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4371 						memory_object_t pager;
4372 						vm_prot_t       pager_prot;
4373 
4374 						/*
4375 						 * For "named" VM objects, let the pager know that the
4376 						 * memory object is being mapped.  Some pagers need to keep
4377 						 * track of this, to know when they can reclaim the memory
4378 						 * object, for example.
4379 						 * VM calls memory_object_map() for each mapping (specifying
4380 						 * the protection of each mapping) and calls
4381 						 * memory_object_last_unmap() when all the mappings are gone.
4382 						 */
4383 						pager_prot = max_protection;
4384 						if (copy) {
4385 							/*
4386 							 * Copy-On-Write mapping: won't modify the
4387 							 * memory object.
4388 							 */
4389 							pager_prot &= ~VM_PROT_WRITE;
4390 						}
4391 						vm_object_lock(copy_object);
4392 						pager = copy_object->pager;
4393 						if (copy_object->named &&
4394 						    pager != MEMORY_OBJECT_NULL &&
4395 						    copy_object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4396 							assert(copy_object->pager_ready);
4397 							vm_object_mapping_wait(copy_object, THREAD_UNINT);
4398 							vm_object_mapping_begin(copy_object);
4399 							vm_object_unlock(copy_object);
4400 
4401 							kr = memory_object_map(pager, pager_prot);
4402 							assert(kr == KERN_SUCCESS);
4403 
4404 							vm_object_lock(copy_object);
4405 							vm_object_mapping_end(copy_object);
4406 						}
4407 						vm_object_unlock(copy_object);
4408 					}
4409 
4410 					/*
4411 					 *	Perform the copy if requested
4412 					 */
4413 
4414 					if (copy && copy_object != VM_OBJECT_NULL) {
4415 						vm_object_t             new_object;
4416 						vm_object_offset_t      new_offset;
4417 
4418 						result = vm_object_copy_strategically(copy_object, copy_offset,
4419 						    copy_size,
4420 						    &new_object, &new_offset,
4421 						    &do_copy);
4422 
4423 
4424 						if (result == KERN_MEMORY_RESTART_COPY) {
4425 							boolean_t success;
4426 							boolean_t src_needs_copy;
4427 
4428 							/*
4429 							 * XXX
4430 							 * We currently ignore src_needs_copy.
4431 							 * This really is the issue of how to make
4432 							 * MEMORY_OBJECT_COPY_SYMMETRIC safe for
4433 							 * non-kernel users to use. Solution forthcoming.
4434 							 * In the meantime, since we don't allow non-kernel
4435 							 * memory managers to specify symmetric copy,
4436 							 * we won't run into problems here.
4437 							 */
4438 							new_object = copy_object;
4439 							new_offset = copy_offset;
4440 							success = vm_object_copy_quickly(new_object,
4441 							    new_offset,
4442 							    copy_size,
4443 							    &src_needs_copy,
4444 							    &do_copy);
4445 							assert(success);
4446 							result = KERN_SUCCESS;
4447 						}
4448 						if (result != KERN_SUCCESS) {
4449 							kr = result;
4450 							break;
4451 						}
4452 
4453 						copy_object = new_object;
4454 						copy_offset = new_offset;
4455 						/*
4456 						 * No extra object reference for the mapping:
4457 						 * the mapping should be the only thing keeping
4458 						 * this new object alive.
4459 						 */
4460 					} else {
4461 						/*
4462 						 * We already have the right object
4463 						 * to map.
4464 						 */
4465 						copy_object = VME_OBJECT(copy_entry);
4466 						/* take an extra ref for the mapping below */
4467 						vm_object_reference(copy_object);
4468 					}
4469 				}
4470 
4471 				/* over-map the object into destination */
4472 				remap_flags |= flags;
4473 				remap_flags |= VM_FLAGS_FIXED;
4474 				remap_flags |= VM_FLAGS_OVERWRITE;
4475 				remap_flags &= ~VM_FLAGS_ANYWHERE;
4476 				if (!copy && !copy_entry->is_sub_map) {
4477 					/*
4478 					 * copy-on-write should have been
4479 					 * resolved at this point, or we would
4480 					 * end up sharing instead of copying.
4481 					 */
4482 					assert(!copy_entry->needs_copy);
4483 				}
4484 #if XNU_TARGET_OS_OSX
4485 				if (copy_entry->used_for_jit) {
4486 					vmk_remap_flags.vmkf_map_jit = TRUE;
4487 				}
4488 #endif /* XNU_TARGET_OS_OSX */
4489 
4490 				assertf((copy_vm_alias & VME_ALIAS_MASK) == copy_vm_alias,
4491 				    "VM Tag truncated from 0x%x to 0x%x\n", copy_vm_alias, (copy_vm_alias & VME_ALIAS_MASK));
4492 				kr = vm_map_enter(target_map,
4493 				    &copy_addr,
4494 				    copy_size,
4495 				    (vm_map_offset_t) 0,
4496 				    remap_flags,
4497 				    vmk_remap_flags,
4498 				    (vm_tag_t) copy_vm_alias, /* see comment at end of vm_fault_unwire re. cast*/
4499 				    copy_object,
4500 				    copy_offset,
4501 				    ((copy_object == NULL)
4502 				    ? FALSE
4503 				    : (copy || copy_entry->needs_copy)),
4504 				    cur_protection,
4505 				    max_protection,
4506 				    inheritance);
4507 				if (kr != KERN_SUCCESS) {
4508 					DEBUG4K_SHARE("failed kr 0x%x\n", kr);
4509 					if (copy_entry->is_sub_map) {
4510 						vm_map_deallocate(copy_submap);
4511 					} else {
4512 						vm_object_deallocate(copy_object);
4513 					}
4514 					/* abort */
4515 					break;
4516 				}
4517 
4518 				/* next mapping */
4519 				copy_addr += copy_size;
4520 			}
4521 
4522 			if (kr == KERN_SUCCESS) {
4523 				if (flags & (VM_FLAGS_RETURN_DATA_ADDR |
4524 				    VM_FLAGS_RETURN_4K_DATA_ADDR)) {
4525 					*address = map_addr + offset_in_mapping;
4526 				} else {
4527 					*address = map_addr;
4528 				}
4529 				if (overmap_start) {
4530 					*address += overmap_start;
4531 					DEBUG4K_SHARE("map %p map_addr 0x%llx offset_in_mapping 0x%llx overmap_start 0x%llx -> *address 0x%llx\n", target_map, (uint64_t)map_addr, (uint64_t) offset_in_mapping, (uint64_t)overmap_start, (uint64_t)*address);
4532 				}
4533 			}
4534 			named_entry_unlock(named_entry);
4535 			if (target_copy_map != copy_map) {
4536 				vm_map_copy_discard(target_copy_map);
4537 				target_copy_map = VM_MAP_COPY_NULL;
4538 			}
4539 
4540 			if (kr != KERN_SUCCESS) {
4541 				if (!(flags & VM_FLAGS_OVERWRITE)) {
4542 					/* deallocate the contiguous range */
4543 					(void) vm_deallocate(target_map,
4544 					    map_addr,
4545 					    map_size);
4546 				}
4547 			}
4548 
4549 			return kr;
4550 		}
4551 
4552 		if (named_entry->is_object) {
4553 			unsigned int    access;
4554 			vm_prot_t       protections;
4555 			unsigned int    wimg_mode;
4556 
4557 			/* we are mapping a VM object */
4558 
4559 			protections = named_entry->protection & (VM_PROT_ALL | VM_PROT_ALLEXEC);
4560 			access = GET_MAP_MEM(named_entry->protection);
4561 
4562 			if (flags & (VM_FLAGS_RETURN_DATA_ADDR |
4563 			    VM_FLAGS_RETURN_4K_DATA_ADDR)) {
4564 				offset_in_mapping = offset - VM_MAP_TRUNC_PAGE(offset, VM_MAP_PAGE_MASK(target_map));
4565 				if (flags & VM_FLAGS_RETURN_4K_DATA_ADDR) {
4566 					offset_in_mapping &= ~((signed)(0xFFF));
4567 				}
4568 				offset = VM_MAP_TRUNC_PAGE(offset, VM_MAP_PAGE_MASK(target_map));
4569 				map_size = VM_MAP_ROUND_PAGE((offset + offset_in_mapping + initial_size) - offset, VM_MAP_PAGE_MASK(target_map));
4570 			}
4571 
4572 			object = vm_named_entry_to_vm_object(named_entry);
4573 			assert(object != VM_OBJECT_NULL);
4574 			vm_object_lock(object);
4575 			named_entry_unlock(named_entry);
4576 
4577 			vm_object_reference_locked(object);
4578 
4579 			wimg_mode = object->wimg_bits;
4580 			vm_prot_to_wimg(access, &wimg_mode);
4581 			if (object->wimg_bits != wimg_mode) {
4582 				vm_object_change_wimg_mode(object, wimg_mode);
4583 			}
4584 
4585 			vm_object_unlock(object);
4586 		} else {
4587 			panic("invalid VM named entry %p", named_entry);
4588 		}
4589 	} else if (ip_kotype(port) == IKOT_MEMORY_OBJECT) {
4590 		/*
4591 		 * JMM - This is temporary until we unify named entries
4592 		 * and raw memory objects.
4593 		 *
4594 		 * Detected fake ip_kotype for a memory object.  In
4595 		 * this case, the port isn't really a port at all, but
4596 		 * instead is just a raw memory object.
4597 		 */
4598 		if (flags & (VM_FLAGS_RETURN_DATA_ADDR |
4599 		    VM_FLAGS_RETURN_4K_DATA_ADDR)) {
4600 			panic("VM_FLAGS_RETURN_DATA_ADDR not expected for raw memory object.");
4601 		}
4602 
4603 		object = memory_object_to_vm_object((memory_object_t)port);
4604 		if (object == VM_OBJECT_NULL) {
4605 			return KERN_INVALID_OBJECT;
4606 		}
4607 		vm_object_reference(object);
4608 
4609 		/* wait for object (if any) to be ready */
4610 		if (object != VM_OBJECT_NULL) {
4611 			if (object == kernel_object) {
4612 				printf("Warning: Attempt to map kernel object"
4613 				    " by a non-private kernel entity\n");
4614 				return KERN_INVALID_OBJECT;
4615 			}
4616 			if (!object->pager_ready) {
4617 				vm_object_lock(object);
4618 
4619 				while (!object->pager_ready) {
4620 					vm_object_wait(object,
4621 					    VM_OBJECT_EVENT_PAGER_READY,
4622 					    THREAD_UNINT);
4623 					vm_object_lock(object);
4624 				}
4625 				vm_object_unlock(object);
4626 			}
4627 		}
4628 	} else {
4629 		return KERN_INVALID_OBJECT;
4630 	}
4631 
4632 	if (object != VM_OBJECT_NULL &&
4633 	    object->named &&
4634 	    object->pager != MEMORY_OBJECT_NULL &&
4635 	    object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4636 		memory_object_t pager;
4637 		vm_prot_t       pager_prot;
4638 		kern_return_t   kr;
4639 
4640 		/*
4641 		 * For "named" VM objects, let the pager know that the
4642 		 * memory object is being mapped.  Some pagers need to keep
4643 		 * track of this, to know when they can reclaim the memory
4644 		 * object, for example.
4645 		 * VM calls memory_object_map() for each mapping (specifying
4646 		 * the protection of each mapping) and calls
4647 		 * memory_object_last_unmap() when all the mappings are gone.
4648 		 */
4649 		pager_prot = max_protection;
4650 		if (copy) {
4651 			/*
4652 			 * Copy-On-Write mapping: won't modify the
4653 			 * memory object.
4654 			 */
4655 			pager_prot &= ~VM_PROT_WRITE;
4656 		}
4657 		vm_object_lock(object);
4658 		pager = object->pager;
4659 		if (object->named &&
4660 		    pager != MEMORY_OBJECT_NULL &&
4661 		    object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4662 			assert(object->pager_ready);
4663 			vm_object_mapping_wait(object, THREAD_UNINT);
4664 			vm_object_mapping_begin(object);
4665 			vm_object_unlock(object);
4666 
4667 			kr = memory_object_map(pager, pager_prot);
4668 			assert(kr == KERN_SUCCESS);
4669 
4670 			vm_object_lock(object);
4671 			vm_object_mapping_end(object);
4672 		}
4673 		vm_object_unlock(object);
4674 	}
4675 
4676 	/*
4677 	 *	Perform the copy if requested
4678 	 */
4679 
4680 	if (copy) {
4681 		vm_object_t             new_object;
4682 		vm_object_offset_t      new_offset;
4683 
4684 		result = vm_object_copy_strategically(object, offset,
4685 		    map_size,
4686 		    &new_object, &new_offset,
4687 		    &copy);
4688 
4689 
4690 		if (result == KERN_MEMORY_RESTART_COPY) {
4691 			boolean_t success;
4692 			boolean_t src_needs_copy;
4693 
4694 			/*
4695 			 * XXX
4696 			 * We currently ignore src_needs_copy.
4697 			 * This really is the issue of how to make
4698 			 * MEMORY_OBJECT_COPY_SYMMETRIC safe for
4699 			 * non-kernel users to use. Solution forthcoming.
4700 			 * In the meantime, since we don't allow non-kernel
4701 			 * memory managers to specify symmetric copy,
4702 			 * we won't run into problems here.
4703 			 */
4704 			new_object = object;
4705 			new_offset = offset;
4706 			success = vm_object_copy_quickly(new_object,
4707 			    new_offset,
4708 			    map_size,
4709 			    &src_needs_copy,
4710 			    &copy);
4711 			assert(success);
4712 			result = KERN_SUCCESS;
4713 		}
4714 		/*
4715 		 *	Throw away the reference to the
4716 		 *	original object, as it won't be mapped.
4717 		 */
4718 
4719 		vm_object_deallocate(object);
4720 
4721 		if (result != KERN_SUCCESS) {
4722 			return result;
4723 		}
4724 
4725 		object = new_object;
4726 		offset = new_offset;
4727 	}
4728 
4729 	/*
4730 	 * If non-kernel users want to try to prefault pages, the mapping and prefault
4731 	 * needs to be atomic.
4732 	 */
4733 	kernel_prefault = (try_prefault && vm_kernel_map_is_kernel(target_map));
4734 	vmk_flags.vmkf_keep_map_locked = (try_prefault && !kernel_prefault);
4735 
4736 #if __arm64__
4737 	if (fourk) {
4738 		/* map this object in a "4K" pager */
4739 		result = vm_map_enter_fourk(target_map,
4740 		    &map_addr,
4741 		    map_size,
4742 		    (vm_map_offset_t) mask,
4743 		    flags,
4744 		    vmk_flags,
4745 		    tag,
4746 		    object,
4747 		    offset,
4748 		    copy,
4749 		    cur_protection,
4750 		    max_protection,
4751 		    inheritance);
4752 	} else
4753 #endif /* __arm64__ */
4754 	{
4755 		result = vm_map_enter(target_map,
4756 		    &map_addr, map_size,
4757 		    (vm_map_offset_t)mask,
4758 		    flags,
4759 		    vmk_flags,
4760 		    tag,
4761 		    object, offset,
4762 		    copy,
4763 		    cur_protection, max_protection,
4764 		    inheritance);
4765 	}
4766 	if (result != KERN_SUCCESS) {
4767 		vm_object_deallocate(object);
4768 	}
4769 
4770 	/*
4771 	 * Try to prefault, and do not forget to release the vm map lock.
4772 	 */
4773 	if (result == KERN_SUCCESS && try_prefault) {
4774 		mach_vm_address_t va = map_addr;
4775 		kern_return_t kr = KERN_SUCCESS;
4776 		unsigned int i = 0;
4777 		int pmap_options;
4778 
4779 		pmap_options = kernel_prefault ? 0 : PMAP_OPTIONS_NOWAIT;
4780 		if (object->internal) {
4781 			pmap_options |= PMAP_OPTIONS_INTERNAL;
4782 		}
4783 
4784 		for (i = 0; i < page_list_count; ++i) {
4785 			if (!UPL_VALID_PAGE(page_list, i)) {
4786 				if (kernel_prefault) {
4787 					assertf(FALSE, "kernel_prefault && !UPL_VALID_PAGE");
4788 					result = KERN_MEMORY_ERROR;
4789 					break;
4790 				}
4791 			} else {
4792 				/*
4793 				 * If this function call failed, we should stop
4794 				 * trying to optimize, other calls are likely
4795 				 * going to fail too.
4796 				 *
4797 				 * We are not gonna report an error for such
4798 				 * failure though. That's an optimization, not
4799 				 * something critical.
4800 				 */
4801 				kr = pmap_enter_options(target_map->pmap,
4802 				    va, UPL_PHYS_PAGE(page_list, i),
4803 				    cur_protection, VM_PROT_NONE,
4804 				    0, TRUE, pmap_options, NULL);
4805 				if (kr != KERN_SUCCESS) {
4806 					OSIncrementAtomic64(&vm_prefault_nb_bailout);
4807 					if (kernel_prefault) {
4808 						result = kr;
4809 					}
4810 					break;
4811 				}
4812 				OSIncrementAtomic64(&vm_prefault_nb_pages);
4813 			}
4814 
4815 			/* Next virtual address */
4816 			va += PAGE_SIZE;
4817 		}
4818 		if (vmk_flags.vmkf_keep_map_locked) {
4819 			vm_map_unlock(target_map);
4820 		}
4821 	}
4822 
4823 	if (flags & (VM_FLAGS_RETURN_DATA_ADDR |
4824 	    VM_FLAGS_RETURN_4K_DATA_ADDR)) {
4825 		*address = map_addr + offset_in_mapping;
4826 	} else {
4827 		*address = map_addr;
4828 	}
4829 	return result;
4830 }
4831 
4832 kern_return_t
vm_map_enter_mem_object(vm_map_t target_map,vm_map_offset_t * address,vm_map_size_t initial_size,vm_map_offset_t mask,int flags,vm_map_kernel_flags_t vmk_flags,vm_tag_t tag,ipc_port_t port,vm_object_offset_t offset,boolean_t copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)4833 vm_map_enter_mem_object(
4834 	vm_map_t                target_map,
4835 	vm_map_offset_t         *address,
4836 	vm_map_size_t           initial_size,
4837 	vm_map_offset_t         mask,
4838 	int                     flags,
4839 	vm_map_kernel_flags_t   vmk_flags,
4840 	vm_tag_t                tag,
4841 	ipc_port_t              port,
4842 	vm_object_offset_t      offset,
4843 	boolean_t               copy,
4844 	vm_prot_t               cur_protection,
4845 	vm_prot_t               max_protection,
4846 	vm_inherit_t            inheritance)
4847 {
4848 	kern_return_t ret;
4849 
4850 	ret = vm_map_enter_mem_object_helper(target_map,
4851 	    address,
4852 	    initial_size,
4853 	    mask,
4854 	    flags,
4855 	    vmk_flags,
4856 	    tag,
4857 	    port,
4858 	    offset,
4859 	    copy,
4860 	    cur_protection,
4861 	    max_protection,
4862 	    inheritance,
4863 	    NULL,
4864 	    0);
4865 
4866 #if KASAN
4867 	if (ret == KERN_SUCCESS && address && target_map->pmap == kernel_pmap) {
4868 		kasan_notify_address(*address, initial_size);
4869 	}
4870 #endif
4871 
4872 	return ret;
4873 }
4874 
4875 kern_return_t
vm_map_enter_mem_object_prefault(vm_map_t target_map,vm_map_offset_t * address,vm_map_size_t initial_size,vm_map_offset_t mask,int flags,vm_map_kernel_flags_t vmk_flags,vm_tag_t tag,ipc_port_t port,vm_object_offset_t offset,vm_prot_t cur_protection,vm_prot_t max_protection,upl_page_list_ptr_t page_list,unsigned int page_list_count)4876 vm_map_enter_mem_object_prefault(
4877 	vm_map_t                target_map,
4878 	vm_map_offset_t         *address,
4879 	vm_map_size_t           initial_size,
4880 	vm_map_offset_t         mask,
4881 	int                     flags,
4882 	vm_map_kernel_flags_t   vmk_flags,
4883 	vm_tag_t                tag,
4884 	ipc_port_t              port,
4885 	vm_object_offset_t      offset,
4886 	vm_prot_t               cur_protection,
4887 	vm_prot_t               max_protection,
4888 	upl_page_list_ptr_t     page_list,
4889 	unsigned int            page_list_count)
4890 {
4891 	kern_return_t ret;
4892 
4893 	ret = vm_map_enter_mem_object_helper(target_map,
4894 	    address,
4895 	    initial_size,
4896 	    mask,
4897 	    flags,
4898 	    vmk_flags,
4899 	    tag,
4900 	    port,
4901 	    offset,
4902 	    FALSE,
4903 	    cur_protection,
4904 	    max_protection,
4905 	    VM_INHERIT_DEFAULT,
4906 	    page_list,
4907 	    page_list_count);
4908 
4909 #if KASAN
4910 	if (ret == KERN_SUCCESS && address && target_map->pmap == kernel_pmap) {
4911 		kasan_notify_address(*address, initial_size);
4912 	}
4913 #endif
4914 
4915 	return ret;
4916 }
4917 
4918 
4919 kern_return_t
vm_map_enter_mem_object_control(vm_map_t target_map,vm_map_offset_t * address,vm_map_size_t initial_size,vm_map_offset_t mask,int flags,vm_map_kernel_flags_t vmk_flags,vm_tag_t tag,memory_object_control_t control,vm_object_offset_t offset,boolean_t copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)4920 vm_map_enter_mem_object_control(
4921 	vm_map_t                target_map,
4922 	vm_map_offset_t         *address,
4923 	vm_map_size_t           initial_size,
4924 	vm_map_offset_t         mask,
4925 	int                     flags,
4926 	vm_map_kernel_flags_t   vmk_flags,
4927 	vm_tag_t                tag,
4928 	memory_object_control_t control,
4929 	vm_object_offset_t      offset,
4930 	boolean_t               copy,
4931 	vm_prot_t               cur_protection,
4932 	vm_prot_t               max_protection,
4933 	vm_inherit_t            inheritance)
4934 {
4935 	vm_map_address_t        map_addr;
4936 	vm_map_size_t           map_size;
4937 	vm_object_t             object;
4938 	vm_object_size_t        size;
4939 	kern_return_t           result;
4940 	memory_object_t         pager;
4941 	vm_prot_t               pager_prot;
4942 	kern_return_t           kr;
4943 #if __arm64__
4944 	boolean_t               fourk = vmk_flags.vmkf_fourk;
4945 #endif /* __arm64__ */
4946 
4947 	/*
4948 	 * Check arguments for validity
4949 	 */
4950 	if ((target_map == VM_MAP_NULL) ||
4951 	    (cur_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)) ||
4952 	    (max_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)) ||
4953 	    (inheritance > VM_INHERIT_LAST_VALID) ||
4954 	    initial_size == 0) {
4955 		return KERN_INVALID_ARGUMENT;
4956 	}
4957 
4958 #if __arm64__
4959 	if (fourk && VM_MAP_PAGE_MASK(target_map) < PAGE_MASK) {
4960 		fourk = FALSE;
4961 	}
4962 
4963 	if (fourk) {
4964 		map_addr = vm_map_trunc_page(*address,
4965 		    FOURK_PAGE_MASK);
4966 		map_size = vm_map_round_page(initial_size,
4967 		    FOURK_PAGE_MASK);
4968 	} else
4969 #endif /* __arm64__ */
4970 	{
4971 		map_addr = vm_map_trunc_page(*address,
4972 		    VM_MAP_PAGE_MASK(target_map));
4973 		map_size = vm_map_round_page(initial_size,
4974 		    VM_MAP_PAGE_MASK(target_map));
4975 	}
4976 	size = vm_object_round_page(initial_size);
4977 
4978 	object = memory_object_control_to_vm_object(control);
4979 
4980 	if (object == VM_OBJECT_NULL) {
4981 		return KERN_INVALID_OBJECT;
4982 	}
4983 
4984 	if (object == kernel_object) {
4985 		printf("Warning: Attempt to map kernel object"
4986 		    " by a non-private kernel entity\n");
4987 		return KERN_INVALID_OBJECT;
4988 	}
4989 
4990 	vm_object_lock(object);
4991 	object->ref_count++;
4992 
4993 	/*
4994 	 * For "named" VM objects, let the pager know that the
4995 	 * memory object is being mapped.  Some pagers need to keep
4996 	 * track of this, to know when they can reclaim the memory
4997 	 * object, for example.
4998 	 * VM calls memory_object_map() for each mapping (specifying
4999 	 * the protection of each mapping) and calls
5000 	 * memory_object_last_unmap() when all the mappings are gone.
5001 	 */
5002 	pager_prot = max_protection;
5003 	if (copy) {
5004 		pager_prot &= ~VM_PROT_WRITE;
5005 	}
5006 	pager = object->pager;
5007 	if (object->named &&
5008 	    pager != MEMORY_OBJECT_NULL &&
5009 	    object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
5010 		assert(object->pager_ready);
5011 		vm_object_mapping_wait(object, THREAD_UNINT);
5012 		vm_object_mapping_begin(object);
5013 		vm_object_unlock(object);
5014 
5015 		kr = memory_object_map(pager, pager_prot);
5016 		assert(kr == KERN_SUCCESS);
5017 
5018 		vm_object_lock(object);
5019 		vm_object_mapping_end(object);
5020 	}
5021 	vm_object_unlock(object);
5022 
5023 	/*
5024 	 *	Perform the copy if requested
5025 	 */
5026 
5027 	if (copy) {
5028 		vm_object_t             new_object;
5029 		vm_object_offset_t      new_offset;
5030 
5031 		result = vm_object_copy_strategically(object, offset, size,
5032 		    &new_object, &new_offset,
5033 		    &copy);
5034 
5035 
5036 		if (result == KERN_MEMORY_RESTART_COPY) {
5037 			boolean_t success;
5038 			boolean_t src_needs_copy;
5039 
5040 			/*
5041 			 * XXX
5042 			 * We currently ignore src_needs_copy.
5043 			 * This really is the issue of how to make
5044 			 * MEMORY_OBJECT_COPY_SYMMETRIC safe for
5045 			 * non-kernel users to use. Solution forthcoming.
5046 			 * In the meantime, since we don't allow non-kernel
5047 			 * memory managers to specify symmetric copy,
5048 			 * we won't run into problems here.
5049 			 */
5050 			new_object = object;
5051 			new_offset = offset;
5052 			success = vm_object_copy_quickly(new_object,
5053 			    new_offset, size,
5054 			    &src_needs_copy,
5055 			    &copy);
5056 			assert(success);
5057 			result = KERN_SUCCESS;
5058 		}
5059 		/*
5060 		 *	Throw away the reference to the
5061 		 *	original object, as it won't be mapped.
5062 		 */
5063 
5064 		vm_object_deallocate(object);
5065 
5066 		if (result != KERN_SUCCESS) {
5067 			return result;
5068 		}
5069 
5070 		object = new_object;
5071 		offset = new_offset;
5072 	}
5073 
5074 #if __arm64__
5075 	if (fourk) {
5076 		result = vm_map_enter_fourk(target_map,
5077 		    &map_addr,
5078 		    map_size,
5079 		    (vm_map_offset_t)mask,
5080 		    flags,
5081 		    vmk_flags,
5082 		    tag,
5083 		    object, offset,
5084 		    copy,
5085 		    cur_protection, max_protection,
5086 		    inheritance);
5087 	} else
5088 #endif /* __arm64__ */
5089 	{
5090 		result = vm_map_enter(target_map,
5091 		    &map_addr, map_size,
5092 		    (vm_map_offset_t)mask,
5093 		    flags,
5094 		    vmk_flags,
5095 		    tag,
5096 		    object, offset,
5097 		    copy,
5098 		    cur_protection, max_protection,
5099 		    inheritance);
5100 	}
5101 	if (result != KERN_SUCCESS) {
5102 		vm_object_deallocate(object);
5103 	}
5104 	*address = map_addr;
5105 
5106 	return result;
5107 }
5108 
5109 
5110 #if     VM_CPM
5111 
5112 #ifdef MACH_ASSERT
5113 extern pmap_paddr_t     avail_start, avail_end;
5114 #endif
5115 
5116 /*
5117  *	Allocate memory in the specified map, with the caveat that
5118  *	the memory is physically contiguous.  This call may fail
5119  *	if the system can't find sufficient contiguous memory.
5120  *	This call may cause or lead to heart-stopping amounts of
5121  *	paging activity.
5122  *
5123  *	Memory obtained from this call should be freed in the
5124  *	normal way, viz., via vm_deallocate.
5125  */
5126 kern_return_t
vm_map_enter_cpm(vm_map_t map,vm_map_offset_t * addr,vm_map_size_t size,int flags)5127 vm_map_enter_cpm(
5128 	vm_map_t                map,
5129 	vm_map_offset_t *addr,
5130 	vm_map_size_t           size,
5131 	int                     flags)
5132 {
5133 	vm_object_t             cpm_obj;
5134 	pmap_t                  pmap;
5135 	vm_page_t               m, pages;
5136 	kern_return_t           kr;
5137 	vm_map_offset_t         va, start, end, offset;
5138 #if     MACH_ASSERT
5139 	vm_map_offset_t         prev_addr = 0;
5140 #endif  /* MACH_ASSERT */
5141 
5142 	boolean_t               anywhere = ((VM_FLAGS_ANYWHERE & flags) != 0);
5143 	vm_tag_t tag;
5144 
5145 	if (VM_MAP_PAGE_SHIFT(map) != PAGE_SHIFT) {
5146 		/* XXX TODO4K do we need to support this? */
5147 		*addr = 0;
5148 		return KERN_NOT_SUPPORTED;
5149 	}
5150 
5151 	VM_GET_FLAGS_ALIAS(flags, tag);
5152 
5153 	if (size == 0) {
5154 		*addr = 0;
5155 		return KERN_SUCCESS;
5156 	}
5157 	if (anywhere) {
5158 		*addr = vm_map_min(map);
5159 	} else {
5160 		*addr = vm_map_trunc_page(*addr,
5161 		    VM_MAP_PAGE_MASK(map));
5162 	}
5163 	size = vm_map_round_page(size,
5164 	    VM_MAP_PAGE_MASK(map));
5165 
5166 	/*
5167 	 * LP64todo - cpm_allocate should probably allow
5168 	 * allocations of >4GB, but not with the current
5169 	 * algorithm, so just cast down the size for now.
5170 	 */
5171 	if (size > VM_MAX_ADDRESS) {
5172 		return KERN_RESOURCE_SHORTAGE;
5173 	}
5174 	if ((kr = cpm_allocate(CAST_DOWN(vm_size_t, size),
5175 	    &pages, 0, 0, TRUE, flags)) != KERN_SUCCESS) {
5176 		return kr;
5177 	}
5178 
5179 	cpm_obj = vm_object_allocate((vm_object_size_t)size);
5180 	assert(cpm_obj != VM_OBJECT_NULL);
5181 	assert(cpm_obj->internal);
5182 	assert(cpm_obj->vo_size == (vm_object_size_t)size);
5183 	assert(cpm_obj->can_persist == FALSE);
5184 	assert(cpm_obj->pager_created == FALSE);
5185 	assert(cpm_obj->pageout == FALSE);
5186 	assert(cpm_obj->shadow == VM_OBJECT_NULL);
5187 
5188 	/*
5189 	 *	Insert pages into object.
5190 	 */
5191 
5192 	vm_object_lock(cpm_obj);
5193 	for (offset = 0; offset < size; offset += PAGE_SIZE) {
5194 		m = pages;
5195 		pages = NEXT_PAGE(m);
5196 		*(NEXT_PAGE_PTR(m)) = VM_PAGE_NULL;
5197 
5198 		assert(!m->vmp_gobbled);
5199 		assert(!m->vmp_wanted);
5200 		assert(!m->vmp_pageout);
5201 		assert(!m->vmp_tabled);
5202 		assert(VM_PAGE_WIRED(m));
5203 		assert(m->vmp_busy);
5204 		assert(VM_PAGE_GET_PHYS_PAGE(m) >= (avail_start >> PAGE_SHIFT) && VM_PAGE_GET_PHYS_PAGE(m) <= (avail_end >> PAGE_SHIFT));
5205 
5206 		m->vmp_busy = FALSE;
5207 		vm_page_insert(m, cpm_obj, offset);
5208 	}
5209 	assert(cpm_obj->resident_page_count == size / PAGE_SIZE);
5210 	vm_object_unlock(cpm_obj);
5211 
5212 	/*
5213 	 *	Hang onto a reference on the object in case a
5214 	 *	multi-threaded application for some reason decides
5215 	 *	to deallocate the portion of the address space into
5216 	 *	which we will insert this object.
5217 	 *
5218 	 *	Unfortunately, we must insert the object now before
5219 	 *	we can talk to the pmap module about which addresses
5220 	 *	must be wired down.  Hence, the race with a multi-
5221 	 *	threaded app.
5222 	 */
5223 	vm_object_reference(cpm_obj);
5224 
5225 	/*
5226 	 *	Insert object into map.
5227 	 */
5228 
5229 	kr = vm_map_enter(
5230 		map,
5231 		addr,
5232 		size,
5233 		(vm_map_offset_t)0,
5234 		flags,
5235 		VM_MAP_KERNEL_FLAGS_NONE,
5236 		cpm_obj,
5237 		(vm_object_offset_t)0,
5238 		FALSE,
5239 		VM_PROT_ALL,
5240 		VM_PROT_ALL,
5241 		VM_INHERIT_DEFAULT);
5242 
5243 	if (kr != KERN_SUCCESS) {
5244 		/*
5245 		 *	A CPM object doesn't have can_persist set,
5246 		 *	so all we have to do is deallocate it to
5247 		 *	free up these pages.
5248 		 */
5249 		assert(cpm_obj->pager_created == FALSE);
5250 		assert(cpm_obj->can_persist == FALSE);
5251 		assert(cpm_obj->pageout == FALSE);
5252 		assert(cpm_obj->shadow == VM_OBJECT_NULL);
5253 		vm_object_deallocate(cpm_obj); /* kill acquired ref */
5254 		vm_object_deallocate(cpm_obj); /* kill creation ref */
5255 	}
5256 
5257 	/*
5258 	 *	Inform the physical mapping system that the
5259 	 *	range of addresses may not fault, so that
5260 	 *	page tables and such can be locked down as well.
5261 	 */
5262 	start = *addr;
5263 	end = start + size;
5264 	pmap = vm_map_pmap(map);
5265 	pmap_pageable(pmap, start, end, FALSE);
5266 
5267 	/*
5268 	 *	Enter each page into the pmap, to avoid faults.
5269 	 *	Note that this loop could be coded more efficiently,
5270 	 *	if the need arose, rather than looking up each page
5271 	 *	again.
5272 	 */
5273 	for (offset = 0, va = start; offset < size;
5274 	    va += PAGE_SIZE, offset += PAGE_SIZE) {
5275 		int type_of_fault;
5276 
5277 		vm_object_lock(cpm_obj);
5278 		m = vm_page_lookup(cpm_obj, (vm_object_offset_t)offset);
5279 		assert(m != VM_PAGE_NULL);
5280 
5281 		vm_page_zero_fill(m);
5282 
5283 		type_of_fault = DBG_ZERO_FILL_FAULT;
5284 
5285 		vm_fault_enter(m, pmap, va,
5286 		    PAGE_SIZE, 0,
5287 		    VM_PROT_ALL, VM_PROT_WRITE,
5288 		    VM_PAGE_WIRED(m),
5289 		    FALSE,                             /* change_wiring */
5290 		    VM_KERN_MEMORY_NONE,                             /* tag - not wiring */
5291 		    FALSE,                             /* no_cache */
5292 		    FALSE,                             /* cs_bypass */
5293 		    0,                                 /* user_tag */
5294 		    0,                             /* pmap_options */
5295 		    NULL,                              /* need_retry */
5296 		    &type_of_fault);
5297 
5298 		vm_object_unlock(cpm_obj);
5299 	}
5300 
5301 #if     MACH_ASSERT
5302 	/*
5303 	 *	Verify ordering in address space.
5304 	 */
5305 	for (offset = 0; offset < size; offset += PAGE_SIZE) {
5306 		vm_object_lock(cpm_obj);
5307 		m = vm_page_lookup(cpm_obj, (vm_object_offset_t)offset);
5308 		vm_object_unlock(cpm_obj);
5309 		if (m == VM_PAGE_NULL) {
5310 			panic("vm_allocate_cpm:  obj %p off 0x%llx no page",
5311 			    cpm_obj, (uint64_t)offset);
5312 		}
5313 		assert(m->vmp_tabled);
5314 		assert(!m->vmp_busy);
5315 		assert(!m->vmp_wanted);
5316 		assert(!m->vmp_fictitious);
5317 		assert(!m->vmp_private);
5318 		assert(!m->vmp_absent);
5319 		assert(!m->vmp_error);
5320 		assert(!m->vmp_cleaning);
5321 		assert(!m->vmp_laundry);
5322 		assert(!m->vmp_precious);
5323 		assert(!m->vmp_clustered);
5324 		if (offset != 0) {
5325 			if (VM_PAGE_GET_PHYS_PAGE(m) != prev_addr + 1) {
5326 				printf("start 0x%llx end 0x%llx va 0x%llx\n",
5327 				    (uint64_t)start, (uint64_t)end, (uint64_t)va);
5328 				printf("obj %p off 0x%llx\n", cpm_obj, (uint64_t)offset);
5329 				printf("m %p prev_address 0x%llx\n", m, (uint64_t)prev_addr);
5330 				panic("vm_allocate_cpm:  pages not contig!");
5331 			}
5332 		}
5333 		prev_addr = VM_PAGE_GET_PHYS_PAGE(m);
5334 	}
5335 #endif  /* MACH_ASSERT */
5336 
5337 	vm_object_deallocate(cpm_obj); /* kill extra ref */
5338 
5339 	return kr;
5340 }
5341 
5342 
5343 #else   /* VM_CPM */
5344 
5345 /*
5346  *	Interface is defined in all cases, but unless the kernel
5347  *	is built explicitly for this option, the interface does
5348  *	nothing.
5349  */
5350 
5351 kern_return_t
vm_map_enter_cpm(__unused vm_map_t map,__unused vm_map_offset_t * addr,__unused vm_map_size_t size,__unused int flags)5352 vm_map_enter_cpm(
5353 	__unused vm_map_t       map,
5354 	__unused vm_map_offset_t        *addr,
5355 	__unused vm_map_size_t  size,
5356 	__unused int            flags)
5357 {
5358 	return KERN_FAILURE;
5359 }
5360 #endif /* VM_CPM */
5361 
5362 /* Not used without nested pmaps */
5363 #ifndef NO_NESTED_PMAP
5364 /*
5365  * Clip and unnest a portion of a nested submap mapping.
5366  */
5367 
5368 
5369 static void
vm_map_clip_unnest(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t start_unnest,vm_map_offset_t end_unnest)5370 vm_map_clip_unnest(
5371 	vm_map_t        map,
5372 	vm_map_entry_t  entry,
5373 	vm_map_offset_t start_unnest,
5374 	vm_map_offset_t end_unnest)
5375 {
5376 	vm_map_offset_t old_start_unnest = start_unnest;
5377 	vm_map_offset_t old_end_unnest = end_unnest;
5378 
5379 	assert(entry->is_sub_map);
5380 	assert(VME_SUBMAP(entry) != NULL);
5381 	assert(entry->use_pmap);
5382 
5383 	/*
5384 	 * Query the platform for the optimal unnest range.
5385 	 * DRK: There's some duplication of effort here, since
5386 	 * callers may have adjusted the range to some extent. This
5387 	 * routine was introduced to support 1GiB subtree nesting
5388 	 * for x86 platforms, which can also nest on 2MiB boundaries
5389 	 * depending on size/alignment.
5390 	 */
5391 	if (pmap_adjust_unnest_parameters(map->pmap, &start_unnest, &end_unnest)) {
5392 		assert(VME_SUBMAP(entry)->is_nested_map);
5393 		assert(!VME_SUBMAP(entry)->disable_vmentry_reuse);
5394 		log_unnest_badness(map,
5395 		    old_start_unnest,
5396 		    old_end_unnest,
5397 		    VME_SUBMAP(entry)->is_nested_map,
5398 		    (entry->vme_start +
5399 		    VME_SUBMAP(entry)->lowest_unnestable_start -
5400 		    VME_OFFSET(entry)));
5401 	}
5402 
5403 	if (entry->vme_start > start_unnest ||
5404 	    entry->vme_end < end_unnest) {
5405 		panic("vm_map_clip_unnest(0x%llx,0x%llx): "
5406 		    "bad nested entry: start=0x%llx end=0x%llx\n",
5407 		    (long long)start_unnest, (long long)end_unnest,
5408 		    (long long)entry->vme_start, (long long)entry->vme_end);
5409 	}
5410 
5411 	if (start_unnest > entry->vme_start) {
5412 		_vm_map_clip_start(&map->hdr,
5413 		    entry,
5414 		    start_unnest);
5415 		if (map->holelistenabled) {
5416 			vm_map_store_update_first_free(map, NULL, FALSE);
5417 		} else {
5418 			vm_map_store_update_first_free(map, map->first_free, FALSE);
5419 		}
5420 	}
5421 	if (entry->vme_end > end_unnest) {
5422 		_vm_map_clip_end(&map->hdr,
5423 		    entry,
5424 		    end_unnest);
5425 		if (map->holelistenabled) {
5426 			vm_map_store_update_first_free(map, NULL, FALSE);
5427 		} else {
5428 			vm_map_store_update_first_free(map, map->first_free, FALSE);
5429 		}
5430 	}
5431 
5432 	pmap_unnest(map->pmap,
5433 	    entry->vme_start,
5434 	    entry->vme_end - entry->vme_start);
5435 	if ((map->mapped_in_other_pmaps) && os_ref_get_count_raw(&map->map_refcnt) != 0) {
5436 		/* clean up parent map/maps */
5437 		vm_map_submap_pmap_clean(
5438 			map, entry->vme_start,
5439 			entry->vme_end,
5440 			VME_SUBMAP(entry),
5441 			VME_OFFSET(entry));
5442 	}
5443 	entry->use_pmap = FALSE;
5444 	if ((map->pmap != kernel_pmap) &&
5445 	    (VME_ALIAS(entry) == VM_MEMORY_SHARED_PMAP)) {
5446 		VME_ALIAS_SET(entry, VM_MEMORY_UNSHARED_PMAP);
5447 	}
5448 }
5449 #endif  /* NO_NESTED_PMAP */
5450 
5451 __abortlike
5452 static void
__vm_map_clip_atomic_entry_panic(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t where)5453 __vm_map_clip_atomic_entry_panic(
5454 	vm_map_t        map,
5455 	vm_map_entry_t  entry,
5456 	vm_map_offset_t where)
5457 {
5458 	panic("vm_map_clip(%p): Attempting to clip an atomic VM map entry "
5459 	    "%p [0x%llx:0x%llx] at 0x%llx", map, entry,
5460 	    (uint64_t)entry->vme_start,
5461 	    (uint64_t)entry->vme_end,
5462 	    (uint64_t)where);
5463 }
5464 
5465 /*
5466  *	vm_map_clip_start:	[ internal use only ]
5467  *
5468  *	Asserts that the given entry begins at or after
5469  *	the specified address; if necessary,
5470  *	it splits the entry into two.
5471  */
5472 void
vm_map_clip_start(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t startaddr)5473 vm_map_clip_start(
5474 	vm_map_t        map,
5475 	vm_map_entry_t  entry,
5476 	vm_map_offset_t startaddr)
5477 {
5478 #ifndef NO_NESTED_PMAP
5479 	if (entry->is_sub_map &&
5480 	    entry->use_pmap &&
5481 	    startaddr >= entry->vme_start) {
5482 		vm_map_offset_t start_unnest, end_unnest;
5483 
5484 		/*
5485 		 * Make sure "startaddr" is no longer in a nested range
5486 		 * before we clip.  Unnest only the minimum range the platform
5487 		 * can handle.
5488 		 * vm_map_clip_unnest may perform additional adjustments to
5489 		 * the unnest range.
5490 		 */
5491 		start_unnest = startaddr & ~(pmap_shared_region_size_min(map->pmap) - 1);
5492 		end_unnest = start_unnest + pmap_shared_region_size_min(map->pmap);
5493 		vm_map_clip_unnest(map, entry, start_unnest, end_unnest);
5494 	}
5495 #endif /* NO_NESTED_PMAP */
5496 	if (startaddr > entry->vme_start) {
5497 		if (VME_OBJECT(entry) &&
5498 		    !entry->is_sub_map &&
5499 		    VME_OBJECT(entry)->phys_contiguous) {
5500 			pmap_remove(map->pmap,
5501 			    (addr64_t)(entry->vme_start),
5502 			    (addr64_t)(entry->vme_end));
5503 		}
5504 		if (entry->vme_atomic) {
5505 			__vm_map_clip_atomic_entry_panic(map, entry, startaddr);
5506 		}
5507 
5508 		DTRACE_VM5(
5509 			vm_map_clip_start,
5510 			vm_map_t, map,
5511 			vm_map_offset_t, entry->vme_start,
5512 			vm_map_offset_t, entry->vme_end,
5513 			vm_map_offset_t, startaddr,
5514 			int, VME_ALIAS(entry));
5515 
5516 		_vm_map_clip_start(&map->hdr, entry, startaddr);
5517 		if (map->holelistenabled) {
5518 			vm_map_store_update_first_free(map, NULL, FALSE);
5519 		} else {
5520 			vm_map_store_update_first_free(map, map->first_free, FALSE);
5521 		}
5522 	}
5523 }
5524 
5525 
5526 #define vm_map_copy_clip_start(copy, entry, startaddr) \
5527 	MACRO_BEGIN \
5528 	if ((startaddr) > (entry)->vme_start) \
5529 	        _vm_map_clip_start(&(copy)->cpy_hdr,(entry),(startaddr)); \
5530 	MACRO_END
5531 
5532 /*
5533  *	This routine is called only when it is known that
5534  *	the entry must be split.
5535  */
5536 static void
_vm_map_clip_start(struct vm_map_header * map_header,vm_map_entry_t entry,vm_map_offset_t start)5537 _vm_map_clip_start(
5538 	struct vm_map_header    *map_header,
5539 	vm_map_entry_t          entry,
5540 	vm_map_offset_t         start)
5541 {
5542 	vm_map_entry_t  new_entry;
5543 
5544 	/*
5545 	 *	Split off the front portion --
5546 	 *	note that we must insert the new
5547 	 *	entry BEFORE this one, so that
5548 	 *	this entry has the specified starting
5549 	 *	address.
5550 	 */
5551 
5552 	if (entry->map_aligned) {
5553 		assert(VM_MAP_PAGE_ALIGNED(start,
5554 		    VM_MAP_HDR_PAGE_MASK(map_header)));
5555 	}
5556 
5557 	new_entry = _vm_map_entry_create(map_header);
5558 	vm_map_entry_copy_full(new_entry, entry);
5559 
5560 	new_entry->vme_end = start;
5561 	assert(new_entry->vme_start < new_entry->vme_end);
5562 	VME_OFFSET_SET(entry, VME_OFFSET(entry) + (start - entry->vme_start));
5563 	assert(start < entry->vme_end);
5564 	entry->vme_start = start;
5565 
5566 	_vm_map_store_entry_link(map_header, entry->vme_prev, new_entry);
5567 
5568 	if (entry->is_sub_map) {
5569 		vm_map_reference(VME_SUBMAP(new_entry));
5570 	} else {
5571 		vm_object_reference(VME_OBJECT(new_entry));
5572 	}
5573 }
5574 
5575 
5576 /*
5577  *	vm_map_clip_end:	[ internal use only ]
5578  *
5579  *	Asserts that the given entry ends at or before
5580  *	the specified address; if necessary,
5581  *	it splits the entry into two.
5582  */
5583 void
vm_map_clip_end(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t endaddr)5584 vm_map_clip_end(
5585 	vm_map_t        map,
5586 	vm_map_entry_t  entry,
5587 	vm_map_offset_t endaddr)
5588 {
5589 	if (endaddr > entry->vme_end) {
5590 		/*
5591 		 * Within the scope of this clipping, limit "endaddr" to
5592 		 * the end of this map entry...
5593 		 */
5594 		endaddr = entry->vme_end;
5595 	}
5596 #ifndef NO_NESTED_PMAP
5597 	if (entry->is_sub_map && entry->use_pmap) {
5598 		vm_map_offset_t start_unnest, end_unnest;
5599 
5600 		/*
5601 		 * Make sure the range between the start of this entry and
5602 		 * the new "endaddr" is no longer nested before we clip.
5603 		 * Unnest only the minimum range the platform can handle.
5604 		 * vm_map_clip_unnest may perform additional adjustments to
5605 		 * the unnest range.
5606 		 */
5607 		start_unnest = entry->vme_start;
5608 		end_unnest =
5609 		    (endaddr + pmap_shared_region_size_min(map->pmap) - 1) &
5610 		    ~(pmap_shared_region_size_min(map->pmap) - 1);
5611 		vm_map_clip_unnest(map, entry, start_unnest, end_unnest);
5612 	}
5613 #endif /* NO_NESTED_PMAP */
5614 	if (endaddr < entry->vme_end) {
5615 		if (VME_OBJECT(entry) &&
5616 		    !entry->is_sub_map &&
5617 		    VME_OBJECT(entry)->phys_contiguous) {
5618 			pmap_remove(map->pmap,
5619 			    (addr64_t)(entry->vme_start),
5620 			    (addr64_t)(entry->vme_end));
5621 		}
5622 		if (entry->vme_atomic) {
5623 			__vm_map_clip_atomic_entry_panic(map, entry, endaddr);
5624 		}
5625 		DTRACE_VM5(
5626 			vm_map_clip_end,
5627 			vm_map_t, map,
5628 			vm_map_offset_t, entry->vme_start,
5629 			vm_map_offset_t, entry->vme_end,
5630 			vm_map_offset_t, endaddr,
5631 			int, VME_ALIAS(entry));
5632 
5633 		_vm_map_clip_end(&map->hdr, entry, endaddr);
5634 		if (map->holelistenabled) {
5635 			vm_map_store_update_first_free(map, NULL, FALSE);
5636 		} else {
5637 			vm_map_store_update_first_free(map, map->first_free, FALSE);
5638 		}
5639 	}
5640 }
5641 
5642 
5643 #define vm_map_copy_clip_end(copy, entry, endaddr) \
5644 	MACRO_BEGIN \
5645 	if ((endaddr) < (entry)->vme_end) \
5646 	        _vm_map_clip_end(&(copy)->cpy_hdr,(entry),(endaddr)); \
5647 	MACRO_END
5648 
5649 /*
5650  *	This routine is called only when it is known that
5651  *	the entry must be split.
5652  */
5653 static void
_vm_map_clip_end(struct vm_map_header * map_header,vm_map_entry_t entry,vm_map_offset_t end)5654 _vm_map_clip_end(
5655 	struct vm_map_header    *map_header,
5656 	vm_map_entry_t          entry,
5657 	vm_map_offset_t         end)
5658 {
5659 	vm_map_entry_t  new_entry;
5660 
5661 	/*
5662 	 *	Create a new entry and insert it
5663 	 *	AFTER the specified entry
5664 	 */
5665 
5666 	if (entry->map_aligned) {
5667 		assert(VM_MAP_PAGE_ALIGNED(end,
5668 		    VM_MAP_HDR_PAGE_MASK(map_header)));
5669 	}
5670 
5671 	new_entry = _vm_map_entry_create(map_header);
5672 	vm_map_entry_copy_full(new_entry, entry);
5673 
5674 	assert(entry->vme_start < end);
5675 	new_entry->vme_start = entry->vme_end = end;
5676 	VME_OFFSET_SET(new_entry,
5677 	    VME_OFFSET(new_entry) + (end - entry->vme_start));
5678 	assert(new_entry->vme_start < new_entry->vme_end);
5679 
5680 	_vm_map_store_entry_link(map_header, entry, new_entry);
5681 
5682 	if (entry->is_sub_map) {
5683 		vm_map_reference(VME_SUBMAP(new_entry));
5684 	} else {
5685 		vm_object_reference(VME_OBJECT(new_entry));
5686 	}
5687 }
5688 
5689 
5690 /*
5691  *	VM_MAP_RANGE_CHECK:	[ internal use only ]
5692  *
5693  *	Asserts that the starting and ending region
5694  *	addresses fall within the valid range of the map.
5695  */
5696 #define VM_MAP_RANGE_CHECK(map, start, end)     \
5697 	MACRO_BEGIN                             \
5698 	if (start < vm_map_min(map))            \
5699 	        start = vm_map_min(map);        \
5700 	if (end > vm_map_max(map))              \
5701 	        end = vm_map_max(map);          \
5702 	if (start > end)                        \
5703 	        start = end;                    \
5704 	MACRO_END
5705 
5706 /*
5707  *	vm_map_range_check:	[ internal use only ]
5708  *
5709  *	Check that the region defined by the specified start and
5710  *	end addresses are wholly contained within a single map
5711  *	entry or set of adjacent map entries of the spacified map,
5712  *	i.e. the specified region contains no unmapped space.
5713  *	If any or all of the region is unmapped, FALSE is returned.
5714  *	Otherwise, TRUE is returned and if the output argument 'entry'
5715  *	is not NULL it points to the map entry containing the start
5716  *	of the region.
5717  *
5718  *	The map is locked for reading on entry and is left locked.
5719  */
5720 static boolean_t
vm_map_range_check(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_map_entry_t * entry)5721 vm_map_range_check(
5722 	vm_map_t                map,
5723 	vm_map_offset_t         start,
5724 	vm_map_offset_t         end,
5725 	vm_map_entry_t          *entry)
5726 {
5727 	vm_map_entry_t          cur;
5728 	vm_map_offset_t         prev;
5729 
5730 	/*
5731 	 *      Basic sanity checks first
5732 	 */
5733 	if (start < vm_map_min(map) || end > vm_map_max(map) || start > end) {
5734 		return FALSE;
5735 	}
5736 
5737 	/*
5738 	 *      Check first if the region starts within a valid
5739 	 *	mapping for the map.
5740 	 */
5741 	if (!vm_map_lookup_entry(map, start, &cur)) {
5742 		return FALSE;
5743 	}
5744 
5745 	/*
5746 	 *	Optimize for the case that the region is contained
5747 	 *	in a single map entry.
5748 	 */
5749 	if (entry != (vm_map_entry_t *) NULL) {
5750 		*entry = cur;
5751 	}
5752 	if (end <= cur->vme_end) {
5753 		return TRUE;
5754 	}
5755 
5756 	/*
5757 	 *      If the region is not wholly contained within a
5758 	 *      single entry, walk the entries looking for holes.
5759 	 */
5760 	prev = cur->vme_end;
5761 	cur = cur->vme_next;
5762 	while ((cur != vm_map_to_entry(map)) && (prev == cur->vme_start)) {
5763 		if (end <= cur->vme_end) {
5764 			return TRUE;
5765 		}
5766 		prev = cur->vme_end;
5767 		cur = cur->vme_next;
5768 	}
5769 	return FALSE;
5770 }
5771 
5772 /*
5773  *	vm_map_protect:
5774  *
5775  *	Sets the protection of the specified address
5776  *	region in the target map.  If "set_max" is
5777  *	specified, the maximum protection is to be set;
5778  *	otherwise, only the current protection is affected.
5779  */
5780 kern_return_t
vm_map_protect(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t new_prot,boolean_t set_max)5781 vm_map_protect(
5782 	vm_map_t        map,
5783 	vm_map_offset_t start,
5784 	vm_map_offset_t end,
5785 	vm_prot_t       new_prot,
5786 	boolean_t       set_max)
5787 {
5788 	vm_map_entry_t                  current;
5789 	vm_map_offset_t                 prev;
5790 	vm_map_entry_t                  entry;
5791 	vm_prot_t                       new_max;
5792 	int                             pmap_options = 0;
5793 	kern_return_t                   kr;
5794 
5795 	if (new_prot & VM_PROT_COPY) {
5796 		vm_map_offset_t         new_start;
5797 		vm_prot_t               cur_prot, max_prot;
5798 		vm_map_kernel_flags_t   kflags;
5799 
5800 		/* LP64todo - see below */
5801 		if (start >= map->max_offset) {
5802 			return KERN_INVALID_ADDRESS;
5803 		}
5804 
5805 		if ((new_prot & VM_PROT_ALLEXEC) &&
5806 		    map->pmap != kernel_pmap &&
5807 		    (vm_map_cs_enforcement(map)
5808 #if XNU_TARGET_OS_OSX && __arm64__
5809 		    || !VM_MAP_IS_EXOTIC(map)
5810 #endif /* XNU_TARGET_OS_OSX && __arm64__ */
5811 		    ) &&
5812 		    VM_MAP_POLICY_WX_FAIL(map)) {
5813 			DTRACE_VM3(cs_wx,
5814 			    uint64_t, (uint64_t) start,
5815 			    uint64_t, (uint64_t) end,
5816 			    vm_prot_t, new_prot);
5817 			printf("CODE SIGNING: %d[%s] %s can't have both write and exec at the same time\n",
5818 			    proc_selfpid(),
5819 			    (current_task()->bsd_info
5820 			    ? proc_name_address(current_task()->bsd_info)
5821 			    : "?"),
5822 			    __FUNCTION__);
5823 			return KERN_PROTECTION_FAILURE;
5824 		}
5825 
5826 		/*
5827 		 * Let vm_map_remap_extract() know that it will need to:
5828 		 * + make a copy of the mapping
5829 		 * + add VM_PROT_WRITE to the max protections
5830 		 * + remove any protections that are no longer allowed from the
5831 		 *   max protections (to avoid any WRITE/EXECUTE conflict, for
5832 		 *   example).
5833 		 * Note that "max_prot" is an IN/OUT parameter only for this
5834 		 * specific (VM_PROT_COPY) case.  It's usually an OUT parameter
5835 		 * only.
5836 		 */
5837 		max_prot = new_prot & (VM_PROT_ALL | VM_PROT_ALLEXEC);
5838 		cur_prot = VM_PROT_NONE;
5839 		kflags = VM_MAP_KERNEL_FLAGS_NONE;
5840 		kflags.vmkf_remap_prot_copy = TRUE;
5841 		kflags.vmkf_overwrite_immutable = TRUE;
5842 		new_start = start;
5843 		kr = vm_map_remap(map,
5844 		    &new_start,
5845 		    end - start,
5846 		    0, /* mask */
5847 		    VM_FLAGS_FIXED | VM_FLAGS_OVERWRITE,
5848 		    kflags,
5849 		    0,
5850 		    map,
5851 		    start,
5852 		    TRUE, /* copy-on-write remapping! */
5853 		    &cur_prot, /* IN/OUT */
5854 		    &max_prot, /* IN/OUT */
5855 		    VM_INHERIT_DEFAULT);
5856 		if (kr != KERN_SUCCESS) {
5857 			return kr;
5858 		}
5859 		new_prot &= ~VM_PROT_COPY;
5860 	}
5861 
5862 	vm_map_lock(map);
5863 
5864 	/* LP64todo - remove this check when vm_map_commpage64()
5865 	 * no longer has to stuff in a map_entry for the commpage
5866 	 * above the map's max_offset.
5867 	 */
5868 	if (start >= map->max_offset) {
5869 		vm_map_unlock(map);
5870 		return KERN_INVALID_ADDRESS;
5871 	}
5872 
5873 	while (1) {
5874 		/*
5875 		 *      Lookup the entry.  If it doesn't start in a valid
5876 		 *	entry, return an error.
5877 		 */
5878 		if (!vm_map_lookup_entry(map, start, &entry)) {
5879 			vm_map_unlock(map);
5880 			return KERN_INVALID_ADDRESS;
5881 		}
5882 
5883 		if (entry->superpage_size && (start & (SUPERPAGE_SIZE - 1))) { /* extend request to whole entry */
5884 			start = SUPERPAGE_ROUND_DOWN(start);
5885 			continue;
5886 		}
5887 		break;
5888 	}
5889 	if (entry->superpage_size) {
5890 		end = SUPERPAGE_ROUND_UP(end);
5891 	}
5892 
5893 	/*
5894 	 *	Make a first pass to check for protection and address
5895 	 *	violations.
5896 	 */
5897 
5898 	current = entry;
5899 	prev = current->vme_start;
5900 	while ((current != vm_map_to_entry(map)) &&
5901 	    (current->vme_start < end)) {
5902 		/*
5903 		 * If there is a hole, return an error.
5904 		 */
5905 		if (current->vme_start != prev) {
5906 			vm_map_unlock(map);
5907 			return KERN_INVALID_ADDRESS;
5908 		}
5909 
5910 		new_max = current->max_protection;
5911 
5912 #if defined(__x86_64__)
5913 		/* Allow max mask to include execute prot bits if this map doesn't enforce CS */
5914 		if (set_max && (new_prot & VM_PROT_ALLEXEC) && !vm_map_cs_enforcement(map)) {
5915 			new_max = (new_max & ~VM_PROT_ALLEXEC) | (new_prot & VM_PROT_ALLEXEC);
5916 		}
5917 #endif
5918 		if ((new_prot & new_max) != new_prot) {
5919 			vm_map_unlock(map);
5920 			return KERN_PROTECTION_FAILURE;
5921 		}
5922 
5923 		if (current->used_for_jit &&
5924 		    pmap_has_prot_policy(map->pmap, current->translated_allow_execute, current->protection)) {
5925 			vm_map_unlock(map);
5926 			return KERN_PROTECTION_FAILURE;
5927 		}
5928 
5929 		if ((new_prot & VM_PROT_WRITE) &&
5930 		    (new_prot & VM_PROT_ALLEXEC) &&
5931 #if XNU_TARGET_OS_OSX
5932 		    map->pmap != kernel_pmap &&
5933 		    (vm_map_cs_enforcement(map)
5934 #if __arm64__
5935 		    || !VM_MAP_IS_EXOTIC(map)
5936 #endif /* __arm64__ */
5937 		    ) &&
5938 #endif /* XNU_TARGET_OS_OSX */
5939 		    !(current->used_for_jit)) {
5940 			DTRACE_VM3(cs_wx,
5941 			    uint64_t, (uint64_t) current->vme_start,
5942 			    uint64_t, (uint64_t) current->vme_end,
5943 			    vm_prot_t, new_prot);
5944 			printf("CODE SIGNING: %d[%s] %s can't have both write and exec at the same time\n",
5945 			    proc_selfpid(),
5946 			    (current_task()->bsd_info
5947 			    ? proc_name_address(current_task()->bsd_info)
5948 			    : "?"),
5949 			    __FUNCTION__);
5950 			new_prot &= ~VM_PROT_ALLEXEC;
5951 			if (VM_MAP_POLICY_WX_FAIL(map)) {
5952 				vm_map_unlock(map);
5953 				return KERN_PROTECTION_FAILURE;
5954 			}
5955 		}
5956 
5957 		/*
5958 		 * If the task has requested executable lockdown,
5959 		 * deny both:
5960 		 * - adding executable protections OR
5961 		 * - adding write protections to an existing executable mapping.
5962 		 */
5963 		if (map->map_disallow_new_exec == TRUE) {
5964 			if ((new_prot & VM_PROT_ALLEXEC) ||
5965 			    ((current->protection & VM_PROT_EXECUTE) && (new_prot & VM_PROT_WRITE))) {
5966 				vm_map_unlock(map);
5967 				return KERN_PROTECTION_FAILURE;
5968 			}
5969 		}
5970 
5971 		prev = current->vme_end;
5972 		current = current->vme_next;
5973 	}
5974 
5975 #if __arm64__
5976 	if (end > prev &&
5977 	    end == vm_map_round_page(prev, VM_MAP_PAGE_MASK(map))) {
5978 		vm_map_entry_t prev_entry;
5979 
5980 		prev_entry = current->vme_prev;
5981 		if (prev_entry != vm_map_to_entry(map) &&
5982 		    !prev_entry->map_aligned &&
5983 		    (vm_map_round_page(prev_entry->vme_end,
5984 		    VM_MAP_PAGE_MASK(map))
5985 		    == end)) {
5986 			/*
5987 			 * The last entry in our range is not "map-aligned"
5988 			 * but it would have reached all the way to "end"
5989 			 * if it had been map-aligned, so this is not really
5990 			 * a hole in the range and we can proceed.
5991 			 */
5992 			prev = end;
5993 		}
5994 	}
5995 #endif /* __arm64__ */
5996 
5997 	if (end > prev) {
5998 		vm_map_unlock(map);
5999 		return KERN_INVALID_ADDRESS;
6000 	}
6001 
6002 	/*
6003 	 *	Go back and fix up protections.
6004 	 *	Clip to start here if the range starts within
6005 	 *	the entry.
6006 	 */
6007 
6008 	current = entry;
6009 	if (current != vm_map_to_entry(map)) {
6010 		/* clip and unnest if necessary */
6011 		vm_map_clip_start(map, current, start);
6012 	}
6013 
6014 	while ((current != vm_map_to_entry(map)) &&
6015 	    (current->vme_start < end)) {
6016 		vm_prot_t       old_prot;
6017 
6018 		vm_map_clip_end(map, current, end);
6019 
6020 		if (current->is_sub_map) {
6021 			/* clipping did unnest if needed */
6022 			assert(!current->use_pmap);
6023 		}
6024 
6025 		old_prot = current->protection;
6026 
6027 		if (set_max) {
6028 			current->max_protection = new_prot;
6029 			/* Consider either EXECUTE or UEXEC as EXECUTE for this masking */
6030 			current->protection = (new_prot & old_prot);
6031 		} else {
6032 			current->protection = new_prot;
6033 		}
6034 
6035 		/*
6036 		 *	Update physical map if necessary.
6037 		 *	If the request is to turn off write protection,
6038 		 *	we won't do it for real (in pmap). This is because
6039 		 *	it would cause copy-on-write to fail.  We've already
6040 		 *	set, the new protection in the map, so if a
6041 		 *	write-protect fault occurred, it will be fixed up
6042 		 *	properly, COW or not.
6043 		 */
6044 		if (current->protection != old_prot) {
6045 			/* Look one level in we support nested pmaps */
6046 			/* from mapped submaps which are direct entries */
6047 			/* in our map */
6048 
6049 			vm_prot_t prot;
6050 
6051 			prot = current->protection;
6052 			if (current->is_sub_map || (VME_OBJECT(current) == NULL) || (VME_OBJECT(current) != compressor_object)) {
6053 				prot &= ~VM_PROT_WRITE;
6054 			} else {
6055 				assert(!VME_OBJECT(current)->code_signed);
6056 				assert(VME_OBJECT(current)->copy_strategy == MEMORY_OBJECT_COPY_NONE);
6057 			}
6058 
6059 			if (override_nx(map, VME_ALIAS(current)) && prot) {
6060 				prot |= VM_PROT_EXECUTE;
6061 			}
6062 
6063 #if DEVELOPMENT || DEBUG
6064 			if (!(old_prot & VM_PROT_EXECUTE) &&
6065 			    (prot & VM_PROT_EXECUTE) &&
6066 			    panic_on_unsigned_execute &&
6067 			    (proc_selfcsflags() & CS_KILL)) {
6068 				panic("vm_map_protect(%p,0x%llx,0x%llx) old=0x%x new=0x%x - <rdar://23770418> code-signing bypass?", map, (uint64_t)current->vme_start, (uint64_t)current->vme_end, old_prot, prot);
6069 			}
6070 #endif /* DEVELOPMENT || DEBUG */
6071 
6072 			if (pmap_has_prot_policy(map->pmap, current->translated_allow_execute, prot)) {
6073 				if (current->wired_count) {
6074 					panic("vm_map_protect(%p,0x%llx,0x%llx) new=0x%x wired=%x",
6075 					    map, (uint64_t)current->vme_start, (uint64_t)current->vme_end, prot, current->wired_count);
6076 				}
6077 
6078 				/* If the pmap layer cares about this
6079 				 * protection type, force a fault for
6080 				 * each page so that vm_fault will
6081 				 * repopulate the page with the full
6082 				 * set of protections.
6083 				 */
6084 				/*
6085 				 * TODO: We don't seem to need this,
6086 				 * but this is due to an internal
6087 				 * implementation detail of
6088 				 * pmap_protect.  Do we want to rely
6089 				 * on this?
6090 				 */
6091 				prot = VM_PROT_NONE;
6092 			}
6093 
6094 			if (current->is_sub_map && current->use_pmap) {
6095 				pmap_protect(VME_SUBMAP(current)->pmap,
6096 				    current->vme_start,
6097 				    current->vme_end,
6098 				    prot);
6099 			} else {
6100 				if (prot & VM_PROT_WRITE) {
6101 					if (VME_OBJECT(current) == compressor_object) {
6102 						/*
6103 						 * For write requests on the
6104 						 * compressor, we wil ask the
6105 						 * pmap layer to prevent us from
6106 						 * taking a write fault when we
6107 						 * attempt to access the mapping
6108 						 * next.
6109 						 */
6110 						pmap_options |= PMAP_OPTIONS_PROTECT_IMMEDIATE;
6111 					}
6112 				}
6113 
6114 				pmap_protect_options(map->pmap,
6115 				    current->vme_start,
6116 				    current->vme_end,
6117 				    prot,
6118 				    pmap_options,
6119 				    NULL);
6120 			}
6121 		}
6122 		current = current->vme_next;
6123 	}
6124 
6125 	current = entry;
6126 	while ((current != vm_map_to_entry(map)) &&
6127 	    (current->vme_start <= end)) {
6128 		vm_map_simplify_entry(map, current);
6129 		current = current->vme_next;
6130 	}
6131 
6132 	vm_map_unlock(map);
6133 	return KERN_SUCCESS;
6134 }
6135 
6136 /*
6137  *	vm_map_inherit:
6138  *
6139  *	Sets the inheritance of the specified address
6140  *	range in the target map.  Inheritance
6141  *	affects how the map will be shared with
6142  *	child maps at the time of vm_map_fork.
6143  */
6144 kern_return_t
vm_map_inherit(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_inherit_t new_inheritance)6145 vm_map_inherit(
6146 	vm_map_t        map,
6147 	vm_map_offset_t start,
6148 	vm_map_offset_t end,
6149 	vm_inherit_t    new_inheritance)
6150 {
6151 	vm_map_entry_t  entry;
6152 	vm_map_entry_t  temp_entry;
6153 
6154 	vm_map_lock(map);
6155 
6156 	VM_MAP_RANGE_CHECK(map, start, end);
6157 
6158 	if (vm_map_lookup_entry(map, start, &temp_entry)) {
6159 		entry = temp_entry;
6160 	} else {
6161 		temp_entry = temp_entry->vme_next;
6162 		entry = temp_entry;
6163 	}
6164 
6165 	/* first check entire range for submaps which can't support the */
6166 	/* given inheritance. */
6167 	while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
6168 		if (entry->is_sub_map) {
6169 			if (new_inheritance == VM_INHERIT_COPY) {
6170 				vm_map_unlock(map);
6171 				return KERN_INVALID_ARGUMENT;
6172 			}
6173 		}
6174 
6175 		entry = entry->vme_next;
6176 	}
6177 
6178 	entry = temp_entry;
6179 	if (entry != vm_map_to_entry(map)) {
6180 		/* clip and unnest if necessary */
6181 		vm_map_clip_start(map, entry, start);
6182 	}
6183 
6184 	while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
6185 		vm_map_clip_end(map, entry, end);
6186 		if (entry->is_sub_map) {
6187 			/* clip did unnest if needed */
6188 			assert(!entry->use_pmap);
6189 		}
6190 
6191 		entry->inheritance = new_inheritance;
6192 
6193 		entry = entry->vme_next;
6194 	}
6195 
6196 	vm_map_unlock(map);
6197 	return KERN_SUCCESS;
6198 }
6199 
6200 /*
6201  * Update the accounting for the amount of wired memory in this map.  If the user has
6202  * exceeded the defined limits, then we fail.  Wiring on behalf of the kernel never fails.
6203  */
6204 
6205 static kern_return_t
add_wire_counts(vm_map_t map,vm_map_entry_t entry,boolean_t user_wire)6206 add_wire_counts(
6207 	vm_map_t        map,
6208 	vm_map_entry_t  entry,
6209 	boolean_t       user_wire)
6210 {
6211 	vm_map_size_t   size;
6212 
6213 	if (user_wire) {
6214 		unsigned int total_wire_count =  vm_page_wire_count + vm_lopage_free_count;
6215 
6216 		/*
6217 		 * We're wiring memory at the request of the user.  Check if this is the first time the user is wiring
6218 		 * this map entry.
6219 		 */
6220 
6221 		if (entry->user_wired_count == 0) {
6222 			size = entry->vme_end - entry->vme_start;
6223 
6224 			/*
6225 			 * Since this is the first time the user is wiring this map entry, check to see if we're
6226 			 * exceeding the user wire limits.  There is a per map limit which is the smaller of either
6227 			 * the process's rlimit or the global vm_per_task_user_wire_limit which caps this value.  There is also
6228 			 * a system-wide limit on the amount of memory all users can wire.  If the user is over either
6229 			 * limit, then we fail.
6230 			 */
6231 
6232 			if (size + map->user_wire_size > MIN(map->user_wire_limit, vm_per_task_user_wire_limit) ||
6233 			    size + ptoa_64(total_wire_count) > vm_global_user_wire_limit) {
6234 				if (size + ptoa_64(total_wire_count) > vm_global_user_wire_limit) {
6235 #if DEVELOPMENT || DEBUG
6236 					if (panic_on_mlock_failure) {
6237 						panic("mlock: Over global wire limit. %llu bytes wired and requested to wire %llu bytes more", ptoa_64(total_wire_count), (uint64_t) size);
6238 					}
6239 #endif /* DEVELOPMENT || DEBUG */
6240 					os_atomic_inc(&vm_add_wire_count_over_global_limit, relaxed);
6241 				} else {
6242 					os_atomic_inc(&vm_add_wire_count_over_user_limit, relaxed);
6243 #if DEVELOPMENT || DEBUG
6244 					if (panic_on_mlock_failure) {
6245 						panic("mlock: Over process wire limit. %llu bytes wired and requested to wire %llu bytes more", (uint64_t) map->user_wire_size, (uint64_t) size);
6246 					}
6247 #endif /* DEVELOPMENT || DEBUG */
6248 				}
6249 				return KERN_RESOURCE_SHORTAGE;
6250 			}
6251 
6252 			/*
6253 			 * The first time the user wires an entry, we also increment the wired_count and add this to
6254 			 * the total that has been wired in the map.
6255 			 */
6256 
6257 			if (entry->wired_count >= MAX_WIRE_COUNT) {
6258 				return KERN_FAILURE;
6259 			}
6260 
6261 			entry->wired_count++;
6262 			map->user_wire_size += size;
6263 		}
6264 
6265 		if (entry->user_wired_count >= MAX_WIRE_COUNT) {
6266 			return KERN_FAILURE;
6267 		}
6268 
6269 		entry->user_wired_count++;
6270 	} else {
6271 		/*
6272 		 * The kernel's wiring the memory.  Just bump the count and continue.
6273 		 */
6274 
6275 		if (entry->wired_count >= MAX_WIRE_COUNT) {
6276 			panic("vm_map_wire: too many wirings");
6277 		}
6278 
6279 		entry->wired_count++;
6280 	}
6281 
6282 	return KERN_SUCCESS;
6283 }
6284 
6285 /*
6286  * Update the memory wiring accounting now that the given map entry is being unwired.
6287  */
6288 
6289 static void
subtract_wire_counts(vm_map_t map,vm_map_entry_t entry,boolean_t user_wire)6290 subtract_wire_counts(
6291 	vm_map_t        map,
6292 	vm_map_entry_t  entry,
6293 	boolean_t       user_wire)
6294 {
6295 	if (user_wire) {
6296 		/*
6297 		 * We're unwiring memory at the request of the user.  See if we're removing the last user wire reference.
6298 		 */
6299 
6300 		if (entry->user_wired_count == 1) {
6301 			/*
6302 			 * We're removing the last user wire reference.  Decrement the wired_count and the total
6303 			 * user wired memory for this map.
6304 			 */
6305 
6306 			assert(entry->wired_count >= 1);
6307 			entry->wired_count--;
6308 			map->user_wire_size -= entry->vme_end - entry->vme_start;
6309 		}
6310 
6311 		assert(entry->user_wired_count >= 1);
6312 		entry->user_wired_count--;
6313 	} else {
6314 		/*
6315 		 * The kernel is unwiring the memory.   Just update the count.
6316 		 */
6317 
6318 		assert(entry->wired_count >= 1);
6319 		entry->wired_count--;
6320 	}
6321 }
6322 
6323 int cs_executable_wire = 0;
6324 
6325 /*
6326  *	vm_map_wire:
6327  *
6328  *	Sets the pageability of the specified address range in the
6329  *	target map as wired.  Regions specified as not pageable require
6330  *	locked-down physical memory and physical page maps.  The
6331  *	access_type variable indicates types of accesses that must not
6332  *	generate page faults.  This is checked against protection of
6333  *	memory being locked-down.
6334  *
6335  *	The map must not be locked, but a reference must remain to the
6336  *	map throughout the call.
6337  */
6338 static kern_return_t
vm_map_wire_nested(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t caller_prot,vm_tag_t tag,boolean_t user_wire,pmap_t map_pmap,vm_map_offset_t pmap_addr,ppnum_t * physpage_p)6339 vm_map_wire_nested(
6340 	vm_map_t                map,
6341 	vm_map_offset_t         start,
6342 	vm_map_offset_t         end,
6343 	vm_prot_t               caller_prot,
6344 	vm_tag_t                tag,
6345 	boolean_t               user_wire,
6346 	pmap_t                  map_pmap,
6347 	vm_map_offset_t         pmap_addr,
6348 	ppnum_t                 *physpage_p)
6349 {
6350 	vm_map_entry_t          entry;
6351 	vm_prot_t               access_type;
6352 	struct vm_map_entry     *first_entry, tmp_entry;
6353 	vm_map_t                real_map;
6354 	vm_map_offset_t         s, e;
6355 	kern_return_t           rc;
6356 	boolean_t               need_wakeup;
6357 	boolean_t               main_map = FALSE;
6358 	wait_interrupt_t        interruptible_state;
6359 	thread_t                cur_thread;
6360 	unsigned int            last_timestamp;
6361 	vm_map_size_t           size;
6362 	boolean_t               wire_and_extract;
6363 	vm_prot_t               extra_prots;
6364 
6365 	extra_prots = VM_PROT_COPY;
6366 	extra_prots |= VM_PROT_COPY_FAIL_IF_EXECUTABLE;
6367 #if XNU_TARGET_OS_OSX
6368 	if (map->pmap == kernel_pmap ||
6369 	    !vm_map_cs_enforcement(map)) {
6370 		extra_prots &= ~VM_PROT_COPY_FAIL_IF_EXECUTABLE;
6371 	}
6372 #endif /* XNU_TARGET_OS_OSX */
6373 
6374 	access_type = (caller_prot & (VM_PROT_ALL | VM_PROT_ALLEXEC));
6375 
6376 	wire_and_extract = FALSE;
6377 	if (physpage_p != NULL) {
6378 		/*
6379 		 * The caller wants the physical page number of the
6380 		 * wired page.  We return only one physical page number
6381 		 * so this works for only one page at a time.
6382 		 */
6383 		if ((end - start) != PAGE_SIZE) {
6384 			return KERN_INVALID_ARGUMENT;
6385 		}
6386 		wire_and_extract = TRUE;
6387 		*physpage_p = 0;
6388 	}
6389 
6390 	vm_map_lock(map);
6391 	if (map_pmap == NULL) {
6392 		main_map = TRUE;
6393 	}
6394 	last_timestamp = map->timestamp;
6395 
6396 	VM_MAP_RANGE_CHECK(map, start, end);
6397 	assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
6398 	assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
6399 
6400 	if (start == end) {
6401 		/* We wired what the caller asked for, zero pages */
6402 		vm_map_unlock(map);
6403 		return KERN_SUCCESS;
6404 	}
6405 
6406 	need_wakeup = FALSE;
6407 	cur_thread = current_thread();
6408 
6409 	s = start;
6410 	rc = KERN_SUCCESS;
6411 
6412 	if (vm_map_lookup_entry(map, s, &first_entry)) {
6413 		entry = first_entry;
6414 		/*
6415 		 * vm_map_clip_start will be done later.
6416 		 * We don't want to unnest any nested submaps here !
6417 		 */
6418 	} else {
6419 		/* Start address is not in map */
6420 		rc = KERN_INVALID_ADDRESS;
6421 		goto done;
6422 	}
6423 
6424 	while ((entry != vm_map_to_entry(map)) && (s < end)) {
6425 		/*
6426 		 * At this point, we have wired from "start" to "s".
6427 		 * We still need to wire from "s" to "end".
6428 		 *
6429 		 * "entry" hasn't been clipped, so it could start before "s"
6430 		 * and/or end after "end".
6431 		 */
6432 
6433 		/* "e" is how far we want to wire in this entry */
6434 		e = entry->vme_end;
6435 		if (e > end) {
6436 			e = end;
6437 		}
6438 
6439 		/*
6440 		 * If another thread is wiring/unwiring this entry then
6441 		 * block after informing other thread to wake us up.
6442 		 */
6443 		if (entry->in_transition) {
6444 			wait_result_t wait_result;
6445 
6446 			/*
6447 			 * We have not clipped the entry.  Make sure that
6448 			 * the start address is in range so that the lookup
6449 			 * below will succeed.
6450 			 * "s" is the current starting point: we've already
6451 			 * wired from "start" to "s" and we still have
6452 			 * to wire from "s" to "end".
6453 			 */
6454 
6455 			entry->needs_wakeup = TRUE;
6456 
6457 			/*
6458 			 * wake up anybody waiting on entries that we have
6459 			 * already wired.
6460 			 */
6461 			if (need_wakeup) {
6462 				vm_map_entry_wakeup(map);
6463 				need_wakeup = FALSE;
6464 			}
6465 			/*
6466 			 * User wiring is interruptible
6467 			 */
6468 			wait_result = vm_map_entry_wait(map,
6469 			    (user_wire) ? THREAD_ABORTSAFE :
6470 			    THREAD_UNINT);
6471 			if (user_wire && wait_result == THREAD_INTERRUPTED) {
6472 				/*
6473 				 * undo the wirings we have done so far
6474 				 * We do not clear the needs_wakeup flag,
6475 				 * because we cannot tell if we were the
6476 				 * only one waiting.
6477 				 */
6478 				rc = KERN_FAILURE;
6479 				goto done;
6480 			}
6481 
6482 			/*
6483 			 * Cannot avoid a lookup here. reset timestamp.
6484 			 */
6485 			last_timestamp = map->timestamp;
6486 
6487 			/*
6488 			 * The entry could have been clipped, look it up again.
6489 			 * Worse that can happen is, it may not exist anymore.
6490 			 */
6491 			if (!vm_map_lookup_entry(map, s, &first_entry)) {
6492 				/*
6493 				 * User: undo everything upto the previous
6494 				 * entry.  let vm_map_unwire worry about
6495 				 * checking the validity of the range.
6496 				 */
6497 				rc = KERN_FAILURE;
6498 				goto done;
6499 			}
6500 			entry = first_entry;
6501 			continue;
6502 		}
6503 
6504 		if (entry->is_sub_map) {
6505 			vm_map_offset_t sub_start;
6506 			vm_map_offset_t sub_end;
6507 			vm_map_offset_t local_start;
6508 			vm_map_offset_t local_end;
6509 			pmap_t          pmap;
6510 
6511 			if (wire_and_extract) {
6512 				/*
6513 				 * Wiring would result in copy-on-write
6514 				 * which would not be compatible with
6515 				 * the sharing we have with the original
6516 				 * provider of this memory.
6517 				 */
6518 				rc = KERN_INVALID_ARGUMENT;
6519 				goto done;
6520 			}
6521 
6522 			vm_map_clip_start(map, entry, s);
6523 			vm_map_clip_end(map, entry, end);
6524 
6525 			sub_start = VME_OFFSET(entry);
6526 			sub_end = entry->vme_end;
6527 			sub_end += VME_OFFSET(entry) - entry->vme_start;
6528 
6529 			local_end = entry->vme_end;
6530 			if (map_pmap == NULL) {
6531 				vm_object_t             object;
6532 				vm_object_offset_t      offset;
6533 				vm_prot_t               prot;
6534 				boolean_t               wired;
6535 				vm_map_entry_t          local_entry;
6536 				vm_map_version_t         version;
6537 				vm_map_t                lookup_map;
6538 
6539 				if (entry->use_pmap) {
6540 					pmap = VME_SUBMAP(entry)->pmap;
6541 					/* ppc implementation requires that */
6542 					/* submaps pmap address ranges line */
6543 					/* up with parent map */
6544 #ifdef notdef
6545 					pmap_addr = sub_start;
6546 #endif
6547 					pmap_addr = s;
6548 				} else {
6549 					pmap = map->pmap;
6550 					pmap_addr = s;
6551 				}
6552 
6553 				if (entry->wired_count) {
6554 					if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6555 						goto done;
6556 					}
6557 
6558 					/*
6559 					 * The map was not unlocked:
6560 					 * no need to goto re-lookup.
6561 					 * Just go directly to next entry.
6562 					 */
6563 					entry = entry->vme_next;
6564 					s = entry->vme_start;
6565 					continue;
6566 				}
6567 
6568 				/* call vm_map_lookup_locked to */
6569 				/* cause any needs copy to be   */
6570 				/* evaluated */
6571 				local_start = entry->vme_start;
6572 				lookup_map = map;
6573 				vm_map_lock_write_to_read(map);
6574 				rc = vm_map_lookup_locked(
6575 					&lookup_map, local_start,
6576 					(access_type | extra_prots),
6577 					OBJECT_LOCK_EXCLUSIVE,
6578 					&version, &object,
6579 					&offset, &prot, &wired,
6580 					NULL,
6581 					&real_map, NULL);
6582 				if (rc != KERN_SUCCESS) {
6583 					vm_map_unlock_read(lookup_map);
6584 					assert(map_pmap == NULL);
6585 					vm_map_unwire(map, start,
6586 					    s, user_wire);
6587 					return rc;
6588 				}
6589 				vm_object_unlock(object);
6590 				if (real_map != lookup_map) {
6591 					vm_map_unlock(real_map);
6592 				}
6593 				vm_map_unlock_read(lookup_map);
6594 				vm_map_lock(map);
6595 
6596 				/* we unlocked, so must re-lookup */
6597 				if (!vm_map_lookup_entry(map,
6598 				    local_start,
6599 				    &local_entry)) {
6600 					rc = KERN_FAILURE;
6601 					goto done;
6602 				}
6603 
6604 				/*
6605 				 * entry could have been "simplified",
6606 				 * so re-clip
6607 				 */
6608 				entry = local_entry;
6609 				assert(s == local_start);
6610 				vm_map_clip_start(map, entry, s);
6611 				vm_map_clip_end(map, entry, end);
6612 				/* re-compute "e" */
6613 				e = entry->vme_end;
6614 				if (e > end) {
6615 					e = end;
6616 				}
6617 
6618 				/* did we have a change of type? */
6619 				if (!entry->is_sub_map) {
6620 					last_timestamp = map->timestamp;
6621 					continue;
6622 				}
6623 			} else {
6624 				local_start = entry->vme_start;
6625 				pmap = map_pmap;
6626 			}
6627 
6628 			if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6629 				goto done;
6630 			}
6631 
6632 			entry->in_transition = TRUE;
6633 
6634 			vm_map_unlock(map);
6635 			rc = vm_map_wire_nested(VME_SUBMAP(entry),
6636 			    sub_start, sub_end,
6637 			    caller_prot, tag,
6638 			    user_wire, pmap, pmap_addr,
6639 			    NULL);
6640 			vm_map_lock(map);
6641 
6642 			/*
6643 			 * Find the entry again.  It could have been clipped
6644 			 * after we unlocked the map.
6645 			 */
6646 			if (!vm_map_lookup_entry(map, local_start,
6647 			    &first_entry)) {
6648 				panic("vm_map_wire: re-lookup failed");
6649 			}
6650 			entry = first_entry;
6651 
6652 			assert(local_start == s);
6653 			/* re-compute "e" */
6654 			e = entry->vme_end;
6655 			if (e > end) {
6656 				e = end;
6657 			}
6658 
6659 			last_timestamp = map->timestamp;
6660 			while ((entry != vm_map_to_entry(map)) &&
6661 			    (entry->vme_start < e)) {
6662 				assert(entry->in_transition);
6663 				entry->in_transition = FALSE;
6664 				if (entry->needs_wakeup) {
6665 					entry->needs_wakeup = FALSE;
6666 					need_wakeup = TRUE;
6667 				}
6668 				if (rc != KERN_SUCCESS) {/* from vm_*_wire */
6669 					subtract_wire_counts(map, entry, user_wire);
6670 				}
6671 				entry = entry->vme_next;
6672 			}
6673 			if (rc != KERN_SUCCESS) {       /* from vm_*_wire */
6674 				goto done;
6675 			}
6676 
6677 			/* no need to relookup again */
6678 			s = entry->vme_start;
6679 			continue;
6680 		}
6681 
6682 		/*
6683 		 * If this entry is already wired then increment
6684 		 * the appropriate wire reference count.
6685 		 */
6686 		if (entry->wired_count) {
6687 			if ((entry->protection & access_type) != access_type) {
6688 				/* found a protection problem */
6689 
6690 				/*
6691 				 * XXX FBDP
6692 				 * We should always return an error
6693 				 * in this case but since we didn't
6694 				 * enforce it before, let's do
6695 				 * it only for the new "wire_and_extract"
6696 				 * code path for now...
6697 				 */
6698 				if (wire_and_extract) {
6699 					rc = KERN_PROTECTION_FAILURE;
6700 					goto done;
6701 				}
6702 			}
6703 
6704 			/*
6705 			 * entry is already wired down, get our reference
6706 			 * after clipping to our range.
6707 			 */
6708 			vm_map_clip_start(map, entry, s);
6709 			vm_map_clip_end(map, entry, end);
6710 
6711 			if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6712 				goto done;
6713 			}
6714 
6715 			if (wire_and_extract) {
6716 				vm_object_t             object;
6717 				vm_object_offset_t      offset;
6718 				vm_page_t               m;
6719 
6720 				/*
6721 				 * We don't have to "wire" the page again
6722 				 * bit we still have to "extract" its
6723 				 * physical page number, after some sanity
6724 				 * checks.
6725 				 */
6726 				assert((entry->vme_end - entry->vme_start)
6727 				    == PAGE_SIZE);
6728 				assert(!entry->needs_copy);
6729 				assert(!entry->is_sub_map);
6730 				assert(VME_OBJECT(entry));
6731 				if (((entry->vme_end - entry->vme_start)
6732 				    != PAGE_SIZE) ||
6733 				    entry->needs_copy ||
6734 				    entry->is_sub_map ||
6735 				    VME_OBJECT(entry) == VM_OBJECT_NULL) {
6736 					rc = KERN_INVALID_ARGUMENT;
6737 					goto done;
6738 				}
6739 
6740 				object = VME_OBJECT(entry);
6741 				offset = VME_OFFSET(entry);
6742 				/* need exclusive lock to update m->dirty */
6743 				if (entry->protection & VM_PROT_WRITE) {
6744 					vm_object_lock(object);
6745 				} else {
6746 					vm_object_lock_shared(object);
6747 				}
6748 				m = vm_page_lookup(object, offset);
6749 				assert(m != VM_PAGE_NULL);
6750 				assert(VM_PAGE_WIRED(m));
6751 				if (m != VM_PAGE_NULL && VM_PAGE_WIRED(m)) {
6752 					*physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
6753 					if (entry->protection & VM_PROT_WRITE) {
6754 						vm_object_lock_assert_exclusive(
6755 							object);
6756 						m->vmp_dirty = TRUE;
6757 					}
6758 				} else {
6759 					/* not already wired !? */
6760 					*physpage_p = 0;
6761 				}
6762 				vm_object_unlock(object);
6763 			}
6764 
6765 			/* map was not unlocked: no need to relookup */
6766 			entry = entry->vme_next;
6767 			s = entry->vme_start;
6768 			continue;
6769 		}
6770 
6771 		/*
6772 		 * Unwired entry or wire request transmitted via submap
6773 		 */
6774 
6775 		/*
6776 		 * Wiring would copy the pages to the shadow object.
6777 		 * The shadow object would not be code-signed so
6778 		 * attempting to execute code from these copied pages
6779 		 * would trigger a code-signing violation.
6780 		 */
6781 
6782 		if ((entry->protection & VM_PROT_EXECUTE)
6783 #if XNU_TARGET_OS_OSX
6784 		    &&
6785 		    map->pmap != kernel_pmap &&
6786 		    (vm_map_cs_enforcement(map)
6787 #if __arm64__
6788 		    || !VM_MAP_IS_EXOTIC(map)
6789 #endif /* __arm64__ */
6790 		    )
6791 #endif /* XNU_TARGET_OS_OSX */
6792 		    ) {
6793 #if MACH_ASSERT
6794 			printf("pid %d[%s] wiring executable range from "
6795 			    "0x%llx to 0x%llx: rejected to preserve "
6796 			    "code-signing\n",
6797 			    proc_selfpid(),
6798 			    (current_task()->bsd_info
6799 			    ? proc_name_address(current_task()->bsd_info)
6800 			    : "?"),
6801 			    (uint64_t) entry->vme_start,
6802 			    (uint64_t) entry->vme_end);
6803 #endif /* MACH_ASSERT */
6804 			DTRACE_VM2(cs_executable_wire,
6805 			    uint64_t, (uint64_t)entry->vme_start,
6806 			    uint64_t, (uint64_t)entry->vme_end);
6807 			cs_executable_wire++;
6808 			rc = KERN_PROTECTION_FAILURE;
6809 			goto done;
6810 		}
6811 
6812 		/*
6813 		 * Perform actions of vm_map_lookup that need the write
6814 		 * lock on the map: create a shadow object for a
6815 		 * copy-on-write region, or an object for a zero-fill
6816 		 * region.
6817 		 */
6818 		size = entry->vme_end - entry->vme_start;
6819 		/*
6820 		 * If wiring a copy-on-write page, we need to copy it now
6821 		 * even if we're only (currently) requesting read access.
6822 		 * This is aggressive, but once it's wired we can't move it.
6823 		 */
6824 		if (entry->needs_copy) {
6825 			if (wire_and_extract) {
6826 				/*
6827 				 * We're supposed to share with the original
6828 				 * provider so should not be "needs_copy"
6829 				 */
6830 				rc = KERN_INVALID_ARGUMENT;
6831 				goto done;
6832 			}
6833 
6834 			VME_OBJECT_SHADOW(entry, size);
6835 			entry->needs_copy = FALSE;
6836 		} else if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
6837 			if (wire_and_extract) {
6838 				/*
6839 				 * We're supposed to share with the original
6840 				 * provider so should already have an object.
6841 				 */
6842 				rc = KERN_INVALID_ARGUMENT;
6843 				goto done;
6844 			}
6845 			VME_OBJECT_SET(entry, vm_object_allocate(size));
6846 			VME_OFFSET_SET(entry, (vm_object_offset_t)0);
6847 			assert(entry->use_pmap);
6848 		}
6849 
6850 		vm_map_clip_start(map, entry, s);
6851 		vm_map_clip_end(map, entry, end);
6852 
6853 		/* re-compute "e" */
6854 		e = entry->vme_end;
6855 		if (e > end) {
6856 			e = end;
6857 		}
6858 
6859 		/*
6860 		 * Check for holes and protection mismatch.
6861 		 * Holes: Next entry should be contiguous unless this
6862 		 *	  is the end of the region.
6863 		 * Protection: Access requested must be allowed, unless
6864 		 *	wiring is by protection class
6865 		 */
6866 		if ((entry->vme_end < end) &&
6867 		    ((entry->vme_next == vm_map_to_entry(map)) ||
6868 		    (entry->vme_next->vme_start > entry->vme_end))) {
6869 			/* found a hole */
6870 			rc = KERN_INVALID_ADDRESS;
6871 			goto done;
6872 		}
6873 		if ((entry->protection & access_type) != access_type) {
6874 			/* found a protection problem */
6875 			rc = KERN_PROTECTION_FAILURE;
6876 			goto done;
6877 		}
6878 
6879 		assert(entry->wired_count == 0 && entry->user_wired_count == 0);
6880 
6881 		if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6882 			goto done;
6883 		}
6884 
6885 		entry->in_transition = TRUE;
6886 
6887 		/*
6888 		 * This entry might get split once we unlock the map.
6889 		 * In vm_fault_wire(), we need the current range as
6890 		 * defined by this entry.  In order for this to work
6891 		 * along with a simultaneous clip operation, we make a
6892 		 * temporary copy of this entry and use that for the
6893 		 * wiring.  Note that the underlying objects do not
6894 		 * change during a clip.
6895 		 */
6896 		tmp_entry = *entry;
6897 
6898 		/*
6899 		 * The in_transition state guarentees that the entry
6900 		 * (or entries for this range, if split occured) will be
6901 		 * there when the map lock is acquired for the second time.
6902 		 */
6903 		vm_map_unlock(map);
6904 
6905 		if (!user_wire && cur_thread != THREAD_NULL) {
6906 			interruptible_state = thread_interrupt_level(THREAD_UNINT);
6907 		} else {
6908 			interruptible_state = THREAD_UNINT;
6909 		}
6910 
6911 		if (map_pmap) {
6912 			rc = vm_fault_wire(map,
6913 			    &tmp_entry, caller_prot, tag, map_pmap, pmap_addr,
6914 			    physpage_p);
6915 		} else {
6916 			rc = vm_fault_wire(map,
6917 			    &tmp_entry, caller_prot, tag, map->pmap,
6918 			    tmp_entry.vme_start,
6919 			    physpage_p);
6920 		}
6921 
6922 		if (!user_wire && cur_thread != THREAD_NULL) {
6923 			thread_interrupt_level(interruptible_state);
6924 		}
6925 
6926 		vm_map_lock(map);
6927 
6928 		if (last_timestamp + 1 != map->timestamp) {
6929 			/*
6930 			 * Find the entry again.  It could have been clipped
6931 			 * after we unlocked the map.
6932 			 */
6933 			if (!vm_map_lookup_entry(map, tmp_entry.vme_start,
6934 			    &first_entry)) {
6935 				panic("vm_map_wire: re-lookup failed");
6936 			}
6937 
6938 			entry = first_entry;
6939 		}
6940 
6941 		last_timestamp = map->timestamp;
6942 
6943 		while ((entry != vm_map_to_entry(map)) &&
6944 		    (entry->vme_start < tmp_entry.vme_end)) {
6945 			assert(entry->in_transition);
6946 			entry->in_transition = FALSE;
6947 			if (entry->needs_wakeup) {
6948 				entry->needs_wakeup = FALSE;
6949 				need_wakeup = TRUE;
6950 			}
6951 			if (rc != KERN_SUCCESS) {       /* from vm_*_wire */
6952 				subtract_wire_counts(map, entry, user_wire);
6953 			}
6954 			entry = entry->vme_next;
6955 		}
6956 
6957 		if (rc != KERN_SUCCESS) {               /* from vm_*_wire */
6958 			goto done;
6959 		}
6960 
6961 		if ((entry != vm_map_to_entry(map)) && /* we still have entries in the map */
6962 		    (tmp_entry.vme_end != end) &&    /* AND, we are not at the end of the requested range */
6963 		    (entry->vme_start != tmp_entry.vme_end)) { /* AND, the next entry is not contiguous. */
6964 			/* found a "new" hole */
6965 			s = tmp_entry.vme_end;
6966 			rc = KERN_INVALID_ADDRESS;
6967 			goto done;
6968 		}
6969 
6970 		s = entry->vme_start;
6971 	} /* end while loop through map entries */
6972 
6973 done:
6974 	if (rc == KERN_SUCCESS) {
6975 		/* repair any damage we may have made to the VM map */
6976 		vm_map_simplify_range(map, start, end);
6977 	}
6978 
6979 	vm_map_unlock(map);
6980 
6981 	/*
6982 	 * wake up anybody waiting on entries we wired.
6983 	 */
6984 	if (need_wakeup) {
6985 		vm_map_entry_wakeup(map);
6986 	}
6987 
6988 	if (rc != KERN_SUCCESS) {
6989 		/* undo what has been wired so far */
6990 		vm_map_unwire_nested(map, start, s, user_wire,
6991 		    map_pmap, pmap_addr);
6992 		if (physpage_p) {
6993 			*physpage_p = 0;
6994 		}
6995 	}
6996 
6997 	return rc;
6998 }
6999 
7000 kern_return_t
vm_map_wire_external(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t caller_prot,boolean_t user_wire)7001 vm_map_wire_external(
7002 	vm_map_t                map,
7003 	vm_map_offset_t         start,
7004 	vm_map_offset_t         end,
7005 	vm_prot_t               caller_prot,
7006 	boolean_t               user_wire)
7007 {
7008 	kern_return_t   kret;
7009 
7010 	kret = vm_map_wire_nested(map, start, end, caller_prot, vm_tag_bt(),
7011 	    user_wire, (pmap_t)NULL, 0, NULL);
7012 	return kret;
7013 }
7014 
7015 kern_return_t
vm_map_wire_kernel(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t caller_prot,vm_tag_t tag,boolean_t user_wire)7016 vm_map_wire_kernel(
7017 	vm_map_t                map,
7018 	vm_map_offset_t         start,
7019 	vm_map_offset_t         end,
7020 	vm_prot_t               caller_prot,
7021 	vm_tag_t                tag,
7022 	boolean_t               user_wire)
7023 {
7024 	kern_return_t   kret;
7025 
7026 	kret = vm_map_wire_nested(map, start, end, caller_prot, tag,
7027 	    user_wire, (pmap_t)NULL, 0, NULL);
7028 	return kret;
7029 }
7030 
7031 kern_return_t
vm_map_wire_and_extract_external(vm_map_t map,vm_map_offset_t start,vm_prot_t caller_prot,boolean_t user_wire,ppnum_t * physpage_p)7032 vm_map_wire_and_extract_external(
7033 	vm_map_t        map,
7034 	vm_map_offset_t start,
7035 	vm_prot_t       caller_prot,
7036 	boolean_t       user_wire,
7037 	ppnum_t         *physpage_p)
7038 {
7039 	kern_return_t   kret;
7040 
7041 	kret = vm_map_wire_nested(map,
7042 	    start,
7043 	    start + VM_MAP_PAGE_SIZE(map),
7044 	    caller_prot,
7045 	    vm_tag_bt(),
7046 	    user_wire,
7047 	    (pmap_t)NULL,
7048 	    0,
7049 	    physpage_p);
7050 	if (kret != KERN_SUCCESS &&
7051 	    physpage_p != NULL) {
7052 		*physpage_p = 0;
7053 	}
7054 	return kret;
7055 }
7056 
7057 kern_return_t
vm_map_wire_and_extract_kernel(vm_map_t map,vm_map_offset_t start,vm_prot_t caller_prot,vm_tag_t tag,boolean_t user_wire,ppnum_t * physpage_p)7058 vm_map_wire_and_extract_kernel(
7059 	vm_map_t        map,
7060 	vm_map_offset_t start,
7061 	vm_prot_t       caller_prot,
7062 	vm_tag_t        tag,
7063 	boolean_t       user_wire,
7064 	ppnum_t         *physpage_p)
7065 {
7066 	kern_return_t   kret;
7067 
7068 	kret = vm_map_wire_nested(map,
7069 	    start,
7070 	    start + VM_MAP_PAGE_SIZE(map),
7071 	    caller_prot,
7072 	    tag,
7073 	    user_wire,
7074 	    (pmap_t)NULL,
7075 	    0,
7076 	    physpage_p);
7077 	if (kret != KERN_SUCCESS &&
7078 	    physpage_p != NULL) {
7079 		*physpage_p = 0;
7080 	}
7081 	return kret;
7082 }
7083 
7084 /*
7085  *	vm_map_unwire:
7086  *
7087  *	Sets the pageability of the specified address range in the target
7088  *	as pageable.  Regions specified must have been wired previously.
7089  *
7090  *	The map must not be locked, but a reference must remain to the map
7091  *	throughout the call.
7092  *
7093  *	Kernel will panic on failures.  User unwire ignores holes and
7094  *	unwired and intransition entries to avoid losing memory by leaving
7095  *	it unwired.
7096  */
7097 static kern_return_t
vm_map_unwire_nested(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,boolean_t user_wire,pmap_t map_pmap,vm_map_offset_t pmap_addr)7098 vm_map_unwire_nested(
7099 	vm_map_t                map,
7100 	vm_map_offset_t         start,
7101 	vm_map_offset_t         end,
7102 	boolean_t               user_wire,
7103 	pmap_t                  map_pmap,
7104 	vm_map_offset_t         pmap_addr)
7105 {
7106 	vm_map_entry_t          entry;
7107 	struct vm_map_entry     *first_entry, tmp_entry;
7108 	boolean_t               need_wakeup;
7109 	boolean_t               main_map = FALSE;
7110 	unsigned int            last_timestamp;
7111 
7112 	vm_map_lock(map);
7113 	if (map_pmap == NULL) {
7114 		main_map = TRUE;
7115 	}
7116 	last_timestamp = map->timestamp;
7117 
7118 	VM_MAP_RANGE_CHECK(map, start, end);
7119 	assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
7120 	assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
7121 
7122 	if (start == end) {
7123 		/* We unwired what the caller asked for: zero pages */
7124 		vm_map_unlock(map);
7125 		return KERN_SUCCESS;
7126 	}
7127 
7128 	if (vm_map_lookup_entry(map, start, &first_entry)) {
7129 		entry = first_entry;
7130 		/*
7131 		 * vm_map_clip_start will be done later.
7132 		 * We don't want to unnest any nested sub maps here !
7133 		 */
7134 	} else {
7135 		if (!user_wire) {
7136 			panic("vm_map_unwire: start not found");
7137 		}
7138 		/*	Start address is not in map. */
7139 		vm_map_unlock(map);
7140 		return KERN_INVALID_ADDRESS;
7141 	}
7142 
7143 	if (entry->superpage_size) {
7144 		/* superpages are always wired */
7145 		vm_map_unlock(map);
7146 		return KERN_INVALID_ADDRESS;
7147 	}
7148 
7149 	need_wakeup = FALSE;
7150 	while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
7151 		if (entry->in_transition) {
7152 			/*
7153 			 * 1)
7154 			 * Another thread is wiring down this entry. Note
7155 			 * that if it is not for the other thread we would
7156 			 * be unwiring an unwired entry.  This is not
7157 			 * permitted.  If we wait, we will be unwiring memory
7158 			 * we did not wire.
7159 			 *
7160 			 * 2)
7161 			 * Another thread is unwiring this entry.  We did not
7162 			 * have a reference to it, because if we did, this
7163 			 * entry will not be getting unwired now.
7164 			 */
7165 			if (!user_wire) {
7166 				/*
7167 				 * XXX FBDP
7168 				 * This could happen:  there could be some
7169 				 * overlapping vslock/vsunlock operations
7170 				 * going on.
7171 				 * We should probably just wait and retry,
7172 				 * but then we have to be careful that this
7173 				 * entry could get "simplified" after
7174 				 * "in_transition" gets unset and before
7175 				 * we re-lookup the entry, so we would
7176 				 * have to re-clip the entry to avoid
7177 				 * re-unwiring what we have already unwired...
7178 				 * See vm_map_wire_nested().
7179 				 *
7180 				 * Or we could just ignore "in_transition"
7181 				 * here and proceed to decement the wired
7182 				 * count(s) on this entry.  That should be fine
7183 				 * as long as "wired_count" doesn't drop all
7184 				 * the way to 0 (and we should panic if THAT
7185 				 * happens).
7186 				 */
7187 				panic("vm_map_unwire: in_transition entry");
7188 			}
7189 
7190 			entry = entry->vme_next;
7191 			continue;
7192 		}
7193 
7194 		if (entry->is_sub_map) {
7195 			vm_map_offset_t sub_start;
7196 			vm_map_offset_t sub_end;
7197 			vm_map_offset_t local_end;
7198 			pmap_t          pmap;
7199 
7200 			vm_map_clip_start(map, entry, start);
7201 			vm_map_clip_end(map, entry, end);
7202 
7203 			sub_start = VME_OFFSET(entry);
7204 			sub_end = entry->vme_end - entry->vme_start;
7205 			sub_end += VME_OFFSET(entry);
7206 			local_end = entry->vme_end;
7207 			if (map_pmap == NULL) {
7208 				if (entry->use_pmap) {
7209 					pmap = VME_SUBMAP(entry)->pmap;
7210 					pmap_addr = sub_start;
7211 				} else {
7212 					pmap = map->pmap;
7213 					pmap_addr = start;
7214 				}
7215 				if (entry->wired_count == 0 ||
7216 				    (user_wire && entry->user_wired_count == 0)) {
7217 					if (!user_wire) {
7218 						panic("vm_map_unwire: entry is unwired");
7219 					}
7220 					entry = entry->vme_next;
7221 					continue;
7222 				}
7223 
7224 				/*
7225 				 * Check for holes
7226 				 * Holes: Next entry should be contiguous unless
7227 				 * this is the end of the region.
7228 				 */
7229 				if (((entry->vme_end < end) &&
7230 				    ((entry->vme_next == vm_map_to_entry(map)) ||
7231 				    (entry->vme_next->vme_start
7232 				    > entry->vme_end)))) {
7233 					if (!user_wire) {
7234 						panic("vm_map_unwire: non-contiguous region");
7235 					}
7236 /*
7237  *                                       entry = entry->vme_next;
7238  *                                       continue;
7239  */
7240 				}
7241 
7242 				subtract_wire_counts(map, entry, user_wire);
7243 
7244 				if (entry->wired_count != 0) {
7245 					entry = entry->vme_next;
7246 					continue;
7247 				}
7248 
7249 				entry->in_transition = TRUE;
7250 				tmp_entry = *entry;/* see comment in vm_map_wire() */
7251 
7252 				/*
7253 				 * We can unlock the map now. The in_transition state
7254 				 * guarantees existance of the entry.
7255 				 */
7256 				vm_map_unlock(map);
7257 				vm_map_unwire_nested(VME_SUBMAP(entry),
7258 				    sub_start, sub_end, user_wire, pmap, pmap_addr);
7259 				vm_map_lock(map);
7260 
7261 				if (last_timestamp + 1 != map->timestamp) {
7262 					/*
7263 					 * Find the entry again.  It could have been
7264 					 * clipped or deleted after we unlocked the map.
7265 					 */
7266 					if (!vm_map_lookup_entry(map,
7267 					    tmp_entry.vme_start,
7268 					    &first_entry)) {
7269 						if (!user_wire) {
7270 							panic("vm_map_unwire: re-lookup failed");
7271 						}
7272 						entry = first_entry->vme_next;
7273 					} else {
7274 						entry = first_entry;
7275 					}
7276 				}
7277 				last_timestamp = map->timestamp;
7278 
7279 				/*
7280 				 * clear transition bit for all constituent entries
7281 				 * that were in the original entry (saved in
7282 				 * tmp_entry).  Also check for waiters.
7283 				 */
7284 				while ((entry != vm_map_to_entry(map)) &&
7285 				    (entry->vme_start < tmp_entry.vme_end)) {
7286 					assert(entry->in_transition);
7287 					entry->in_transition = FALSE;
7288 					if (entry->needs_wakeup) {
7289 						entry->needs_wakeup = FALSE;
7290 						need_wakeup = TRUE;
7291 					}
7292 					entry = entry->vme_next;
7293 				}
7294 				continue;
7295 			} else {
7296 				tmp_entry = *entry;
7297 				vm_map_unlock(map);
7298 				vm_map_unwire_nested(VME_SUBMAP(entry),
7299 				    sub_start, sub_end, user_wire, map_pmap,
7300 				    pmap_addr);
7301 				vm_map_lock(map);
7302 
7303 				if (last_timestamp + 1 != map->timestamp) {
7304 					/*
7305 					 * Find the entry again.  It could have been
7306 					 * clipped or deleted after we unlocked the map.
7307 					 */
7308 					if (!vm_map_lookup_entry(map,
7309 					    tmp_entry.vme_start,
7310 					    &first_entry)) {
7311 						if (!user_wire) {
7312 							panic("vm_map_unwire: re-lookup failed");
7313 						}
7314 						entry = first_entry->vme_next;
7315 					} else {
7316 						entry = first_entry;
7317 					}
7318 				}
7319 				last_timestamp = map->timestamp;
7320 			}
7321 		}
7322 
7323 
7324 		if ((entry->wired_count == 0) ||
7325 		    (user_wire && entry->user_wired_count == 0)) {
7326 			if (!user_wire) {
7327 				panic("vm_map_unwire: entry is unwired");
7328 			}
7329 
7330 			entry = entry->vme_next;
7331 			continue;
7332 		}
7333 
7334 		assert(entry->wired_count > 0 &&
7335 		    (!user_wire || entry->user_wired_count > 0));
7336 
7337 		vm_map_clip_start(map, entry, start);
7338 		vm_map_clip_end(map, entry, end);
7339 
7340 		/*
7341 		 * Check for holes
7342 		 * Holes: Next entry should be contiguous unless
7343 		 *	  this is the end of the region.
7344 		 */
7345 		if (((entry->vme_end < end) &&
7346 		    ((entry->vme_next == vm_map_to_entry(map)) ||
7347 		    (entry->vme_next->vme_start > entry->vme_end)))) {
7348 			if (!user_wire) {
7349 				panic("vm_map_unwire: non-contiguous region");
7350 			}
7351 			entry = entry->vme_next;
7352 			continue;
7353 		}
7354 
7355 		subtract_wire_counts(map, entry, user_wire);
7356 
7357 		if (entry->wired_count != 0) {
7358 			entry = entry->vme_next;
7359 			continue;
7360 		}
7361 
7362 		if (entry->zero_wired_pages) {
7363 			entry->zero_wired_pages = FALSE;
7364 		}
7365 
7366 		entry->in_transition = TRUE;
7367 		tmp_entry = *entry;     /* see comment in vm_map_wire() */
7368 
7369 		/*
7370 		 * We can unlock the map now. The in_transition state
7371 		 * guarantees existance of the entry.
7372 		 */
7373 		vm_map_unlock(map);
7374 		if (map_pmap) {
7375 			vm_fault_unwire(map,
7376 			    &tmp_entry, FALSE, map_pmap, pmap_addr);
7377 		} else {
7378 			vm_fault_unwire(map,
7379 			    &tmp_entry, FALSE, map->pmap,
7380 			    tmp_entry.vme_start);
7381 		}
7382 		vm_map_lock(map);
7383 
7384 		if (last_timestamp + 1 != map->timestamp) {
7385 			/*
7386 			 * Find the entry again.  It could have been clipped
7387 			 * or deleted after we unlocked the map.
7388 			 */
7389 			if (!vm_map_lookup_entry(map, tmp_entry.vme_start,
7390 			    &first_entry)) {
7391 				if (!user_wire) {
7392 					panic("vm_map_unwire: re-lookup failed");
7393 				}
7394 				entry = first_entry->vme_next;
7395 			} else {
7396 				entry = first_entry;
7397 			}
7398 		}
7399 		last_timestamp = map->timestamp;
7400 
7401 		/*
7402 		 * clear transition bit for all constituent entries that
7403 		 * were in the original entry (saved in tmp_entry).  Also
7404 		 * check for waiters.
7405 		 */
7406 		while ((entry != vm_map_to_entry(map)) &&
7407 		    (entry->vme_start < tmp_entry.vme_end)) {
7408 			assert(entry->in_transition);
7409 			entry->in_transition = FALSE;
7410 			if (entry->needs_wakeup) {
7411 				entry->needs_wakeup = FALSE;
7412 				need_wakeup = TRUE;
7413 			}
7414 			entry = entry->vme_next;
7415 		}
7416 	}
7417 
7418 	/*
7419 	 * We might have fragmented the address space when we wired this
7420 	 * range of addresses.  Attempt to re-coalesce these VM map entries
7421 	 * with their neighbors now that they're no longer wired.
7422 	 * Under some circumstances, address space fragmentation can
7423 	 * prevent VM object shadow chain collapsing, which can cause
7424 	 * swap space leaks.
7425 	 */
7426 	vm_map_simplify_range(map, start, end);
7427 
7428 	vm_map_unlock(map);
7429 	/*
7430 	 * wake up anybody waiting on entries that we have unwired.
7431 	 */
7432 	if (need_wakeup) {
7433 		vm_map_entry_wakeup(map);
7434 	}
7435 	return KERN_SUCCESS;
7436 }
7437 
7438 kern_return_t
vm_map_unwire(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,boolean_t user_wire)7439 vm_map_unwire(
7440 	vm_map_t                map,
7441 	vm_map_offset_t         start,
7442 	vm_map_offset_t         end,
7443 	boolean_t               user_wire)
7444 {
7445 	return vm_map_unwire_nested(map, start, end,
7446 	           user_wire, (pmap_t)NULL, 0);
7447 }
7448 
7449 
7450 /*
7451  *	vm_map_entry_zap:	[ internal use only ]
7452  *
7453  *	Remove the entry from the target map
7454  *	and put it on a zap list.
7455  */
7456 static void
vm_map_entry_zap(vm_map_t map,vm_map_entry_t entry,vm_map_zap_t zap)7457 vm_map_entry_zap(
7458 	vm_map_t                map,
7459 	vm_map_entry_t          entry,
7460 	vm_map_zap_t            zap)
7461 {
7462 	vm_map_offset_t s, e;
7463 
7464 	s = entry->vme_start;
7465 	e = entry->vme_end;
7466 	assert(VM_MAP_PAGE_ALIGNED(s, FOURK_PAGE_MASK));
7467 	assert(VM_MAP_PAGE_ALIGNED(e, FOURK_PAGE_MASK));
7468 	if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK) {
7469 		assert(page_aligned(s));
7470 		assert(page_aligned(e));
7471 	}
7472 	if (entry->map_aligned == TRUE) {
7473 		assert(VM_MAP_PAGE_ALIGNED(s, VM_MAP_PAGE_MASK(map)));
7474 		assert(VM_MAP_PAGE_ALIGNED(e, VM_MAP_PAGE_MASK(map)));
7475 	}
7476 	assert(entry->wired_count == 0);
7477 	assert(entry->user_wired_count == 0);
7478 	assert(!entry->permanent);
7479 
7480 	vm_map_store_entry_unlink(map, entry);
7481 	map->size -= e - s;
7482 
7483 	vm_map_zap_append(zap, entry);
7484 }
7485 
7486 static void
vm_map_submap_pmap_clean(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_map_t sub_map,vm_map_offset_t offset)7487 vm_map_submap_pmap_clean(
7488 	vm_map_t        map,
7489 	vm_map_offset_t start,
7490 	vm_map_offset_t end,
7491 	vm_map_t        sub_map,
7492 	vm_map_offset_t offset)
7493 {
7494 	vm_map_offset_t submap_start;
7495 	vm_map_offset_t submap_end;
7496 	vm_map_size_t   remove_size;
7497 	vm_map_entry_t  entry;
7498 
7499 	submap_end = offset + (end - start);
7500 	submap_start = offset;
7501 
7502 	vm_map_lock_read(sub_map);
7503 	if (vm_map_lookup_entry(sub_map, offset, &entry)) {
7504 		remove_size = (entry->vme_end - entry->vme_start);
7505 		if (offset > entry->vme_start) {
7506 			remove_size -= offset - entry->vme_start;
7507 		}
7508 
7509 
7510 		if (submap_end < entry->vme_end) {
7511 			remove_size -=
7512 			    entry->vme_end - submap_end;
7513 		}
7514 		if (entry->is_sub_map) {
7515 			vm_map_submap_pmap_clean(
7516 				sub_map,
7517 				start,
7518 				start + remove_size,
7519 				VME_SUBMAP(entry),
7520 				VME_OFFSET(entry));
7521 		} else {
7522 			if (map->mapped_in_other_pmaps &&
7523 			    os_ref_get_count_raw(&map->map_refcnt) != 0 &&
7524 			    VME_OBJECT(entry) != NULL) {
7525 				vm_object_pmap_protect_options(
7526 					VME_OBJECT(entry),
7527 					(VME_OFFSET(entry) +
7528 					offset -
7529 					entry->vme_start),
7530 					remove_size,
7531 					PMAP_NULL,
7532 					PAGE_SIZE,
7533 					entry->vme_start,
7534 					VM_PROT_NONE,
7535 					PMAP_OPTIONS_REMOVE);
7536 			} else {
7537 				pmap_remove(map->pmap,
7538 				    (addr64_t)start,
7539 				    (addr64_t)(start + remove_size));
7540 			}
7541 		}
7542 	}
7543 
7544 	entry = entry->vme_next;
7545 
7546 	while ((entry != vm_map_to_entry(sub_map))
7547 	    && (entry->vme_start < submap_end)) {
7548 		remove_size = (entry->vme_end - entry->vme_start);
7549 		if (submap_end < entry->vme_end) {
7550 			remove_size -= entry->vme_end - submap_end;
7551 		}
7552 		if (entry->is_sub_map) {
7553 			vm_map_submap_pmap_clean(
7554 				sub_map,
7555 				(start + entry->vme_start) - offset,
7556 				((start + entry->vme_start) - offset) + remove_size,
7557 				VME_SUBMAP(entry),
7558 				VME_OFFSET(entry));
7559 		} else {
7560 			if (map->mapped_in_other_pmaps &&
7561 			    os_ref_get_count_raw(&map->map_refcnt) != 0 &&
7562 			    VME_OBJECT(entry) != NULL) {
7563 				vm_object_pmap_protect_options(
7564 					VME_OBJECT(entry),
7565 					VME_OFFSET(entry),
7566 					remove_size,
7567 					PMAP_NULL,
7568 					PAGE_SIZE,
7569 					entry->vme_start,
7570 					VM_PROT_NONE,
7571 					PMAP_OPTIONS_REMOVE);
7572 			} else {
7573 				pmap_remove(map->pmap,
7574 				    (addr64_t)((start + entry->vme_start)
7575 				    - offset),
7576 				    (addr64_t)(((start + entry->vme_start)
7577 				    - offset) + remove_size));
7578 			}
7579 		}
7580 		entry = entry->vme_next;
7581 	}
7582 	vm_map_unlock_read(sub_map);
7583 	return;
7584 }
7585 
7586 /*
7587  *     virt_memory_guard_ast:
7588  *
7589  *     Handle the AST callout for a virtual memory guard.
7590  *	   raise an EXC_GUARD exception and terminate the task
7591  *     if configured to do so.
7592  */
7593 void
virt_memory_guard_ast(thread_t thread,mach_exception_data_type_t code,mach_exception_data_type_t subcode)7594 virt_memory_guard_ast(
7595 	thread_t thread,
7596 	mach_exception_data_type_t code,
7597 	mach_exception_data_type_t subcode)
7598 {
7599 	task_t task = get_threadtask(thread);
7600 	assert(task != kernel_task);
7601 	assert(task == current_task());
7602 	kern_return_t sync_exception_result;
7603 	uint32_t behavior;
7604 
7605 	behavior = task->task_exc_guard;
7606 
7607 	/* Is delivery enabled */
7608 	if ((behavior & TASK_EXC_GUARD_VM_DELIVER) == 0) {
7609 		return;
7610 	}
7611 
7612 	/* If only once, make sure we're that once */
7613 	while (behavior & TASK_EXC_GUARD_VM_ONCE) {
7614 		uint32_t new_behavior = behavior & ~TASK_EXC_GUARD_VM_DELIVER;
7615 
7616 		if (OSCompareAndSwap(behavior, new_behavior, &task->task_exc_guard)) {
7617 			break;
7618 		}
7619 		behavior = task->task_exc_guard;
7620 		if ((behavior & TASK_EXC_GUARD_VM_DELIVER) == 0) {
7621 			return;
7622 		}
7623 	}
7624 
7625 	/* Raise exception synchronously and see if handler claimed it */
7626 	sync_exception_result = task_exception_notify(EXC_GUARD, code, subcode);
7627 
7628 	if (task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL) {
7629 		/*
7630 		 * If Synchronous EXC_GUARD delivery was successful then
7631 		 * kill the process and return, else kill the process
7632 		 * and deliver the exception via EXC_CORPSE_NOTIFY.
7633 		 */
7634 		if (sync_exception_result == KERN_SUCCESS) {
7635 			task_bsdtask_kill(current_task());
7636 		} else {
7637 			exit_with_guard_exception(current_proc(), code, subcode);
7638 		}
7639 	} else if (task->task_exc_guard & TASK_EXC_GUARD_VM_CORPSE) {
7640 		/*
7641 		 * If the synchronous EXC_GUARD delivery was not successful,
7642 		 * raise a simulated crash.
7643 		 */
7644 		if (sync_exception_result != KERN_SUCCESS) {
7645 			task_violated_guard(code, subcode, NULL);
7646 		}
7647 	}
7648 }
7649 
7650 /*
7651  *     vm_map_guard_exception:
7652  *
7653  *     Generate a GUARD_TYPE_VIRTUAL_MEMORY EXC_GUARD exception.
7654  *
7655  *     Right now, we do this when we find nothing mapped, or a
7656  *     gap in the mapping when a user address space deallocate
7657  *     was requested. We report the address of the first gap found.
7658  */
7659 static void
vm_map_guard_exception(vm_map_offset_t gap_start,unsigned reason)7660 vm_map_guard_exception(
7661 	vm_map_offset_t gap_start,
7662 	unsigned reason)
7663 {
7664 	mach_exception_code_t code = 0;
7665 	unsigned int guard_type = GUARD_TYPE_VIRT_MEMORY;
7666 	unsigned int target = 0; /* should we pass in pid associated with map? */
7667 	mach_exception_data_type_t subcode = (uint64_t)gap_start;
7668 	boolean_t fatal = FALSE;
7669 
7670 	task_t task = current_task_early();
7671 
7672 	/* Can't deliver exceptions to a NULL task (early boot) or kernel task */
7673 	if (task == NULL || task == kernel_task) {
7674 		return;
7675 	}
7676 
7677 	EXC_GUARD_ENCODE_TYPE(code, guard_type);
7678 	EXC_GUARD_ENCODE_FLAVOR(code, reason);
7679 	EXC_GUARD_ENCODE_TARGET(code, target);
7680 
7681 	if (task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL) {
7682 		fatal = TRUE;
7683 	}
7684 	thread_guard_violation(current_thread(), code, subcode, fatal);
7685 }
7686 
7687 __abortlike
7688 static void
__vm_map_delete_gap_panic(vm_map_t map,vm_map_offset_t where,vm_map_offset_t start,vm_map_offset_t end)7689 __vm_map_delete_gap_panic(
7690 	vm_map_t                map,
7691 	vm_map_offset_t         where,
7692 	vm_map_offset_t         start,
7693 	vm_map_offset_t         end)
7694 {
7695 	panic("vm_map_delete(%p,0x%llx,0x%llx): no map entry at 0x%llx",
7696 	    map, (uint64_t)start, (uint64_t)end, (uint64_t)where);
7697 }
7698 
7699 __abortlike
7700 static void
__vm_map_delete_permanent_panic(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_map_entry_t entry)7701 __vm_map_delete_permanent_panic(
7702 	vm_map_t                map,
7703 	vm_map_offset_t         start,
7704 	vm_map_offset_t         end,
7705 	vm_map_entry_t          entry)
7706 {
7707 	panic("vm_map_delete(%p,0x%llx,0x%llx): "
7708 	    "Attempting to remove permanent VM map entry %p [0x%llx:0x%llx]",
7709 	    map, (uint64_t)start, (uint64_t)end, entry,
7710 	    (uint64_t)entry->vme_start,
7711 	    (uint64_t)entry->vme_end);
7712 }
7713 
7714 __abortlike
7715 static void
__vm_map_delete_loose_atomic_panic(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_map_entry_t entry)7716 __vm_map_delete_loose_atomic_panic(
7717 	vm_map_t                map,
7718 	vm_map_offset_t         start,
7719 	vm_map_offset_t         end,
7720 	vm_map_entry_t          entry)
7721 {
7722 	panic("vm_map_delete(%p,0x%llx,0x%llx): "
7723 	    "request loosely encompasses atomic entry %p at (0x%llx,0x%llx)",
7724 	    map, (uint64_t)start, (uint64_t)end, entry,
7725 	    (uint64_t)entry->vme_start,
7726 	    (uint64_t)entry->vme_end);
7727 }
7728 
7729 __options_decl(vm_map_delete_state_t, uint32_t, {
7730 	VMDS_NONE               = 0x0000,
7731 
7732 	VMDS_FOUND_GAP          = 0x0001,
7733 	VMDS_GAPS_OK            = 0x0002,
7734 
7735 	VMDS_KERNEL_PMAP        = 0x0004,
7736 	VMDS_NEEDS_LOOKUP       = 0x0008,
7737 	VMDS_NEEDS_WAKEUP       = 0x0010,
7738 });
7739 
7740 /*
7741  *	vm_map_delete:	[ internal use only ]
7742  *
7743  *	Deallocates the given address range from the target map.
7744  *	Removes all user wirings. Unwires one kernel wiring if
7745  *	VM_MAP_REMOVE_KUNWIRE is set.  Waits for kernel wirings to go
7746  *	away if VM_MAP_REMOVE_WAIT_FOR_KWIRE is set.  Sleeps
7747  *	interruptibly if VM_MAP_REMOVE_INTERRUPTIBLE is set.
7748  *
7749  *
7750  *	When VM_MAP_REMOVE_RETURN_ERRORS is not passed,
7751  *	then any error in removing mappings will lead to a panic
7752  *	so that clients do not have to repeat the panic code
7753  *	at each call site.  If VM_MAP_REMOVE_INTERRUPTIBLE
7754  *	is also passed, then KERN_ABORTED will not lead to a panic.
7755  *
7756  *	Note: at this time, there is no such condition,
7757  *	      that isn't already causing a panic.
7758  *
7759  *	      If the code is changed to add such errors later,
7760  *	      then the flag must be honored.
7761  *
7762  *	This routine is called with map locked and leaves map locked.
7763  */
7764 static kern_return_t
vm_map_delete(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vmr_flags_t flags,vm_map_zap_t zap_list)7765 vm_map_delete(
7766 	vm_map_t                map,
7767 	vm_map_offset_t         start,
7768 	vm_map_offset_t         end,
7769 	vmr_flags_t             flags,
7770 	vm_map_zap_t            zap_list)
7771 {
7772 	vm_map_entry_t          entry, next;
7773 	int                     interruptible;
7774 	vm_map_offset_t         gap_start = 0;
7775 	vm_map_offset_t         clear_in_transition_end = 0;
7776 	__unused vm_map_offset_t save_start = start;
7777 	__unused vm_map_offset_t save_end = end;
7778 	vm_map_delete_state_t   state = VMDS_NONE;
7779 
7780 	if (vm_map_pmap(map) == kernel_pmap) {
7781 		state |= VMDS_KERNEL_PMAP;
7782 	}
7783 
7784 	if (map->terminated || os_ref_get_count_raw(&map->map_refcnt) == 0) {
7785 		state |= VMDS_GAPS_OK;
7786 	}
7787 
7788 	interruptible = (flags & VM_MAP_REMOVE_INTERRUPTIBLE) ?
7789 	    THREAD_ABORTSAFE : THREAD_UNINT;
7790 
7791 	/*
7792 	 *	Find the start of the region.
7793 	 *
7794 	 *	If in a superpage, extend the range
7795 	 *	to include the start of the mapping.
7796 	 */
7797 	if (vm_map_lookup_entry_or_next(map, start, &entry)) {
7798 		if (entry->superpage_size && (start & ~SUPERPAGE_MASK)) {
7799 			start = SUPERPAGE_ROUND_DOWN(start);
7800 			while (vm_map_lookup_entry_or_next(map, start, &entry)) {
7801 				if (entry->superpage_size && (start & ~SUPERPAGE_MASK)) {
7802 					start = SUPERPAGE_ROUND_DOWN(start);
7803 					continue;
7804 				}
7805 				break;
7806 			}
7807 		}
7808 		SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
7809 	} else {
7810 		if (!(state & VMDS_GAPS_OK)) {
7811 			state |= VMDS_FOUND_GAP;
7812 			gap_start = start;
7813 		}
7814 	}
7815 
7816 	if (entry->superpage_size) {
7817 		end = SUPERPAGE_ROUND_UP(end);
7818 	}
7819 
7820 	/*
7821 	 *	Step through all entries in this region
7822 	 */
7823 	for (vm_map_offset_t s = start; s < end;) {
7824 		/*
7825 		 * At this point, we have deleted all the memory entries
7826 		 * in [start, s) and are proceeding with the [s, end) range.
7827 		 *
7828 		 * This loop might drop the map lock, and it is possible that
7829 		 * some memory was already reallocated within [start, s)
7830 		 * and we don't want to mess with those entries.
7831 		 *
7832 		 * Some of those entries could even have been re-assembled
7833 		 * with an entry after "s" (in vm_map_simplify_entry()), so
7834 		 * we may have to vm_map_clip_start() again.
7835 		 *
7836 		 * When clear_in_transition_end is set, the we had marked
7837 		 * [start, clear_in_transition_end) as "in_transition"
7838 		 * during a previous iteration and we need to clear it.
7839 		 */
7840 
7841 		/*
7842 		 * Step 1: If needed (because we dropped locks),
7843 		 *         lookup the entry again.
7844 		 *
7845 		 *         If we're coming back from unwiring (Step 5),
7846 		 *         we also need to mark the entries as no longer
7847 		 *         in transition after that.
7848 		 */
7849 
7850 		if (state & VMDS_NEEDS_LOOKUP) {
7851 			state &= ~VMDS_NEEDS_LOOKUP;
7852 
7853 			if (vm_map_lookup_entry_or_next(map, s, &entry)) {
7854 				SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
7855 			}
7856 		}
7857 
7858 		if (clear_in_transition_end) {
7859 			for (vm_map_entry_t it = entry;
7860 			    it != vm_map_to_entry(map) &&
7861 			    it->vme_start < clear_in_transition_end;
7862 			    it = it->vme_next) {
7863 				assert(it->in_transition);
7864 				it->in_transition = FALSE;
7865 				if (it->needs_wakeup) {
7866 					it->needs_wakeup = FALSE;
7867 					state |= VMDS_NEEDS_WAKEUP;
7868 				}
7869 			}
7870 
7871 			clear_in_transition_end = 0;
7872 		}
7873 
7874 
7875 		/*
7876 		 * Step 2: Perform various policy checks
7877 		 *         before we do _anything_ to this entry.
7878 		 */
7879 
7880 		if (entry == vm_map_to_entry(map) || s < entry->vme_start) {
7881 			if (state & (VMDS_GAPS_OK | VMDS_FOUND_GAP)) {
7882 				/*
7883 				 * Either we found a gap already,
7884 				 * or we are tearing down a map,
7885 				 * keep going.
7886 				 */
7887 			} else if (state & VMDS_KERNEL_PMAP) {
7888 				__vm_map_delete_gap_panic(map, s, start, end);
7889 			} else if (vm_map_round_page(s, VM_MAP_PAGE_MASK(map)) < end) {
7890 				/*
7891 				 * The vm_map_round_page() is needed since an entry
7892 				 * can be less than VM_MAP_PAGE_MASK() sized.
7893 				 *
7894 				 * For example, devices which have h/w 4K pages,
7895 				 * but entry sizes are all now 16K.
7896 				 */
7897 				state |= VMDS_FOUND_GAP;
7898 				gap_start = s;
7899 			}
7900 
7901 			if (entry == vm_map_to_entry(map) ||
7902 			    end <= entry->vme_start) {
7903 				break;
7904 			}
7905 
7906 			s = entry->vme_start;
7907 		}
7908 
7909 		if (state & VMDS_KERNEL_PMAP) {
7910 			/*
7911 			 * In the kernel map and its submaps,
7912 			 * permanent entries never die, even
7913 			 * if VM_MAP_REMOVE_IMMUTABLE is passed.
7914 			 */
7915 			if (entry->permanent) {
7916 				__vm_map_delete_permanent_panic(map, start, end, entry);
7917 			}
7918 
7919 			/*
7920 			 * In the kernel map and its submaps,
7921 			 * the removal of an atomic entry is strict.
7922 			 *
7923 			 * An atomic entry is processed only if it was
7924 			 * specifically targeted.
7925 			 *
7926 			 * We might have deleted non-atomic entries before
7927 			 * we reach this this point however...
7928 			 */
7929 			if (entry->vme_atomic &&
7930 			    (entry->vme_start != start || entry->vme_end != end)) {
7931 				__vm_map_delete_loose_atomic_panic(map,
7932 				    start, end, entry);
7933 			}
7934 		}
7935 
7936 
7937 		/*
7938 		 * Step 3: Perform any clipping needed.
7939 		 *
7940 		 *         After this, "entry" starts at "s", ends before "end"
7941 		 */
7942 
7943 		if (entry->vme_start < s) {
7944 			if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) &&
7945 			    entry->map_aligned &&
7946 			    !VM_MAP_PAGE_ALIGNED(s, VM_MAP_PAGE_MASK(map))) {
7947 				/*
7948 				 * The entry will no longer be map-aligned
7949 				 * after clipping and the caller said it's OK.
7950 				 */
7951 				entry->map_aligned = FALSE;
7952 			}
7953 			vm_map_clip_start(map, entry, s);
7954 			SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
7955 		}
7956 
7957 		if (end < entry->vme_end) {
7958 			if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) &&
7959 			    entry->map_aligned &&
7960 			    !VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map))) {
7961 				/*
7962 				 * The entry will no longer be map-aligned
7963 				 * after clipping and the caller said it's OK.
7964 				 */
7965 				entry->map_aligned = FALSE;
7966 			}
7967 			vm_map_clip_end(map, entry, end);
7968 		}
7969 
7970 		assert(s == entry->vme_start);
7971 		assert(entry->vme_end <= end);
7972 
7973 
7974 		/*
7975 		 * Step 4: If the entry is in flux, wait for this to resolve.
7976 		 */
7977 
7978 		if (entry->in_transition) {
7979 			wait_result_t wait_result;
7980 
7981 			/*
7982 			 * Another thread is wiring/unwiring this entry.
7983 			 * Let the other thread know we are waiting.
7984 			 */
7985 
7986 			entry->needs_wakeup = TRUE;
7987 
7988 			/*
7989 			 * wake up anybody waiting on entries that we have
7990 			 * already unwired/deleted.
7991 			 */
7992 			if (state & VMDS_NEEDS_WAKEUP) {
7993 				vm_map_entry_wakeup(map);
7994 				state &= ~VMDS_NEEDS_WAKEUP;
7995 			}
7996 
7997 			wait_result = vm_map_entry_wait(map, interruptible);
7998 
7999 			if (interruptible &&
8000 			    wait_result == THREAD_INTERRUPTED) {
8001 				/*
8002 				 * We do not clear the needs_wakeup flag,
8003 				 * since we cannot tell if we were the only one.
8004 				 */
8005 				return KERN_ABORTED;
8006 			}
8007 
8008 			/*
8009 			 * The entry could have been clipped or it
8010 			 * may not exist anymore.  Look it up again.
8011 			 */
8012 			state |= VMDS_NEEDS_LOOKUP;
8013 			continue;
8014 		}
8015 
8016 
8017 		/*
8018 		 * Step 5: Handle wiring
8019 		 */
8020 
8021 		if (entry->wired_count) {
8022 			struct vm_map_entry tmp_entry;
8023 			boolean_t           user_wire;
8024 			unsigned int        last_timestamp;
8025 
8026 			user_wire = entry->user_wired_count > 0;
8027 
8028 			/*
8029 			 *      Remove a kernel wiring if requested
8030 			 */
8031 			if (flags & VM_MAP_REMOVE_KUNWIRE) {
8032 				entry->wired_count--;
8033 			}
8034 
8035 			/*
8036 			 *	Remove all user wirings for proper accounting
8037 			 */
8038 			while (entry->user_wired_count) {
8039 				subtract_wire_counts(map, entry, user_wire);
8040 			}
8041 
8042 			/*
8043 			 * All our DMA I/O operations in IOKit are currently
8044 			 * done by wiring through the map entries of the task
8045 			 * requesting the I/O.
8046 			 *
8047 			 * Because of this, we must always wait for kernel wirings
8048 			 * to go away on the entries before deleting them.
8049 			 *
8050 			 * Any caller who wants to actually remove a kernel wiring
8051 			 * should explicitly set the VM_MAP_REMOVE_KUNWIRE flag to
8052 			 * properly remove one wiring instead of blasting through
8053 			 * them all.
8054 			 */
8055 			if (entry->wired_count != 0) {
8056 				assert(map != kernel_map);
8057 				/*
8058 				 * Cannot continue.  Typical case is when
8059 				 * a user thread has physical io pending on
8060 				 * on this page.  Either wait for the
8061 				 * kernel wiring to go away or return an
8062 				 * error.
8063 				 */
8064 				wait_result_t wait_result;
8065 
8066 				entry->needs_wakeup = TRUE;
8067 				wait_result = vm_map_entry_wait(map,
8068 				    interruptible);
8069 
8070 				if (interruptible &&
8071 				    wait_result == THREAD_INTERRUPTED) {
8072 					/*
8073 					 * We do not clear the
8074 					 * needs_wakeup flag, since we
8075 					 * cannot tell if we were the
8076 					 * only one.
8077 					 */
8078 					return KERN_ABORTED;
8079 				}
8080 
8081 
8082 				/*
8083 				 * The entry could have been clipped or
8084 				 * it may not exist anymore.  Look it
8085 				 * up again.
8086 				 */
8087 				state |= VMDS_NEEDS_LOOKUP;
8088 				continue;
8089 			}
8090 
8091 			/*
8092 			 * We can unlock the map now.
8093 			 *
8094 			 * The entry might be split once we unlock the map,
8095 			 * but we need the range as defined by this entry
8096 			 * to be stable. So we must make a local copy.
8097 			 *
8098 			 * The underlying objects do not change during clips,
8099 			 * and the in_transition state guarentees existence
8100 			 * of the entry.
8101 			 */
8102 			last_timestamp = map->timestamp;
8103 			entry->in_transition = TRUE;
8104 			tmp_entry = *entry;
8105 			vm_map_unlock(map);
8106 
8107 			if (tmp_entry.is_sub_map) {
8108 				vm_map_t sub_map;
8109 				vm_map_offset_t sub_start, sub_end;
8110 				pmap_t pmap;
8111 				vm_map_offset_t pmap_addr;
8112 
8113 
8114 				sub_map = VME_SUBMAP(&tmp_entry);
8115 				sub_start = VME_OFFSET(&tmp_entry);
8116 				sub_end = sub_start + (tmp_entry.vme_end -
8117 				    tmp_entry.vme_start);
8118 				if (tmp_entry.use_pmap) {
8119 					pmap = sub_map->pmap;
8120 					pmap_addr = tmp_entry.vme_start;
8121 				} else {
8122 					pmap = map->pmap;
8123 					pmap_addr = tmp_entry.vme_start;
8124 				}
8125 				(void) vm_map_unwire_nested(sub_map,
8126 				    sub_start, sub_end,
8127 				    user_wire,
8128 				    pmap, pmap_addr);
8129 			} else {
8130 				if (VME_OBJECT(&tmp_entry) == kernel_object) {
8131 					pmap_protect_options(
8132 						map->pmap,
8133 						tmp_entry.vme_start,
8134 						tmp_entry.vme_end,
8135 						VM_PROT_NONE,
8136 						PMAP_OPTIONS_REMOVE,
8137 						NULL);
8138 				}
8139 				vm_fault_unwire(map, &tmp_entry,
8140 				    VME_OBJECT(&tmp_entry) == kernel_object,
8141 				    map->pmap, tmp_entry.vme_start);
8142 			}
8143 
8144 			vm_map_lock(map);
8145 
8146 			/*
8147 			 * Unwiring happened, we can now go back to deleting
8148 			 * them (after we clear the in_transition bit for the range).
8149 			 */
8150 			if (last_timestamp + 1 != map->timestamp) {
8151 				state |= VMDS_NEEDS_LOOKUP;
8152 			}
8153 			clear_in_transition_end = tmp_entry.vme_end;
8154 			continue;
8155 		}
8156 
8157 		assert(entry->wired_count == 0);
8158 		assert(entry->user_wired_count == 0);
8159 
8160 
8161 		/*
8162 		 * Step 6: Entry is unwired and ready for us to delete !
8163 		 */
8164 
8165 		if (!entry->permanent) {
8166 			/*
8167 			 * Typical case: the entry really shouldn't be permanent
8168 			 */
8169 		} else if ((flags & VM_MAP_REMOVE_IMMUTABLE) || map->terminated) {
8170 #if 0
8171 			printf("FBDP %d[%s] removing permanent entry "
8172 			    "%p [0x%llx:0x%llx] prot 0x%x/0x%x\n",
8173 			    proc_selfpid(),
8174 			    (current_task()->bsd_info
8175 			    ? proc_name_address(current_task()->bsd_info)
8176 			    : "?"), entry,
8177 			    (uint64_t)entry->vme_start,
8178 			    (uint64_t)entry->vme_end,
8179 			    entry->protection,
8180 			    entry->max_protection);
8181 #endif
8182 			entry->permanent = FALSE;
8183 		} else {
8184 			/*
8185 			 * dtrace -n 'vm_map_delete_permanent {
8186 			 *     print("start=0x%llx end=0x%llx prot=0x%x/0x%x\n", arg0, arg1, arg2, arg3);
8187 			 *     stack();
8188 			 *     ustack();
8189 			 * }'
8190 			 */
8191 			DTRACE_VM5(vm_map_delete_permanent,
8192 			    vm_map_offset_t, entry->vme_start,
8193 			    vm_map_offset_t, entry->vme_end,
8194 			    vm_prot_t, entry->protection,
8195 			    vm_prot_t, entry->max_protection,
8196 			    int, VME_ALIAS(entry));
8197 		}
8198 
8199 		if (entry->is_sub_map) {
8200 			assertf(VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)) >= VM_MAP_PAGE_SHIFT(map),
8201 			    "map %p (%d) entry %p submap %p (%d)\n",
8202 			    map, VM_MAP_PAGE_SHIFT(map), entry,
8203 			    VME_SUBMAP(entry),
8204 			    VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)));
8205 			if (entry->use_pmap) {
8206 #ifndef NO_NESTED_PMAP
8207 				int pmap_flags;
8208 
8209 				if (map->terminated) {
8210 					/*
8211 					 * This is the final cleanup of the
8212 					 * address space being terminated.
8213 					 * No new mappings are expected and
8214 					 * we don't really need to unnest the
8215 					 * shared region (and lose the "global"
8216 					 * pmap mappings, if applicable).
8217 					 *
8218 					 * Tell the pmap layer that we're
8219 					 * "clean" wrt nesting.
8220 					 */
8221 					pmap_flags = PMAP_UNNEST_CLEAN;
8222 				} else {
8223 					/*
8224 					 * We're unmapping part of the nested
8225 					 * shared region, so we can't keep the
8226 					 * nested pmap.
8227 					 */
8228 					pmap_flags = 0;
8229 				}
8230 				pmap_unnest_options(
8231 					map->pmap,
8232 					(addr64_t)entry->vme_start,
8233 					entry->vme_end - entry->vme_start,
8234 					pmap_flags);
8235 #endif  /* NO_NESTED_PMAP */
8236 				if (map->mapped_in_other_pmaps &&
8237 				    os_ref_get_count_raw(&map->map_refcnt) != 0) {
8238 					/* clean up parent map/maps */
8239 					vm_map_submap_pmap_clean(
8240 						map, entry->vme_start,
8241 						entry->vme_end,
8242 						VME_SUBMAP(entry),
8243 						VME_OFFSET(entry));
8244 				}
8245 			} else {
8246 				vm_map_submap_pmap_clean(
8247 					map, entry->vme_start, entry->vme_end,
8248 					VME_SUBMAP(entry),
8249 					VME_OFFSET(entry));
8250 			}
8251 		} else if (VME_OBJECT(entry) == kernel_object ||
8252 		    VME_OBJECT(entry) == compressor_object) {
8253 			/*
8254 			 * nothing to do
8255 			 */
8256 		} else if (map->mapped_in_other_pmaps &&
8257 		    os_ref_get_count_raw(&map->map_refcnt) != 0) {
8258 			vm_object_pmap_protect_options(
8259 				VME_OBJECT(entry), VME_OFFSET(entry),
8260 				entry->vme_end - entry->vme_start,
8261 				PMAP_NULL,
8262 				PAGE_SIZE,
8263 				entry->vme_start,
8264 				VM_PROT_NONE,
8265 				PMAP_OPTIONS_REMOVE);
8266 		} else if ((VME_OBJECT(entry) != VM_OBJECT_NULL) ||
8267 		    (state & VMDS_KERNEL_PMAP)) {
8268 			/* Remove translations associated
8269 			 * with this range unless the entry
8270 			 * does not have an object, or
8271 			 * it's the kernel map or a descendant
8272 			 * since the platform could potentially
8273 			 * create "backdoor" mappings invisible
8274 			 * to the VM. It is expected that
8275 			 * objectless, non-kernel ranges
8276 			 * do not have such VM invisible
8277 			 * translations.
8278 			 */
8279 			pmap_remove_options(map->pmap,
8280 			    (addr64_t)entry->vme_start,
8281 			    (addr64_t)entry->vme_end,
8282 			    PMAP_OPTIONS_REMOVE);
8283 		}
8284 
8285 #if DEBUG
8286 		/*
8287 		 * All pmap mappings for this map entry must have been
8288 		 * cleared by now.
8289 		 */
8290 		assert(pmap_is_empty(map->pmap,
8291 		    entry->vme_start,
8292 		    entry->vme_end));
8293 #endif /* DEBUG */
8294 
8295 		if (entry->iokit_acct) {
8296 			/* alternate accounting */
8297 			DTRACE_VM4(vm_map_iokit_unmapped_region,
8298 			    vm_map_t, map,
8299 			    vm_map_offset_t, entry->vme_start,
8300 			    vm_map_offset_t, entry->vme_end,
8301 			    int, VME_ALIAS(entry));
8302 			vm_map_iokit_unmapped_region(map,
8303 			    (entry->vme_end -
8304 			    entry->vme_start));
8305 			entry->iokit_acct = FALSE;
8306 			entry->use_pmap = FALSE;
8307 		}
8308 
8309 		s = entry->vme_end;
8310 		next = entry->vme_next;
8311 
8312 		if (entry->permanent) {
8313 			/*
8314 			 * A permanent entry can not be removed, so leave it
8315 			 * in place but remove all access permissions.
8316 			 */
8317 			entry->protection = VM_PROT_NONE;
8318 			entry->max_protection = VM_PROT_NONE;
8319 		} else {
8320 			vm_map_entry_zap(map, entry, zap_list);
8321 		}
8322 
8323 		entry = next;
8324 
8325 		if ((flags & VM_MAP_REMOVE_NO_YIELD) == 0 && s < end) {
8326 			unsigned int last_timestamp = map->timestamp++;
8327 
8328 			if (lck_rw_lock_yield_exclusive(&map->lock,
8329 			    LCK_RW_YIELD_ANY_WAITER)) {
8330 				if (last_timestamp != map->timestamp + 1) {
8331 					state |= VMDS_NEEDS_LOOKUP;
8332 				}
8333 			} else {
8334 				/* we didn't yield, undo our change */
8335 				map->timestamp--;
8336 			}
8337 		}
8338 	}
8339 
8340 	if (map->wait_for_space) {
8341 		thread_wakeup((event_t) map);
8342 	}
8343 
8344 	if (state & VMDS_NEEDS_WAKEUP) {
8345 		vm_map_entry_wakeup(map);
8346 	}
8347 
8348 	if (state & VMDS_FOUND_GAP) {
8349 		DTRACE_VM3(kern_vm_deallocate_gap,
8350 		    vm_map_offset_t, gap_start,
8351 		    vm_map_offset_t, save_start,
8352 		    vm_map_offset_t, save_end);
8353 		if (flags & VM_MAP_REMOVE_GAPS_FAIL) {
8354 			return KERN_INVALID_VALUE;
8355 		} else {
8356 			vm_map_guard_exception(gap_start, kGUARD_EXC_DEALLOC_GAP);
8357 		}
8358 	}
8359 
8360 	return KERN_SUCCESS;
8361 }
8362 
8363 
8364 /*
8365  *	vm_map_terminate:
8366  *
8367  *	Clean out a task's map.
8368  */
8369 kern_return_t
vm_map_terminate(vm_map_t map)8370 vm_map_terminate(
8371 	vm_map_t        map)
8372 {
8373 	vm_map_lock(map);
8374 	map->terminated = TRUE;
8375 	vm_map_disable_hole_optimization(map);
8376 	vm_map_remove_and_unlock(map, map->min_offset, map->max_offset,
8377 	    VM_MAP_REMOVE_NO_FLAGS);
8378 	return KERN_SUCCESS;
8379 }
8380 
8381 /*
8382  *	vm_map_remove:
8383  *
8384  *	Remove the given address range from the target map.
8385  *	This is the exported form of vm_map_delete.
8386  */
8387 kern_return_t
vm_map_remove_flags(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vmr_flags_t flags)8388 vm_map_remove_flags(
8389 	vm_map_t        map,
8390 	vm_map_offset_t start,
8391 	vm_map_offset_t end,
8392 	vmr_flags_t     flags)
8393 {
8394 	vm_map_lock(map);
8395 	return vm_map_remove_and_unlock(map, start, end, flags);
8396 }
8397 
8398 /*
8399  *	vm_map_remove_locked:
8400  *
8401  *	Remove the given address range from the target locked map.
8402  *	This is the exported form of vm_map_delete.
8403  */
8404 kern_return_t
vm_map_remove_and_unlock(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vmr_flags_t flags)8405 vm_map_remove_and_unlock(
8406 	vm_map_t        map,
8407 	vm_map_offset_t start,
8408 	vm_map_offset_t end,
8409 	vmr_flags_t     flags)
8410 {
8411 	VM_MAP_ZAP_DECLARE(zap);
8412 	kern_return_t   result;
8413 
8414 	VM_MAP_RANGE_CHECK(map, start, end);
8415 	result = vm_map_delete(map, start, end, flags, &zap);
8416 	vm_map_unlock(map);
8417 
8418 	vm_map_zap_dispose(&zap);
8419 
8420 	return result;
8421 }
8422 
8423 
8424 /*
8425  *	Routine:	vm_map_copy_allocate
8426  *
8427  *	Description:
8428  *		Allocates and initializes a map copy object.
8429  */
8430 static vm_map_copy_t
vm_map_copy_allocate(void)8431 vm_map_copy_allocate(void)
8432 {
8433 	vm_map_copy_t new_copy;
8434 
8435 	new_copy = zalloc_flags(vm_map_copy_zone, Z_WAITOK | Z_ZERO);
8436 	new_copy->c_u.hdr.rb_head_store.rbh_root = (void*)(int)SKIP_RB_TREE;
8437 	vm_map_copy_first_entry(new_copy) = vm_map_copy_to_entry(new_copy);
8438 	vm_map_copy_last_entry(new_copy) = vm_map_copy_to_entry(new_copy);
8439 	return new_copy;
8440 }
8441 
8442 /*
8443  *	Routine:	vm_map_copy_discard
8444  *
8445  *	Description:
8446  *		Dispose of a map copy object (returned by
8447  *		vm_map_copyin).
8448  */
8449 void
vm_map_copy_discard(vm_map_copy_t copy)8450 vm_map_copy_discard(
8451 	vm_map_copy_t   copy)
8452 {
8453 	if (copy == VM_MAP_COPY_NULL) {
8454 		return;
8455 	}
8456 
8457 	/*
8458 	 * Assert that the vm_map_copy is coming from the right
8459 	 * zone and hasn't been forged
8460 	 */
8461 	vm_map_copy_require(copy);
8462 
8463 	switch (copy->type) {
8464 	case VM_MAP_COPY_ENTRY_LIST:
8465 		while (vm_map_copy_first_entry(copy) !=
8466 		    vm_map_copy_to_entry(copy)) {
8467 			vm_map_entry_t  entry = vm_map_copy_first_entry(copy);
8468 
8469 			vm_map_copy_entry_unlink(copy, entry);
8470 			if (entry->is_sub_map) {
8471 				vm_map_deallocate(VME_SUBMAP(entry));
8472 			} else {
8473 				vm_object_deallocate(VME_OBJECT(entry));
8474 			}
8475 			vm_map_copy_entry_dispose(entry);
8476 		}
8477 		break;
8478 	case VM_MAP_COPY_OBJECT:
8479 		vm_object_deallocate(copy->cpy_object);
8480 		break;
8481 	case VM_MAP_COPY_KERNEL_BUFFER:
8482 
8483 		/*
8484 		 * The vm_map_copy_t and possibly the data buffer were
8485 		 * allocated by a single call to kalloc_data(), i.e. the
8486 		 * vm_map_copy_t was not allocated out of the zone.
8487 		 */
8488 		if (copy->size > msg_ool_size_small || copy->offset) {
8489 			panic("Invalid vm_map_copy_t sz:%lld, ofst:%lld",
8490 			    (long long)copy->size, (long long)copy->offset);
8491 		}
8492 		kfree_data(copy->cpy_kdata, copy->size);
8493 	}
8494 	zfree(vm_map_copy_zone, copy);
8495 }
8496 
8497 /*
8498  *	Routine:	vm_map_copy_copy
8499  *
8500  *	Description:
8501  *			Move the information in a map copy object to
8502  *			a new map copy object, leaving the old one
8503  *			empty.
8504  *
8505  *			This is used by kernel routines that need
8506  *			to look at out-of-line data (in copyin form)
8507  *			before deciding whether to return SUCCESS.
8508  *			If the routine returns FAILURE, the original
8509  *			copy object will be deallocated; therefore,
8510  *			these routines must make a copy of the copy
8511  *			object and leave the original empty so that
8512  *			deallocation will not fail.
8513  */
8514 vm_map_copy_t
vm_map_copy_copy(vm_map_copy_t copy)8515 vm_map_copy_copy(
8516 	vm_map_copy_t   copy)
8517 {
8518 	vm_map_copy_t   new_copy;
8519 
8520 	if (copy == VM_MAP_COPY_NULL) {
8521 		return VM_MAP_COPY_NULL;
8522 	}
8523 
8524 	/*
8525 	 * Assert that the vm_map_copy is coming from the right
8526 	 * zone and hasn't been forged
8527 	 */
8528 	vm_map_copy_require(copy);
8529 
8530 	/*
8531 	 * Allocate a new copy object, and copy the information
8532 	 * from the old one into it.
8533 	 */
8534 
8535 	new_copy = (vm_map_copy_t) zalloc(vm_map_copy_zone);
8536 	memcpy((void *) new_copy, (void *) copy, sizeof(struct vm_map_copy));
8537 #if __has_feature(ptrauth_calls)
8538 	if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) {
8539 		new_copy->cpy_kdata = copy->cpy_kdata;
8540 	}
8541 #endif
8542 
8543 	if (copy->type == VM_MAP_COPY_ENTRY_LIST) {
8544 		/*
8545 		 * The links in the entry chain must be
8546 		 * changed to point to the new copy object.
8547 		 */
8548 		vm_map_copy_first_entry(copy)->vme_prev
8549 		        = vm_map_copy_to_entry(new_copy);
8550 		vm_map_copy_last_entry(copy)->vme_next
8551 		        = vm_map_copy_to_entry(new_copy);
8552 	}
8553 
8554 	/*
8555 	 * Change the old copy object into one that contains
8556 	 * nothing to be deallocated.
8557 	 */
8558 	copy->type = VM_MAP_COPY_OBJECT;
8559 	copy->cpy_object = VM_OBJECT_NULL;
8560 
8561 	/*
8562 	 * Return the new object.
8563 	 */
8564 	return new_copy;
8565 }
8566 
8567 static boolean_t
vm_map_entry_is_overwritable(vm_map_t dst_map __unused,vm_map_entry_t entry)8568 vm_map_entry_is_overwritable(
8569 	vm_map_t        dst_map __unused,
8570 	vm_map_entry_t  entry)
8571 {
8572 	if (!(entry->protection & VM_PROT_WRITE)) {
8573 		/* can't overwrite if not writable */
8574 		return FALSE;
8575 	}
8576 #if !__x86_64__
8577 	if (entry->used_for_jit &&
8578 	    vm_map_cs_enforcement(dst_map) &&
8579 	    !dst_map->cs_debugged) {
8580 		/*
8581 		 * Can't overwrite a JIT region while cs_enforced
8582 		 * and not cs_debugged.
8583 		 */
8584 		return FALSE;
8585 	}
8586 #endif /* !__x86_64__ */
8587 	return TRUE;
8588 }
8589 
8590 static kern_return_t
vm_map_overwrite_submap_recurse(vm_map_t dst_map,vm_map_offset_t dst_addr,vm_map_size_t dst_size)8591 vm_map_overwrite_submap_recurse(
8592 	vm_map_t        dst_map,
8593 	vm_map_offset_t dst_addr,
8594 	vm_map_size_t   dst_size)
8595 {
8596 	vm_map_offset_t dst_end;
8597 	vm_map_entry_t  tmp_entry;
8598 	vm_map_entry_t  entry;
8599 	kern_return_t   result;
8600 	boolean_t       encountered_sub_map = FALSE;
8601 
8602 
8603 
8604 	/*
8605 	 *	Verify that the destination is all writeable
8606 	 *	initially.  We have to trunc the destination
8607 	 *	address and round the copy size or we'll end up
8608 	 *	splitting entries in strange ways.
8609 	 */
8610 
8611 	dst_end = vm_map_round_page(dst_addr + dst_size,
8612 	    VM_MAP_PAGE_MASK(dst_map));
8613 	vm_map_lock(dst_map);
8614 
8615 start_pass_1:
8616 	if (!vm_map_lookup_entry(dst_map, dst_addr, &tmp_entry)) {
8617 		vm_map_unlock(dst_map);
8618 		return KERN_INVALID_ADDRESS;
8619 	}
8620 
8621 	vm_map_clip_start(dst_map,
8622 	    tmp_entry,
8623 	    vm_map_trunc_page(dst_addr,
8624 	    VM_MAP_PAGE_MASK(dst_map)));
8625 	if (tmp_entry->is_sub_map) {
8626 		/* clipping did unnest if needed */
8627 		assert(!tmp_entry->use_pmap);
8628 	}
8629 
8630 	for (entry = tmp_entry;;) {
8631 		vm_map_entry_t  next;
8632 
8633 		next = entry->vme_next;
8634 		while (entry->is_sub_map) {
8635 			vm_map_offset_t sub_start;
8636 			vm_map_offset_t sub_end;
8637 			vm_map_offset_t local_end;
8638 
8639 			if (entry->in_transition) {
8640 				/*
8641 				 * Say that we are waiting, and wait for entry.
8642 				 */
8643 				entry->needs_wakeup = TRUE;
8644 				vm_map_entry_wait(dst_map, THREAD_UNINT);
8645 
8646 				goto start_pass_1;
8647 			}
8648 
8649 			encountered_sub_map = TRUE;
8650 			sub_start = VME_OFFSET(entry);
8651 
8652 			if (entry->vme_end < dst_end) {
8653 				sub_end = entry->vme_end;
8654 			} else {
8655 				sub_end = dst_end;
8656 			}
8657 			sub_end -= entry->vme_start;
8658 			sub_end += VME_OFFSET(entry);
8659 			local_end = entry->vme_end;
8660 			vm_map_unlock(dst_map);
8661 
8662 			result = vm_map_overwrite_submap_recurse(
8663 				VME_SUBMAP(entry),
8664 				sub_start,
8665 				sub_end - sub_start);
8666 
8667 			if (result != KERN_SUCCESS) {
8668 				return result;
8669 			}
8670 			if (dst_end <= entry->vme_end) {
8671 				return KERN_SUCCESS;
8672 			}
8673 			vm_map_lock(dst_map);
8674 			if (!vm_map_lookup_entry(dst_map, local_end,
8675 			    &tmp_entry)) {
8676 				vm_map_unlock(dst_map);
8677 				return KERN_INVALID_ADDRESS;
8678 			}
8679 			entry = tmp_entry;
8680 			next = entry->vme_next;
8681 		}
8682 
8683 		if (!(entry->protection & VM_PROT_WRITE)) {
8684 			vm_map_unlock(dst_map);
8685 			return KERN_PROTECTION_FAILURE;
8686 		}
8687 
8688 		if (!vm_map_entry_is_overwritable(dst_map, entry)) {
8689 			vm_map_unlock(dst_map);
8690 			return KERN_PROTECTION_FAILURE;
8691 		}
8692 
8693 		/*
8694 		 *	If the entry is in transition, we must wait
8695 		 *	for it to exit that state.  Anything could happen
8696 		 *	when we unlock the map, so start over.
8697 		 */
8698 		if (entry->in_transition) {
8699 			/*
8700 			 * Say that we are waiting, and wait for entry.
8701 			 */
8702 			entry->needs_wakeup = TRUE;
8703 			vm_map_entry_wait(dst_map, THREAD_UNINT);
8704 
8705 			goto start_pass_1;
8706 		}
8707 
8708 /*
8709  *		our range is contained completely within this map entry
8710  */
8711 		if (dst_end <= entry->vme_end) {
8712 			vm_map_unlock(dst_map);
8713 			return KERN_SUCCESS;
8714 		}
8715 /*
8716  *		check that range specified is contiguous region
8717  */
8718 		if ((next == vm_map_to_entry(dst_map)) ||
8719 		    (next->vme_start != entry->vme_end)) {
8720 			vm_map_unlock(dst_map);
8721 			return KERN_INVALID_ADDRESS;
8722 		}
8723 
8724 		/*
8725 		 *	Check for permanent objects in the destination.
8726 		 */
8727 		if ((VME_OBJECT(entry) != VM_OBJECT_NULL) &&
8728 		    ((!VME_OBJECT(entry)->internal) ||
8729 		    (VME_OBJECT(entry)->true_share))) {
8730 			if (encountered_sub_map) {
8731 				vm_map_unlock(dst_map);
8732 				return KERN_FAILURE;
8733 			}
8734 		}
8735 
8736 
8737 		entry = next;
8738 	}/* for */
8739 	vm_map_unlock(dst_map);
8740 	return KERN_SUCCESS;
8741 }
8742 
8743 /*
8744  *	Routine:	vm_map_copy_overwrite
8745  *
8746  *	Description:
8747  *		Copy the memory described by the map copy
8748  *		object (copy; returned by vm_map_copyin) onto
8749  *		the specified destination region (dst_map, dst_addr).
8750  *		The destination must be writeable.
8751  *
8752  *		Unlike vm_map_copyout, this routine actually
8753  *		writes over previously-mapped memory.  If the
8754  *		previous mapping was to a permanent (user-supplied)
8755  *		memory object, it is preserved.
8756  *
8757  *		The attributes (protection and inheritance) of the
8758  *		destination region are preserved.
8759  *
8760  *		If successful, consumes the copy object.
8761  *		Otherwise, the caller is responsible for it.
8762  *
8763  *	Implementation notes:
8764  *		To overwrite aligned temporary virtual memory, it is
8765  *		sufficient to remove the previous mapping and insert
8766  *		the new copy.  This replacement is done either on
8767  *		the whole region (if no permanent virtual memory
8768  *		objects are embedded in the destination region) or
8769  *		in individual map entries.
8770  *
8771  *		To overwrite permanent virtual memory , it is necessary
8772  *		to copy each page, as the external memory management
8773  *		interface currently does not provide any optimizations.
8774  *
8775  *		Unaligned memory also has to be copied.  It is possible
8776  *		to use 'vm_trickery' to copy the aligned data.  This is
8777  *		not done but not hard to implement.
8778  *
8779  *		Once a page of permanent memory has been overwritten,
8780  *		it is impossible to interrupt this function; otherwise,
8781  *		the call would be neither atomic nor location-independent.
8782  *		The kernel-state portion of a user thread must be
8783  *		interruptible.
8784  *
8785  *		It may be expensive to forward all requests that might
8786  *		overwrite permanent memory (vm_write, vm_copy) to
8787  *		uninterruptible kernel threads.  This routine may be
8788  *		called by interruptible threads; however, success is
8789  *		not guaranteed -- if the request cannot be performed
8790  *		atomically and interruptibly, an error indication is
8791  *		returned.
8792  *
8793  *		Callers of this function must call vm_map_copy_require on
8794  *		previously created vm_map_copy_t or pass a newly created
8795  *		one to ensure that it hasn't been forged.
8796  */
8797 
8798 static kern_return_t
vm_map_copy_overwrite_nested(vm_map_t dst_map,vm_map_address_t dst_addr,vm_map_copy_t copy,boolean_t interruptible,pmap_t pmap,boolean_t discard_on_success)8799 vm_map_copy_overwrite_nested(
8800 	vm_map_t                dst_map,
8801 	vm_map_address_t        dst_addr,
8802 	vm_map_copy_t           copy,
8803 	boolean_t               interruptible,
8804 	pmap_t                  pmap,
8805 	boolean_t               discard_on_success)
8806 {
8807 	vm_map_offset_t         dst_end;
8808 	vm_map_entry_t          tmp_entry;
8809 	vm_map_entry_t          entry;
8810 	kern_return_t           kr;
8811 	boolean_t               aligned = TRUE;
8812 	boolean_t               contains_permanent_objects = FALSE;
8813 	boolean_t               encountered_sub_map = FALSE;
8814 	vm_map_offset_t         base_addr;
8815 	vm_map_size_t           copy_size;
8816 	vm_map_size_t           total_size;
8817 	uint16_t                copy_page_shift;
8818 
8819 	/*
8820 	 *	Check for special kernel buffer allocated
8821 	 *	by new_ipc_kmsg_copyin.
8822 	 */
8823 
8824 	if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) {
8825 		return vm_map_copyout_kernel_buffer(
8826 			dst_map, &dst_addr,
8827 			copy, copy->size, TRUE, discard_on_success);
8828 	}
8829 
8830 	/*
8831 	 *      Only works for entry lists at the moment.  Will
8832 	 *	support page lists later.
8833 	 */
8834 
8835 	assert(copy->type == VM_MAP_COPY_ENTRY_LIST);
8836 
8837 	if (copy->size == 0) {
8838 		if (discard_on_success) {
8839 			vm_map_copy_discard(copy);
8840 		}
8841 		return KERN_SUCCESS;
8842 	}
8843 
8844 	copy_page_shift = copy->cpy_hdr.page_shift;
8845 
8846 	/*
8847 	 *	Verify that the destination is all writeable
8848 	 *	initially.  We have to trunc the destination
8849 	 *	address and round the copy size or we'll end up
8850 	 *	splitting entries in strange ways.
8851 	 */
8852 
8853 	if (!VM_MAP_PAGE_ALIGNED(copy->size,
8854 	    VM_MAP_PAGE_MASK(dst_map)) ||
8855 	    !VM_MAP_PAGE_ALIGNED(copy->offset,
8856 	    VM_MAP_PAGE_MASK(dst_map)) ||
8857 	    !VM_MAP_PAGE_ALIGNED(dst_addr,
8858 	    VM_MAP_PAGE_MASK(dst_map)) ||
8859 	    copy_page_shift != VM_MAP_PAGE_SHIFT(dst_map)) {
8860 		aligned = FALSE;
8861 		dst_end = vm_map_round_page(dst_addr + copy->size,
8862 		    VM_MAP_PAGE_MASK(dst_map));
8863 	} else {
8864 		dst_end = dst_addr + copy->size;
8865 	}
8866 
8867 	vm_map_lock(dst_map);
8868 
8869 	/* LP64todo - remove this check when vm_map_commpage64()
8870 	 * no longer has to stuff in a map_entry for the commpage
8871 	 * above the map's max_offset.
8872 	 */
8873 	if (dst_addr >= dst_map->max_offset) {
8874 		vm_map_unlock(dst_map);
8875 		return KERN_INVALID_ADDRESS;
8876 	}
8877 
8878 start_pass_1:
8879 	if (!vm_map_lookup_entry(dst_map, dst_addr, &tmp_entry)) {
8880 		vm_map_unlock(dst_map);
8881 		return KERN_INVALID_ADDRESS;
8882 	}
8883 	vm_map_clip_start(dst_map,
8884 	    tmp_entry,
8885 	    vm_map_trunc_page(dst_addr,
8886 	    VM_MAP_PAGE_MASK(dst_map)));
8887 	for (entry = tmp_entry;;) {
8888 		vm_map_entry_t  next = entry->vme_next;
8889 
8890 		while (entry->is_sub_map) {
8891 			vm_map_offset_t sub_start;
8892 			vm_map_offset_t sub_end;
8893 			vm_map_offset_t local_end;
8894 
8895 			if (entry->in_transition) {
8896 				/*
8897 				 * Say that we are waiting, and wait for entry.
8898 				 */
8899 				entry->needs_wakeup = TRUE;
8900 				vm_map_entry_wait(dst_map, THREAD_UNINT);
8901 
8902 				goto start_pass_1;
8903 			}
8904 
8905 			local_end = entry->vme_end;
8906 			if (!(entry->needs_copy)) {
8907 				/* if needs_copy we are a COW submap */
8908 				/* in such a case we just replace so */
8909 				/* there is no need for the follow-  */
8910 				/* ing check.                        */
8911 				encountered_sub_map = TRUE;
8912 				sub_start = VME_OFFSET(entry);
8913 
8914 				if (entry->vme_end < dst_end) {
8915 					sub_end = entry->vme_end;
8916 				} else {
8917 					sub_end = dst_end;
8918 				}
8919 				sub_end -= entry->vme_start;
8920 				sub_end += VME_OFFSET(entry);
8921 				vm_map_unlock(dst_map);
8922 
8923 				kr = vm_map_overwrite_submap_recurse(
8924 					VME_SUBMAP(entry),
8925 					sub_start,
8926 					sub_end - sub_start);
8927 				if (kr != KERN_SUCCESS) {
8928 					return kr;
8929 				}
8930 				vm_map_lock(dst_map);
8931 			}
8932 
8933 			if (dst_end <= entry->vme_end) {
8934 				goto start_overwrite;
8935 			}
8936 			if (!vm_map_lookup_entry(dst_map, local_end,
8937 			    &entry)) {
8938 				vm_map_unlock(dst_map);
8939 				return KERN_INVALID_ADDRESS;
8940 			}
8941 			next = entry->vme_next;
8942 		}
8943 
8944 		if (!(entry->protection & VM_PROT_WRITE)) {
8945 			vm_map_unlock(dst_map);
8946 			return KERN_PROTECTION_FAILURE;
8947 		}
8948 
8949 		if (!vm_map_entry_is_overwritable(dst_map, entry)) {
8950 			vm_map_unlock(dst_map);
8951 			return KERN_PROTECTION_FAILURE;
8952 		}
8953 
8954 		/*
8955 		 *	If the entry is in transition, we must wait
8956 		 *	for it to exit that state.  Anything could happen
8957 		 *	when we unlock the map, so start over.
8958 		 */
8959 		if (entry->in_transition) {
8960 			/*
8961 			 * Say that we are waiting, and wait for entry.
8962 			 */
8963 			entry->needs_wakeup = TRUE;
8964 			vm_map_entry_wait(dst_map, THREAD_UNINT);
8965 
8966 			goto start_pass_1;
8967 		}
8968 
8969 /*
8970  *		our range is contained completely within this map entry
8971  */
8972 		if (dst_end <= entry->vme_end) {
8973 			break;
8974 		}
8975 /*
8976  *		check that range specified is contiguous region
8977  */
8978 		if ((next == vm_map_to_entry(dst_map)) ||
8979 		    (next->vme_start != entry->vme_end)) {
8980 			vm_map_unlock(dst_map);
8981 			return KERN_INVALID_ADDRESS;
8982 		}
8983 
8984 
8985 		/*
8986 		 *	Check for permanent objects in the destination.
8987 		 */
8988 		if ((VME_OBJECT(entry) != VM_OBJECT_NULL) &&
8989 		    ((!VME_OBJECT(entry)->internal) ||
8990 		    (VME_OBJECT(entry)->true_share))) {
8991 			contains_permanent_objects = TRUE;
8992 		}
8993 
8994 		entry = next;
8995 	}/* for */
8996 
8997 start_overwrite:
8998 	/*
8999 	 *	If there are permanent objects in the destination, then
9000 	 *	the copy cannot be interrupted.
9001 	 */
9002 
9003 	if (interruptible && contains_permanent_objects) {
9004 		vm_map_unlock(dst_map);
9005 		return KERN_FAILURE;   /* XXX */
9006 	}
9007 
9008 	/*
9009 	 *
9010 	 *	Make a second pass, overwriting the data
9011 	 *	At the beginning of each loop iteration,
9012 	 *	the next entry to be overwritten is "tmp_entry"
9013 	 *	(initially, the value returned from the lookup above),
9014 	 *	and the starting address expected in that entry
9015 	 *	is "start".
9016 	 */
9017 
9018 	total_size = copy->size;
9019 	if (encountered_sub_map) {
9020 		copy_size = 0;
9021 		/* re-calculate tmp_entry since we've had the map */
9022 		/* unlocked */
9023 		if (!vm_map_lookup_entry( dst_map, dst_addr, &tmp_entry)) {
9024 			vm_map_unlock(dst_map);
9025 			return KERN_INVALID_ADDRESS;
9026 		}
9027 	} else {
9028 		copy_size = copy->size;
9029 	}
9030 
9031 	base_addr = dst_addr;
9032 	while (TRUE) {
9033 		/* deconstruct the copy object and do in parts */
9034 		/* only in sub_map, interruptable case */
9035 		vm_map_entry_t  copy_entry;
9036 		vm_map_entry_t  previous_prev = VM_MAP_ENTRY_NULL;
9037 		vm_map_entry_t  next_copy = VM_MAP_ENTRY_NULL;
9038 		int             nentries;
9039 		int             remaining_entries = 0;
9040 		vm_map_offset_t new_offset = 0;
9041 
9042 		for (entry = tmp_entry; copy_size == 0;) {
9043 			vm_map_entry_t  next;
9044 
9045 			next = entry->vme_next;
9046 
9047 			/* tmp_entry and base address are moved along */
9048 			/* each time we encounter a sub-map.  Otherwise */
9049 			/* entry can outpase tmp_entry, and the copy_size */
9050 			/* may reflect the distance between them */
9051 			/* if the current entry is found to be in transition */
9052 			/* we will start over at the beginning or the last */
9053 			/* encounter of a submap as dictated by base_addr */
9054 			/* we will zero copy_size accordingly. */
9055 			if (entry->in_transition) {
9056 				/*
9057 				 * Say that we are waiting, and wait for entry.
9058 				 */
9059 				entry->needs_wakeup = TRUE;
9060 				vm_map_entry_wait(dst_map, THREAD_UNINT);
9061 
9062 				if (!vm_map_lookup_entry(dst_map, base_addr,
9063 				    &tmp_entry)) {
9064 					vm_map_unlock(dst_map);
9065 					return KERN_INVALID_ADDRESS;
9066 				}
9067 				copy_size = 0;
9068 				entry = tmp_entry;
9069 				continue;
9070 			}
9071 			if (entry->is_sub_map) {
9072 				vm_map_offset_t sub_start;
9073 				vm_map_offset_t sub_end;
9074 				vm_map_offset_t local_end;
9075 
9076 				if (entry->needs_copy) {
9077 					/* if this is a COW submap */
9078 					/* just back the range with a */
9079 					/* anonymous entry */
9080 					if (entry->vme_end < dst_end) {
9081 						sub_end = entry->vme_end;
9082 					} else {
9083 						sub_end = dst_end;
9084 					}
9085 					if (entry->vme_start < base_addr) {
9086 						sub_start = base_addr;
9087 					} else {
9088 						sub_start = entry->vme_start;
9089 					}
9090 					vm_map_clip_end(
9091 						dst_map, entry, sub_end);
9092 					vm_map_clip_start(
9093 						dst_map, entry, sub_start);
9094 					assert(!entry->use_pmap);
9095 					assert(!entry->iokit_acct);
9096 					entry->use_pmap = TRUE;
9097 					entry->is_sub_map = FALSE;
9098 					vm_map_deallocate(
9099 						VME_SUBMAP(entry));
9100 					VME_OBJECT_SET(entry, VM_OBJECT_NULL);
9101 					VME_OFFSET_SET(entry, 0);
9102 					entry->is_shared = FALSE;
9103 					entry->needs_copy = FALSE;
9104 					entry->protection = VM_PROT_DEFAULT;
9105 					entry->max_protection = VM_PROT_ALL;
9106 					entry->wired_count = 0;
9107 					entry->user_wired_count = 0;
9108 					if (entry->inheritance
9109 					    == VM_INHERIT_SHARE) {
9110 						entry->inheritance = VM_INHERIT_COPY;
9111 					}
9112 					continue;
9113 				}
9114 				/* first take care of any non-sub_map */
9115 				/* entries to send */
9116 				if (base_addr < entry->vme_start) {
9117 					/* stuff to send */
9118 					copy_size =
9119 					    entry->vme_start - base_addr;
9120 					break;
9121 				}
9122 				sub_start = VME_OFFSET(entry);
9123 
9124 				if (entry->vme_end < dst_end) {
9125 					sub_end = entry->vme_end;
9126 				} else {
9127 					sub_end = dst_end;
9128 				}
9129 				sub_end -= entry->vme_start;
9130 				sub_end += VME_OFFSET(entry);
9131 				local_end = entry->vme_end;
9132 				vm_map_unlock(dst_map);
9133 				copy_size = sub_end - sub_start;
9134 
9135 				/* adjust the copy object */
9136 				if (total_size > copy_size) {
9137 					vm_map_size_t   local_size = 0;
9138 					vm_map_size_t   entry_size;
9139 
9140 					nentries = 1;
9141 					new_offset = copy->offset;
9142 					copy_entry = vm_map_copy_first_entry(copy);
9143 					while (copy_entry !=
9144 					    vm_map_copy_to_entry(copy)) {
9145 						entry_size = copy_entry->vme_end -
9146 						    copy_entry->vme_start;
9147 						if ((local_size < copy_size) &&
9148 						    ((local_size + entry_size)
9149 						    >= copy_size)) {
9150 							vm_map_copy_clip_end(copy,
9151 							    copy_entry,
9152 							    copy_entry->vme_start +
9153 							    (copy_size - local_size));
9154 							entry_size = copy_entry->vme_end -
9155 							    copy_entry->vme_start;
9156 							local_size += entry_size;
9157 							new_offset += entry_size;
9158 						}
9159 						if (local_size >= copy_size) {
9160 							next_copy = copy_entry->vme_next;
9161 							copy_entry->vme_next =
9162 							    vm_map_copy_to_entry(copy);
9163 							previous_prev =
9164 							    copy->cpy_hdr.links.prev;
9165 							copy->cpy_hdr.links.prev = copy_entry;
9166 							copy->size = copy_size;
9167 							remaining_entries =
9168 							    copy->cpy_hdr.nentries;
9169 							remaining_entries -= nentries;
9170 							copy->cpy_hdr.nentries = nentries;
9171 							break;
9172 						} else {
9173 							local_size += entry_size;
9174 							new_offset += entry_size;
9175 							nentries++;
9176 						}
9177 						copy_entry = copy_entry->vme_next;
9178 					}
9179 				}
9180 
9181 				if ((entry->use_pmap) && (pmap == NULL)) {
9182 					kr = vm_map_copy_overwrite_nested(
9183 						VME_SUBMAP(entry),
9184 						sub_start,
9185 						copy,
9186 						interruptible,
9187 						VME_SUBMAP(entry)->pmap,
9188 						TRUE);
9189 				} else if (pmap != NULL) {
9190 					kr = vm_map_copy_overwrite_nested(
9191 						VME_SUBMAP(entry),
9192 						sub_start,
9193 						copy,
9194 						interruptible, pmap,
9195 						TRUE);
9196 				} else {
9197 					kr = vm_map_copy_overwrite_nested(
9198 						VME_SUBMAP(entry),
9199 						sub_start,
9200 						copy,
9201 						interruptible,
9202 						dst_map->pmap,
9203 						TRUE);
9204 				}
9205 				if (kr != KERN_SUCCESS) {
9206 					if (next_copy != NULL) {
9207 						copy->cpy_hdr.nentries +=
9208 						    remaining_entries;
9209 						copy->cpy_hdr.links.prev->vme_next =
9210 						    next_copy;
9211 						copy->cpy_hdr.links.prev
9212 						        = previous_prev;
9213 						copy->size = total_size;
9214 					}
9215 					return kr;
9216 				}
9217 				if (dst_end <= local_end) {
9218 					return KERN_SUCCESS;
9219 				}
9220 				/* otherwise copy no longer exists, it was */
9221 				/* destroyed after successful copy_overwrite */
9222 				copy = vm_map_copy_allocate();
9223 				copy->type = VM_MAP_COPY_ENTRY_LIST;
9224 				copy->offset = new_offset;
9225 				copy->cpy_hdr.page_shift = copy_page_shift;
9226 
9227 				/*
9228 				 * XXX FBDP
9229 				 * this does not seem to deal with
9230 				 * the VM map store (R&B tree)
9231 				 */
9232 
9233 				total_size -= copy_size;
9234 				copy_size = 0;
9235 				/* put back remainder of copy in container */
9236 				if (next_copy != NULL) {
9237 					copy->cpy_hdr.nentries = remaining_entries;
9238 					copy->cpy_hdr.links.next = next_copy;
9239 					copy->cpy_hdr.links.prev = previous_prev;
9240 					copy->size = total_size;
9241 					next_copy->vme_prev =
9242 					    vm_map_copy_to_entry(copy);
9243 					next_copy = NULL;
9244 				}
9245 				base_addr = local_end;
9246 				vm_map_lock(dst_map);
9247 				if (!vm_map_lookup_entry(dst_map,
9248 				    local_end, &tmp_entry)) {
9249 					vm_map_unlock(dst_map);
9250 					return KERN_INVALID_ADDRESS;
9251 				}
9252 				entry = tmp_entry;
9253 				continue;
9254 			}
9255 			if (dst_end <= entry->vme_end) {
9256 				copy_size = dst_end - base_addr;
9257 				break;
9258 			}
9259 
9260 			if ((next == vm_map_to_entry(dst_map)) ||
9261 			    (next->vme_start != entry->vme_end)) {
9262 				vm_map_unlock(dst_map);
9263 				return KERN_INVALID_ADDRESS;
9264 			}
9265 
9266 			entry = next;
9267 		}/* for */
9268 
9269 		next_copy = NULL;
9270 		nentries = 1;
9271 
9272 		/* adjust the copy object */
9273 		if (total_size > copy_size) {
9274 			vm_map_size_t   local_size = 0;
9275 			vm_map_size_t   entry_size;
9276 
9277 			new_offset = copy->offset;
9278 			copy_entry = vm_map_copy_first_entry(copy);
9279 			while (copy_entry != vm_map_copy_to_entry(copy)) {
9280 				entry_size = copy_entry->vme_end -
9281 				    copy_entry->vme_start;
9282 				if ((local_size < copy_size) &&
9283 				    ((local_size + entry_size)
9284 				    >= copy_size)) {
9285 					vm_map_copy_clip_end(copy, copy_entry,
9286 					    copy_entry->vme_start +
9287 					    (copy_size - local_size));
9288 					entry_size = copy_entry->vme_end -
9289 					    copy_entry->vme_start;
9290 					local_size += entry_size;
9291 					new_offset += entry_size;
9292 				}
9293 				if (local_size >= copy_size) {
9294 					next_copy = copy_entry->vme_next;
9295 					copy_entry->vme_next =
9296 					    vm_map_copy_to_entry(copy);
9297 					previous_prev =
9298 					    copy->cpy_hdr.links.prev;
9299 					copy->cpy_hdr.links.prev = copy_entry;
9300 					copy->size = copy_size;
9301 					remaining_entries =
9302 					    copy->cpy_hdr.nentries;
9303 					remaining_entries -= nentries;
9304 					copy->cpy_hdr.nentries = nentries;
9305 					break;
9306 				} else {
9307 					local_size += entry_size;
9308 					new_offset += entry_size;
9309 					nentries++;
9310 				}
9311 				copy_entry = copy_entry->vme_next;
9312 			}
9313 		}
9314 
9315 		if (aligned) {
9316 			pmap_t  local_pmap;
9317 
9318 			if (pmap) {
9319 				local_pmap = pmap;
9320 			} else {
9321 				local_pmap = dst_map->pmap;
9322 			}
9323 
9324 			if ((kr =  vm_map_copy_overwrite_aligned(
9325 				    dst_map, tmp_entry, copy,
9326 				    base_addr, local_pmap)) != KERN_SUCCESS) {
9327 				if (next_copy != NULL) {
9328 					copy->cpy_hdr.nentries +=
9329 					    remaining_entries;
9330 					copy->cpy_hdr.links.prev->vme_next =
9331 					    next_copy;
9332 					copy->cpy_hdr.links.prev =
9333 					    previous_prev;
9334 					copy->size += copy_size;
9335 				}
9336 				return kr;
9337 			}
9338 			vm_map_unlock(dst_map);
9339 		} else {
9340 			/*
9341 			 * Performance gain:
9342 			 *
9343 			 * if the copy and dst address are misaligned but the same
9344 			 * offset within the page we can copy_not_aligned the
9345 			 * misaligned parts and copy aligned the rest.  If they are
9346 			 * aligned but len is unaligned we simply need to copy
9347 			 * the end bit unaligned.  We'll need to split the misaligned
9348 			 * bits of the region in this case !
9349 			 */
9350 			/* ALWAYS UNLOCKS THE dst_map MAP */
9351 			kr = vm_map_copy_overwrite_unaligned(
9352 				dst_map,
9353 				tmp_entry,
9354 				copy,
9355 				base_addr,
9356 				discard_on_success);
9357 			if (kr != KERN_SUCCESS) {
9358 				if (next_copy != NULL) {
9359 					copy->cpy_hdr.nentries +=
9360 					    remaining_entries;
9361 					copy->cpy_hdr.links.prev->vme_next =
9362 					    next_copy;
9363 					copy->cpy_hdr.links.prev =
9364 					    previous_prev;
9365 					copy->size += copy_size;
9366 				}
9367 				return kr;
9368 			}
9369 		}
9370 		total_size -= copy_size;
9371 		if (total_size == 0) {
9372 			break;
9373 		}
9374 		base_addr += copy_size;
9375 		copy_size = 0;
9376 		copy->offset = new_offset;
9377 		if (next_copy != NULL) {
9378 			copy->cpy_hdr.nentries = remaining_entries;
9379 			copy->cpy_hdr.links.next = next_copy;
9380 			copy->cpy_hdr.links.prev = previous_prev;
9381 			next_copy->vme_prev = vm_map_copy_to_entry(copy);
9382 			copy->size = total_size;
9383 		}
9384 		vm_map_lock(dst_map);
9385 		while (TRUE) {
9386 			if (!vm_map_lookup_entry(dst_map,
9387 			    base_addr, &tmp_entry)) {
9388 				vm_map_unlock(dst_map);
9389 				return KERN_INVALID_ADDRESS;
9390 			}
9391 			if (tmp_entry->in_transition) {
9392 				entry->needs_wakeup = TRUE;
9393 				vm_map_entry_wait(dst_map, THREAD_UNINT);
9394 			} else {
9395 				break;
9396 			}
9397 		}
9398 		vm_map_clip_start(dst_map,
9399 		    tmp_entry,
9400 		    vm_map_trunc_page(base_addr,
9401 		    VM_MAP_PAGE_MASK(dst_map)));
9402 
9403 		entry = tmp_entry;
9404 	} /* while */
9405 
9406 	/*
9407 	 *	Throw away the vm_map_copy object
9408 	 */
9409 	if (discard_on_success) {
9410 		vm_map_copy_discard(copy);
9411 	}
9412 
9413 	return KERN_SUCCESS;
9414 }/* vm_map_copy_overwrite */
9415 
9416 kern_return_t
vm_map_copy_overwrite(vm_map_t dst_map,vm_map_offset_t dst_addr,vm_map_copy_t copy,vm_map_size_t copy_size,boolean_t interruptible)9417 vm_map_copy_overwrite(
9418 	vm_map_t        dst_map,
9419 	vm_map_offset_t dst_addr,
9420 	vm_map_copy_t   copy,
9421 	vm_map_size_t   copy_size,
9422 	boolean_t       interruptible)
9423 {
9424 	vm_map_size_t   head_size, tail_size;
9425 	vm_map_copy_t   head_copy, tail_copy;
9426 	vm_map_offset_t head_addr, tail_addr;
9427 	vm_map_entry_t  entry;
9428 	kern_return_t   kr;
9429 	vm_map_offset_t effective_page_mask, effective_page_size;
9430 	uint16_t        copy_page_shift;
9431 
9432 	head_size = 0;
9433 	tail_size = 0;
9434 	head_copy = NULL;
9435 	tail_copy = NULL;
9436 	head_addr = 0;
9437 	tail_addr = 0;
9438 
9439 	/*
9440 	 *	Check for null copy object.
9441 	 */
9442 	if (copy == VM_MAP_COPY_NULL) {
9443 		return KERN_SUCCESS;
9444 	}
9445 
9446 	/*
9447 	 * Assert that the vm_map_copy is coming from the right
9448 	 * zone and hasn't been forged
9449 	 */
9450 	vm_map_copy_require(copy);
9451 
9452 	if (interruptible ||
9453 	    copy->type != VM_MAP_COPY_ENTRY_LIST) {
9454 		/*
9455 		 * We can't split the "copy" map if we're interruptible
9456 		 * or if we don't have a "copy" map...
9457 		 */
9458 blunt_copy:
9459 		return vm_map_copy_overwrite_nested(dst_map,
9460 		           dst_addr,
9461 		           copy,
9462 		           interruptible,
9463 		           (pmap_t) NULL,
9464 		           TRUE);
9465 	}
9466 
9467 	copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy);
9468 	if (copy_page_shift < PAGE_SHIFT ||
9469 	    VM_MAP_PAGE_SHIFT(dst_map) < PAGE_SHIFT) {
9470 		goto blunt_copy;
9471 	}
9472 
9473 	if (VM_MAP_PAGE_SHIFT(dst_map) < PAGE_SHIFT) {
9474 		effective_page_mask = VM_MAP_PAGE_MASK(dst_map);
9475 	} else {
9476 		effective_page_mask = MAX(VM_MAP_PAGE_MASK(dst_map), PAGE_MASK);
9477 		effective_page_mask = MAX(VM_MAP_COPY_PAGE_MASK(copy),
9478 		    effective_page_mask);
9479 	}
9480 	effective_page_size = effective_page_mask + 1;
9481 
9482 	if (copy_size < VM_MAP_COPY_OVERWRITE_OPTIMIZATION_THRESHOLD_PAGES * effective_page_size) {
9483 		/*
9484 		 * Too small to bother with optimizing...
9485 		 */
9486 		goto blunt_copy;
9487 	}
9488 
9489 	if ((dst_addr & effective_page_mask) !=
9490 	    (copy->offset & effective_page_mask)) {
9491 		/*
9492 		 * Incompatible mis-alignment of source and destination...
9493 		 */
9494 		goto blunt_copy;
9495 	}
9496 
9497 	/*
9498 	 * Proper alignment or identical mis-alignment at the beginning.
9499 	 * Let's try and do a small unaligned copy first (if needed)
9500 	 * and then an aligned copy for the rest.
9501 	 */
9502 	if (!vm_map_page_aligned(dst_addr, effective_page_mask)) {
9503 		head_addr = dst_addr;
9504 		head_size = (effective_page_size -
9505 		    (copy->offset & effective_page_mask));
9506 		head_size = MIN(head_size, copy_size);
9507 	}
9508 	if (!vm_map_page_aligned(copy->offset + copy_size,
9509 	    effective_page_mask)) {
9510 		/*
9511 		 * Mis-alignment at the end.
9512 		 * Do an aligned copy up to the last page and
9513 		 * then an unaligned copy for the remaining bytes.
9514 		 */
9515 		tail_size = ((copy->offset + copy_size) &
9516 		    effective_page_mask);
9517 		tail_size = MIN(tail_size, copy_size);
9518 		tail_addr = dst_addr + copy_size - tail_size;
9519 		assert(tail_addr >= head_addr + head_size);
9520 	}
9521 	assert(head_size + tail_size <= copy_size);
9522 
9523 	if (head_size + tail_size == copy_size) {
9524 		/*
9525 		 * It's all unaligned, no optimization possible...
9526 		 */
9527 		goto blunt_copy;
9528 	}
9529 
9530 	/*
9531 	 * Can't optimize if there are any submaps in the
9532 	 * destination due to the way we free the "copy" map
9533 	 * progressively in vm_map_copy_overwrite_nested()
9534 	 * in that case.
9535 	 */
9536 	vm_map_lock_read(dst_map);
9537 	if (!vm_map_lookup_entry(dst_map, dst_addr, &entry)) {
9538 		vm_map_unlock_read(dst_map);
9539 		goto blunt_copy;
9540 	}
9541 	for (;
9542 	    (entry != vm_map_copy_to_entry(copy) &&
9543 	    entry->vme_start < dst_addr + copy_size);
9544 	    entry = entry->vme_next) {
9545 		if (entry->is_sub_map) {
9546 			vm_map_unlock_read(dst_map);
9547 			goto blunt_copy;
9548 		}
9549 	}
9550 	vm_map_unlock_read(dst_map);
9551 
9552 	if (head_size) {
9553 		/*
9554 		 * Unaligned copy of the first "head_size" bytes, to reach
9555 		 * a page boundary.
9556 		 */
9557 
9558 		/*
9559 		 * Extract "head_copy" out of "copy".
9560 		 */
9561 		head_copy = vm_map_copy_allocate();
9562 		head_copy->type = VM_MAP_COPY_ENTRY_LIST;
9563 		head_copy->cpy_hdr.entries_pageable =
9564 		    copy->cpy_hdr.entries_pageable;
9565 		vm_map_store_init(&head_copy->cpy_hdr);
9566 		head_copy->cpy_hdr.page_shift = copy_page_shift;
9567 
9568 		entry = vm_map_copy_first_entry(copy);
9569 		if (entry->vme_end < copy->offset + head_size) {
9570 			head_size = entry->vme_end - copy->offset;
9571 		}
9572 
9573 		head_copy->offset = copy->offset;
9574 		head_copy->size = head_size;
9575 		copy->offset += head_size;
9576 		copy->size -= head_size;
9577 		copy_size -= head_size;
9578 		assert(copy_size > 0);
9579 
9580 		vm_map_copy_clip_end(copy, entry, copy->offset);
9581 		vm_map_copy_entry_unlink(copy, entry);
9582 		vm_map_copy_entry_link(head_copy,
9583 		    vm_map_copy_to_entry(head_copy),
9584 		    entry);
9585 
9586 		/*
9587 		 * Do the unaligned copy.
9588 		 */
9589 		kr = vm_map_copy_overwrite_nested(dst_map,
9590 		    head_addr,
9591 		    head_copy,
9592 		    interruptible,
9593 		    (pmap_t) NULL,
9594 		    FALSE);
9595 		if (kr != KERN_SUCCESS) {
9596 			goto done;
9597 		}
9598 	}
9599 
9600 	if (tail_size) {
9601 		/*
9602 		 * Extract "tail_copy" out of "copy".
9603 		 */
9604 		tail_copy = vm_map_copy_allocate();
9605 		tail_copy->type = VM_MAP_COPY_ENTRY_LIST;
9606 		tail_copy->cpy_hdr.entries_pageable =
9607 		    copy->cpy_hdr.entries_pageable;
9608 		vm_map_store_init(&tail_copy->cpy_hdr);
9609 		tail_copy->cpy_hdr.page_shift = copy_page_shift;
9610 
9611 		tail_copy->offset = copy->offset + copy_size - tail_size;
9612 		tail_copy->size = tail_size;
9613 
9614 		copy->size -= tail_size;
9615 		copy_size -= tail_size;
9616 		assert(copy_size > 0);
9617 
9618 		entry = vm_map_copy_last_entry(copy);
9619 		vm_map_copy_clip_start(copy, entry, tail_copy->offset);
9620 		entry = vm_map_copy_last_entry(copy);
9621 		vm_map_copy_entry_unlink(copy, entry);
9622 		vm_map_copy_entry_link(tail_copy,
9623 		    vm_map_copy_last_entry(tail_copy),
9624 		    entry);
9625 	}
9626 
9627 	/*
9628 	 * If we are here from ipc_kmsg_copyout_ool_descriptor(),
9629 	 * we want to avoid TOCTOU issues w.r.t copy->size but
9630 	 * we don't need to change vm_map_copy_overwrite_nested()
9631 	 * and all other vm_map_copy_overwrite variants.
9632 	 *
9633 	 * So we assign the original copy_size that was passed into
9634 	 * this routine back to copy.
9635 	 *
9636 	 * This use of local 'copy_size' passed into this routine is
9637 	 * to try and protect against TOCTOU attacks where the kernel
9638 	 * has been exploited. We don't expect this to be an issue
9639 	 * during normal system operation.
9640 	 */
9641 	assertf(copy->size == copy_size,
9642 	    "Mismatch of copy sizes. Expected 0x%llx, Got 0x%llx\n", (uint64_t) copy_size, (uint64_t) copy->size);
9643 	copy->size = copy_size;
9644 
9645 	/*
9646 	 * Copy most (or possibly all) of the data.
9647 	 */
9648 	kr = vm_map_copy_overwrite_nested(dst_map,
9649 	    dst_addr + head_size,
9650 	    copy,
9651 	    interruptible,
9652 	    (pmap_t) NULL,
9653 	    FALSE);
9654 	if (kr != KERN_SUCCESS) {
9655 		goto done;
9656 	}
9657 
9658 	if (tail_size) {
9659 		kr = vm_map_copy_overwrite_nested(dst_map,
9660 		    tail_addr,
9661 		    tail_copy,
9662 		    interruptible,
9663 		    (pmap_t) NULL,
9664 		    FALSE);
9665 	}
9666 
9667 done:
9668 	assert(copy->type == VM_MAP_COPY_ENTRY_LIST);
9669 	if (kr == KERN_SUCCESS) {
9670 		/*
9671 		 * Discard all the copy maps.
9672 		 */
9673 		if (head_copy) {
9674 			vm_map_copy_discard(head_copy);
9675 			head_copy = NULL;
9676 		}
9677 		vm_map_copy_discard(copy);
9678 		if (tail_copy) {
9679 			vm_map_copy_discard(tail_copy);
9680 			tail_copy = NULL;
9681 		}
9682 	} else {
9683 		/*
9684 		 * Re-assemble the original copy map.
9685 		 */
9686 		if (head_copy) {
9687 			entry = vm_map_copy_first_entry(head_copy);
9688 			vm_map_copy_entry_unlink(head_copy, entry);
9689 			vm_map_copy_entry_link(copy,
9690 			    vm_map_copy_to_entry(copy),
9691 			    entry);
9692 			copy->offset -= head_size;
9693 			copy->size += head_size;
9694 			vm_map_copy_discard(head_copy);
9695 			head_copy = NULL;
9696 		}
9697 		if (tail_copy) {
9698 			entry = vm_map_copy_last_entry(tail_copy);
9699 			vm_map_copy_entry_unlink(tail_copy, entry);
9700 			vm_map_copy_entry_link(copy,
9701 			    vm_map_copy_last_entry(copy),
9702 			    entry);
9703 			copy->size += tail_size;
9704 			vm_map_copy_discard(tail_copy);
9705 			tail_copy = NULL;
9706 		}
9707 	}
9708 	return kr;
9709 }
9710 
9711 
9712 /*
9713  *	Routine: vm_map_copy_overwrite_unaligned	[internal use only]
9714  *
9715  *	Decription:
9716  *	Physically copy unaligned data
9717  *
9718  *	Implementation:
9719  *	Unaligned parts of pages have to be physically copied.  We use
9720  *	a modified form of vm_fault_copy (which understands none-aligned
9721  *	page offsets and sizes) to do the copy.  We attempt to copy as
9722  *	much memory in one go as possibly, however vm_fault_copy copies
9723  *	within 1 memory object so we have to find the smaller of "amount left"
9724  *	"source object data size" and "target object data size".  With
9725  *	unaligned data we don't need to split regions, therefore the source
9726  *	(copy) object should be one map entry, the target range may be split
9727  *	over multiple map entries however.  In any event we are pessimistic
9728  *	about these assumptions.
9729  *
9730  *	Callers of this function must call vm_map_copy_require on
9731  *	previously created vm_map_copy_t or pass a newly created
9732  *	one to ensure that it hasn't been forged.
9733  *
9734  *	Assumptions:
9735  *	dst_map is locked on entry and is return locked on success,
9736  *	unlocked on error.
9737  */
9738 
9739 static kern_return_t
vm_map_copy_overwrite_unaligned(vm_map_t dst_map,vm_map_entry_t entry,vm_map_copy_t copy,vm_map_offset_t start,boolean_t discard_on_success)9740 vm_map_copy_overwrite_unaligned(
9741 	vm_map_t        dst_map,
9742 	vm_map_entry_t  entry,
9743 	vm_map_copy_t   copy,
9744 	vm_map_offset_t start,
9745 	boolean_t       discard_on_success)
9746 {
9747 	vm_map_entry_t          copy_entry;
9748 	vm_map_entry_t          copy_entry_next;
9749 	vm_map_version_t        version;
9750 	vm_object_t             dst_object;
9751 	vm_object_offset_t      dst_offset;
9752 	vm_object_offset_t      src_offset;
9753 	vm_object_offset_t      entry_offset;
9754 	vm_map_offset_t         entry_end;
9755 	vm_map_size_t           src_size,
9756 	    dst_size,
9757 	    copy_size,
9758 	    amount_left;
9759 	kern_return_t           kr = KERN_SUCCESS;
9760 
9761 
9762 	copy_entry = vm_map_copy_first_entry(copy);
9763 
9764 	vm_map_lock_write_to_read(dst_map);
9765 
9766 	src_offset = copy->offset - trunc_page_mask_64(copy->offset, VM_MAP_COPY_PAGE_MASK(copy));
9767 	amount_left = copy->size;
9768 /*
9769  *	unaligned so we never clipped this entry, we need the offset into
9770  *	the vm_object not just the data.
9771  */
9772 	while (amount_left > 0) {
9773 		if (entry == vm_map_to_entry(dst_map)) {
9774 			vm_map_unlock_read(dst_map);
9775 			return KERN_INVALID_ADDRESS;
9776 		}
9777 
9778 		/* "start" must be within the current map entry */
9779 		assert((start >= entry->vme_start) && (start < entry->vme_end));
9780 
9781 		dst_offset = start - entry->vme_start;
9782 
9783 		dst_size = entry->vme_end - start;
9784 
9785 		src_size = copy_entry->vme_end -
9786 		    (copy_entry->vme_start + src_offset);
9787 
9788 		if (dst_size < src_size) {
9789 /*
9790  *			we can only copy dst_size bytes before
9791  *			we have to get the next destination entry
9792  */
9793 			copy_size = dst_size;
9794 		} else {
9795 /*
9796  *			we can only copy src_size bytes before
9797  *			we have to get the next source copy entry
9798  */
9799 			copy_size = src_size;
9800 		}
9801 
9802 		if (copy_size > amount_left) {
9803 			copy_size = amount_left;
9804 		}
9805 /*
9806  *		Entry needs copy, create a shadow shadow object for
9807  *		Copy on write region.
9808  */
9809 		if (entry->needs_copy &&
9810 		    ((entry->protection & VM_PROT_WRITE) != 0)) {
9811 			if (vm_map_lock_read_to_write(dst_map)) {
9812 				vm_map_lock_read(dst_map);
9813 				goto RetryLookup;
9814 			}
9815 			VME_OBJECT_SHADOW(entry,
9816 			    (vm_map_size_t)(entry->vme_end
9817 			    - entry->vme_start));
9818 			entry->needs_copy = FALSE;
9819 			vm_map_lock_write_to_read(dst_map);
9820 		}
9821 		dst_object = VME_OBJECT(entry);
9822 /*
9823  *		unlike with the virtual (aligned) copy we're going
9824  *		to fault on it therefore we need a target object.
9825  */
9826 		if (dst_object == VM_OBJECT_NULL) {
9827 			if (vm_map_lock_read_to_write(dst_map)) {
9828 				vm_map_lock_read(dst_map);
9829 				goto RetryLookup;
9830 			}
9831 			dst_object = vm_object_allocate((vm_map_size_t)
9832 			    entry->vme_end - entry->vme_start);
9833 			VME_OBJECT_SET(entry, dst_object);
9834 			VME_OFFSET_SET(entry, 0);
9835 			assert(entry->use_pmap);
9836 			vm_map_lock_write_to_read(dst_map);
9837 		}
9838 /*
9839  *		Take an object reference and unlock map. The "entry" may
9840  *		disappear or change when the map is unlocked.
9841  */
9842 		vm_object_reference(dst_object);
9843 		version.main_timestamp = dst_map->timestamp;
9844 		entry_offset = VME_OFFSET(entry);
9845 		entry_end = entry->vme_end;
9846 		vm_map_unlock_read(dst_map);
9847 /*
9848  *		Copy as much as possible in one pass
9849  */
9850 		kr = vm_fault_copy(
9851 			VME_OBJECT(copy_entry),
9852 			VME_OFFSET(copy_entry) + src_offset,
9853 			&copy_size,
9854 			dst_object,
9855 			entry_offset + dst_offset,
9856 			dst_map,
9857 			&version,
9858 			THREAD_UNINT );
9859 
9860 		start += copy_size;
9861 		src_offset += copy_size;
9862 		amount_left -= copy_size;
9863 /*
9864  *		Release the object reference
9865  */
9866 		vm_object_deallocate(dst_object);
9867 /*
9868  *		If a hard error occurred, return it now
9869  */
9870 		if (kr != KERN_SUCCESS) {
9871 			return kr;
9872 		}
9873 
9874 		if ((copy_entry->vme_start + src_offset) == copy_entry->vme_end
9875 		    || amount_left == 0) {
9876 /*
9877  *			all done with this copy entry, dispose.
9878  */
9879 			copy_entry_next = copy_entry->vme_next;
9880 
9881 			if (discard_on_success) {
9882 				vm_map_copy_entry_unlink(copy, copy_entry);
9883 				assert(!copy_entry->is_sub_map);
9884 				vm_object_deallocate(VME_OBJECT(copy_entry));
9885 				vm_map_copy_entry_dispose(copy_entry);
9886 			}
9887 
9888 			if (copy_entry_next == vm_map_copy_to_entry(copy) &&
9889 			    amount_left) {
9890 /*
9891  *				not finished copying but run out of source
9892  */
9893 				return KERN_INVALID_ADDRESS;
9894 			}
9895 
9896 			copy_entry = copy_entry_next;
9897 
9898 			src_offset = 0;
9899 		}
9900 
9901 		if (amount_left == 0) {
9902 			return KERN_SUCCESS;
9903 		}
9904 
9905 		vm_map_lock_read(dst_map);
9906 		if (version.main_timestamp == dst_map->timestamp) {
9907 			if (start == entry_end) {
9908 /*
9909  *				destination region is split.  Use the version
9910  *				information to avoid a lookup in the normal
9911  *				case.
9912  */
9913 				entry = entry->vme_next;
9914 /*
9915  *				should be contiguous. Fail if we encounter
9916  *				a hole in the destination.
9917  */
9918 				if (start != entry->vme_start) {
9919 					vm_map_unlock_read(dst_map);
9920 					return KERN_INVALID_ADDRESS;
9921 				}
9922 			}
9923 		} else {
9924 /*
9925  *			Map version check failed.
9926  *			we must lookup the entry because somebody
9927  *			might have changed the map behind our backs.
9928  */
9929 RetryLookup:
9930 			if (!vm_map_lookup_entry(dst_map, start, &entry)) {
9931 				vm_map_unlock_read(dst_map);
9932 				return KERN_INVALID_ADDRESS;
9933 			}
9934 		}
9935 	}/* while */
9936 
9937 	return KERN_SUCCESS;
9938 }/* vm_map_copy_overwrite_unaligned */
9939 
9940 /*
9941  *	Routine: vm_map_copy_overwrite_aligned	[internal use only]
9942  *
9943  *	Description:
9944  *	Does all the vm_trickery possible for whole pages.
9945  *
9946  *	Implementation:
9947  *
9948  *	If there are no permanent objects in the destination,
9949  *	and the source and destination map entry zones match,
9950  *	and the destination map entry is not shared,
9951  *	then the map entries can be deleted and replaced
9952  *	with those from the copy.  The following code is the
9953  *	basic idea of what to do, but there are lots of annoying
9954  *	little details about getting protection and inheritance
9955  *	right.  Should add protection, inheritance, and sharing checks
9956  *	to the above pass and make sure that no wiring is involved.
9957  *
9958  *	Callers of this function must call vm_map_copy_require on
9959  *	previously created vm_map_copy_t or pass a newly created
9960  *	one to ensure that it hasn't been forged.
9961  */
9962 
9963 int vm_map_copy_overwrite_aligned_src_not_internal = 0;
9964 int vm_map_copy_overwrite_aligned_src_not_symmetric = 0;
9965 int vm_map_copy_overwrite_aligned_src_large = 0;
9966 
9967 static kern_return_t
vm_map_copy_overwrite_aligned(vm_map_t dst_map,vm_map_entry_t tmp_entry,vm_map_copy_t copy,vm_map_offset_t start,__unused pmap_t pmap)9968 vm_map_copy_overwrite_aligned(
9969 	vm_map_t        dst_map,
9970 	vm_map_entry_t  tmp_entry,
9971 	vm_map_copy_t   copy,
9972 	vm_map_offset_t start,
9973 	__unused pmap_t pmap)
9974 {
9975 	vm_object_t     object;
9976 	vm_map_entry_t  copy_entry;
9977 	vm_map_size_t   copy_size;
9978 	vm_map_size_t   size;
9979 	vm_map_entry_t  entry;
9980 
9981 	while ((copy_entry = vm_map_copy_first_entry(copy))
9982 	    != vm_map_copy_to_entry(copy)) {
9983 		copy_size = (copy_entry->vme_end - copy_entry->vme_start);
9984 
9985 		entry = tmp_entry;
9986 		if (entry->is_sub_map) {
9987 			/* unnested when clipped earlier */
9988 			assert(!entry->use_pmap);
9989 		}
9990 		if (entry == vm_map_to_entry(dst_map)) {
9991 			vm_map_unlock(dst_map);
9992 			return KERN_INVALID_ADDRESS;
9993 		}
9994 		size = (entry->vme_end - entry->vme_start);
9995 		/*
9996 		 *	Make sure that no holes popped up in the
9997 		 *	address map, and that the protection is
9998 		 *	still valid, in case the map was unlocked
9999 		 *	earlier.
10000 		 */
10001 
10002 		if ((entry->vme_start != start) || ((entry->is_sub_map)
10003 		    && !entry->needs_copy)) {
10004 			vm_map_unlock(dst_map);
10005 			return KERN_INVALID_ADDRESS;
10006 		}
10007 		assert(entry != vm_map_to_entry(dst_map));
10008 
10009 		/*
10010 		 *	Check protection again
10011 		 */
10012 
10013 		if (!(entry->protection & VM_PROT_WRITE)) {
10014 			vm_map_unlock(dst_map);
10015 			return KERN_PROTECTION_FAILURE;
10016 		}
10017 
10018 		if (!vm_map_entry_is_overwritable(dst_map, entry)) {
10019 			vm_map_unlock(dst_map);
10020 			return KERN_PROTECTION_FAILURE;
10021 		}
10022 
10023 		/*
10024 		 *	Adjust to source size first
10025 		 */
10026 
10027 		if (copy_size < size) {
10028 			if (entry->map_aligned &&
10029 			    !VM_MAP_PAGE_ALIGNED(entry->vme_start + copy_size,
10030 			    VM_MAP_PAGE_MASK(dst_map))) {
10031 				/* no longer map-aligned */
10032 				entry->map_aligned = FALSE;
10033 			}
10034 			vm_map_clip_end(dst_map, entry, entry->vme_start + copy_size);
10035 			size = copy_size;
10036 		}
10037 
10038 		/*
10039 		 *	Adjust to destination size
10040 		 */
10041 
10042 		if (size < copy_size) {
10043 			vm_map_copy_clip_end(copy, copy_entry,
10044 			    copy_entry->vme_start + size);
10045 			copy_size = size;
10046 		}
10047 
10048 		assert((entry->vme_end - entry->vme_start) == size);
10049 		assert((tmp_entry->vme_end - tmp_entry->vme_start) == size);
10050 		assert((copy_entry->vme_end - copy_entry->vme_start) == size);
10051 
10052 		/*
10053 		 *	If the destination contains temporary unshared memory,
10054 		 *	we can perform the copy by throwing it away and
10055 		 *	installing the source data.
10056 		 */
10057 
10058 		object = VME_OBJECT(entry);
10059 		if ((!entry->is_shared &&
10060 		    ((object == VM_OBJECT_NULL) ||
10061 		    (object->internal && !object->true_share))) ||
10062 		    entry->needs_copy) {
10063 			vm_object_t     old_object = VME_OBJECT(entry);
10064 			vm_object_offset_t      old_offset = VME_OFFSET(entry);
10065 			vm_object_offset_t      offset;
10066 
10067 			/*
10068 			 * Ensure that the source and destination aren't
10069 			 * identical
10070 			 */
10071 			if (old_object == VME_OBJECT(copy_entry) &&
10072 			    old_offset == VME_OFFSET(copy_entry)) {
10073 				vm_map_copy_entry_unlink(copy, copy_entry);
10074 				vm_map_copy_entry_dispose(copy_entry);
10075 
10076 				if (old_object != VM_OBJECT_NULL) {
10077 					vm_object_deallocate(old_object);
10078 				}
10079 
10080 				start = tmp_entry->vme_end;
10081 				tmp_entry = tmp_entry->vme_next;
10082 				continue;
10083 			}
10084 
10085 #if XNU_TARGET_OS_OSX
10086 #define __TRADEOFF1_OBJ_SIZE (64 * 1024 * 1024) /* 64 MB */
10087 #define __TRADEOFF1_COPY_SIZE (128 * 1024)      /* 128 KB */
10088 			if (VME_OBJECT(copy_entry) != VM_OBJECT_NULL &&
10089 			    VME_OBJECT(copy_entry)->vo_size >= __TRADEOFF1_OBJ_SIZE &&
10090 			    copy_size <= __TRADEOFF1_COPY_SIZE) {
10091 				/*
10092 				 * Virtual vs. Physical copy tradeoff #1.
10093 				 *
10094 				 * Copying only a few pages out of a large
10095 				 * object:  do a physical copy instead of
10096 				 * a virtual copy, to avoid possibly keeping
10097 				 * the entire large object alive because of
10098 				 * those few copy-on-write pages.
10099 				 */
10100 				vm_map_copy_overwrite_aligned_src_large++;
10101 				goto slow_copy;
10102 			}
10103 #endif /* XNU_TARGET_OS_OSX */
10104 
10105 			if ((dst_map->pmap != kernel_pmap) &&
10106 			    (VME_ALIAS(entry) >= VM_MEMORY_MALLOC) &&
10107 			    (VME_ALIAS(entry) <= VM_MEMORY_MALLOC_MEDIUM)) {
10108 				vm_object_t new_object, new_shadow;
10109 
10110 				/*
10111 				 * We're about to map something over a mapping
10112 				 * established by malloc()...
10113 				 */
10114 				new_object = VME_OBJECT(copy_entry);
10115 				if (new_object != VM_OBJECT_NULL) {
10116 					vm_object_lock_shared(new_object);
10117 				}
10118 				while (new_object != VM_OBJECT_NULL &&
10119 #if XNU_TARGET_OS_OSX
10120 				    !new_object->true_share &&
10121 				    new_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC &&
10122 #endif /* XNU_TARGET_OS_OSX */
10123 				    new_object->internal) {
10124 					new_shadow = new_object->shadow;
10125 					if (new_shadow == VM_OBJECT_NULL) {
10126 						break;
10127 					}
10128 					vm_object_lock_shared(new_shadow);
10129 					vm_object_unlock(new_object);
10130 					new_object = new_shadow;
10131 				}
10132 				if (new_object != VM_OBJECT_NULL) {
10133 					if (!new_object->internal) {
10134 						/*
10135 						 * The new mapping is backed
10136 						 * by an external object.  We
10137 						 * don't want malloc'ed memory
10138 						 * to be replaced with such a
10139 						 * non-anonymous mapping, so
10140 						 * let's go off the optimized
10141 						 * path...
10142 						 */
10143 						vm_map_copy_overwrite_aligned_src_not_internal++;
10144 						vm_object_unlock(new_object);
10145 						goto slow_copy;
10146 					}
10147 #if XNU_TARGET_OS_OSX
10148 					if (new_object->true_share ||
10149 					    new_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
10150 						/*
10151 						 * Same if there's a "true_share"
10152 						 * object in the shadow chain, or
10153 						 * an object with a non-default
10154 						 * (SYMMETRIC) copy strategy.
10155 						 */
10156 						vm_map_copy_overwrite_aligned_src_not_symmetric++;
10157 						vm_object_unlock(new_object);
10158 						goto slow_copy;
10159 					}
10160 #endif /* XNU_TARGET_OS_OSX */
10161 					vm_object_unlock(new_object);
10162 				}
10163 				/*
10164 				 * The new mapping is still backed by
10165 				 * anonymous (internal) memory, so it's
10166 				 * OK to substitute it for the original
10167 				 * malloc() mapping.
10168 				 */
10169 			}
10170 
10171 			if (old_object != VM_OBJECT_NULL) {
10172 				if (entry->is_sub_map) {
10173 					if (entry->use_pmap) {
10174 #ifndef NO_NESTED_PMAP
10175 						pmap_unnest(dst_map->pmap,
10176 						    (addr64_t)entry->vme_start,
10177 						    entry->vme_end - entry->vme_start);
10178 #endif  /* NO_NESTED_PMAP */
10179 						if (dst_map->mapped_in_other_pmaps) {
10180 							/* clean up parent */
10181 							/* map/maps */
10182 							vm_map_submap_pmap_clean(
10183 								dst_map, entry->vme_start,
10184 								entry->vme_end,
10185 								VME_SUBMAP(entry),
10186 								VME_OFFSET(entry));
10187 						}
10188 					} else {
10189 						vm_map_submap_pmap_clean(
10190 							dst_map, entry->vme_start,
10191 							entry->vme_end,
10192 							VME_SUBMAP(entry),
10193 							VME_OFFSET(entry));
10194 					}
10195 					vm_map_deallocate(VME_SUBMAP(entry));
10196 				} else {
10197 					if (dst_map->mapped_in_other_pmaps) {
10198 						vm_object_pmap_protect_options(
10199 							VME_OBJECT(entry),
10200 							VME_OFFSET(entry),
10201 							entry->vme_end
10202 							- entry->vme_start,
10203 							PMAP_NULL,
10204 							PAGE_SIZE,
10205 							entry->vme_start,
10206 							VM_PROT_NONE,
10207 							PMAP_OPTIONS_REMOVE);
10208 					} else {
10209 						pmap_remove_options(
10210 							dst_map->pmap,
10211 							(addr64_t)(entry->vme_start),
10212 							(addr64_t)(entry->vme_end),
10213 							PMAP_OPTIONS_REMOVE);
10214 					}
10215 					vm_object_deallocate(old_object);
10216 				}
10217 			}
10218 
10219 			if (entry->iokit_acct) {
10220 				/* keep using iokit accounting */
10221 				entry->use_pmap = FALSE;
10222 			} else {
10223 				/* use pmap accounting */
10224 				entry->use_pmap = TRUE;
10225 			}
10226 			entry->is_sub_map = FALSE;
10227 			VME_OBJECT_SET(entry, VME_OBJECT(copy_entry));
10228 			object = VME_OBJECT(entry);
10229 			entry->needs_copy = copy_entry->needs_copy;
10230 			entry->wired_count = 0;
10231 			entry->user_wired_count = 0;
10232 			offset = VME_OFFSET(copy_entry);
10233 			VME_OFFSET_SET(entry, offset);
10234 
10235 			vm_map_copy_entry_unlink(copy, copy_entry);
10236 			vm_map_copy_entry_dispose(copy_entry);
10237 
10238 			/*
10239 			 * we could try to push pages into the pmap at this point, BUT
10240 			 * this optimization only saved on average 2 us per page if ALL
10241 			 * the pages in the source were currently mapped
10242 			 * and ALL the pages in the dest were touched, if there were fewer
10243 			 * than 2/3 of the pages touched, this optimization actually cost more cycles
10244 			 * it also puts a lot of pressure on the pmap layer w/r to mapping structures
10245 			 */
10246 
10247 			/*
10248 			 *	Set up for the next iteration.  The map
10249 			 *	has not been unlocked, so the next
10250 			 *	address should be at the end of this
10251 			 *	entry, and the next map entry should be
10252 			 *	the one following it.
10253 			 */
10254 
10255 			start = tmp_entry->vme_end;
10256 			tmp_entry = tmp_entry->vme_next;
10257 		} else {
10258 			vm_map_version_t        version;
10259 			vm_object_t             dst_object;
10260 			vm_object_offset_t      dst_offset;
10261 			kern_return_t           r;
10262 
10263 slow_copy:
10264 			if (entry->needs_copy) {
10265 				VME_OBJECT_SHADOW(entry,
10266 				    (entry->vme_end -
10267 				    entry->vme_start));
10268 				entry->needs_copy = FALSE;
10269 			}
10270 
10271 			dst_object = VME_OBJECT(entry);
10272 			dst_offset = VME_OFFSET(entry);
10273 
10274 			/*
10275 			 *	Take an object reference, and record
10276 			 *	the map version information so that the
10277 			 *	map can be safely unlocked.
10278 			 */
10279 
10280 			if (dst_object == VM_OBJECT_NULL) {
10281 				/*
10282 				 * We would usually have just taken the
10283 				 * optimized path above if the destination
10284 				 * object has not been allocated yet.  But we
10285 				 * now disable that optimization if the copy
10286 				 * entry's object is not backed by anonymous
10287 				 * memory to avoid replacing malloc'ed
10288 				 * (i.e. re-usable) anonymous memory with a
10289 				 * not-so-anonymous mapping.
10290 				 * So we have to handle this case here and
10291 				 * allocate a new VM object for this map entry.
10292 				 */
10293 				dst_object = vm_object_allocate(
10294 					entry->vme_end - entry->vme_start);
10295 				dst_offset = 0;
10296 				VME_OBJECT_SET(entry, dst_object);
10297 				VME_OFFSET_SET(entry, dst_offset);
10298 				assert(entry->use_pmap);
10299 			}
10300 
10301 			vm_object_reference(dst_object);
10302 
10303 			/* account for unlock bumping up timestamp */
10304 			version.main_timestamp = dst_map->timestamp + 1;
10305 
10306 			vm_map_unlock(dst_map);
10307 
10308 			/*
10309 			 *	Copy as much as possible in one pass
10310 			 */
10311 
10312 			copy_size = size;
10313 			r = vm_fault_copy(
10314 				VME_OBJECT(copy_entry),
10315 				VME_OFFSET(copy_entry),
10316 				&copy_size,
10317 				dst_object,
10318 				dst_offset,
10319 				dst_map,
10320 				&version,
10321 				THREAD_UNINT );
10322 
10323 			/*
10324 			 *	Release the object reference
10325 			 */
10326 
10327 			vm_object_deallocate(dst_object);
10328 
10329 			/*
10330 			 *	If a hard error occurred, return it now
10331 			 */
10332 
10333 			if (r != KERN_SUCCESS) {
10334 				return r;
10335 			}
10336 
10337 			if (copy_size != 0) {
10338 				/*
10339 				 *	Dispose of the copied region
10340 				 */
10341 
10342 				vm_map_copy_clip_end(copy, copy_entry,
10343 				    copy_entry->vme_start + copy_size);
10344 				vm_map_copy_entry_unlink(copy, copy_entry);
10345 				vm_object_deallocate(VME_OBJECT(copy_entry));
10346 				vm_map_copy_entry_dispose(copy_entry);
10347 			}
10348 
10349 			/*
10350 			 *	Pick up in the destination map where we left off.
10351 			 *
10352 			 *	Use the version information to avoid a lookup
10353 			 *	in the normal case.
10354 			 */
10355 
10356 			start += copy_size;
10357 			vm_map_lock(dst_map);
10358 			if (version.main_timestamp == dst_map->timestamp &&
10359 			    copy_size != 0) {
10360 				/* We can safely use saved tmp_entry value */
10361 
10362 				if (tmp_entry->map_aligned &&
10363 				    !VM_MAP_PAGE_ALIGNED(
10364 					    start,
10365 					    VM_MAP_PAGE_MASK(dst_map))) {
10366 					/* no longer map-aligned */
10367 					tmp_entry->map_aligned = FALSE;
10368 				}
10369 				vm_map_clip_end(dst_map, tmp_entry, start);
10370 				tmp_entry = tmp_entry->vme_next;
10371 			} else {
10372 				/* Must do lookup of tmp_entry */
10373 
10374 				if (!vm_map_lookup_entry(dst_map, start, &tmp_entry)) {
10375 					vm_map_unlock(dst_map);
10376 					return KERN_INVALID_ADDRESS;
10377 				}
10378 				if (tmp_entry->map_aligned &&
10379 				    !VM_MAP_PAGE_ALIGNED(
10380 					    start,
10381 					    VM_MAP_PAGE_MASK(dst_map))) {
10382 					/* no longer map-aligned */
10383 					tmp_entry->map_aligned = FALSE;
10384 				}
10385 				vm_map_clip_start(dst_map, tmp_entry, start);
10386 			}
10387 		}
10388 	}/* while */
10389 
10390 	return KERN_SUCCESS;
10391 }/* vm_map_copy_overwrite_aligned */
10392 
10393 /*
10394  *	Routine: vm_map_copyin_kernel_buffer [internal use only]
10395  *
10396  *	Description:
10397  *		Copy in data to a kernel buffer from space in the
10398  *		source map. The original space may be optionally
10399  *		deallocated.
10400  *
10401  *		If successful, returns a new copy object.
10402  */
10403 static kern_return_t
vm_map_copyin_kernel_buffer(vm_map_t src_map,vm_map_offset_t src_addr,vm_map_size_t len,boolean_t src_destroy,vm_map_copy_t * copy_result)10404 vm_map_copyin_kernel_buffer(
10405 	vm_map_t        src_map,
10406 	vm_map_offset_t src_addr,
10407 	vm_map_size_t   len,
10408 	boolean_t       src_destroy,
10409 	vm_map_copy_t   *copy_result)
10410 {
10411 	kern_return_t kr;
10412 	vm_map_copy_t copy;
10413 
10414 	if (len > msg_ool_size_small) {
10415 		return KERN_INVALID_ARGUMENT;
10416 	}
10417 
10418 	copy = zalloc_flags(vm_map_copy_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
10419 	copy->cpy_kdata = kalloc_data(len, Z_WAITOK);
10420 	if (copy->cpy_kdata == NULL) {
10421 		zfree(vm_map_copy_zone, copy);
10422 		return KERN_RESOURCE_SHORTAGE;
10423 	}
10424 
10425 	copy->type = VM_MAP_COPY_KERNEL_BUFFER;
10426 	copy->size = len;
10427 	copy->offset = 0;
10428 
10429 	kr = copyinmap(src_map, src_addr, copy->cpy_kdata, (vm_size_t)len);
10430 	if (kr != KERN_SUCCESS) {
10431 		kfree_data(copy->cpy_kdata, len);
10432 		zfree(vm_map_copy_zone, copy);
10433 		return kr;
10434 	}
10435 
10436 	if (src_destroy) {
10437 		vmr_flags_t flags = VM_MAP_REMOVE_INTERRUPTIBLE;
10438 
10439 		if (src_map == kernel_map) {
10440 			flags |= VM_MAP_REMOVE_KUNWIRE;
10441 		}
10442 
10443 		(void)vm_map_remove_flags(src_map,
10444 		    vm_map_trunc_page(src_addr, VM_MAP_PAGE_MASK(src_map)),
10445 		    vm_map_round_page(src_addr + len, VM_MAP_PAGE_MASK(src_map)),
10446 		    flags);
10447 	}
10448 
10449 	*copy_result = copy;
10450 	return KERN_SUCCESS;
10451 }
10452 
10453 /*
10454  *	Routine: vm_map_copyout_kernel_buffer	[internal use only]
10455  *
10456  *	Description:
10457  *		Copy out data from a kernel buffer into space in the
10458  *		destination map. The space may be otpionally dynamically
10459  *		allocated.
10460  *
10461  *		If successful, consumes the copy object.
10462  *		Otherwise, the caller is responsible for it.
10463  *
10464  *		Callers of this function must call vm_map_copy_require on
10465  *		previously created vm_map_copy_t or pass a newly created
10466  *		one to ensure that it hasn't been forged.
10467  */
10468 static int vm_map_copyout_kernel_buffer_failures = 0;
10469 static kern_return_t
vm_map_copyout_kernel_buffer(vm_map_t map,vm_map_address_t * addr,vm_map_copy_t copy,vm_map_size_t copy_size,boolean_t overwrite,boolean_t consume_on_success)10470 vm_map_copyout_kernel_buffer(
10471 	vm_map_t                map,
10472 	vm_map_address_t        *addr,  /* IN/OUT */
10473 	vm_map_copy_t           copy,
10474 	vm_map_size_t           copy_size,
10475 	boolean_t               overwrite,
10476 	boolean_t               consume_on_success)
10477 {
10478 	kern_return_t kr = KERN_SUCCESS;
10479 	thread_t thread = current_thread();
10480 
10481 	assert(copy->size == copy_size);
10482 
10483 	/*
10484 	 * check for corrupted vm_map_copy structure
10485 	 */
10486 	if (copy_size > msg_ool_size_small || copy->offset) {
10487 		panic("Invalid vm_map_copy_t sz:%lld, ofst:%lld",
10488 		    (long long)copy->size, (long long)copy->offset);
10489 	}
10490 
10491 	if (!overwrite) {
10492 		/*
10493 		 * Allocate space in the target map for the data
10494 		 */
10495 		*addr = 0;
10496 		kr = vm_map_enter(map,
10497 		    addr,
10498 		    vm_map_round_page(copy_size,
10499 		    VM_MAP_PAGE_MASK(map)),
10500 		    (vm_map_offset_t) 0,
10501 		    VM_FLAGS_ANYWHERE,
10502 		    VM_MAP_KERNEL_FLAGS_NONE,
10503 		    VM_KERN_MEMORY_NONE,
10504 		    VM_OBJECT_NULL,
10505 		    (vm_object_offset_t) 0,
10506 		    FALSE,
10507 		    VM_PROT_DEFAULT,
10508 		    VM_PROT_ALL,
10509 		    VM_INHERIT_DEFAULT);
10510 		if (kr != KERN_SUCCESS) {
10511 			return kr;
10512 		}
10513 #if KASAN
10514 		if (map->pmap == kernel_pmap) {
10515 			kasan_notify_address(*addr, copy->size);
10516 		}
10517 #endif
10518 	}
10519 
10520 	/*
10521 	 * Copyout the data from the kernel buffer to the target map.
10522 	 */
10523 	if (thread->map == map) {
10524 		/*
10525 		 * If the target map is the current map, just do
10526 		 * the copy.
10527 		 */
10528 		assert((vm_size_t)copy_size == copy_size);
10529 		if (copyout(copy->cpy_kdata, *addr, (vm_size_t)copy_size)) {
10530 			kr = KERN_INVALID_ADDRESS;
10531 		}
10532 	} else {
10533 		vm_map_t oldmap;
10534 
10535 		/*
10536 		 * If the target map is another map, assume the
10537 		 * target's address space identity for the duration
10538 		 * of the copy.
10539 		 */
10540 		vm_map_reference(map);
10541 		oldmap = vm_map_switch(map);
10542 
10543 		assert((vm_size_t)copy_size == copy_size);
10544 		if (copyout(copy->cpy_kdata, *addr, (vm_size_t)copy_size)) {
10545 			vm_map_copyout_kernel_buffer_failures++;
10546 			kr = KERN_INVALID_ADDRESS;
10547 		}
10548 
10549 		(void) vm_map_switch(oldmap);
10550 		vm_map_deallocate(map);
10551 	}
10552 
10553 	if (kr != KERN_SUCCESS) {
10554 		/* the copy failed, clean up */
10555 		if (!overwrite) {
10556 			/*
10557 			 * Deallocate the space we allocated in the target map.
10558 			 */
10559 			vm_map_remove(map,
10560 			    vm_map_trunc_page(*addr,
10561 			    VM_MAP_PAGE_MASK(map)),
10562 			    vm_map_round_page((*addr +
10563 			    vm_map_round_page(copy_size,
10564 			    VM_MAP_PAGE_MASK(map))),
10565 			    VM_MAP_PAGE_MASK(map)));
10566 			*addr = 0;
10567 		}
10568 	} else {
10569 		/* copy was successful, dicard the copy structure */
10570 		if (consume_on_success) {
10571 			kfree_data(copy->cpy_kdata, copy_size);
10572 			zfree(vm_map_copy_zone, copy);
10573 		}
10574 	}
10575 
10576 	return kr;
10577 }
10578 
10579 /*
10580  *	Routine:	vm_map_copy_insert      [internal use only]
10581  *
10582  *	Description:
10583  *		Link a copy chain ("copy") into a map at the
10584  *		specified location (after "where").
10585  *
10586  *		Callers of this function must call vm_map_copy_require on
10587  *		previously created vm_map_copy_t or pass a newly created
10588  *		one to ensure that it hasn't been forged.
10589  *	Side effects:
10590  *		The copy chain is destroyed.
10591  */
10592 static void
vm_map_copy_insert(vm_map_t map,vm_map_entry_t after_where,vm_map_copy_t copy)10593 vm_map_copy_insert(
10594 	vm_map_t        map,
10595 	vm_map_entry_t  after_where,
10596 	vm_map_copy_t   copy)
10597 {
10598 	vm_map_entry_t  entry;
10599 
10600 	while (vm_map_copy_first_entry(copy) != vm_map_copy_to_entry(copy)) {
10601 		entry = vm_map_copy_first_entry(copy);
10602 		vm_map_copy_entry_unlink(copy, entry);
10603 		vm_map_store_entry_link(map, after_where, entry,
10604 		    VM_MAP_KERNEL_FLAGS_NONE);
10605 		after_where = entry;
10606 	}
10607 	zfree(vm_map_copy_zone, copy);
10608 }
10609 
10610 /*
10611  * Callers of this function must call vm_map_copy_require on
10612  * previously created vm_map_copy_t or pass a newly created
10613  * one to ensure that it hasn't been forged.
10614  */
10615 void
vm_map_copy_remap(vm_map_t map,vm_map_entry_t where,vm_map_copy_t copy,vm_map_offset_t adjustment,vm_prot_t cur_prot,vm_prot_t max_prot,vm_inherit_t inheritance)10616 vm_map_copy_remap(
10617 	vm_map_t        map,
10618 	vm_map_entry_t  where,
10619 	vm_map_copy_t   copy,
10620 	vm_map_offset_t adjustment,
10621 	vm_prot_t       cur_prot,
10622 	vm_prot_t       max_prot,
10623 	vm_inherit_t    inheritance)
10624 {
10625 	vm_map_entry_t  copy_entry, new_entry;
10626 
10627 	for (copy_entry = vm_map_copy_first_entry(copy);
10628 	    copy_entry != vm_map_copy_to_entry(copy);
10629 	    copy_entry = copy_entry->vme_next) {
10630 		/* get a new VM map entry for the map */
10631 		new_entry = vm_map_entry_create(map);
10632 		/* copy the "copy entry" to the new entry */
10633 		vm_map_entry_copy(map, new_entry, copy_entry);
10634 		/* adjust "start" and "end" */
10635 		new_entry->vme_start += adjustment;
10636 		new_entry->vme_end += adjustment;
10637 		/* clear some attributes */
10638 		new_entry->inheritance = inheritance;
10639 		new_entry->protection = cur_prot;
10640 		new_entry->max_protection = max_prot;
10641 		new_entry->behavior = VM_BEHAVIOR_DEFAULT;
10642 		/* take an extra reference on the entry's "object" */
10643 		if (new_entry->is_sub_map) {
10644 			assert(!new_entry->use_pmap); /* not nested */
10645 			vm_map_lock(VME_SUBMAP(new_entry));
10646 			vm_map_reference(VME_SUBMAP(new_entry));
10647 			vm_map_unlock(VME_SUBMAP(new_entry));
10648 		} else {
10649 			vm_object_reference(VME_OBJECT(new_entry));
10650 		}
10651 		/* insert the new entry in the map */
10652 		vm_map_store_entry_link(map, where, new_entry,
10653 		    VM_MAP_KERNEL_FLAGS_NONE);
10654 		/* continue inserting the "copy entries" after the new entry */
10655 		where = new_entry;
10656 	}
10657 }
10658 
10659 
10660 /*
10661  * Returns true if *size matches (or is in the range of) copy->size.
10662  * Upon returning true, the *size field is updated with the actual size of the
10663  * copy object (may be different for VM_MAP_COPY_ENTRY_LIST types)
10664  */
10665 boolean_t
vm_map_copy_validate_size(vm_map_t dst_map,vm_map_copy_t copy,vm_map_size_t * size)10666 vm_map_copy_validate_size(
10667 	vm_map_t                dst_map,
10668 	vm_map_copy_t           copy,
10669 	vm_map_size_t           *size)
10670 {
10671 	if (copy == VM_MAP_COPY_NULL) {
10672 		return FALSE;
10673 	}
10674 
10675 	/*
10676 	 * Assert that the vm_map_copy is coming from the right
10677 	 * zone and hasn't been forged
10678 	 */
10679 	vm_map_copy_require(copy);
10680 
10681 	vm_map_size_t copy_sz = copy->size;
10682 	vm_map_size_t sz = *size;
10683 	switch (copy->type) {
10684 	case VM_MAP_COPY_OBJECT:
10685 	case VM_MAP_COPY_KERNEL_BUFFER:
10686 		if (sz == copy_sz) {
10687 			return TRUE;
10688 		}
10689 		break;
10690 	case VM_MAP_COPY_ENTRY_LIST:
10691 		/*
10692 		 * potential page-size rounding prevents us from exactly
10693 		 * validating this flavor of vm_map_copy, but we can at least
10694 		 * assert that it's within a range.
10695 		 */
10696 		if (copy_sz >= sz &&
10697 		    copy_sz <= vm_map_round_page(sz, VM_MAP_PAGE_MASK(dst_map))) {
10698 			*size = copy_sz;
10699 			return TRUE;
10700 		}
10701 		break;
10702 	default:
10703 		break;
10704 	}
10705 	return FALSE;
10706 }
10707 
10708 /*
10709  *	Routine:	vm_map_copyout_size
10710  *
10711  *	Description:
10712  *		Copy out a copy chain ("copy") into newly-allocated
10713  *		space in the destination map. Uses a prevalidated
10714  *		size for the copy object (vm_map_copy_validate_size).
10715  *
10716  *		If successful, consumes the copy object.
10717  *		Otherwise, the caller is responsible for it.
10718  */
10719 kern_return_t
vm_map_copyout_size(vm_map_t dst_map,vm_map_address_t * dst_addr,vm_map_copy_t copy,vm_map_size_t copy_size)10720 vm_map_copyout_size(
10721 	vm_map_t                dst_map,
10722 	vm_map_address_t        *dst_addr,      /* OUT */
10723 	vm_map_copy_t           copy,
10724 	vm_map_size_t           copy_size)
10725 {
10726 	return vm_map_copyout_internal(dst_map, dst_addr, copy, copy_size,
10727 	           TRUE,                     /* consume_on_success */
10728 	           VM_PROT_DEFAULT,
10729 	           VM_PROT_ALL,
10730 	           VM_INHERIT_DEFAULT);
10731 }
10732 
10733 /*
10734  *	Routine:	vm_map_copyout
10735  *
10736  *	Description:
10737  *		Copy out a copy chain ("copy") into newly-allocated
10738  *		space in the destination map.
10739  *
10740  *		If successful, consumes the copy object.
10741  *		Otherwise, the caller is responsible for it.
10742  */
10743 kern_return_t
vm_map_copyout(vm_map_t dst_map,vm_map_address_t * dst_addr,vm_map_copy_t copy)10744 vm_map_copyout(
10745 	vm_map_t                dst_map,
10746 	vm_map_address_t        *dst_addr,      /* OUT */
10747 	vm_map_copy_t           copy)
10748 {
10749 	return vm_map_copyout_internal(dst_map, dst_addr, copy, copy ? copy->size : 0,
10750 	           TRUE,                     /* consume_on_success */
10751 	           VM_PROT_DEFAULT,
10752 	           VM_PROT_ALL,
10753 	           VM_INHERIT_DEFAULT);
10754 }
10755 
10756 kern_return_t
vm_map_copyout_internal(vm_map_t dst_map,vm_map_address_t * dst_addr,vm_map_copy_t copy,vm_map_size_t copy_size,boolean_t consume_on_success,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)10757 vm_map_copyout_internal(
10758 	vm_map_t                dst_map,
10759 	vm_map_address_t        *dst_addr,      /* OUT */
10760 	vm_map_copy_t           copy,
10761 	vm_map_size_t           copy_size,
10762 	boolean_t               consume_on_success,
10763 	vm_prot_t               cur_protection,
10764 	vm_prot_t               max_protection,
10765 	vm_inherit_t            inheritance)
10766 {
10767 	vm_map_size_t           size;
10768 	vm_map_size_t           adjustment;
10769 	vm_map_offset_t         start;
10770 	vm_object_offset_t      vm_copy_start;
10771 	vm_map_entry_t          last;
10772 	vm_map_entry_t          entry;
10773 	vm_map_copy_t           original_copy;
10774 	kern_return_t           kr;
10775 	vm_map_kernel_flags_t   vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
10776 
10777 	/*
10778 	 *	Check for null copy object.
10779 	 */
10780 
10781 	if (copy == VM_MAP_COPY_NULL) {
10782 		*dst_addr = 0;
10783 		return KERN_SUCCESS;
10784 	}
10785 
10786 	/*
10787 	 * Assert that the vm_map_copy is coming from the right
10788 	 * zone and hasn't been forged
10789 	 */
10790 	vm_map_copy_require(copy);
10791 
10792 	if (copy->size != copy_size) {
10793 		*dst_addr = 0;
10794 		return KERN_FAILURE;
10795 	}
10796 
10797 	/*
10798 	 *	Check for special copy object, created
10799 	 *	by vm_map_copyin_object.
10800 	 */
10801 
10802 	if (copy->type == VM_MAP_COPY_OBJECT) {
10803 		vm_object_t             object = copy->cpy_object;
10804 		vm_object_offset_t      offset;
10805 
10806 		offset = vm_object_trunc_page(copy->offset);
10807 		size = vm_map_round_page((copy_size +
10808 		    (vm_map_size_t)(copy->offset -
10809 		    offset)),
10810 		    VM_MAP_PAGE_MASK(dst_map));
10811 		*dst_addr = 0;
10812 		kr = vm_map_enter(dst_map, dst_addr, size,
10813 		    (vm_map_offset_t) 0, VM_FLAGS_ANYWHERE,
10814 		    VM_MAP_KERNEL_FLAGS_NONE,
10815 		    VM_KERN_MEMORY_NONE,
10816 		    object, offset, FALSE,
10817 		    VM_PROT_DEFAULT, VM_PROT_ALL,
10818 		    VM_INHERIT_DEFAULT);
10819 		if (kr != KERN_SUCCESS) {
10820 			return kr;
10821 		}
10822 		/* Account for non-pagealigned copy object */
10823 		*dst_addr += (vm_map_offset_t)(copy->offset - offset);
10824 		if (consume_on_success) {
10825 			zfree(vm_map_copy_zone, copy);
10826 		}
10827 		return KERN_SUCCESS;
10828 	}
10829 
10830 	/*
10831 	 *	Check for special kernel buffer allocated
10832 	 *	by new_ipc_kmsg_copyin.
10833 	 */
10834 
10835 	if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) {
10836 		return vm_map_copyout_kernel_buffer(dst_map, dst_addr,
10837 		           copy, copy_size, FALSE,
10838 		           consume_on_success);
10839 	}
10840 
10841 	original_copy = copy;
10842 	if (copy->cpy_hdr.page_shift != VM_MAP_PAGE_SHIFT(dst_map)) {
10843 		vm_map_copy_t target_copy;
10844 		vm_map_offset_t overmap_start, overmap_end, trimmed_start;
10845 
10846 		target_copy = VM_MAP_COPY_NULL;
10847 		DEBUG4K_ADJUST("adjusting...\n");
10848 		kr = vm_map_copy_adjust_to_target(
10849 			copy,
10850 			0, /* offset */
10851 			copy->size, /* size */
10852 			dst_map,
10853 			TRUE, /* copy */
10854 			&target_copy,
10855 			&overmap_start,
10856 			&overmap_end,
10857 			&trimmed_start);
10858 		if (kr != KERN_SUCCESS) {
10859 			DEBUG4K_COPY("adjust failed 0x%x\n", kr);
10860 			return kr;
10861 		}
10862 		DEBUG4K_COPY("copy %p (%d 0x%llx 0x%llx) dst_map %p (%d) target_copy %p (%d 0x%llx 0x%llx) overmap_start 0x%llx overmap_end 0x%llx trimmed_start 0x%llx\n", copy, copy->cpy_hdr.page_shift, copy->offset, (uint64_t)copy->size, dst_map, VM_MAP_PAGE_SHIFT(dst_map), target_copy, target_copy->cpy_hdr.page_shift, target_copy->offset, (uint64_t)target_copy->size, (uint64_t)overmap_start, (uint64_t)overmap_end, (uint64_t)trimmed_start);
10863 		if (target_copy != copy) {
10864 			copy = target_copy;
10865 		}
10866 		copy_size = copy->size;
10867 	}
10868 
10869 	/*
10870 	 *	Find space for the data
10871 	 */
10872 
10873 	vm_copy_start = vm_map_trunc_page((vm_map_size_t)copy->offset,
10874 	    VM_MAP_COPY_PAGE_MASK(copy));
10875 	size = vm_map_round_page((vm_map_size_t)copy->offset + copy_size,
10876 	    VM_MAP_COPY_PAGE_MASK(copy))
10877 	    - vm_copy_start;
10878 
10879 
10880 	if (dst_map == kernel_map) {
10881 		vmk_flags.vmkf_range_id = KMEM_RANGE_ID_DATA;
10882 	}
10883 
10884 	vm_map_lock(dst_map);
10885 	kr = vm_map_locate_space(dst_map, size, 0, vmk_flags,
10886 	    &start, &last);
10887 	if (kr != KERN_SUCCESS) {
10888 		vm_map_unlock(dst_map);
10889 		return kr;
10890 	}
10891 
10892 	adjustment = start - vm_copy_start;
10893 	if (!consume_on_success) {
10894 		/*
10895 		 * We're not allowed to consume "copy", so we'll have to
10896 		 * copy its map entries into the destination map below.
10897 		 * No need to re-allocate map entries from the correct
10898 		 * (pageable or not) zone, since we'll get new map entries
10899 		 * during the transfer.
10900 		 * We'll also adjust the map entries's "start" and "end"
10901 		 * during the transfer, to keep "copy"'s entries consistent
10902 		 * with its "offset".
10903 		 */
10904 		goto after_adjustments;
10905 	}
10906 
10907 	/*
10908 	 *	Since we're going to just drop the map
10909 	 *	entries from the copy into the destination
10910 	 *	map, they must come from the same pool.
10911 	 */
10912 
10913 	if (copy->cpy_hdr.entries_pageable != dst_map->hdr.entries_pageable) {
10914 		/*
10915 		 * Mismatches occur when dealing with the default
10916 		 * pager.
10917 		 */
10918 		vm_map_entry_t  next, new;
10919 
10920 		/*
10921 		 * Find the zone that the copies were allocated from
10922 		 */
10923 
10924 		entry = vm_map_copy_first_entry(copy);
10925 
10926 		/*
10927 		 * Reinitialize the copy so that vm_map_copy_entry_link
10928 		 * will work.
10929 		 */
10930 		vm_map_store_copy_reset(copy, entry);
10931 		copy->cpy_hdr.entries_pageable = dst_map->hdr.entries_pageable;
10932 
10933 		/*
10934 		 * Copy each entry.
10935 		 */
10936 		while (entry != vm_map_copy_to_entry(copy)) {
10937 			new = vm_map_copy_entry_create(copy);
10938 			vm_map_entry_copy_full(new, entry);
10939 			new->vme_no_copy_on_read = FALSE;
10940 			assert(!new->iokit_acct);
10941 			if (new->is_sub_map) {
10942 				/* clr address space specifics */
10943 				new->use_pmap = FALSE;
10944 			}
10945 			vm_map_copy_entry_link(copy,
10946 			    vm_map_copy_last_entry(copy),
10947 			    new);
10948 			next = entry->vme_next;
10949 			vm_map_entry_dispose(entry);
10950 			entry = next;
10951 		}
10952 	}
10953 
10954 	/*
10955 	 *	Adjust the addresses in the copy chain, and
10956 	 *	reset the region attributes.
10957 	 */
10958 
10959 	for (entry = vm_map_copy_first_entry(copy);
10960 	    entry != vm_map_copy_to_entry(copy);
10961 	    entry = entry->vme_next) {
10962 		if (VM_MAP_PAGE_SHIFT(dst_map) == PAGE_SHIFT) {
10963 			/*
10964 			 * We're injecting this copy entry into a map that
10965 			 * has the standard page alignment, so clear
10966 			 * "map_aligned" (which might have been inherited
10967 			 * from the original map entry).
10968 			 */
10969 			entry->map_aligned = FALSE;
10970 		}
10971 
10972 		entry->vme_start += adjustment;
10973 		entry->vme_end += adjustment;
10974 
10975 		if (entry->map_aligned) {
10976 			assert(VM_MAP_PAGE_ALIGNED(entry->vme_start,
10977 			    VM_MAP_PAGE_MASK(dst_map)));
10978 			assert(VM_MAP_PAGE_ALIGNED(entry->vme_end,
10979 			    VM_MAP_PAGE_MASK(dst_map)));
10980 		}
10981 
10982 		entry->inheritance = VM_INHERIT_DEFAULT;
10983 		entry->protection = VM_PROT_DEFAULT;
10984 		entry->max_protection = VM_PROT_ALL;
10985 		entry->behavior = VM_BEHAVIOR_DEFAULT;
10986 
10987 		/*
10988 		 * If the entry is now wired,
10989 		 * map the pages into the destination map.
10990 		 */
10991 		if (entry->wired_count != 0) {
10992 			vm_map_offset_t va;
10993 			vm_object_offset_t       offset;
10994 			vm_object_t object;
10995 			vm_prot_t prot;
10996 			int     type_of_fault;
10997 
10998 			/* TODO4K would need to use actual page size */
10999 			assert(VM_MAP_PAGE_SHIFT(dst_map) == PAGE_SHIFT);
11000 
11001 			object = VME_OBJECT(entry);
11002 			offset = VME_OFFSET(entry);
11003 			va = entry->vme_start;
11004 
11005 			pmap_pageable(dst_map->pmap,
11006 			    entry->vme_start,
11007 			    entry->vme_end,
11008 			    TRUE);
11009 
11010 			while (va < entry->vme_end) {
11011 				vm_page_t       m;
11012 				struct vm_object_fault_info fault_info = {};
11013 
11014 				/*
11015 				 * Look up the page in the object.
11016 				 * Assert that the page will be found in the
11017 				 * top object:
11018 				 * either
11019 				 *	the object was newly created by
11020 				 *	vm_object_copy_slowly, and has
11021 				 *	copies of all of the pages from
11022 				 *	the source object
11023 				 * or
11024 				 *	the object was moved from the old
11025 				 *	map entry; because the old map
11026 				 *	entry was wired, all of the pages
11027 				 *	were in the top-level object.
11028 				 *	(XXX not true if we wire pages for
11029 				 *	 reading)
11030 				 */
11031 				vm_object_lock(object);
11032 
11033 				m = vm_page_lookup(object, offset);
11034 				if (m == VM_PAGE_NULL || !VM_PAGE_WIRED(m) ||
11035 				    m->vmp_absent) {
11036 					panic("vm_map_copyout: wiring %p", m);
11037 				}
11038 
11039 				prot = entry->protection;
11040 
11041 				if (override_nx(dst_map, VME_ALIAS(entry)) &&
11042 				    prot) {
11043 					prot |= VM_PROT_EXECUTE;
11044 				}
11045 
11046 				type_of_fault = DBG_CACHE_HIT_FAULT;
11047 
11048 				fault_info.user_tag = VME_ALIAS(entry);
11049 				fault_info.pmap_options = 0;
11050 				if (entry->iokit_acct ||
11051 				    (!entry->is_sub_map && !entry->use_pmap)) {
11052 					fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
11053 				}
11054 
11055 				vm_fault_enter(m,
11056 				    dst_map->pmap,
11057 				    va,
11058 				    PAGE_SIZE, 0,
11059 				    prot,
11060 				    prot,
11061 				    VM_PAGE_WIRED(m),
11062 				    FALSE,            /* change_wiring */
11063 				    VM_KERN_MEMORY_NONE,            /* tag - not wiring */
11064 				    &fault_info,
11065 				    NULL,             /* need_retry */
11066 				    &type_of_fault);
11067 
11068 				vm_object_unlock(object);
11069 
11070 				offset += PAGE_SIZE_64;
11071 				va += PAGE_SIZE;
11072 			}
11073 		}
11074 	}
11075 
11076 after_adjustments:
11077 
11078 	/*
11079 	 *	Correct the page alignment for the result
11080 	 */
11081 
11082 	*dst_addr = start + (copy->offset - vm_copy_start);
11083 
11084 #if KASAN
11085 	kasan_notify_address(*dst_addr, size);
11086 #endif
11087 
11088 	/*
11089 	 *	Update the hints and the map size
11090 	 */
11091 
11092 	if (consume_on_success) {
11093 		SAVE_HINT_MAP_WRITE(dst_map, vm_map_copy_last_entry(copy));
11094 	} else {
11095 		SAVE_HINT_MAP_WRITE(dst_map, last);
11096 	}
11097 
11098 	dst_map->size += size;
11099 
11100 	/*
11101 	 *	Link in the copy
11102 	 */
11103 
11104 	if (consume_on_success) {
11105 		vm_map_copy_insert(dst_map, last, copy);
11106 		if (copy != original_copy) {
11107 			vm_map_copy_discard(original_copy);
11108 			original_copy = VM_MAP_COPY_NULL;
11109 		}
11110 	} else {
11111 		vm_map_copy_remap(dst_map, last, copy, adjustment,
11112 		    cur_protection, max_protection,
11113 		    inheritance);
11114 		if (copy != original_copy && original_copy != VM_MAP_COPY_NULL) {
11115 			vm_map_copy_discard(copy);
11116 			copy = original_copy;
11117 		}
11118 	}
11119 
11120 
11121 	vm_map_unlock(dst_map);
11122 
11123 	/*
11124 	 * XXX	If wiring_required, call vm_map_pageable
11125 	 */
11126 
11127 	return KERN_SUCCESS;
11128 }
11129 
11130 /*
11131  *	Routine:	vm_map_copyin
11132  *
11133  *	Description:
11134  *		see vm_map_copyin_common.  Exported via Unsupported.exports.
11135  *
11136  */
11137 
11138 #undef vm_map_copyin
11139 
11140 kern_return_t
vm_map_copyin(vm_map_t src_map,vm_map_address_t src_addr,vm_map_size_t len,boolean_t src_destroy,vm_map_copy_t * copy_result)11141 vm_map_copyin(
11142 	vm_map_t                        src_map,
11143 	vm_map_address_t        src_addr,
11144 	vm_map_size_t           len,
11145 	boolean_t                       src_destroy,
11146 	vm_map_copy_t           *copy_result)   /* OUT */
11147 {
11148 	return vm_map_copyin_common(src_map, src_addr, len, src_destroy,
11149 	           FALSE, copy_result, FALSE);
11150 }
11151 
11152 /*
11153  *	Routine:	vm_map_copyin_common
11154  *
11155  *	Description:
11156  *		Copy the specified region (src_addr, len) from the
11157  *		source address space (src_map), possibly removing
11158  *		the region from the source address space (src_destroy).
11159  *
11160  *	Returns:
11161  *		A vm_map_copy_t object (copy_result), suitable for
11162  *		insertion into another address space (using vm_map_copyout),
11163  *		copying over another address space region (using
11164  *		vm_map_copy_overwrite).  If the copy is unused, it
11165  *		should be destroyed (using vm_map_copy_discard).
11166  *
11167  *	In/out conditions:
11168  *		The source map should not be locked on entry.
11169  */
11170 
11171 typedef struct submap_map {
11172 	vm_map_t        parent_map;
11173 	vm_map_offset_t base_start;
11174 	vm_map_offset_t base_end;
11175 	vm_map_size_t   base_len;
11176 	struct submap_map *next;
11177 } submap_map_t;
11178 
11179 kern_return_t
vm_map_copyin_common(vm_map_t src_map,vm_map_address_t src_addr,vm_map_size_t len,boolean_t src_destroy,__unused boolean_t src_volatile,vm_map_copy_t * copy_result,boolean_t use_maxprot)11180 vm_map_copyin_common(
11181 	vm_map_t        src_map,
11182 	vm_map_address_t src_addr,
11183 	vm_map_size_t   len,
11184 	boolean_t       src_destroy,
11185 	__unused boolean_t      src_volatile,
11186 	vm_map_copy_t   *copy_result,   /* OUT */
11187 	boolean_t       use_maxprot)
11188 {
11189 	int flags;
11190 
11191 	flags = 0;
11192 	if (src_destroy) {
11193 		flags |= VM_MAP_COPYIN_SRC_DESTROY;
11194 	}
11195 	if (use_maxprot) {
11196 		flags |= VM_MAP_COPYIN_USE_MAXPROT;
11197 	}
11198 	return vm_map_copyin_internal(src_map,
11199 	           src_addr,
11200 	           len,
11201 	           flags,
11202 	           copy_result);
11203 }
11204 kern_return_t
vm_map_copyin_internal(vm_map_t src_map,vm_map_address_t src_addr,vm_map_size_t len,int flags,vm_map_copy_t * copy_result)11205 vm_map_copyin_internal(
11206 	vm_map_t        src_map,
11207 	vm_map_address_t src_addr,
11208 	vm_map_size_t   len,
11209 	int             flags,
11210 	vm_map_copy_t   *copy_result)   /* OUT */
11211 {
11212 	vm_map_entry_t  tmp_entry;      /* Result of last map lookup --
11213 	                                 * in multi-level lookup, this
11214 	                                 * entry contains the actual
11215 	                                 * vm_object/offset.
11216 	                                 */
11217 	vm_map_entry_t  new_entry = VM_MAP_ENTRY_NULL;  /* Map entry for copy */
11218 
11219 	vm_map_offset_t src_start;      /* Start of current entry --
11220 	                                 * where copy is taking place now
11221 	                                 */
11222 	vm_map_offset_t src_end;        /* End of entire region to be
11223 	                                 * copied */
11224 	vm_map_offset_t src_base;
11225 	vm_map_t        base_map = src_map;
11226 	boolean_t       map_share = FALSE;
11227 	submap_map_t    *parent_maps = NULL;
11228 
11229 	vm_map_copy_t   copy;           /* Resulting copy */
11230 	vm_map_address_t copy_addr;
11231 	vm_map_size_t   copy_size;
11232 	boolean_t       src_destroy;
11233 	boolean_t       use_maxprot;
11234 	boolean_t       preserve_purgeable;
11235 	boolean_t       entry_was_shared;
11236 	vm_map_entry_t  saved_src_entry;
11237 
11238 	if (flags & ~VM_MAP_COPYIN_ALL_FLAGS) {
11239 		return KERN_INVALID_ARGUMENT;
11240 	}
11241 
11242 	src_destroy = (flags & VM_MAP_COPYIN_SRC_DESTROY) ? TRUE : FALSE;
11243 	use_maxprot = (flags & VM_MAP_COPYIN_USE_MAXPROT) ? TRUE : FALSE;
11244 	preserve_purgeable =
11245 	    (flags & VM_MAP_COPYIN_PRESERVE_PURGEABLE) ? TRUE : FALSE;
11246 
11247 	/*
11248 	 *	Check for copies of zero bytes.
11249 	 */
11250 
11251 	if (len == 0) {
11252 		*copy_result = VM_MAP_COPY_NULL;
11253 		return KERN_SUCCESS;
11254 	}
11255 
11256 	/*
11257 	 *	Check that the end address doesn't overflow
11258 	 */
11259 	src_end = src_addr + len;
11260 	if (src_end < src_addr) {
11261 		return KERN_INVALID_ADDRESS;
11262 	}
11263 
11264 	/*
11265 	 *	Compute (page aligned) start and end of region
11266 	 */
11267 	src_start = vm_map_trunc_page(src_addr,
11268 	    VM_MAP_PAGE_MASK(src_map));
11269 	src_end = vm_map_round_page(src_end,
11270 	    VM_MAP_PAGE_MASK(src_map));
11271 
11272 	/*
11273 	 * If the copy is sufficiently small, use a kernel buffer instead
11274 	 * of making a virtual copy.  The theory being that the cost of
11275 	 * setting up VM (and taking C-O-W faults) dominates the copy costs
11276 	 * for small regions.
11277 	 */
11278 	if ((len <= msg_ool_size_small) &&
11279 	    !use_maxprot &&
11280 	    !preserve_purgeable &&
11281 	    !(flags & VM_MAP_COPYIN_ENTRY_LIST) &&
11282 	    /*
11283 	     * Since the "msg_ool_size_small" threshold was increased and
11284 	     * vm_map_copyin_kernel_buffer() doesn't handle accesses beyond the
11285 	     * address space limits, we revert to doing a virtual copy if the
11286 	     * copied range goes beyond those limits.  Otherwise, mach_vm_read()
11287 	     * of the commpage would now fail when it used to work.
11288 	     */
11289 	    (src_start >= vm_map_min(src_map) &&
11290 	    src_start < vm_map_max(src_map) &&
11291 	    src_end >= vm_map_min(src_map) &&
11292 	    src_end < vm_map_max(src_map))) {
11293 		return vm_map_copyin_kernel_buffer(src_map, src_addr, len,
11294 		           src_destroy, copy_result);
11295 	}
11296 
11297 	/*
11298 	 *	Allocate a header element for the list.
11299 	 *
11300 	 *	Use the start and end in the header to
11301 	 *	remember the endpoints prior to rounding.
11302 	 */
11303 
11304 	copy = vm_map_copy_allocate();
11305 	copy->type = VM_MAP_COPY_ENTRY_LIST;
11306 	copy->cpy_hdr.entries_pageable = TRUE;
11307 	copy->cpy_hdr.page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(src_map);
11308 
11309 	vm_map_store_init( &(copy->cpy_hdr));
11310 
11311 	copy->offset = src_addr;
11312 	copy->size = len;
11313 
11314 	new_entry = vm_map_copy_entry_create(copy);
11315 
11316 #define RETURN(x)                                               \
11317 	MACRO_BEGIN                                             \
11318 	vm_map_unlock(src_map);                                 \
11319 	if(src_map != base_map)                                 \
11320 	        vm_map_deallocate(src_map);                     \
11321 	if (new_entry != VM_MAP_ENTRY_NULL)                     \
11322 	        vm_map_copy_entry_dispose(new_entry);           \
11323 	vm_map_copy_discard(copy);                              \
11324 	{                                                       \
11325 	        submap_map_t	*_ptr;                          \
11326                                                                 \
11327 	        for(_ptr = parent_maps; _ptr != NULL; _ptr = parent_maps) { \
11328 	                parent_maps=parent_maps->next;          \
11329 	                if (_ptr->parent_map != base_map)       \
11330 	                        vm_map_deallocate(_ptr->parent_map);    \
11331 	                kfree_type(submap_map_t, _ptr);         \
11332 	        }                                               \
11333 	}                                                       \
11334 	MACRO_RETURN(x);                                        \
11335 	MACRO_END
11336 
11337 	/*
11338 	 *	Find the beginning of the region.
11339 	 */
11340 
11341 	vm_map_lock(src_map);
11342 
11343 	/*
11344 	 * Lookup the original "src_addr" rather than the truncated
11345 	 * "src_start", in case "src_start" falls in a non-map-aligned
11346 	 * map entry *before* the map entry that contains "src_addr"...
11347 	 */
11348 	if (!vm_map_lookup_entry(src_map, src_addr, &tmp_entry)) {
11349 		RETURN(KERN_INVALID_ADDRESS);
11350 	}
11351 	if (!tmp_entry->is_sub_map) {
11352 		/*
11353 		 * ... but clip to the map-rounded "src_start" rather than
11354 		 * "src_addr" to preserve map-alignment.  We'll adjust the
11355 		 * first copy entry at the end, if needed.
11356 		 */
11357 		vm_map_clip_start(src_map, tmp_entry, src_start);
11358 	}
11359 	if (src_start < tmp_entry->vme_start) {
11360 		/*
11361 		 * Move "src_start" up to the start of the
11362 		 * first map entry to copy.
11363 		 */
11364 		src_start = tmp_entry->vme_start;
11365 	}
11366 	/* set for later submap fix-up */
11367 	copy_addr = src_start;
11368 
11369 	/*
11370 	 *	Go through entries until we get to the end.
11371 	 */
11372 
11373 	while (TRUE) {
11374 		vm_map_entry_t  src_entry = tmp_entry;  /* Top-level entry */
11375 		vm_map_size_t   src_size;               /* Size of source
11376 		                                         * map entry (in both
11377 		                                         * maps)
11378 		                                         */
11379 
11380 		vm_object_t             src_object;     /* Object to copy */
11381 		vm_object_offset_t      src_offset;
11382 
11383 		vm_object_t             new_copy_object;/* vm_object_copy_* result */
11384 
11385 		boolean_t       src_needs_copy;         /* Should source map
11386 		                                         * be made read-only
11387 		                                         * for copy-on-write?
11388 		                                         */
11389 
11390 		boolean_t       new_entry_needs_copy;   /* Will new entry be COW? */
11391 
11392 		boolean_t       was_wired;              /* Was source wired? */
11393 		boolean_t       saved_used_for_jit;     /* Saved used_for_jit. */
11394 		vm_map_version_t version;               /* Version before locks
11395 		                                         * dropped to make copy
11396 		                                         */
11397 		kern_return_t   result;                 /* Return value from
11398 		                                         * copy_strategically.
11399 		                                         */
11400 		while (tmp_entry->is_sub_map) {
11401 			vm_map_size_t submap_len;
11402 			submap_map_t *ptr;
11403 
11404 			ptr = kalloc_type(submap_map_t, Z_WAITOK);
11405 			ptr->next = parent_maps;
11406 			parent_maps = ptr;
11407 			ptr->parent_map = src_map;
11408 			ptr->base_start = src_start;
11409 			ptr->base_end = src_end;
11410 			submap_len = tmp_entry->vme_end - src_start;
11411 			if (submap_len > (src_end - src_start)) {
11412 				submap_len = src_end - src_start;
11413 			}
11414 			ptr->base_len = submap_len;
11415 
11416 			src_start -= tmp_entry->vme_start;
11417 			src_start += VME_OFFSET(tmp_entry);
11418 			src_end = src_start + submap_len;
11419 			src_map = VME_SUBMAP(tmp_entry);
11420 			vm_map_lock(src_map);
11421 			/* keep an outstanding reference for all maps in */
11422 			/* the parents tree except the base map */
11423 			vm_map_reference(src_map);
11424 			vm_map_unlock(ptr->parent_map);
11425 			if (!vm_map_lookup_entry(
11426 				    src_map, src_start, &tmp_entry)) {
11427 				RETURN(KERN_INVALID_ADDRESS);
11428 			}
11429 			map_share = TRUE;
11430 			if (!tmp_entry->is_sub_map) {
11431 				vm_map_clip_start(src_map, tmp_entry, src_start);
11432 			}
11433 			src_entry = tmp_entry;
11434 		}
11435 		/* we are now in the lowest level submap... */
11436 
11437 		if ((VME_OBJECT(tmp_entry) != VM_OBJECT_NULL) &&
11438 		    (VME_OBJECT(tmp_entry)->phys_contiguous)) {
11439 			/* This is not, supported for now.In future */
11440 			/* we will need to detect the phys_contig   */
11441 			/* condition and then upgrade copy_slowly   */
11442 			/* to do physical copy from the device mem  */
11443 			/* based object. We can piggy-back off of   */
11444 			/* the was wired boolean to set-up the      */
11445 			/* proper handling */
11446 			RETURN(KERN_PROTECTION_FAILURE);
11447 		}
11448 		/*
11449 		 *	Create a new address map entry to hold the result.
11450 		 *	Fill in the fields from the appropriate source entries.
11451 		 *	We must unlock the source map to do this if we need
11452 		 *	to allocate a map entry.
11453 		 */
11454 		if (new_entry == VM_MAP_ENTRY_NULL) {
11455 			version.main_timestamp = src_map->timestamp;
11456 			vm_map_unlock(src_map);
11457 
11458 			new_entry = vm_map_copy_entry_create(copy);
11459 
11460 			vm_map_lock(src_map);
11461 			if ((version.main_timestamp + 1) != src_map->timestamp) {
11462 				if (!vm_map_lookup_entry(src_map, src_start,
11463 				    &tmp_entry)) {
11464 					RETURN(KERN_INVALID_ADDRESS);
11465 				}
11466 				if (!tmp_entry->is_sub_map) {
11467 					vm_map_clip_start(src_map, tmp_entry, src_start);
11468 				}
11469 				continue; /* restart w/ new tmp_entry */
11470 			}
11471 		}
11472 
11473 		/*
11474 		 *	Verify that the region can be read.
11475 		 */
11476 		if (((src_entry->protection & VM_PROT_READ) == VM_PROT_NONE &&
11477 		    !use_maxprot) ||
11478 		    (src_entry->max_protection & VM_PROT_READ) == 0) {
11479 			RETURN(KERN_PROTECTION_FAILURE);
11480 		}
11481 
11482 		/*
11483 		 *	Clip against the endpoints of the entire region.
11484 		 */
11485 
11486 		vm_map_clip_end(src_map, src_entry, src_end);
11487 
11488 		src_size = src_entry->vme_end - src_start;
11489 		src_object = VME_OBJECT(src_entry);
11490 		src_offset = VME_OFFSET(src_entry);
11491 		was_wired = (src_entry->wired_count != 0);
11492 
11493 		vm_map_entry_copy(src_map, new_entry, src_entry);
11494 		if (new_entry->is_sub_map) {
11495 			/* clr address space specifics */
11496 			new_entry->use_pmap = FALSE;
11497 		} else {
11498 			/*
11499 			 * We're dealing with a copy-on-write operation,
11500 			 * so the resulting mapping should not inherit the
11501 			 * original mapping's accounting settings.
11502 			 * "iokit_acct" should have been cleared in
11503 			 * vm_map_entry_copy().
11504 			 * "use_pmap" should be reset to its default (TRUE)
11505 			 * so that the new mapping gets accounted for in
11506 			 * the task's memory footprint.
11507 			 */
11508 			assert(!new_entry->iokit_acct);
11509 			new_entry->use_pmap = TRUE;
11510 		}
11511 
11512 		/*
11513 		 *	Attempt non-blocking copy-on-write optimizations.
11514 		 */
11515 
11516 		/*
11517 		 * If we are destroying the source, and the object
11518 		 * is internal, we could move the object reference
11519 		 * from the source to the copy.  The copy is
11520 		 * copy-on-write only if the source is.
11521 		 * We make another reference to the object, because
11522 		 * destroying the source entry will deallocate it.
11523 		 *
11524 		 * This memory transfer has to be atomic, (to prevent
11525 		 * the VM object from being shared or copied while
11526 		 * it's being moved here), so we could only do this
11527 		 * if we won't have to unlock the VM map until the
11528 		 * original mapping has been fully removed.
11529 		 */
11530 
11531 RestartCopy:
11532 		if ((src_object == VM_OBJECT_NULL ||
11533 		    (!was_wired && !map_share && !tmp_entry->is_shared
11534 		    && !(debug4k_no_cow_copyin && VM_MAP_PAGE_SHIFT(src_map) < PAGE_SHIFT))) &&
11535 		    vm_object_copy_quickly(
11536 			    VME_OBJECT(new_entry),
11537 			    src_offset,
11538 			    src_size,
11539 			    &src_needs_copy,
11540 			    &new_entry_needs_copy)) {
11541 			new_entry->needs_copy = new_entry_needs_copy;
11542 
11543 			/*
11544 			 *	Handle copy-on-write obligations
11545 			 */
11546 
11547 			if (src_needs_copy && !tmp_entry->needs_copy) {
11548 				vm_prot_t prot;
11549 
11550 				prot = src_entry->protection & ~VM_PROT_WRITE;
11551 
11552 				if (override_nx(src_map, VME_ALIAS(src_entry))
11553 				    && prot) {
11554 					prot |= VM_PROT_EXECUTE;
11555 				}
11556 
11557 				vm_object_pmap_protect(
11558 					src_object,
11559 					src_offset,
11560 					src_size,
11561 					(src_entry->is_shared ?
11562 					PMAP_NULL
11563 					: src_map->pmap),
11564 					VM_MAP_PAGE_SIZE(src_map),
11565 					src_entry->vme_start,
11566 					prot);
11567 
11568 				assert(tmp_entry->wired_count == 0);
11569 				tmp_entry->needs_copy = TRUE;
11570 			}
11571 
11572 			/*
11573 			 *	The map has never been unlocked, so it's safe
11574 			 *	to move to the next entry rather than doing
11575 			 *	another lookup.
11576 			 */
11577 
11578 			goto CopySuccessful;
11579 		}
11580 
11581 		entry_was_shared = tmp_entry->is_shared;
11582 
11583 		/*
11584 		 *	Take an object reference, so that we may
11585 		 *	release the map lock(s).
11586 		 */
11587 
11588 		assert(src_object != VM_OBJECT_NULL);
11589 		vm_object_reference(src_object);
11590 
11591 		/*
11592 		 *	Record the timestamp for later verification.
11593 		 *	Unlock the map.
11594 		 */
11595 
11596 		version.main_timestamp = src_map->timestamp;
11597 		vm_map_unlock(src_map); /* Increments timestamp once! */
11598 		saved_src_entry = src_entry;
11599 		tmp_entry = VM_MAP_ENTRY_NULL;
11600 		src_entry = VM_MAP_ENTRY_NULL;
11601 
11602 		/*
11603 		 *	Perform the copy
11604 		 */
11605 
11606 		if (was_wired ||
11607 		    (debug4k_no_cow_copyin &&
11608 		    VM_MAP_PAGE_SHIFT(src_map) < PAGE_SHIFT)) {
11609 CopySlowly:
11610 			vm_object_lock(src_object);
11611 			result = vm_object_copy_slowly(
11612 				src_object,
11613 				src_offset,
11614 				src_size,
11615 				THREAD_UNINT,
11616 				&new_copy_object);
11617 			/* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
11618 			saved_used_for_jit = new_entry->used_for_jit;
11619 			VME_OBJECT_SET(new_entry, new_copy_object);
11620 			new_entry->used_for_jit = saved_used_for_jit;
11621 			VME_OFFSET_SET(new_entry,
11622 			    src_offset - vm_object_trunc_page(src_offset));
11623 			new_entry->needs_copy = FALSE;
11624 		} else if (src_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC &&
11625 		    (entry_was_shared || map_share)) {
11626 			vm_object_t new_object;
11627 
11628 			vm_object_lock_shared(src_object);
11629 			new_object = vm_object_copy_delayed(
11630 				src_object,
11631 				src_offset,
11632 				src_size,
11633 				TRUE);
11634 			if (new_object == VM_OBJECT_NULL) {
11635 				goto CopySlowly;
11636 			}
11637 
11638 			VME_OBJECT_SET(new_entry, new_object);
11639 			assert(new_entry->wired_count == 0);
11640 			new_entry->needs_copy = TRUE;
11641 			assert(!new_entry->iokit_acct);
11642 			assert(new_object->purgable == VM_PURGABLE_DENY);
11643 			assertf(new_entry->use_pmap, "src_map %p new_entry %p\n", src_map, new_entry);
11644 			result = KERN_SUCCESS;
11645 		} else {
11646 			vm_object_offset_t new_offset;
11647 			new_offset = VME_OFFSET(new_entry);
11648 			result = vm_object_copy_strategically(src_object,
11649 			    src_offset,
11650 			    src_size,
11651 			    &new_copy_object,
11652 			    &new_offset,
11653 			    &new_entry_needs_copy);
11654 			/* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
11655 			saved_used_for_jit = new_entry->used_for_jit;
11656 			VME_OBJECT_SET(new_entry, new_copy_object);
11657 			new_entry->used_for_jit = saved_used_for_jit;
11658 			if (new_offset != VME_OFFSET(new_entry)) {
11659 				VME_OFFSET_SET(new_entry, new_offset);
11660 			}
11661 
11662 			new_entry->needs_copy = new_entry_needs_copy;
11663 		}
11664 
11665 		if (result == KERN_SUCCESS &&
11666 		    ((preserve_purgeable &&
11667 		    src_object->purgable != VM_PURGABLE_DENY) ||
11668 		    new_entry->used_for_jit)) {
11669 			/*
11670 			 * Purgeable objects should be COPY_NONE, true share;
11671 			 * this should be propogated to the copy.
11672 			 *
11673 			 * Also force mappings the pmap specially protects to
11674 			 * be COPY_NONE; trying to COW these mappings would
11675 			 * change the effective protections, which could have
11676 			 * side effects if the pmap layer relies on the
11677 			 * specified protections.
11678 			 */
11679 
11680 			vm_object_t     new_object;
11681 
11682 			new_object = VME_OBJECT(new_entry);
11683 			assert(new_object != src_object);
11684 			vm_object_lock(new_object);
11685 			assert(new_object->ref_count == 1);
11686 			assert(new_object->shadow == VM_OBJECT_NULL);
11687 			assert(new_object->copy == VM_OBJECT_NULL);
11688 			assert(new_object->vo_owner == NULL);
11689 
11690 			new_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
11691 
11692 			if (preserve_purgeable &&
11693 			    src_object->purgable != VM_PURGABLE_DENY) {
11694 				new_object->true_share = TRUE;
11695 
11696 				/* start as non-volatile with no owner... */
11697 				new_object->purgable = VM_PURGABLE_NONVOLATILE;
11698 				vm_purgeable_nonvolatile_enqueue(new_object, NULL);
11699 				/* ... and move to src_object's purgeable state */
11700 				if (src_object->purgable != VM_PURGABLE_NONVOLATILE) {
11701 					int state;
11702 					state = src_object->purgable;
11703 					vm_object_purgable_control(
11704 						new_object,
11705 						VM_PURGABLE_SET_STATE_FROM_KERNEL,
11706 						&state);
11707 				}
11708 				/* no pmap accounting for purgeable objects */
11709 				new_entry->use_pmap = FALSE;
11710 			}
11711 
11712 			vm_object_unlock(new_object);
11713 			new_object = VM_OBJECT_NULL;
11714 		}
11715 
11716 		if (result != KERN_SUCCESS &&
11717 		    result != KERN_MEMORY_RESTART_COPY) {
11718 			vm_map_lock(src_map);
11719 			RETURN(result);
11720 		}
11721 
11722 		/*
11723 		 *	Throw away the extra reference
11724 		 */
11725 
11726 		vm_object_deallocate(src_object);
11727 
11728 		/*
11729 		 *	Verify that the map has not substantially
11730 		 *	changed while the copy was being made.
11731 		 */
11732 
11733 		vm_map_lock(src_map);
11734 
11735 		if ((version.main_timestamp + 1) == src_map->timestamp) {
11736 			/* src_map hasn't changed: src_entry is still valid */
11737 			src_entry = saved_src_entry;
11738 			goto VerificationSuccessful;
11739 		}
11740 
11741 		/*
11742 		 *	Simple version comparison failed.
11743 		 *
11744 		 *	Retry the lookup and verify that the
11745 		 *	same object/offset are still present.
11746 		 *
11747 		 *	[Note: a memory manager that colludes with
11748 		 *	the calling task can detect that we have
11749 		 *	cheated.  While the map was unlocked, the
11750 		 *	mapping could have been changed and restored.]
11751 		 */
11752 
11753 		if (!vm_map_lookup_entry(src_map, src_start, &tmp_entry)) {
11754 			if (result != KERN_MEMORY_RESTART_COPY) {
11755 				vm_object_deallocate(VME_OBJECT(new_entry));
11756 				VME_OBJECT_SET(new_entry, VM_OBJECT_NULL);
11757 				/* reset accounting state */
11758 				new_entry->iokit_acct = FALSE;
11759 				new_entry->use_pmap = TRUE;
11760 			}
11761 			RETURN(KERN_INVALID_ADDRESS);
11762 		}
11763 
11764 		src_entry = tmp_entry;
11765 		vm_map_clip_start(src_map, src_entry, src_start);
11766 
11767 		if ((((src_entry->protection & VM_PROT_READ) == VM_PROT_NONE) &&
11768 		    !use_maxprot) ||
11769 		    ((src_entry->max_protection & VM_PROT_READ) == 0)) {
11770 			goto VerificationFailed;
11771 		}
11772 
11773 		if (src_entry->vme_end < new_entry->vme_end) {
11774 			/*
11775 			 * This entry might have been shortened
11776 			 * (vm_map_clip_end) or been replaced with
11777 			 * an entry that ends closer to "src_start"
11778 			 * than before.
11779 			 * Adjust "new_entry" accordingly; copying
11780 			 * less memory would be correct but we also
11781 			 * redo the copy (see below) if the new entry
11782 			 * no longer points at the same object/offset.
11783 			 */
11784 			assert(VM_MAP_PAGE_ALIGNED(src_entry->vme_end,
11785 			    VM_MAP_COPY_PAGE_MASK(copy)));
11786 			new_entry->vme_end = src_entry->vme_end;
11787 			src_size = new_entry->vme_end - src_start;
11788 		} else if (src_entry->vme_end > new_entry->vme_end) {
11789 			/*
11790 			 * This entry might have been extended
11791 			 * (vm_map_entry_simplify() or coalesce)
11792 			 * or been replaced with an entry that ends farther
11793 			 * from "src_start" than before.
11794 			 *
11795 			 * We've called vm_object_copy_*() only on
11796 			 * the previous <start:end> range, so we can't
11797 			 * just extend new_entry.  We have to re-do
11798 			 * the copy based on the new entry as if it was
11799 			 * pointing at a different object/offset (see
11800 			 * "Verification failed" below).
11801 			 */
11802 		}
11803 
11804 		if ((VME_OBJECT(src_entry) != src_object) ||
11805 		    (VME_OFFSET(src_entry) != src_offset) ||
11806 		    (src_entry->vme_end > new_entry->vme_end)) {
11807 			/*
11808 			 *	Verification failed.
11809 			 *
11810 			 *	Start over with this top-level entry.
11811 			 */
11812 
11813 VerificationFailed:     ;
11814 
11815 			vm_object_deallocate(VME_OBJECT(new_entry));
11816 			tmp_entry = src_entry;
11817 			continue;
11818 		}
11819 
11820 		/*
11821 		 *	Verification succeeded.
11822 		 */
11823 
11824 VerificationSuccessful:;
11825 
11826 		if (result == KERN_MEMORY_RESTART_COPY) {
11827 			goto RestartCopy;
11828 		}
11829 
11830 		/*
11831 		 *	Copy succeeded.
11832 		 */
11833 
11834 CopySuccessful: ;
11835 
11836 		/*
11837 		 *	Link in the new copy entry.
11838 		 */
11839 
11840 		vm_map_copy_entry_link(copy, vm_map_copy_last_entry(copy),
11841 		    new_entry);
11842 
11843 		/*
11844 		 *	Determine whether the entire region
11845 		 *	has been copied.
11846 		 */
11847 		src_base = src_start;
11848 		src_start = new_entry->vme_end;
11849 		new_entry = VM_MAP_ENTRY_NULL;
11850 		while ((src_start >= src_end) && (src_end != 0)) {
11851 			submap_map_t    *ptr;
11852 
11853 			if (src_map == base_map) {
11854 				/* back to the top */
11855 				break;
11856 			}
11857 
11858 			ptr = parent_maps;
11859 			assert(ptr != NULL);
11860 			parent_maps = parent_maps->next;
11861 
11862 			/* fix up the damage we did in that submap */
11863 			vm_map_simplify_range(src_map,
11864 			    src_base,
11865 			    src_end);
11866 
11867 			vm_map_unlock(src_map);
11868 			vm_map_deallocate(src_map);
11869 			vm_map_lock(ptr->parent_map);
11870 			src_map = ptr->parent_map;
11871 			src_base = ptr->base_start;
11872 			src_start = ptr->base_start + ptr->base_len;
11873 			src_end = ptr->base_end;
11874 			if (!vm_map_lookup_entry(src_map,
11875 			    src_start,
11876 			    &tmp_entry) &&
11877 			    (src_end > src_start)) {
11878 				RETURN(KERN_INVALID_ADDRESS);
11879 			}
11880 			kfree_type(submap_map_t, ptr);
11881 			if (parent_maps == NULL) {
11882 				map_share = FALSE;
11883 			}
11884 			src_entry = tmp_entry->vme_prev;
11885 		}
11886 
11887 		if ((VM_MAP_PAGE_SHIFT(src_map) != PAGE_SHIFT) &&
11888 		    (src_start >= src_addr + len) &&
11889 		    (src_addr + len != 0)) {
11890 			/*
11891 			 * Stop copying now, even though we haven't reached
11892 			 * "src_end".  We'll adjust the end of the last copy
11893 			 * entry at the end, if needed.
11894 			 *
11895 			 * If src_map's aligment is different from the
11896 			 * system's page-alignment, there could be
11897 			 * extra non-map-aligned map entries between
11898 			 * the original (non-rounded) "src_addr + len"
11899 			 * and the rounded "src_end".
11900 			 * We do not want to copy those map entries since
11901 			 * they're not part of the copied range.
11902 			 */
11903 			break;
11904 		}
11905 
11906 		if ((src_start >= src_end) && (src_end != 0)) {
11907 			break;
11908 		}
11909 
11910 		/*
11911 		 *	Verify that there are no gaps in the region
11912 		 */
11913 
11914 		tmp_entry = src_entry->vme_next;
11915 		if ((tmp_entry->vme_start != src_start) ||
11916 		    (tmp_entry == vm_map_to_entry(src_map))) {
11917 			RETURN(KERN_INVALID_ADDRESS);
11918 		}
11919 	}
11920 
11921 	/*
11922 	 * If the source should be destroyed, do it now, since the
11923 	 * copy was successful.
11924 	 */
11925 	if (src_destroy) {
11926 		(void)vm_map_remove_and_unlock(src_map,
11927 		    vm_map_trunc_page(src_addr, VM_MAP_PAGE_MASK(src_map)),
11928 		    src_end,
11929 		    ((src_map == kernel_map) ?
11930 		    VM_MAP_REMOVE_KUNWIRE :
11931 		    VM_MAP_REMOVE_NO_FLAGS));
11932 	} else {
11933 		/* fix up the damage we did in the base map */
11934 		vm_map_simplify_range(
11935 			src_map,
11936 			vm_map_trunc_page(src_addr,
11937 			VM_MAP_PAGE_MASK(src_map)),
11938 			vm_map_round_page(src_end,
11939 			VM_MAP_PAGE_MASK(src_map)));
11940 		vm_map_unlock(src_map);
11941 	}
11942 
11943 	tmp_entry = VM_MAP_ENTRY_NULL;
11944 
11945 	if (VM_MAP_PAGE_SHIFT(src_map) > PAGE_SHIFT &&
11946 	    VM_MAP_PAGE_SHIFT(src_map) != VM_MAP_COPY_PAGE_SHIFT(copy)) {
11947 		vm_map_offset_t original_start, original_offset, original_end;
11948 
11949 		assert(VM_MAP_COPY_PAGE_MASK(copy) == PAGE_MASK);
11950 
11951 		/* adjust alignment of first copy_entry's "vme_start" */
11952 		tmp_entry = vm_map_copy_first_entry(copy);
11953 		if (tmp_entry != vm_map_copy_to_entry(copy)) {
11954 			vm_map_offset_t adjustment;
11955 
11956 			original_start = tmp_entry->vme_start;
11957 			original_offset = VME_OFFSET(tmp_entry);
11958 
11959 			/* map-align the start of the first copy entry... */
11960 			adjustment = (tmp_entry->vme_start -
11961 			    vm_map_trunc_page(
11962 				    tmp_entry->vme_start,
11963 				    VM_MAP_PAGE_MASK(src_map)));
11964 			tmp_entry->vme_start -= adjustment;
11965 			VME_OFFSET_SET(tmp_entry,
11966 			    VME_OFFSET(tmp_entry) - adjustment);
11967 			copy_addr -= adjustment;
11968 			assert(tmp_entry->vme_start < tmp_entry->vme_end);
11969 			/* ... adjust for mis-aligned start of copy range */
11970 			adjustment =
11971 			    (vm_map_trunc_page(copy->offset,
11972 			    PAGE_MASK) -
11973 			    vm_map_trunc_page(copy->offset,
11974 			    VM_MAP_PAGE_MASK(src_map)));
11975 			if (adjustment) {
11976 				assert(page_aligned(adjustment));
11977 				assert(adjustment < VM_MAP_PAGE_SIZE(src_map));
11978 				tmp_entry->vme_start += adjustment;
11979 				VME_OFFSET_SET(tmp_entry,
11980 				    (VME_OFFSET(tmp_entry) +
11981 				    adjustment));
11982 				copy_addr += adjustment;
11983 				assert(tmp_entry->vme_start < tmp_entry->vme_end);
11984 			}
11985 
11986 			/*
11987 			 * Assert that the adjustments haven't exposed
11988 			 * more than was originally copied...
11989 			 */
11990 			assert(tmp_entry->vme_start >= original_start);
11991 			assert(VME_OFFSET(tmp_entry) >= original_offset);
11992 			/*
11993 			 * ... and that it did not adjust outside of a
11994 			 * a single 16K page.
11995 			 */
11996 			assert(vm_map_trunc_page(tmp_entry->vme_start,
11997 			    VM_MAP_PAGE_MASK(src_map)) ==
11998 			    vm_map_trunc_page(original_start,
11999 			    VM_MAP_PAGE_MASK(src_map)));
12000 		}
12001 
12002 		/* adjust alignment of last copy_entry's "vme_end" */
12003 		tmp_entry = vm_map_copy_last_entry(copy);
12004 		if (tmp_entry != vm_map_copy_to_entry(copy)) {
12005 			vm_map_offset_t adjustment;
12006 
12007 			original_end = tmp_entry->vme_end;
12008 
12009 			/* map-align the end of the last copy entry... */
12010 			tmp_entry->vme_end =
12011 			    vm_map_round_page(tmp_entry->vme_end,
12012 			    VM_MAP_PAGE_MASK(src_map));
12013 			/* ... adjust for mis-aligned end of copy range */
12014 			adjustment =
12015 			    (vm_map_round_page((copy->offset +
12016 			    copy->size),
12017 			    VM_MAP_PAGE_MASK(src_map)) -
12018 			    vm_map_round_page((copy->offset +
12019 			    copy->size),
12020 			    PAGE_MASK));
12021 			if (adjustment) {
12022 				assert(page_aligned(adjustment));
12023 				assert(adjustment < VM_MAP_PAGE_SIZE(src_map));
12024 				tmp_entry->vme_end -= adjustment;
12025 				assert(tmp_entry->vme_start < tmp_entry->vme_end);
12026 			}
12027 
12028 			/*
12029 			 * Assert that the adjustments haven't exposed
12030 			 * more than was originally copied...
12031 			 */
12032 			assert(tmp_entry->vme_end <= original_end);
12033 			/*
12034 			 * ... and that it did not adjust outside of a
12035 			 * a single 16K page.
12036 			 */
12037 			assert(vm_map_round_page(tmp_entry->vme_end,
12038 			    VM_MAP_PAGE_MASK(src_map)) ==
12039 			    vm_map_round_page(original_end,
12040 			    VM_MAP_PAGE_MASK(src_map)));
12041 		}
12042 	}
12043 
12044 	/* Fix-up start and end points in copy.  This is necessary */
12045 	/* when the various entries in the copy object were picked */
12046 	/* up from different sub-maps */
12047 
12048 	tmp_entry = vm_map_copy_first_entry(copy);
12049 	copy_size = 0; /* compute actual size */
12050 	while (tmp_entry != vm_map_copy_to_entry(copy)) {
12051 		assert(VM_MAP_PAGE_ALIGNED(
12052 			    copy_addr + (tmp_entry->vme_end -
12053 			    tmp_entry->vme_start),
12054 			    MIN(VM_MAP_COPY_PAGE_MASK(copy), PAGE_MASK)));
12055 		assert(VM_MAP_PAGE_ALIGNED(
12056 			    copy_addr,
12057 			    MIN(VM_MAP_COPY_PAGE_MASK(copy), PAGE_MASK)));
12058 
12059 		/*
12060 		 * The copy_entries will be injected directly into the
12061 		 * destination map and might not be "map aligned" there...
12062 		 */
12063 		tmp_entry->map_aligned = FALSE;
12064 
12065 		tmp_entry->vme_end = copy_addr +
12066 		    (tmp_entry->vme_end - tmp_entry->vme_start);
12067 		tmp_entry->vme_start = copy_addr;
12068 		assert(tmp_entry->vme_start < tmp_entry->vme_end);
12069 		copy_addr += tmp_entry->vme_end - tmp_entry->vme_start;
12070 		copy_size += tmp_entry->vme_end - tmp_entry->vme_start;
12071 		tmp_entry = (struct vm_map_entry *)tmp_entry->vme_next;
12072 	}
12073 
12074 	if (VM_MAP_PAGE_SHIFT(src_map) != PAGE_SHIFT &&
12075 	    copy_size < copy->size) {
12076 		/*
12077 		 * The actual size of the VM map copy is smaller than what
12078 		 * was requested by the caller.  This must be because some
12079 		 * PAGE_SIZE-sized pages are missing at the end of the last
12080 		 * VM_MAP_PAGE_SIZE(src_map)-sized chunk of the range.
12081 		 * The caller might not have been aware of those missing
12082 		 * pages and might not want to be aware of it, which is
12083 		 * fine as long as they don't try to access (and crash on)
12084 		 * those missing pages.
12085 		 * Let's adjust the size of the "copy", to avoid failing
12086 		 * in vm_map_copyout() or vm_map_copy_overwrite().
12087 		 */
12088 		assert(vm_map_round_page(copy_size,
12089 		    VM_MAP_PAGE_MASK(src_map)) ==
12090 		    vm_map_round_page(copy->size,
12091 		    VM_MAP_PAGE_MASK(src_map)));
12092 		copy->size = copy_size;
12093 	}
12094 
12095 	*copy_result = copy;
12096 	return KERN_SUCCESS;
12097 
12098 #undef  RETURN
12099 }
12100 
12101 kern_return_t
vm_map_copy_extract(vm_map_t src_map,vm_map_address_t src_addr,vm_map_size_t len,boolean_t do_copy,vm_map_copy_t * copy_result,vm_prot_t * cur_prot,vm_prot_t * max_prot,vm_inherit_t inheritance,vm_map_kernel_flags_t vmk_flags)12102 vm_map_copy_extract(
12103 	vm_map_t                src_map,
12104 	vm_map_address_t        src_addr,
12105 	vm_map_size_t           len,
12106 	boolean_t               do_copy,
12107 	vm_map_copy_t           *copy_result,   /* OUT */
12108 	vm_prot_t               *cur_prot,      /* IN/OUT */
12109 	vm_prot_t               *max_prot,      /* IN/OUT */
12110 	vm_inherit_t            inheritance,
12111 	vm_map_kernel_flags_t   vmk_flags)
12112 {
12113 	vm_map_copy_t   copy;
12114 	kern_return_t   kr;
12115 	vm_prot_t required_cur_prot, required_max_prot;
12116 
12117 	/*
12118 	 *	Check for copies of zero bytes.
12119 	 */
12120 
12121 	if (len == 0) {
12122 		*copy_result = VM_MAP_COPY_NULL;
12123 		return KERN_SUCCESS;
12124 	}
12125 
12126 	/*
12127 	 *	Check that the end address doesn't overflow
12128 	 */
12129 	if (src_addr + len < src_addr) {
12130 		return KERN_INVALID_ADDRESS;
12131 	}
12132 
12133 	if (VM_MAP_PAGE_SIZE(src_map) < PAGE_SIZE) {
12134 		DEBUG4K_SHARE("src_map %p src_addr 0x%llx src_end 0x%llx\n", src_map, (uint64_t)src_addr, (uint64_t)(src_addr + len));
12135 	}
12136 
12137 	required_cur_prot = *cur_prot;
12138 	required_max_prot = *max_prot;
12139 
12140 	/*
12141 	 *	Allocate a header element for the list.
12142 	 *
12143 	 *	Use the start and end in the header to
12144 	 *	remember the endpoints prior to rounding.
12145 	 */
12146 
12147 	copy = vm_map_copy_allocate();
12148 	copy->type = VM_MAP_COPY_ENTRY_LIST;
12149 	copy->cpy_hdr.entries_pageable = vmk_flags.vmkf_copy_pageable;
12150 
12151 	vm_map_store_init(&copy->cpy_hdr);
12152 
12153 	copy->offset = 0;
12154 	copy->size = len;
12155 
12156 	kr = vm_map_remap_extract(src_map,
12157 	    src_addr,
12158 	    len,
12159 	    do_copy,             /* copy */
12160 	    &copy->cpy_hdr,
12161 	    cur_prot,            /* IN/OUT */
12162 	    max_prot,            /* IN/OUT */
12163 	    inheritance,
12164 	    vmk_flags);
12165 	if (kr != KERN_SUCCESS) {
12166 		vm_map_copy_discard(copy);
12167 		return kr;
12168 	}
12169 	if (required_cur_prot != VM_PROT_NONE) {
12170 		assert((*cur_prot & required_cur_prot) == required_cur_prot);
12171 		assert((*max_prot & required_max_prot) == required_max_prot);
12172 	}
12173 
12174 	*copy_result = copy;
12175 	return KERN_SUCCESS;
12176 }
12177 
12178 /*
12179  *	vm_map_copyin_object:
12180  *
12181  *	Create a copy object from an object.
12182  *	Our caller donates an object reference.
12183  */
12184 
12185 kern_return_t
vm_map_copyin_object(vm_object_t object,vm_object_offset_t offset,vm_object_size_t size,vm_map_copy_t * copy_result)12186 vm_map_copyin_object(
12187 	vm_object_t             object,
12188 	vm_object_offset_t      offset, /* offset of region in object */
12189 	vm_object_size_t        size,   /* size of region in object */
12190 	vm_map_copy_t   *copy_result)   /* OUT */
12191 {
12192 	vm_map_copy_t   copy;           /* Resulting copy */
12193 
12194 	/*
12195 	 *	We drop the object into a special copy object
12196 	 *	that contains the object directly.
12197 	 */
12198 
12199 	copy = vm_map_copy_allocate();
12200 	copy->type = VM_MAP_COPY_OBJECT;
12201 	copy->cpy_object = object;
12202 	copy->offset = offset;
12203 	copy->size = size;
12204 
12205 	*copy_result = copy;
12206 	return KERN_SUCCESS;
12207 }
12208 
12209 static void
vm_map_fork_share(vm_map_t old_map,vm_map_entry_t old_entry,vm_map_t new_map)12210 vm_map_fork_share(
12211 	vm_map_t        old_map,
12212 	vm_map_entry_t  old_entry,
12213 	vm_map_t        new_map)
12214 {
12215 	vm_object_t     object;
12216 	vm_map_entry_t  new_entry;
12217 
12218 	/*
12219 	 *	New sharing code.  New map entry
12220 	 *	references original object.  Internal
12221 	 *	objects use asynchronous copy algorithm for
12222 	 *	future copies.  First make sure we have
12223 	 *	the right object.  If we need a shadow,
12224 	 *	or someone else already has one, then
12225 	 *	make a new shadow and share it.
12226 	 */
12227 
12228 	object = VME_OBJECT(old_entry);
12229 	if (old_entry->is_sub_map) {
12230 		assert(old_entry->wired_count == 0);
12231 #ifndef NO_NESTED_PMAP
12232 		if (old_entry->use_pmap) {
12233 			kern_return_t   result;
12234 
12235 			result = pmap_nest(new_map->pmap,
12236 			    (VME_SUBMAP(old_entry))->pmap,
12237 			    (addr64_t)old_entry->vme_start,
12238 			    (uint64_t)(old_entry->vme_end - old_entry->vme_start));
12239 			if (result) {
12240 				panic("vm_map_fork_share: pmap_nest failed!");
12241 			}
12242 		}
12243 #endif  /* NO_NESTED_PMAP */
12244 	} else if (object == VM_OBJECT_NULL) {
12245 		object = vm_object_allocate((vm_map_size_t)(old_entry->vme_end -
12246 		    old_entry->vme_start));
12247 		VME_OFFSET_SET(old_entry, 0);
12248 		VME_OBJECT_SET(old_entry, object);
12249 		old_entry->use_pmap = TRUE;
12250 //		assert(!old_entry->needs_copy);
12251 	} else if (object->copy_strategy !=
12252 	    MEMORY_OBJECT_COPY_SYMMETRIC) {
12253 		/*
12254 		 *	We are already using an asymmetric
12255 		 *	copy, and therefore we already have
12256 		 *	the right object.
12257 		 */
12258 
12259 		assert(!old_entry->needs_copy);
12260 	} else if (old_entry->needs_copy ||       /* case 1 */
12261 	    object->shadowed ||                 /* case 2 */
12262 	    (!object->true_share &&             /* case 3 */
12263 	    !old_entry->is_shared &&
12264 	    (object->vo_size >
12265 	    (vm_map_size_t)(old_entry->vme_end -
12266 	    old_entry->vme_start)))) {
12267 		/*
12268 		 *	We need to create a shadow.
12269 		 *	There are three cases here.
12270 		 *	In the first case, we need to
12271 		 *	complete a deferred symmetrical
12272 		 *	copy that we participated in.
12273 		 *	In the second and third cases,
12274 		 *	we need to create the shadow so
12275 		 *	that changes that we make to the
12276 		 *	object do not interfere with
12277 		 *	any symmetrical copies which
12278 		 *	have occured (case 2) or which
12279 		 *	might occur (case 3).
12280 		 *
12281 		 *	The first case is when we had
12282 		 *	deferred shadow object creation
12283 		 *	via the entry->needs_copy mechanism.
12284 		 *	This mechanism only works when
12285 		 *	only one entry points to the source
12286 		 *	object, and we are about to create
12287 		 *	a second entry pointing to the
12288 		 *	same object. The problem is that
12289 		 *	there is no way of mapping from
12290 		 *	an object to the entries pointing
12291 		 *	to it. (Deferred shadow creation
12292 		 *	works with one entry because occurs
12293 		 *	at fault time, and we walk from the
12294 		 *	entry to the object when handling
12295 		 *	the fault.)
12296 		 *
12297 		 *	The second case is when the object
12298 		 *	to be shared has already been copied
12299 		 *	with a symmetric copy, but we point
12300 		 *	directly to the object without
12301 		 *	needs_copy set in our entry. (This
12302 		 *	can happen because different ranges
12303 		 *	of an object can be pointed to by
12304 		 *	different entries. In particular,
12305 		 *	a single entry pointing to an object
12306 		 *	can be split by a call to vm_inherit,
12307 		 *	which, combined with task_create, can
12308 		 *	result in the different entries
12309 		 *	having different needs_copy values.)
12310 		 *	The shadowed flag in the object allows
12311 		 *	us to detect this case. The problem
12312 		 *	with this case is that if this object
12313 		 *	has or will have shadows, then we
12314 		 *	must not perform an asymmetric copy
12315 		 *	of this object, since such a copy
12316 		 *	allows the object to be changed, which
12317 		 *	will break the previous symmetrical
12318 		 *	copies (which rely upon the object
12319 		 *	not changing). In a sense, the shadowed
12320 		 *	flag says "don't change this object".
12321 		 *	We fix this by creating a shadow
12322 		 *	object for this object, and sharing
12323 		 *	that. This works because we are free
12324 		 *	to change the shadow object (and thus
12325 		 *	to use an asymmetric copy strategy);
12326 		 *	this is also semantically correct,
12327 		 *	since this object is temporary, and
12328 		 *	therefore a copy of the object is
12329 		 *	as good as the object itself. (This
12330 		 *	is not true for permanent objects,
12331 		 *	since the pager needs to see changes,
12332 		 *	which won't happen if the changes
12333 		 *	are made to a copy.)
12334 		 *
12335 		 *	The third case is when the object
12336 		 *	to be shared has parts sticking
12337 		 *	outside of the entry we're working
12338 		 *	with, and thus may in the future
12339 		 *	be subject to a symmetrical copy.
12340 		 *	(This is a preemptive version of
12341 		 *	case 2.)
12342 		 */
12343 		VME_OBJECT_SHADOW(old_entry,
12344 		    (vm_map_size_t) (old_entry->vme_end -
12345 		    old_entry->vme_start));
12346 
12347 		/*
12348 		 *	If we're making a shadow for other than
12349 		 *	copy on write reasons, then we have
12350 		 *	to remove write permission.
12351 		 */
12352 
12353 		if (!old_entry->needs_copy &&
12354 		    (old_entry->protection & VM_PROT_WRITE)) {
12355 			vm_prot_t prot;
12356 
12357 			assert(!pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, old_entry->protection));
12358 
12359 			prot = old_entry->protection & ~VM_PROT_WRITE;
12360 
12361 			assert(!pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, prot));
12362 
12363 			if (override_nx(old_map, VME_ALIAS(old_entry)) && prot) {
12364 				prot |= VM_PROT_EXECUTE;
12365 			}
12366 
12367 
12368 			if (old_map->mapped_in_other_pmaps) {
12369 				vm_object_pmap_protect(
12370 					VME_OBJECT(old_entry),
12371 					VME_OFFSET(old_entry),
12372 					(old_entry->vme_end -
12373 					old_entry->vme_start),
12374 					PMAP_NULL,
12375 					PAGE_SIZE,
12376 					old_entry->vme_start,
12377 					prot);
12378 			} else {
12379 				pmap_protect(old_map->pmap,
12380 				    old_entry->vme_start,
12381 				    old_entry->vme_end,
12382 				    prot);
12383 			}
12384 		}
12385 
12386 		old_entry->needs_copy = FALSE;
12387 		object = VME_OBJECT(old_entry);
12388 	}
12389 
12390 
12391 	/*
12392 	 *	If object was using a symmetric copy strategy,
12393 	 *	change its copy strategy to the default
12394 	 *	asymmetric copy strategy, which is copy_delay
12395 	 *	in the non-norma case and copy_call in the
12396 	 *	norma case. Bump the reference count for the
12397 	 *	new entry.
12398 	 */
12399 
12400 	if (old_entry->is_sub_map) {
12401 		vm_map_lock(VME_SUBMAP(old_entry));
12402 		vm_map_reference(VME_SUBMAP(old_entry));
12403 		vm_map_unlock(VME_SUBMAP(old_entry));
12404 	} else {
12405 		vm_object_lock(object);
12406 		vm_object_reference_locked(object);
12407 		if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
12408 			object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
12409 		}
12410 		vm_object_unlock(object);
12411 	}
12412 
12413 	/*
12414 	 *	Clone the entry, using object ref from above.
12415 	 *	Mark both entries as shared.
12416 	 */
12417 
12418 	new_entry = vm_map_entry_create(new_map); /* Never the kernel map or descendants */
12419 	vm_map_entry_copy(old_map, new_entry, old_entry);
12420 	old_entry->is_shared = TRUE;
12421 	new_entry->is_shared = TRUE;
12422 
12423 	/*
12424 	 * We're dealing with a shared mapping, so the resulting mapping
12425 	 * should inherit some of the original mapping's accounting settings.
12426 	 * "iokit_acct" should have been cleared in vm_map_entry_copy().
12427 	 * "use_pmap" should stay the same as before (if it hasn't been reset
12428 	 * to TRUE when we cleared "iokit_acct").
12429 	 */
12430 	assert(!new_entry->iokit_acct);
12431 
12432 	/*
12433 	 *	If old entry's inheritence is VM_INHERIT_NONE,
12434 	 *	the new entry is for corpse fork, remove the
12435 	 *	write permission from the new entry.
12436 	 */
12437 	if (old_entry->inheritance == VM_INHERIT_NONE) {
12438 		new_entry->protection &= ~VM_PROT_WRITE;
12439 		new_entry->max_protection &= ~VM_PROT_WRITE;
12440 	}
12441 
12442 	/*
12443 	 *	Insert the entry into the new map -- we
12444 	 *	know we're inserting at the end of the new
12445 	 *	map.
12446 	 */
12447 
12448 	vm_map_store_entry_link(new_map, vm_map_last_entry(new_map), new_entry,
12449 	    VM_MAP_KERNEL_FLAGS_NONE);
12450 
12451 	/*
12452 	 *	Update the physical map
12453 	 */
12454 
12455 	if (old_entry->is_sub_map) {
12456 		/* Bill Angell pmap support goes here */
12457 	} else {
12458 		pmap_copy(new_map->pmap, old_map->pmap, new_entry->vme_start,
12459 		    old_entry->vme_end - old_entry->vme_start,
12460 		    old_entry->vme_start);
12461 	}
12462 }
12463 
12464 static boolean_t
vm_map_fork_copy(vm_map_t old_map,vm_map_entry_t * old_entry_p,vm_map_t new_map,int vm_map_copyin_flags)12465 vm_map_fork_copy(
12466 	vm_map_t        old_map,
12467 	vm_map_entry_t  *old_entry_p,
12468 	vm_map_t        new_map,
12469 	int             vm_map_copyin_flags)
12470 {
12471 	vm_map_entry_t old_entry = *old_entry_p;
12472 	vm_map_size_t entry_size = old_entry->vme_end - old_entry->vme_start;
12473 	vm_map_offset_t start = old_entry->vme_start;
12474 	vm_map_copy_t copy;
12475 	vm_map_entry_t last = vm_map_last_entry(new_map);
12476 
12477 	vm_map_unlock(old_map);
12478 	/*
12479 	 *	Use maxprot version of copyin because we
12480 	 *	care about whether this memory can ever
12481 	 *	be accessed, not just whether it's accessible
12482 	 *	right now.
12483 	 */
12484 	vm_map_copyin_flags |= VM_MAP_COPYIN_USE_MAXPROT;
12485 	if (vm_map_copyin_internal(old_map, start, entry_size,
12486 	    vm_map_copyin_flags, &copy)
12487 	    != KERN_SUCCESS) {
12488 		/*
12489 		 *	The map might have changed while it
12490 		 *	was unlocked, check it again.  Skip
12491 		 *	any blank space or permanently
12492 		 *	unreadable region.
12493 		 */
12494 		vm_map_lock(old_map);
12495 		if (!vm_map_lookup_entry(old_map, start, &last) ||
12496 		    (last->max_protection & VM_PROT_READ) == VM_PROT_NONE) {
12497 			last = last->vme_next;
12498 		}
12499 		*old_entry_p = last;
12500 
12501 		/*
12502 		 * XXX	For some error returns, want to
12503 		 * XXX	skip to the next element.  Note
12504 		 *	that INVALID_ADDRESS and
12505 		 *	PROTECTION_FAILURE are handled above.
12506 		 */
12507 
12508 		return FALSE;
12509 	}
12510 
12511 	/*
12512 	 * Assert that the vm_map_copy is coming from the right
12513 	 * zone and hasn't been forged
12514 	 */
12515 	vm_map_copy_require(copy);
12516 
12517 	/*
12518 	 *	Insert the copy into the new map
12519 	 */
12520 	vm_map_copy_insert(new_map, last, copy);
12521 
12522 	/*
12523 	 *	Pick up the traversal at the end of
12524 	 *	the copied region.
12525 	 */
12526 
12527 	vm_map_lock(old_map);
12528 	start += entry_size;
12529 	if (!vm_map_lookup_entry(old_map, start, &last)) {
12530 		last = last->vme_next;
12531 	} else {
12532 		if (last->vme_start == start) {
12533 			/*
12534 			 * No need to clip here and we don't
12535 			 * want to cause any unnecessary
12536 			 * unnesting...
12537 			 */
12538 		} else {
12539 			vm_map_clip_start(old_map, last, start);
12540 		}
12541 	}
12542 	*old_entry_p = last;
12543 
12544 	return TRUE;
12545 }
12546 
12547 /*
12548  *	vm_map_fork:
12549  *
12550  *	Create and return a new map based on the old
12551  *	map, according to the inheritance values on the
12552  *	regions in that map and the options.
12553  *
12554  *	The source map must not be locked.
12555  */
12556 vm_map_t
vm_map_fork(ledger_t ledger,vm_map_t old_map,int options)12557 vm_map_fork(
12558 	ledger_t        ledger,
12559 	vm_map_t        old_map,
12560 	int             options)
12561 {
12562 	pmap_t          new_pmap;
12563 	vm_map_t        new_map;
12564 	vm_map_entry_t  old_entry;
12565 	vm_map_size_t   new_size = 0, entry_size;
12566 	vm_map_entry_t  new_entry;
12567 	boolean_t       src_needs_copy;
12568 	boolean_t       new_entry_needs_copy;
12569 	boolean_t       pmap_is64bit;
12570 	int             vm_map_copyin_flags;
12571 	vm_inherit_t    old_entry_inheritance;
12572 	int             map_create_options;
12573 	kern_return_t   footprint_collect_kr;
12574 
12575 	if (options & ~(VM_MAP_FORK_SHARE_IF_INHERIT_NONE |
12576 	    VM_MAP_FORK_PRESERVE_PURGEABLE |
12577 	    VM_MAP_FORK_CORPSE_FOOTPRINT)) {
12578 		/* unsupported option */
12579 		return VM_MAP_NULL;
12580 	}
12581 
12582 	pmap_is64bit =
12583 #if defined(__i386__) || defined(__x86_64__)
12584 	    old_map->pmap->pm_task_map != TASK_MAP_32BIT;
12585 #elif defined(__arm64__)
12586 	    old_map->pmap->is_64bit;
12587 #elif defined(__arm__)
12588 	    FALSE;
12589 #else
12590 #error Unknown architecture.
12591 #endif
12592 
12593 	unsigned int pmap_flags = 0;
12594 	pmap_flags |= pmap_is64bit ? PMAP_CREATE_64BIT : 0;
12595 #if defined(HAS_APPLE_PAC)
12596 	pmap_flags |= old_map->pmap->disable_jop ? PMAP_CREATE_DISABLE_JOP : 0;
12597 #endif
12598 #if PMAP_CREATE_FORCE_4K_PAGES
12599 	if (VM_MAP_PAGE_SIZE(old_map) == FOURK_PAGE_SIZE &&
12600 	    PAGE_SIZE != FOURK_PAGE_SIZE) {
12601 		pmap_flags |= PMAP_CREATE_FORCE_4K_PAGES;
12602 	}
12603 #endif /* PMAP_CREATE_FORCE_4K_PAGES */
12604 	new_pmap = pmap_create_options(ledger, (vm_map_size_t) 0, pmap_flags);
12605 	if (new_pmap == NULL) {
12606 		return VM_MAP_NULL;
12607 	}
12608 
12609 	vm_map_reference(old_map);
12610 	vm_map_lock(old_map);
12611 
12612 	map_create_options = 0;
12613 	if (old_map->hdr.entries_pageable) {
12614 		map_create_options |= VM_MAP_CREATE_PAGEABLE;
12615 	}
12616 	if (options & VM_MAP_FORK_CORPSE_FOOTPRINT) {
12617 		map_create_options |= VM_MAP_CREATE_CORPSE_FOOTPRINT;
12618 		footprint_collect_kr = KERN_SUCCESS;
12619 	}
12620 	new_map = vm_map_create_options(new_pmap,
12621 	    old_map->min_offset,
12622 	    old_map->max_offset,
12623 	    map_create_options);
12624 	/* inherit cs_enforcement */
12625 	vm_map_cs_enforcement_set(new_map, old_map->cs_enforcement);
12626 	vm_map_lock(new_map);
12627 	vm_commit_pagezero_status(new_map);
12628 	/* inherit the parent map's page size */
12629 	vm_map_set_page_shift(new_map, VM_MAP_PAGE_SHIFT(old_map));
12630 
12631 	/* ensure PMAP_CS structures are prepared for the fork */
12632 	pmap_cs_fork_prepare(old_map->pmap, new_pmap);
12633 
12634 	for (old_entry = vm_map_first_entry(old_map); old_entry != vm_map_to_entry(old_map);) {
12635 		/*
12636 		 * Abort any corpse collection if the system is shutting down.
12637 		 */
12638 		if ((options & VM_MAP_FORK_CORPSE_FOOTPRINT) &&
12639 		    get_system_inshutdown()) {
12640 			vm_map_corpse_footprint_collect_done(new_map);
12641 			vm_map_unlock(new_map);
12642 			vm_map_unlock(old_map);
12643 			vm_map_deallocate(new_map);
12644 			vm_map_deallocate(old_map);
12645 			printf("Aborting corpse map due to system shutdown\n");
12646 			return VM_MAP_NULL;
12647 		}
12648 
12649 		entry_size = old_entry->vme_end - old_entry->vme_start;
12650 
12651 		old_entry_inheritance = old_entry->inheritance;
12652 		/*
12653 		 * If caller used the VM_MAP_FORK_SHARE_IF_INHERIT_NONE option
12654 		 * share VM_INHERIT_NONE entries that are not backed by a
12655 		 * device pager.
12656 		 */
12657 		if (old_entry_inheritance == VM_INHERIT_NONE &&
12658 		    (options & VM_MAP_FORK_SHARE_IF_INHERIT_NONE) &&
12659 		    (old_entry->protection & VM_PROT_READ) &&
12660 		    !(!old_entry->is_sub_map &&
12661 		    VME_OBJECT(old_entry) != NULL &&
12662 		    VME_OBJECT(old_entry)->pager != NULL &&
12663 		    is_device_pager_ops(
12664 			    VME_OBJECT(old_entry)->pager->mo_pager_ops))) {
12665 			old_entry_inheritance = VM_INHERIT_SHARE;
12666 		}
12667 
12668 		if (old_entry_inheritance != VM_INHERIT_NONE &&
12669 		    (options & VM_MAP_FORK_CORPSE_FOOTPRINT) &&
12670 		    footprint_collect_kr == KERN_SUCCESS) {
12671 			/*
12672 			 * The corpse won't have old_map->pmap to query
12673 			 * footprint information, so collect that data now
12674 			 * and store it in new_map->vmmap_corpse_footprint
12675 			 * for later autopsy.
12676 			 */
12677 			footprint_collect_kr =
12678 			    vm_map_corpse_footprint_collect(old_map,
12679 			    old_entry,
12680 			    new_map);
12681 		}
12682 
12683 		switch (old_entry_inheritance) {
12684 		case VM_INHERIT_NONE:
12685 			break;
12686 
12687 		case VM_INHERIT_SHARE:
12688 			vm_map_fork_share(old_map, old_entry, new_map);
12689 			new_size += entry_size;
12690 			break;
12691 
12692 		case VM_INHERIT_COPY:
12693 
12694 			/*
12695 			 *	Inline the copy_quickly case;
12696 			 *	upon failure, fall back on call
12697 			 *	to vm_map_fork_copy.
12698 			 */
12699 
12700 			if (old_entry->is_sub_map) {
12701 				break;
12702 			}
12703 			if ((old_entry->wired_count != 0) ||
12704 			    ((VME_OBJECT(old_entry) != NULL) &&
12705 			    (VME_OBJECT(old_entry)->true_share))) {
12706 				goto slow_vm_map_fork_copy;
12707 			}
12708 
12709 			new_entry = vm_map_entry_create(new_map); /* never the kernel map or descendants */
12710 			vm_map_entry_copy(old_map, new_entry, old_entry);
12711 			if (old_entry->permanent) {
12712 				/* inherit "permanent" on fork() */
12713 				new_entry->permanent = TRUE;
12714 			}
12715 
12716 			if (new_entry->used_for_jit == TRUE && new_map->jit_entry_exists == FALSE) {
12717 				new_map->jit_entry_exists = TRUE;
12718 			}
12719 
12720 			if (new_entry->is_sub_map) {
12721 				/* clear address space specifics */
12722 				new_entry->use_pmap = FALSE;
12723 			} else {
12724 				/*
12725 				 * We're dealing with a copy-on-write operation,
12726 				 * so the resulting mapping should not inherit
12727 				 * the original mapping's accounting settings.
12728 				 * "iokit_acct" should have been cleared in
12729 				 * vm_map_entry_copy().
12730 				 * "use_pmap" should be reset to its default
12731 				 * (TRUE) so that the new mapping gets
12732 				 * accounted for in the task's memory footprint.
12733 				 */
12734 				assert(!new_entry->iokit_acct);
12735 				new_entry->use_pmap = TRUE;
12736 			}
12737 
12738 			if (!vm_object_copy_quickly(
12739 				    VME_OBJECT(new_entry),
12740 				    VME_OFFSET(old_entry),
12741 				    (old_entry->vme_end -
12742 				    old_entry->vme_start),
12743 				    &src_needs_copy,
12744 				    &new_entry_needs_copy)) {
12745 				vm_map_entry_dispose(new_entry);
12746 				goto slow_vm_map_fork_copy;
12747 			}
12748 
12749 			/*
12750 			 *	Handle copy-on-write obligations
12751 			 */
12752 
12753 			if (src_needs_copy && !old_entry->needs_copy) {
12754 				vm_prot_t prot;
12755 
12756 				assert(!pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, old_entry->protection));
12757 
12758 				prot = old_entry->protection & ~VM_PROT_WRITE;
12759 
12760 				if (override_nx(old_map, VME_ALIAS(old_entry))
12761 				    && prot) {
12762 					prot |= VM_PROT_EXECUTE;
12763 				}
12764 
12765 				assert(!pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, prot));
12766 
12767 				vm_object_pmap_protect(
12768 					VME_OBJECT(old_entry),
12769 					VME_OFFSET(old_entry),
12770 					(old_entry->vme_end -
12771 					old_entry->vme_start),
12772 					((old_entry->is_shared
12773 					|| old_map->mapped_in_other_pmaps)
12774 					? PMAP_NULL :
12775 					old_map->pmap),
12776 					VM_MAP_PAGE_SIZE(old_map),
12777 					old_entry->vme_start,
12778 					prot);
12779 
12780 				assert(old_entry->wired_count == 0);
12781 				old_entry->needs_copy = TRUE;
12782 			}
12783 			new_entry->needs_copy = new_entry_needs_copy;
12784 
12785 			/*
12786 			 *	Insert the entry at the end
12787 			 *	of the map.
12788 			 */
12789 
12790 			vm_map_store_entry_link(new_map,
12791 			    vm_map_last_entry(new_map),
12792 			    new_entry,
12793 			    VM_MAP_KERNEL_FLAGS_NONE);
12794 			new_size += entry_size;
12795 			break;
12796 
12797 slow_vm_map_fork_copy:
12798 			vm_map_copyin_flags = 0;
12799 			if (options & VM_MAP_FORK_PRESERVE_PURGEABLE) {
12800 				vm_map_copyin_flags |=
12801 				    VM_MAP_COPYIN_PRESERVE_PURGEABLE;
12802 			}
12803 			if (vm_map_fork_copy(old_map,
12804 			    &old_entry,
12805 			    new_map,
12806 			    vm_map_copyin_flags)) {
12807 				new_size += entry_size;
12808 			}
12809 			continue;
12810 		}
12811 		old_entry = old_entry->vme_next;
12812 	}
12813 
12814 #if defined(__arm64__)
12815 	pmap_insert_sharedpage(new_map->pmap);
12816 #endif /* __arm64__ */
12817 
12818 	new_map->size = new_size;
12819 
12820 	if (options & VM_MAP_FORK_CORPSE_FOOTPRINT) {
12821 		vm_map_corpse_footprint_collect_done(new_map);
12822 	}
12823 
12824 	/* Propagate JIT entitlement for the pmap layer. */
12825 	if (pmap_get_jit_entitled(old_map->pmap)) {
12826 		/* Tell the pmap that it supports JIT. */
12827 		pmap_set_jit_entitled(new_map->pmap);
12828 	}
12829 
12830 	vm_map_unlock(new_map);
12831 	vm_map_unlock(old_map);
12832 	vm_map_deallocate(old_map);
12833 
12834 	return new_map;
12835 }
12836 
12837 /*
12838  * vm_map_exec:
12839  *
12840  *      Setup the "new_map" with the proper execution environment according
12841  *	to the type of executable (platform, 64bit, chroot environment).
12842  *	Map the comm page and shared region, etc...
12843  */
12844 kern_return_t
vm_map_exec(vm_map_t new_map,task_t task,boolean_t is64bit,void * fsroot,cpu_type_t cpu,cpu_subtype_t cpu_subtype,boolean_t reslide,boolean_t is_driverkit)12845 vm_map_exec(
12846 	vm_map_t        new_map,
12847 	task_t          task,
12848 	boolean_t       is64bit,
12849 	void            *fsroot,
12850 	cpu_type_t      cpu,
12851 	cpu_subtype_t   cpu_subtype,
12852 	boolean_t       reslide,
12853 	boolean_t       is_driverkit)
12854 {
12855 	SHARED_REGION_TRACE_DEBUG(
12856 		("shared_region: task %p: vm_map_exec(%p,%p,%p,0x%x,0x%x): ->\n",
12857 		(void *)VM_KERNEL_ADDRPERM(current_task()),
12858 		(void *)VM_KERNEL_ADDRPERM(new_map),
12859 		(void *)VM_KERNEL_ADDRPERM(task),
12860 		(void *)VM_KERNEL_ADDRPERM(fsroot),
12861 		cpu,
12862 		cpu_subtype));
12863 	(void) vm_commpage_enter(new_map, task, is64bit);
12864 
12865 	(void) vm_shared_region_enter(new_map, task, is64bit, fsroot, cpu, cpu_subtype, reslide, is_driverkit);
12866 
12867 	SHARED_REGION_TRACE_DEBUG(
12868 		("shared_region: task %p: vm_map_exec(%p,%p,%p,0x%x,0x%x): <-\n",
12869 		(void *)VM_KERNEL_ADDRPERM(current_task()),
12870 		(void *)VM_KERNEL_ADDRPERM(new_map),
12871 		(void *)VM_KERNEL_ADDRPERM(task),
12872 		(void *)VM_KERNEL_ADDRPERM(fsroot),
12873 		cpu,
12874 		cpu_subtype));
12875 
12876 	/*
12877 	 * Some devices have region(s) of memory that shouldn't get allocated by
12878 	 * user processes. The following code creates dummy vm_map_entry_t's for each
12879 	 * of the regions that needs to be reserved to prevent any allocations in
12880 	 * those regions.
12881 	 */
12882 	kern_return_t kr = KERN_FAILURE;
12883 	vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
12884 	vmk_flags.vmkf_permanent = TRUE;
12885 	vmk_flags.vmkf_beyond_max = TRUE;
12886 
12887 	struct vm_reserved_region *regions = NULL;
12888 	size_t num_regions = ml_get_vm_reserved_regions(is64bit, &regions);
12889 	assert((num_regions == 0) || (num_regions > 0 && regions != NULL));
12890 
12891 	for (size_t i = 0; i < num_regions; ++i) {
12892 		kr = vm_map_enter(
12893 			new_map,
12894 			&regions[i].vmrr_addr,
12895 			regions[i].vmrr_size,
12896 			(vm_map_offset_t)0,
12897 			VM_FLAGS_FIXED,
12898 			vmk_flags,
12899 			VM_KERN_MEMORY_NONE,
12900 			VM_OBJECT_NULL,
12901 			(vm_object_offset_t)0,
12902 			FALSE,
12903 			VM_PROT_NONE,
12904 			VM_PROT_NONE,
12905 			VM_INHERIT_COPY);
12906 
12907 		if (kr != KERN_SUCCESS) {
12908 			panic("Failed to reserve %s region in user map %p %d", regions[i].vmrr_name, new_map, kr);
12909 		}
12910 	}
12911 
12912 	new_map->reserved_regions = (num_regions ? TRUE : FALSE);
12913 
12914 	return KERN_SUCCESS;
12915 }
12916 
12917 uint64_t vm_map_lookup_locked_copy_slowly_count = 0;
12918 uint64_t vm_map_lookup_locked_copy_slowly_size = 0;
12919 uint64_t vm_map_lookup_locked_copy_slowly_max = 0;
12920 uint64_t vm_map_lookup_locked_copy_slowly_restart = 0;
12921 uint64_t vm_map_lookup_locked_copy_slowly_error = 0;
12922 uint64_t vm_map_lookup_locked_copy_strategically_count = 0;
12923 uint64_t vm_map_lookup_locked_copy_strategically_size = 0;
12924 uint64_t vm_map_lookup_locked_copy_strategically_max = 0;
12925 uint64_t vm_map_lookup_locked_copy_strategically_restart = 0;
12926 uint64_t vm_map_lookup_locked_copy_strategically_error = 0;
12927 uint64_t vm_map_lookup_locked_copy_shadow_count = 0;
12928 uint64_t vm_map_lookup_locked_copy_shadow_size = 0;
12929 uint64_t vm_map_lookup_locked_copy_shadow_max = 0;
12930 /*
12931  *	vm_map_lookup_locked:
12932  *
12933  *	Finds the VM object, offset, and
12934  *	protection for a given virtual address in the
12935  *	specified map, assuming a page fault of the
12936  *	type specified.
12937  *
12938  *	Returns the (object, offset, protection) for
12939  *	this address, whether it is wired down, and whether
12940  *	this map has the only reference to the data in question.
12941  *	In order to later verify this lookup, a "version"
12942  *	is returned.
12943  *	If contended != NULL, *contended will be set to
12944  *	true iff the thread had to spin or block to acquire
12945  *	an exclusive lock.
12946  *
12947  *	The map MUST be locked by the caller and WILL be
12948  *	locked on exit.  In order to guarantee the
12949  *	existence of the returned object, it is returned
12950  *	locked.
12951  *
12952  *	If a lookup is requested with "write protection"
12953  *	specified, the map may be changed to perform virtual
12954  *	copying operations, although the data referenced will
12955  *	remain the same.
12956  */
12957 kern_return_t
vm_map_lookup_locked(vm_map_t * var_map,vm_map_offset_t vaddr,vm_prot_t fault_type,int object_lock_type,vm_map_version_t * out_version,vm_object_t * object,vm_object_offset_t * offset,vm_prot_t * out_prot,boolean_t * wired,vm_object_fault_info_t fault_info,vm_map_t * real_map,bool * contended)12958 vm_map_lookup_locked(
12959 	vm_map_t                *var_map,       /* IN/OUT */
12960 	vm_map_offset_t         vaddr,
12961 	vm_prot_t               fault_type,
12962 	int                     object_lock_type,
12963 	vm_map_version_t        *out_version,   /* OUT */
12964 	vm_object_t             *object,        /* OUT */
12965 	vm_object_offset_t      *offset,        /* OUT */
12966 	vm_prot_t               *out_prot,      /* OUT */
12967 	boolean_t               *wired,         /* OUT */
12968 	vm_object_fault_info_t  fault_info,     /* OUT */
12969 	vm_map_t                *real_map,      /* OUT */
12970 	bool                    *contended)     /* OUT */
12971 {
12972 	vm_map_entry_t                  entry;
12973 	vm_map_t                        map = *var_map;
12974 	vm_map_t                        old_map = *var_map;
12975 	vm_map_t                        cow_sub_map_parent = VM_MAP_NULL;
12976 	vm_map_offset_t                 cow_parent_vaddr = 0;
12977 	vm_map_offset_t                 old_start = 0;
12978 	vm_map_offset_t                 old_end = 0;
12979 	vm_prot_t                       prot;
12980 	boolean_t                       mask_protections;
12981 	boolean_t                       force_copy;
12982 	boolean_t                       no_force_copy_if_executable;
12983 	boolean_t                       submap_needed_copy;
12984 	vm_prot_t                       original_fault_type;
12985 	vm_map_size_t                   fault_page_mask;
12986 
12987 	/*
12988 	 * VM_PROT_MASK means that the caller wants us to use "fault_type"
12989 	 * as a mask against the mapping's actual protections, not as an
12990 	 * absolute value.
12991 	 */
12992 	mask_protections = (fault_type & VM_PROT_IS_MASK) ? TRUE : FALSE;
12993 	force_copy = (fault_type & VM_PROT_COPY) ? TRUE : FALSE;
12994 	no_force_copy_if_executable = (fault_type & VM_PROT_COPY_FAIL_IF_EXECUTABLE) ? TRUE : FALSE;
12995 	fault_type &= VM_PROT_ALL;
12996 	original_fault_type = fault_type;
12997 	if (contended) {
12998 		*contended = false;
12999 	}
13000 
13001 	*real_map = map;
13002 
13003 	fault_page_mask = MIN(VM_MAP_PAGE_MASK(map), PAGE_MASK);
13004 	vaddr = VM_MAP_TRUNC_PAGE(vaddr, fault_page_mask);
13005 
13006 RetryLookup:
13007 	fault_type = original_fault_type;
13008 
13009 	/*
13010 	 *	If the map has an interesting hint, try it before calling
13011 	 *	full blown lookup routine.
13012 	 */
13013 	entry = map->hint;
13014 
13015 	if ((entry == vm_map_to_entry(map)) ||
13016 	    (vaddr < entry->vme_start) || (vaddr >= entry->vme_end)) {
13017 		vm_map_entry_t  tmp_entry;
13018 
13019 		/*
13020 		 *	Entry was either not a valid hint, or the vaddr
13021 		 *	was not contained in the entry, so do a full lookup.
13022 		 */
13023 		if (!vm_map_lookup_entry(map, vaddr, &tmp_entry)) {
13024 			if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
13025 				vm_map_unlock(cow_sub_map_parent);
13026 			}
13027 			if ((*real_map != map)
13028 			    && (*real_map != cow_sub_map_parent)) {
13029 				vm_map_unlock(*real_map);
13030 			}
13031 			return KERN_INVALID_ADDRESS;
13032 		}
13033 
13034 		entry = tmp_entry;
13035 	}
13036 	if (map == old_map) {
13037 		old_start = entry->vme_start;
13038 		old_end = entry->vme_end;
13039 	}
13040 
13041 	/*
13042 	 *	Handle submaps.  Drop lock on upper map, submap is
13043 	 *	returned locked.
13044 	 */
13045 
13046 	submap_needed_copy = FALSE;
13047 submap_recurse:
13048 	if (entry->is_sub_map) {
13049 		vm_map_offset_t         local_vaddr;
13050 		vm_map_offset_t         end_delta;
13051 		vm_map_offset_t         start_delta;
13052 		vm_map_entry_t          submap_entry, saved_submap_entry;
13053 		vm_object_offset_t      submap_entry_offset;
13054 		vm_object_size_t        submap_entry_size;
13055 		vm_prot_t               subentry_protection;
13056 		vm_prot_t               subentry_max_protection;
13057 		boolean_t               subentry_no_copy_on_read;
13058 		boolean_t               mapped_needs_copy = FALSE;
13059 		vm_map_version_t        version;
13060 
13061 		assertf(VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)) >= VM_MAP_PAGE_SHIFT(map),
13062 		    "map %p (%d) entry %p submap %p (%d)\n",
13063 		    map, VM_MAP_PAGE_SHIFT(map), entry,
13064 		    VME_SUBMAP(entry), VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)));
13065 
13066 		local_vaddr = vaddr;
13067 
13068 		if ((entry->use_pmap &&
13069 		    !((fault_type & VM_PROT_WRITE) ||
13070 		    force_copy))) {
13071 			/* if real_map equals map we unlock below */
13072 			if ((*real_map != map) &&
13073 			    (*real_map != cow_sub_map_parent)) {
13074 				vm_map_unlock(*real_map);
13075 			}
13076 			*real_map = VME_SUBMAP(entry);
13077 		}
13078 
13079 		if (entry->needs_copy &&
13080 		    ((fault_type & VM_PROT_WRITE) ||
13081 		    force_copy)) {
13082 			if (!mapped_needs_copy) {
13083 				if (vm_map_lock_read_to_write(map)) {
13084 					vm_map_lock_read(map);
13085 					*real_map = map;
13086 					goto RetryLookup;
13087 				}
13088 				vm_map_lock_read(VME_SUBMAP(entry));
13089 				*var_map = VME_SUBMAP(entry);
13090 				cow_sub_map_parent = map;
13091 				/* reset base to map before cow object */
13092 				/* this is the map which will accept   */
13093 				/* the new cow object */
13094 				old_start = entry->vme_start;
13095 				old_end = entry->vme_end;
13096 				cow_parent_vaddr = vaddr;
13097 				mapped_needs_copy = TRUE;
13098 			} else {
13099 				vm_map_lock_read(VME_SUBMAP(entry));
13100 				*var_map = VME_SUBMAP(entry);
13101 				if ((cow_sub_map_parent != map) &&
13102 				    (*real_map != map)) {
13103 					vm_map_unlock(map);
13104 				}
13105 			}
13106 		} else {
13107 			if (entry->needs_copy) {
13108 				submap_needed_copy = TRUE;
13109 			}
13110 			vm_map_lock_read(VME_SUBMAP(entry));
13111 			*var_map = VME_SUBMAP(entry);
13112 			/* leave map locked if it is a target */
13113 			/* cow sub_map above otherwise, just  */
13114 			/* follow the maps down to the object */
13115 			/* here we unlock knowing we are not  */
13116 			/* revisiting the map.  */
13117 			if ((*real_map != map) && (map != cow_sub_map_parent)) {
13118 				vm_map_unlock_read(map);
13119 			}
13120 		}
13121 
13122 		map = *var_map;
13123 
13124 		/* calculate the offset in the submap for vaddr */
13125 		local_vaddr = (local_vaddr - entry->vme_start) + VME_OFFSET(entry);
13126 		assertf(VM_MAP_PAGE_ALIGNED(local_vaddr, fault_page_mask),
13127 		    "local_vaddr 0x%llx entry->vme_start 0x%llx fault_page_mask 0x%llx\n",
13128 		    (uint64_t)local_vaddr, (uint64_t)entry->vme_start, (uint64_t)fault_page_mask);
13129 
13130 RetrySubMap:
13131 		if (!vm_map_lookup_entry(map, local_vaddr, &submap_entry)) {
13132 			if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
13133 				vm_map_unlock(cow_sub_map_parent);
13134 			}
13135 			if ((*real_map != map)
13136 			    && (*real_map != cow_sub_map_parent)) {
13137 				vm_map_unlock(*real_map);
13138 			}
13139 			*real_map = map;
13140 			return KERN_INVALID_ADDRESS;
13141 		}
13142 
13143 		/* find the attenuated shadow of the underlying object */
13144 		/* on our target map */
13145 
13146 		/* in english the submap object may extend beyond the     */
13147 		/* region mapped by the entry or, may only fill a portion */
13148 		/* of it.  For our purposes, we only care if the object   */
13149 		/* doesn't fill.  In this case the area which will        */
13150 		/* ultimately be clipped in the top map will only need    */
13151 		/* to be as big as the portion of the underlying entry    */
13152 		/* which is mapped */
13153 		start_delta = submap_entry->vme_start > VME_OFFSET(entry) ?
13154 		    submap_entry->vme_start - VME_OFFSET(entry) : 0;
13155 
13156 		end_delta =
13157 		    (VME_OFFSET(entry) + start_delta + (old_end - old_start)) <=
13158 		    submap_entry->vme_end ?
13159 		    0 : (VME_OFFSET(entry) +
13160 		    (old_end - old_start))
13161 		    - submap_entry->vme_end;
13162 
13163 		old_start += start_delta;
13164 		old_end -= end_delta;
13165 
13166 		if (submap_entry->is_sub_map) {
13167 			entry = submap_entry;
13168 			vaddr = local_vaddr;
13169 			goto submap_recurse;
13170 		}
13171 
13172 		if (((fault_type & VM_PROT_WRITE) ||
13173 		    force_copy)
13174 		    && cow_sub_map_parent) {
13175 			vm_object_t     sub_object, copy_object;
13176 			vm_object_offset_t copy_offset;
13177 			vm_map_offset_t local_start;
13178 			vm_map_offset_t local_end;
13179 			boolean_t       object_copied = FALSE;
13180 			vm_object_offset_t object_copied_offset = 0;
13181 			boolean_t       object_copied_needs_copy = FALSE;
13182 			kern_return_t   kr = KERN_SUCCESS;
13183 
13184 			if (vm_map_lock_read_to_write(map)) {
13185 				vm_map_lock_read(map);
13186 				old_start -= start_delta;
13187 				old_end += end_delta;
13188 				goto RetrySubMap;
13189 			}
13190 
13191 
13192 			sub_object = VME_OBJECT(submap_entry);
13193 			if (sub_object == VM_OBJECT_NULL) {
13194 				sub_object =
13195 				    vm_object_allocate(
13196 					(vm_map_size_t)
13197 					(submap_entry->vme_end -
13198 					submap_entry->vme_start));
13199 				VME_OBJECT_SET(submap_entry, sub_object);
13200 				VME_OFFSET_SET(submap_entry, 0);
13201 				assert(!submap_entry->is_sub_map);
13202 				assert(submap_entry->use_pmap);
13203 			}
13204 			local_start =  local_vaddr -
13205 			    (cow_parent_vaddr - old_start);
13206 			local_end = local_vaddr +
13207 			    (old_end - cow_parent_vaddr);
13208 			vm_map_clip_start(map, submap_entry, local_start);
13209 			vm_map_clip_end(map, submap_entry, local_end);
13210 			if (submap_entry->is_sub_map) {
13211 				/* unnesting was done when clipping */
13212 				assert(!submap_entry->use_pmap);
13213 			}
13214 
13215 			/* This is the COW case, lets connect */
13216 			/* an entry in our space to the underlying */
13217 			/* object in the submap, bypassing the  */
13218 			/* submap. */
13219 			submap_entry_offset = VME_OFFSET(submap_entry);
13220 			submap_entry_size = submap_entry->vme_end - submap_entry->vme_start;
13221 
13222 			if ((submap_entry->wired_count != 0 ||
13223 			    sub_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) &&
13224 			    (submap_entry->protection & VM_PROT_EXECUTE) &&
13225 			    no_force_copy_if_executable) {
13226 //				printf("FBDP map %p entry %p start 0x%llx end 0x%llx wired %d strat %d\n", map, submap_entry, (uint64_t)local_start, (uint64_t)local_end, submap_entry->wired_count, sub_object->copy_strategy);
13227 				if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
13228 					vm_map_unlock(cow_sub_map_parent);
13229 				}
13230 				if ((*real_map != map)
13231 				    && (*real_map != cow_sub_map_parent)) {
13232 					vm_map_unlock(*real_map);
13233 				}
13234 				*real_map = map;
13235 				kernel_triage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_NO_COW_ON_EXECUTABLE), 0 /* arg */);
13236 				vm_map_lock_write_to_read(map);
13237 				kr = KERN_PROTECTION_FAILURE;
13238 				DTRACE_VM4(submap_no_copy_executable,
13239 				    vm_map_t, map,
13240 				    vm_object_offset_t, submap_entry_offset,
13241 				    vm_object_size_t, submap_entry_size,
13242 				    int, kr);
13243 				return kr;
13244 			}
13245 
13246 			if (submap_entry->wired_count != 0) {
13247 				vm_object_reference(sub_object);
13248 
13249 				assertf(VM_MAP_PAGE_ALIGNED(VME_OFFSET(submap_entry), VM_MAP_PAGE_MASK(map)),
13250 				    "submap_entry %p offset 0x%llx\n",
13251 				    submap_entry, VME_OFFSET(submap_entry));
13252 
13253 				DTRACE_VM6(submap_copy_slowly,
13254 				    vm_map_t, cow_sub_map_parent,
13255 				    vm_map_offset_t, vaddr,
13256 				    vm_map_t, map,
13257 				    vm_object_size_t, submap_entry_size,
13258 				    int, submap_entry->wired_count,
13259 				    int, sub_object->copy_strategy);
13260 
13261 				saved_submap_entry = submap_entry;
13262 				version.main_timestamp = map->timestamp;
13263 				vm_map_unlock(map); /* Increments timestamp by 1 */
13264 				submap_entry = VM_MAP_ENTRY_NULL;
13265 
13266 				vm_object_lock(sub_object);
13267 				kr = vm_object_copy_slowly(sub_object,
13268 				    submap_entry_offset,
13269 				    submap_entry_size,
13270 				    FALSE,
13271 				    &copy_object);
13272 				object_copied = TRUE;
13273 				object_copied_offset = 0;
13274 				/* 4k: account for extra offset in physical page */
13275 				object_copied_offset += submap_entry_offset - vm_object_trunc_page(submap_entry_offset);
13276 				object_copied_needs_copy = FALSE;
13277 				vm_object_deallocate(sub_object);
13278 
13279 				vm_map_lock(map);
13280 
13281 				if (kr != KERN_SUCCESS &&
13282 				    kr != KERN_MEMORY_RESTART_COPY) {
13283 					if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
13284 						vm_map_unlock(cow_sub_map_parent);
13285 					}
13286 					if ((*real_map != map)
13287 					    && (*real_map != cow_sub_map_parent)) {
13288 						vm_map_unlock(*real_map);
13289 					}
13290 					*real_map = map;
13291 					vm_object_deallocate(copy_object);
13292 					copy_object = VM_OBJECT_NULL;
13293 					kernel_triage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_COPY_SLOWLY_FAILED), 0 /* arg */);
13294 					vm_map_lock_write_to_read(map);
13295 					DTRACE_VM4(submap_copy_error_slowly,
13296 					    vm_object_t, sub_object,
13297 					    vm_object_offset_t, submap_entry_offset,
13298 					    vm_object_size_t, submap_entry_size,
13299 					    int, kr);
13300 					vm_map_lookup_locked_copy_slowly_error++;
13301 					return kr;
13302 				}
13303 
13304 				if ((kr == KERN_SUCCESS) &&
13305 				    (version.main_timestamp + 1) == map->timestamp) {
13306 					submap_entry = saved_submap_entry;
13307 				} else {
13308 					saved_submap_entry = NULL;
13309 					old_start -= start_delta;
13310 					old_end += end_delta;
13311 					vm_object_deallocate(copy_object);
13312 					copy_object = VM_OBJECT_NULL;
13313 					vm_map_lock_write_to_read(map);
13314 					vm_map_lookup_locked_copy_slowly_restart++;
13315 					goto RetrySubMap;
13316 				}
13317 				vm_map_lookup_locked_copy_slowly_count++;
13318 				vm_map_lookup_locked_copy_slowly_size += submap_entry_size;
13319 				if (submap_entry_size > vm_map_lookup_locked_copy_slowly_max) {
13320 					vm_map_lookup_locked_copy_slowly_max = submap_entry_size;
13321 				}
13322 			} else if (sub_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
13323 				submap_entry_offset = VME_OFFSET(submap_entry);
13324 				copy_object = VM_OBJECT_NULL;
13325 				object_copied_offset = submap_entry_offset;
13326 				object_copied_needs_copy = FALSE;
13327 				DTRACE_VM6(submap_copy_strategically,
13328 				    vm_map_t, cow_sub_map_parent,
13329 				    vm_map_offset_t, vaddr,
13330 				    vm_map_t, map,
13331 				    vm_object_size_t, submap_entry_size,
13332 				    int, submap_entry->wired_count,
13333 				    int, sub_object->copy_strategy);
13334 				kr = vm_object_copy_strategically(
13335 					sub_object,
13336 					submap_entry_offset,
13337 					submap_entry->vme_end - submap_entry->vme_start,
13338 					&copy_object,
13339 					&object_copied_offset,
13340 					&object_copied_needs_copy);
13341 				if (kr == KERN_MEMORY_RESTART_COPY) {
13342 					old_start -= start_delta;
13343 					old_end += end_delta;
13344 					vm_object_deallocate(copy_object);
13345 					copy_object = VM_OBJECT_NULL;
13346 					vm_map_lock_write_to_read(map);
13347 					vm_map_lookup_locked_copy_strategically_restart++;
13348 					goto RetrySubMap;
13349 				}
13350 				if (kr != KERN_SUCCESS) {
13351 					if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
13352 						vm_map_unlock(cow_sub_map_parent);
13353 					}
13354 					if ((*real_map != map)
13355 					    && (*real_map != cow_sub_map_parent)) {
13356 						vm_map_unlock(*real_map);
13357 					}
13358 					*real_map = map;
13359 					vm_object_deallocate(copy_object);
13360 					copy_object = VM_OBJECT_NULL;
13361 					kernel_triage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_COPY_STRAT_FAILED), 0 /* arg */);
13362 					vm_map_lock_write_to_read(map);
13363 					DTRACE_VM4(submap_copy_error_strategically,
13364 					    vm_object_t, sub_object,
13365 					    vm_object_offset_t, submap_entry_offset,
13366 					    vm_object_size_t, submap_entry_size,
13367 					    int, kr);
13368 					vm_map_lookup_locked_copy_strategically_error++;
13369 					return kr;
13370 				}
13371 				assert(copy_object != VM_OBJECT_NULL);
13372 				assert(copy_object != sub_object);
13373 				object_copied = TRUE;
13374 				vm_map_lookup_locked_copy_strategically_count++;
13375 				vm_map_lookup_locked_copy_strategically_size += submap_entry_size;
13376 				if (submap_entry_size > vm_map_lookup_locked_copy_strategically_max) {
13377 					vm_map_lookup_locked_copy_strategically_max = submap_entry_size;
13378 				}
13379 			} else {
13380 				/* set up shadow object */
13381 				object_copied = FALSE;
13382 				copy_object = sub_object;
13383 				vm_object_lock(sub_object);
13384 				vm_object_reference_locked(sub_object);
13385 				sub_object->shadowed = TRUE;
13386 				vm_object_unlock(sub_object);
13387 
13388 				assert(submap_entry->wired_count == 0);
13389 				submap_entry->needs_copy = TRUE;
13390 
13391 				prot = submap_entry->protection;
13392 				assert(!pmap_has_prot_policy(map->pmap, submap_entry->translated_allow_execute, prot));
13393 				prot = prot & ~VM_PROT_WRITE;
13394 				assert(!pmap_has_prot_policy(map->pmap, submap_entry->translated_allow_execute, prot));
13395 
13396 				if (override_nx(old_map,
13397 				    VME_ALIAS(submap_entry))
13398 				    && prot) {
13399 					prot |= VM_PROT_EXECUTE;
13400 				}
13401 
13402 				vm_object_pmap_protect(
13403 					sub_object,
13404 					VME_OFFSET(submap_entry),
13405 					submap_entry->vme_end -
13406 					submap_entry->vme_start,
13407 					(submap_entry->is_shared
13408 					|| map->mapped_in_other_pmaps) ?
13409 					PMAP_NULL : map->pmap,
13410 					VM_MAP_PAGE_SIZE(map),
13411 					submap_entry->vme_start,
13412 					prot);
13413 				vm_map_lookup_locked_copy_shadow_count++;
13414 				vm_map_lookup_locked_copy_shadow_size += submap_entry_size;
13415 				if (submap_entry_size > vm_map_lookup_locked_copy_shadow_max) {
13416 					vm_map_lookup_locked_copy_shadow_max = submap_entry_size;
13417 				}
13418 			}
13419 
13420 			/*
13421 			 * Adjust the fault offset to the submap entry.
13422 			 */
13423 			copy_offset = (local_vaddr -
13424 			    submap_entry->vme_start +
13425 			    VME_OFFSET(submap_entry));
13426 
13427 			/* This works diffently than the   */
13428 			/* normal submap case. We go back  */
13429 			/* to the parent of the cow map and*/
13430 			/* clip out the target portion of  */
13431 			/* the sub_map, substituting the   */
13432 			/* new copy object,                */
13433 
13434 			subentry_protection = submap_entry->protection;
13435 			subentry_max_protection = submap_entry->max_protection;
13436 			subentry_no_copy_on_read = submap_entry->vme_no_copy_on_read;
13437 			vm_map_unlock(map);
13438 			submap_entry = NULL; /* not valid after map unlock */
13439 
13440 			local_start = old_start;
13441 			local_end = old_end;
13442 			map = cow_sub_map_parent;
13443 			*var_map = cow_sub_map_parent;
13444 			vaddr = cow_parent_vaddr;
13445 			cow_sub_map_parent = NULL;
13446 
13447 			if (!vm_map_lookup_entry(map,
13448 			    vaddr, &entry)) {
13449 				if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
13450 					vm_map_unlock(cow_sub_map_parent);
13451 				}
13452 				if ((*real_map != map)
13453 				    && (*real_map != cow_sub_map_parent)) {
13454 					vm_map_unlock(*real_map);
13455 				}
13456 				*real_map = map;
13457 				vm_object_deallocate(
13458 					copy_object);
13459 				copy_object = VM_OBJECT_NULL;
13460 				vm_map_lock_write_to_read(map);
13461 				DTRACE_VM4(submap_lookup_post_unlock,
13462 				    uint64_t, (uint64_t)entry->vme_start,
13463 				    uint64_t, (uint64_t)entry->vme_end,
13464 				    vm_map_offset_t, vaddr,
13465 				    int, object_copied);
13466 				return KERN_INVALID_ADDRESS;
13467 			}
13468 
13469 			/* clip out the portion of space */
13470 			/* mapped by the sub map which   */
13471 			/* corresponds to the underlying */
13472 			/* object */
13473 
13474 			/*
13475 			 * Clip (and unnest) the smallest nested chunk
13476 			 * possible around the faulting address...
13477 			 */
13478 			local_start = vaddr & ~(pmap_shared_region_size_min(map->pmap) - 1);
13479 			local_end = local_start + pmap_shared_region_size_min(map->pmap);
13480 			/*
13481 			 * ... but don't go beyond the "old_start" to "old_end"
13482 			 * range, to avoid spanning over another VM region
13483 			 * with a possibly different VM object and/or offset.
13484 			 */
13485 			if (local_start < old_start) {
13486 				local_start = old_start;
13487 			}
13488 			if (local_end > old_end) {
13489 				local_end = old_end;
13490 			}
13491 			/*
13492 			 * Adjust copy_offset to the start of the range.
13493 			 */
13494 			copy_offset -= (vaddr - local_start);
13495 
13496 			vm_map_clip_start(map, entry, local_start);
13497 			vm_map_clip_end(map, entry, local_end);
13498 			if (entry->is_sub_map) {
13499 				/* unnesting was done when clipping */
13500 				assert(!entry->use_pmap);
13501 			}
13502 
13503 			/* substitute copy object for */
13504 			/* shared map entry           */
13505 			vm_map_deallocate(VME_SUBMAP(entry));
13506 			assert(!entry->iokit_acct);
13507 			entry->is_sub_map = FALSE;
13508 			entry->use_pmap = TRUE;
13509 			VME_OBJECT_SET(entry, copy_object);
13510 
13511 			/* propagate the submap entry's protections */
13512 			if (entry->protection != VM_PROT_READ) {
13513 				/*
13514 				 * Someone has already altered the top entry's
13515 				 * protections via vm_protect(VM_PROT_COPY).
13516 				 * Respect these new values and ignore the
13517 				 * submap entry's protections.
13518 				 */
13519 			} else {
13520 				/*
13521 				 * Regular copy-on-write: propagate the submap
13522 				 * entry's protections to the top map entry.
13523 				 */
13524 				entry->protection |= subentry_protection;
13525 			}
13526 			entry->max_protection |= subentry_max_protection;
13527 			/* propagate no_copy_on_read */
13528 			entry->vme_no_copy_on_read = subentry_no_copy_on_read;
13529 
13530 			if ((entry->protection & VM_PROT_WRITE) &&
13531 			    (entry->protection & VM_PROT_EXECUTE) &&
13532 #if XNU_TARGET_OS_OSX
13533 			    map->pmap != kernel_pmap &&
13534 			    (vm_map_cs_enforcement(map)
13535 #if __arm64__
13536 			    || !VM_MAP_IS_EXOTIC(map)
13537 #endif /* __arm64__ */
13538 			    ) &&
13539 #endif /* XNU_TARGET_OS_OSX */
13540 			    !(entry->used_for_jit) &&
13541 			    VM_MAP_POLICY_WX_STRIP_X(map)) {
13542 				DTRACE_VM3(cs_wx,
13543 				    uint64_t, (uint64_t)entry->vme_start,
13544 				    uint64_t, (uint64_t)entry->vme_end,
13545 				    vm_prot_t, entry->protection);
13546 				printf("CODE SIGNING: %d[%s] %s can't have both write and exec at the same time\n",
13547 				    proc_selfpid(),
13548 				    (current_task()->bsd_info
13549 				    ? proc_name_address(current_task()->bsd_info)
13550 				    : "?"),
13551 				    __FUNCTION__);
13552 				entry->protection &= ~VM_PROT_EXECUTE;
13553 			}
13554 
13555 			if (object_copied) {
13556 				VME_OFFSET_SET(entry, local_start - old_start + object_copied_offset);
13557 				entry->needs_copy = object_copied_needs_copy;
13558 				entry->is_shared = FALSE;
13559 			} else {
13560 				assert(VME_OBJECT(entry) != VM_OBJECT_NULL);
13561 				assert(VME_OBJECT(entry)->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC);
13562 				assert(entry->wired_count == 0);
13563 				VME_OFFSET_SET(entry, copy_offset);
13564 				entry->needs_copy = TRUE;
13565 				if (map != old_map) {
13566 					entry->is_shared = TRUE;
13567 				}
13568 			}
13569 			if (entry->inheritance == VM_INHERIT_SHARE) {
13570 				entry->inheritance = VM_INHERIT_COPY;
13571 			}
13572 
13573 			vm_map_lock_write_to_read(map);
13574 		} else {
13575 			if ((cow_sub_map_parent)
13576 			    && (cow_sub_map_parent != *real_map)
13577 			    && (cow_sub_map_parent != map)) {
13578 				vm_map_unlock(cow_sub_map_parent);
13579 			}
13580 			entry = submap_entry;
13581 			vaddr = local_vaddr;
13582 		}
13583 	}
13584 
13585 	/*
13586 	 *	Check whether this task is allowed to have
13587 	 *	this page.
13588 	 */
13589 
13590 	prot = entry->protection;
13591 
13592 	if (override_nx(old_map, VME_ALIAS(entry)) && prot) {
13593 		/*
13594 		 * HACK -- if not a stack, then allow execution
13595 		 */
13596 		prot |= VM_PROT_EXECUTE;
13597 	}
13598 
13599 	if (mask_protections) {
13600 		fault_type &= prot;
13601 		if (fault_type == VM_PROT_NONE) {
13602 			goto protection_failure;
13603 		}
13604 	}
13605 	if (((fault_type & prot) != fault_type)
13606 #if __arm64__
13607 	    /* prefetch abort in execute-only page */
13608 	    && !(prot == VM_PROT_EXECUTE && fault_type == (VM_PROT_READ | VM_PROT_EXECUTE))
13609 #elif defined(__x86_64__)
13610 	    /* Consider the UEXEC bit when handling an EXECUTE fault */
13611 	    && !((fault_type & VM_PROT_EXECUTE) && !(prot & VM_PROT_EXECUTE) && (prot & VM_PROT_UEXEC))
13612 #endif
13613 	    ) {
13614 protection_failure:
13615 		if (*real_map != map) {
13616 			vm_map_unlock(*real_map);
13617 		}
13618 		*real_map = map;
13619 
13620 		if ((fault_type & VM_PROT_EXECUTE) && prot) {
13621 			log_stack_execution_failure((addr64_t)vaddr, prot);
13622 		}
13623 
13624 		DTRACE_VM2(prot_fault, int, 1, (uint64_t *), NULL);
13625 		DTRACE_VM3(prot_fault_detailed, vm_prot_t, fault_type, vm_prot_t, prot, void *, vaddr);
13626 		/*
13627 		 * Noisy (esp. internally) and can be inferred from CrashReports. So OFF for now.
13628 		 *
13629 		 * kernel_triage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_PROTECTION_FAILURE), 0);
13630 		 */
13631 		return KERN_PROTECTION_FAILURE;
13632 	}
13633 
13634 	/*
13635 	 *	If this page is not pageable, we have to get
13636 	 *	it for all possible accesses.
13637 	 */
13638 
13639 	*wired = (entry->wired_count != 0);
13640 	if (*wired) {
13641 		fault_type = prot;
13642 	}
13643 
13644 	/*
13645 	 *	If the entry was copy-on-write, we either ...
13646 	 */
13647 
13648 	if (entry->needs_copy) {
13649 		/*
13650 		 *	If we want to write the page, we may as well
13651 		 *	handle that now since we've got the map locked.
13652 		 *
13653 		 *	If we don't need to write the page, we just
13654 		 *	demote the permissions allowed.
13655 		 */
13656 
13657 		if ((fault_type & VM_PROT_WRITE) || *wired || force_copy) {
13658 			/*
13659 			 *	Make a new object, and place it in the
13660 			 *	object chain.  Note that no new references
13661 			 *	have appeared -- one just moved from the
13662 			 *	map to the new object.
13663 			 */
13664 
13665 			if (vm_map_lock_read_to_write(map)) {
13666 				vm_map_lock_read(map);
13667 				goto RetryLookup;
13668 			}
13669 
13670 			if (VME_OBJECT(entry)->shadowed == FALSE) {
13671 				vm_object_lock(VME_OBJECT(entry));
13672 				VME_OBJECT(entry)->shadowed = TRUE;
13673 				vm_object_unlock(VME_OBJECT(entry));
13674 			}
13675 			VME_OBJECT_SHADOW(entry,
13676 			    (vm_map_size_t) (entry->vme_end -
13677 			    entry->vme_start));
13678 			entry->needs_copy = FALSE;
13679 
13680 			vm_map_lock_write_to_read(map);
13681 		}
13682 		if ((fault_type & VM_PROT_WRITE) == 0 && *wired == 0) {
13683 			/*
13684 			 *	We're attempting to read a copy-on-write
13685 			 *	page -- don't allow writes.
13686 			 */
13687 
13688 			prot &= (~VM_PROT_WRITE);
13689 		}
13690 	}
13691 
13692 	if (submap_needed_copy && (prot & VM_PROT_WRITE)) {
13693 		/*
13694 		 * We went through a "needs_copy" submap without triggering
13695 		 * a copy, so granting write access to the page would bypass
13696 		 * that submap's "needs_copy".
13697 		 */
13698 		assert(!(fault_type & VM_PROT_WRITE));
13699 		assert(!*wired);
13700 		assert(!force_copy);
13701 		// printf("FBDP %d[%s] submap_needed_copy for %p 0x%llx\n", proc_selfpid(), proc_name_address(current_task()->bsd_info), map, vaddr);
13702 		prot &= ~VM_PROT_WRITE;
13703 	}
13704 
13705 	/*
13706 	 *	Create an object if necessary.
13707 	 */
13708 	if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
13709 		if (vm_map_lock_read_to_write(map)) {
13710 			vm_map_lock_read(map);
13711 			goto RetryLookup;
13712 		}
13713 
13714 		VME_OBJECT_SET(entry,
13715 		    vm_object_allocate(
13716 			    (vm_map_size_t)(entry->vme_end -
13717 			    entry->vme_start)));
13718 		VME_OFFSET_SET(entry, 0);
13719 		assert(entry->use_pmap);
13720 		vm_map_lock_write_to_read(map);
13721 	}
13722 
13723 	/*
13724 	 *	Return the object/offset from this entry.  If the entry
13725 	 *	was copy-on-write or empty, it has been fixed up.  Also
13726 	 *	return the protection.
13727 	 */
13728 
13729 	*offset = (vaddr - entry->vme_start) + VME_OFFSET(entry);
13730 	*object = VME_OBJECT(entry);
13731 	*out_prot = prot;
13732 	KDBG_FILTERED(MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_MAP_LOOKUP_OBJECT), VM_KERNEL_UNSLIDE_OR_PERM(*object), (unsigned long) VME_ALIAS(entry), 0, 0);
13733 
13734 	if (fault_info) {
13735 		fault_info->interruptible = THREAD_UNINT; /* for now... */
13736 		/* ... the caller will change "interruptible" if needed */
13737 		fault_info->cluster_size = 0;
13738 		fault_info->user_tag = VME_ALIAS(entry);
13739 		fault_info->pmap_options = 0;
13740 		if (entry->iokit_acct ||
13741 		    (!entry->is_sub_map && !entry->use_pmap)) {
13742 			fault_info->pmap_options |= PMAP_OPTIONS_ALT_ACCT;
13743 		}
13744 		fault_info->behavior = entry->behavior;
13745 		fault_info->lo_offset = VME_OFFSET(entry);
13746 		fault_info->hi_offset =
13747 		    (entry->vme_end - entry->vme_start) + VME_OFFSET(entry);
13748 		fault_info->no_cache  = entry->no_cache;
13749 		fault_info->stealth = FALSE;
13750 		fault_info->io_sync = FALSE;
13751 		if (entry->used_for_jit ||
13752 		    entry->vme_resilient_codesign) {
13753 			fault_info->cs_bypass = TRUE;
13754 		} else {
13755 			fault_info->cs_bypass = FALSE;
13756 		}
13757 		fault_info->pmap_cs_associated = FALSE;
13758 #if CONFIG_PMAP_CS
13759 		if (entry->pmap_cs_associated) {
13760 			/*
13761 			 * The pmap layer will validate this page
13762 			 * before allowing it to be executed from.
13763 			 */
13764 			fault_info->pmap_cs_associated = TRUE;
13765 		}
13766 #endif /* CONFIG_PMAP_CS */
13767 		fault_info->mark_zf_absent = FALSE;
13768 		fault_info->batch_pmap_op = FALSE;
13769 		fault_info->resilient_media = entry->vme_resilient_media;
13770 		fault_info->no_copy_on_read = entry->vme_no_copy_on_read;
13771 		if (entry->translated_allow_execute) {
13772 			fault_info->pmap_options |= PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE;
13773 		}
13774 	}
13775 
13776 	/*
13777 	 *	Lock the object to prevent it from disappearing
13778 	 */
13779 	if (object_lock_type == OBJECT_LOCK_EXCLUSIVE) {
13780 		if (contended == NULL) {
13781 			vm_object_lock(*object);
13782 		} else {
13783 			*contended = vm_object_lock_check_contended(*object);
13784 		}
13785 	} else {
13786 		vm_object_lock_shared(*object);
13787 	}
13788 
13789 	/*
13790 	 *	Save the version number
13791 	 */
13792 
13793 	out_version->main_timestamp = map->timestamp;
13794 
13795 	return KERN_SUCCESS;
13796 }
13797 
13798 
13799 /*
13800  *	vm_map_verify:
13801  *
13802  *	Verifies that the map in question has not changed
13803  *	since the given version. The map has to be locked
13804  *	("shared" mode is fine) before calling this function
13805  *	and it will be returned locked too.
13806  */
13807 boolean_t
vm_map_verify(vm_map_t map,vm_map_version_t * version)13808 vm_map_verify(
13809 	vm_map_t                map,
13810 	vm_map_version_t        *version)       /* REF */
13811 {
13812 	boolean_t       result;
13813 
13814 	vm_map_lock_assert_held(map);
13815 	result = (map->timestamp == version->main_timestamp);
13816 
13817 	return result;
13818 }
13819 
13820 /*
13821  *	TEMPORARYTEMPORARYTEMPORARYTEMPORARYTEMPORARYTEMPORARY
13822  *	Goes away after regular vm_region_recurse function migrates to
13823  *	64 bits
13824  *	vm_region_recurse: A form of vm_region which follows the
13825  *	submaps in a target map
13826  *
13827  */
13828 
13829 kern_return_t
vm_map_region_recurse_64(vm_map_t map,vm_map_offset_t * address,vm_map_size_t * size,natural_t * nesting_depth,vm_region_submap_info_64_t submap_info,mach_msg_type_number_t * count)13830 vm_map_region_recurse_64(
13831 	vm_map_t                 map,
13832 	vm_map_offset_t *address,               /* IN/OUT */
13833 	vm_map_size_t           *size,                  /* OUT */
13834 	natural_t               *nesting_depth, /* IN/OUT */
13835 	vm_region_submap_info_64_t      submap_info,    /* IN/OUT */
13836 	mach_msg_type_number_t  *count) /* IN/OUT */
13837 {
13838 	mach_msg_type_number_t  original_count;
13839 	vm_region_extended_info_data_t  extended;
13840 	vm_map_entry_t                  tmp_entry;
13841 	vm_map_offset_t                 user_address;
13842 	unsigned int                    user_max_depth;
13843 
13844 	/*
13845 	 * "curr_entry" is the VM map entry preceding or including the
13846 	 * address we're looking for.
13847 	 * "curr_map" is the map or sub-map containing "curr_entry".
13848 	 * "curr_address" is the equivalent of the top map's "user_address"
13849 	 * in the current map.
13850 	 * "curr_offset" is the cumulated offset of "curr_map" in the
13851 	 * target task's address space.
13852 	 * "curr_depth" is the depth of "curr_map" in the chain of
13853 	 * sub-maps.
13854 	 *
13855 	 * "curr_max_below" and "curr_max_above" limit the range (around
13856 	 * "curr_address") we should take into account in the current (sub)map.
13857 	 * They limit the range to what's visible through the map entries
13858 	 * we've traversed from the top map to the current map.
13859 	 *
13860 	 */
13861 	vm_map_entry_t                  curr_entry;
13862 	vm_map_address_t                curr_address;
13863 	vm_map_offset_t                 curr_offset;
13864 	vm_map_t                        curr_map;
13865 	unsigned int                    curr_depth;
13866 	vm_map_offset_t                 curr_max_below, curr_max_above;
13867 	vm_map_offset_t                 curr_skip;
13868 
13869 	/*
13870 	 * "next_" is the same as "curr_" but for the VM region immediately
13871 	 * after the address we're looking for.  We need to keep track of this
13872 	 * too because we want to return info about that region if the
13873 	 * address we're looking for is not mapped.
13874 	 */
13875 	vm_map_entry_t                  next_entry;
13876 	vm_map_offset_t                 next_offset;
13877 	vm_map_offset_t                 next_address;
13878 	vm_map_t                        next_map;
13879 	unsigned int                    next_depth;
13880 	vm_map_offset_t                 next_max_below, next_max_above;
13881 	vm_map_offset_t                 next_skip;
13882 
13883 	boolean_t                       look_for_pages;
13884 	vm_region_submap_short_info_64_t short_info;
13885 	boolean_t                       do_region_footprint;
13886 	int                             effective_page_size, effective_page_shift;
13887 	boolean_t                       submap_needed_copy;
13888 
13889 	if (map == VM_MAP_NULL) {
13890 		/* no address space to work on */
13891 		return KERN_INVALID_ARGUMENT;
13892 	}
13893 
13894 	effective_page_shift = vm_self_region_page_shift(map);
13895 	effective_page_size = (1 << effective_page_shift);
13896 
13897 	if (*count < VM_REGION_SUBMAP_SHORT_INFO_COUNT_64) {
13898 		/*
13899 		 * "info" structure is not big enough and
13900 		 * would overflow
13901 		 */
13902 		return KERN_INVALID_ARGUMENT;
13903 	}
13904 
13905 	do_region_footprint = task_self_region_footprint();
13906 	original_count = *count;
13907 
13908 	if (original_count < VM_REGION_SUBMAP_INFO_V0_COUNT_64) {
13909 		*count = VM_REGION_SUBMAP_SHORT_INFO_COUNT_64;
13910 		look_for_pages = FALSE;
13911 		short_info = (vm_region_submap_short_info_64_t) submap_info;
13912 		submap_info = NULL;
13913 	} else {
13914 		look_for_pages = TRUE;
13915 		*count = VM_REGION_SUBMAP_INFO_V0_COUNT_64;
13916 		short_info = NULL;
13917 
13918 		if (original_count >= VM_REGION_SUBMAP_INFO_V1_COUNT_64) {
13919 			*count = VM_REGION_SUBMAP_INFO_V1_COUNT_64;
13920 		}
13921 		if (original_count >= VM_REGION_SUBMAP_INFO_V2_COUNT_64) {
13922 			*count = VM_REGION_SUBMAP_INFO_V2_COUNT_64;
13923 		}
13924 	}
13925 
13926 	user_address = *address;
13927 	user_max_depth = *nesting_depth;
13928 	submap_needed_copy = FALSE;
13929 
13930 	if (not_in_kdp) {
13931 		vm_map_lock_read(map);
13932 	}
13933 
13934 recurse_again:
13935 	curr_entry = NULL;
13936 	curr_map = map;
13937 	curr_address = user_address;
13938 	curr_offset = 0;
13939 	curr_skip = 0;
13940 	curr_depth = 0;
13941 	curr_max_above = ((vm_map_offset_t) -1) - curr_address;
13942 	curr_max_below = curr_address;
13943 
13944 	next_entry = NULL;
13945 	next_map = NULL;
13946 	next_address = 0;
13947 	next_offset = 0;
13948 	next_skip = 0;
13949 	next_depth = 0;
13950 	next_max_above = (vm_map_offset_t) -1;
13951 	next_max_below = (vm_map_offset_t) -1;
13952 
13953 	for (;;) {
13954 		if (vm_map_lookup_entry(curr_map,
13955 		    curr_address,
13956 		    &tmp_entry)) {
13957 			/* tmp_entry contains the address we're looking for */
13958 			curr_entry = tmp_entry;
13959 		} else {
13960 			vm_map_offset_t skip;
13961 			/*
13962 			 * The address is not mapped.  "tmp_entry" is the
13963 			 * map entry preceding the address.  We want the next
13964 			 * one, if it exists.
13965 			 */
13966 			curr_entry = tmp_entry->vme_next;
13967 
13968 			if (curr_entry == vm_map_to_entry(curr_map) ||
13969 			    (curr_entry->vme_start >=
13970 			    curr_address + curr_max_above)) {
13971 				/* no next entry at this level: stop looking */
13972 				if (not_in_kdp) {
13973 					vm_map_unlock_read(curr_map);
13974 				}
13975 				curr_entry = NULL;
13976 				curr_map = NULL;
13977 				curr_skip = 0;
13978 				curr_offset = 0;
13979 				curr_depth = 0;
13980 				curr_max_above = 0;
13981 				curr_max_below = 0;
13982 				break;
13983 			}
13984 
13985 			/* adjust current address and offset */
13986 			skip = curr_entry->vme_start - curr_address;
13987 			curr_address = curr_entry->vme_start;
13988 			curr_skip += skip;
13989 			curr_offset += skip;
13990 			curr_max_above -= skip;
13991 			curr_max_below = 0;
13992 		}
13993 
13994 		/*
13995 		 * Is the next entry at this level closer to the address (or
13996 		 * deeper in the submap chain) than the one we had
13997 		 * so far ?
13998 		 */
13999 		tmp_entry = curr_entry->vme_next;
14000 		if (tmp_entry == vm_map_to_entry(curr_map)) {
14001 			/* no next entry at this level */
14002 		} else if (tmp_entry->vme_start >=
14003 		    curr_address + curr_max_above) {
14004 			/*
14005 			 * tmp_entry is beyond the scope of what we mapped of
14006 			 * this submap in the upper level: ignore it.
14007 			 */
14008 		} else if ((next_entry == NULL) ||
14009 		    (tmp_entry->vme_start + curr_offset <=
14010 		    next_entry->vme_start + next_offset)) {
14011 			/*
14012 			 * We didn't have a "next_entry" or this one is
14013 			 * closer to the address we're looking for:
14014 			 * use this "tmp_entry" as the new "next_entry".
14015 			 */
14016 			if (next_entry != NULL) {
14017 				/* unlock the last "next_map" */
14018 				if (next_map != curr_map && not_in_kdp) {
14019 					vm_map_unlock_read(next_map);
14020 				}
14021 			}
14022 			next_entry = tmp_entry;
14023 			next_map = curr_map;
14024 			next_depth = curr_depth;
14025 			next_address = next_entry->vme_start;
14026 			next_skip = curr_skip;
14027 			next_skip += (next_address - curr_address);
14028 			next_offset = curr_offset;
14029 			next_offset += (next_address - curr_address);
14030 			next_max_above = MIN(next_max_above, curr_max_above);
14031 			next_max_above = MIN(next_max_above,
14032 			    next_entry->vme_end - next_address);
14033 			next_max_below = MIN(next_max_below, curr_max_below);
14034 			next_max_below = MIN(next_max_below,
14035 			    next_address - next_entry->vme_start);
14036 		}
14037 
14038 		/*
14039 		 * "curr_max_{above,below}" allow us to keep track of the
14040 		 * portion of the submap that is actually mapped at this level:
14041 		 * the rest of that submap is irrelevant to us, since it's not
14042 		 * mapped here.
14043 		 * The relevant portion of the map starts at
14044 		 * "VME_OFFSET(curr_entry)" up to the size of "curr_entry".
14045 		 */
14046 		curr_max_above = MIN(curr_max_above,
14047 		    curr_entry->vme_end - curr_address);
14048 		curr_max_below = MIN(curr_max_below,
14049 		    curr_address - curr_entry->vme_start);
14050 
14051 		if (!curr_entry->is_sub_map ||
14052 		    curr_depth >= user_max_depth) {
14053 			/*
14054 			 * We hit a leaf map or we reached the maximum depth
14055 			 * we could, so stop looking.  Keep the current map
14056 			 * locked.
14057 			 */
14058 			break;
14059 		}
14060 
14061 		/*
14062 		 * Get down to the next submap level.
14063 		 */
14064 
14065 		if (curr_entry->needs_copy) {
14066 			/* everything below this is effectively copy-on-write */
14067 			submap_needed_copy = TRUE;
14068 		}
14069 
14070 		/*
14071 		 * Lock the next level and unlock the current level,
14072 		 * unless we need to keep it locked to access the "next_entry"
14073 		 * later.
14074 		 */
14075 		if (not_in_kdp) {
14076 			vm_map_lock_read(VME_SUBMAP(curr_entry));
14077 		}
14078 		if (curr_map == next_map) {
14079 			/* keep "next_map" locked in case we need it */
14080 		} else {
14081 			/* release this map */
14082 			if (not_in_kdp) {
14083 				vm_map_unlock_read(curr_map);
14084 			}
14085 		}
14086 
14087 		/*
14088 		 * Adjust the offset.  "curr_entry" maps the submap
14089 		 * at relative address "curr_entry->vme_start" in the
14090 		 * curr_map but skips the first "VME_OFFSET(curr_entry)"
14091 		 * bytes of the submap.
14092 		 * "curr_offset" always represents the offset of a virtual
14093 		 * address in the curr_map relative to the absolute address
14094 		 * space (i.e. the top-level VM map).
14095 		 */
14096 		curr_offset +=
14097 		    (VME_OFFSET(curr_entry) - curr_entry->vme_start);
14098 		curr_address = user_address + curr_offset;
14099 		/* switch to the submap */
14100 		curr_map = VME_SUBMAP(curr_entry);
14101 		curr_depth++;
14102 		curr_entry = NULL;
14103 	}
14104 
14105 // LP64todo: all the current tools are 32bit, obviously never worked for 64b
14106 // so probably should be a real 32b ID vs. ptr.
14107 // Current users just check for equality
14108 
14109 	if (curr_entry == NULL) {
14110 		/* no VM region contains the address... */
14111 
14112 		if (do_region_footprint && /* we want footprint numbers */
14113 		    next_entry == NULL && /* & there are no more regions */
14114 		    /* & we haven't already provided our fake region: */
14115 		    user_address <= vm_map_last_entry(map)->vme_end) {
14116 			ledger_amount_t ledger_resident, ledger_compressed;
14117 
14118 			/*
14119 			 * Add a fake memory region to account for
14120 			 * purgeable and/or ledger-tagged memory that
14121 			 * counts towards this task's memory footprint,
14122 			 * i.e. the resident/compressed pages of non-volatile
14123 			 * objects owned by that task.
14124 			 */
14125 			task_ledgers_footprint(map->pmap->ledger,
14126 			    &ledger_resident,
14127 			    &ledger_compressed);
14128 			if (ledger_resident + ledger_compressed == 0) {
14129 				/* no purgeable memory usage to report */
14130 				return KERN_INVALID_ADDRESS;
14131 			}
14132 			/* fake region to show nonvolatile footprint */
14133 			if (look_for_pages) {
14134 				submap_info->protection = VM_PROT_DEFAULT;
14135 				submap_info->max_protection = VM_PROT_DEFAULT;
14136 				submap_info->inheritance = VM_INHERIT_DEFAULT;
14137 				submap_info->offset = 0;
14138 				submap_info->user_tag = -1;
14139 				submap_info->pages_resident = (unsigned int) (ledger_resident / effective_page_size);
14140 				submap_info->pages_shared_now_private = 0;
14141 				submap_info->pages_swapped_out = (unsigned int) (ledger_compressed / effective_page_size);
14142 				submap_info->pages_dirtied = submap_info->pages_resident;
14143 				submap_info->ref_count = 1;
14144 				submap_info->shadow_depth = 0;
14145 				submap_info->external_pager = 0;
14146 				submap_info->share_mode = SM_PRIVATE;
14147 				if (submap_needed_copy) {
14148 					submap_info->share_mode = SM_COW;
14149 				}
14150 				submap_info->is_submap = 0;
14151 				submap_info->behavior = VM_BEHAVIOR_DEFAULT;
14152 				submap_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
14153 				submap_info->user_wired_count = 0;
14154 				submap_info->pages_reusable = 0;
14155 			} else {
14156 				short_info->user_tag = -1;
14157 				short_info->offset = 0;
14158 				short_info->protection = VM_PROT_DEFAULT;
14159 				short_info->inheritance = VM_INHERIT_DEFAULT;
14160 				short_info->max_protection = VM_PROT_DEFAULT;
14161 				short_info->behavior = VM_BEHAVIOR_DEFAULT;
14162 				short_info->user_wired_count = 0;
14163 				short_info->is_submap = 0;
14164 				short_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
14165 				short_info->external_pager = 0;
14166 				short_info->shadow_depth = 0;
14167 				short_info->share_mode = SM_PRIVATE;
14168 				if (submap_needed_copy) {
14169 					short_info->share_mode = SM_COW;
14170 				}
14171 				short_info->ref_count = 1;
14172 			}
14173 			*nesting_depth = 0;
14174 			*size = (vm_map_size_t) (ledger_resident + ledger_compressed);
14175 //			*address = user_address;
14176 			*address = vm_map_last_entry(map)->vme_end;
14177 			return KERN_SUCCESS;
14178 		}
14179 
14180 		if (next_entry == NULL) {
14181 			/* ... and no VM region follows it either */
14182 			return KERN_INVALID_ADDRESS;
14183 		}
14184 		/* ... gather info about the next VM region */
14185 		curr_entry = next_entry;
14186 		curr_map = next_map;    /* still locked ... */
14187 		curr_address = next_address;
14188 		curr_skip = next_skip;
14189 		curr_offset = next_offset;
14190 		curr_depth = next_depth;
14191 		curr_max_above = next_max_above;
14192 		curr_max_below = next_max_below;
14193 	} else {
14194 		/* we won't need "next_entry" after all */
14195 		if (next_entry != NULL) {
14196 			/* release "next_map" */
14197 			if (next_map != curr_map && not_in_kdp) {
14198 				vm_map_unlock_read(next_map);
14199 			}
14200 		}
14201 	}
14202 	next_entry = NULL;
14203 	next_map = NULL;
14204 	next_offset = 0;
14205 	next_skip = 0;
14206 	next_depth = 0;
14207 	next_max_below = -1;
14208 	next_max_above = -1;
14209 
14210 	if (curr_entry->is_sub_map &&
14211 	    curr_depth < user_max_depth) {
14212 		/*
14213 		 * We're not as deep as we could be:  we must have
14214 		 * gone back up after not finding anything mapped
14215 		 * below the original top-level map entry's.
14216 		 * Let's move "curr_address" forward and recurse again.
14217 		 */
14218 		user_address = curr_address;
14219 		goto recurse_again;
14220 	}
14221 
14222 	*nesting_depth = curr_depth;
14223 	*size = curr_max_above + curr_max_below;
14224 	*address = user_address + curr_skip - curr_max_below;
14225 
14226 	if (look_for_pages) {
14227 		submap_info->user_tag = VME_ALIAS(curr_entry);
14228 		submap_info->offset = VME_OFFSET(curr_entry);
14229 		submap_info->protection = curr_entry->protection;
14230 		submap_info->inheritance = curr_entry->inheritance;
14231 		submap_info->max_protection = curr_entry->max_protection;
14232 		submap_info->behavior = curr_entry->behavior;
14233 		submap_info->user_wired_count = curr_entry->user_wired_count;
14234 		submap_info->is_submap = curr_entry->is_sub_map;
14235 		submap_info->object_id = VM_OBJECT_ID(VME_OBJECT(curr_entry));
14236 	} else {
14237 		short_info->user_tag = VME_ALIAS(curr_entry);
14238 		short_info->offset = VME_OFFSET(curr_entry);
14239 		short_info->protection = curr_entry->protection;
14240 		short_info->inheritance = curr_entry->inheritance;
14241 		short_info->max_protection = curr_entry->max_protection;
14242 		short_info->behavior = curr_entry->behavior;
14243 		short_info->user_wired_count = curr_entry->user_wired_count;
14244 		short_info->is_submap = curr_entry->is_sub_map;
14245 		short_info->object_id = VM_OBJECT_ID(VME_OBJECT(curr_entry));
14246 	}
14247 
14248 	extended.pages_resident = 0;
14249 	extended.pages_swapped_out = 0;
14250 	extended.pages_shared_now_private = 0;
14251 	extended.pages_dirtied = 0;
14252 	extended.pages_reusable = 0;
14253 	extended.external_pager = 0;
14254 	extended.shadow_depth = 0;
14255 	extended.share_mode = SM_EMPTY;
14256 	extended.ref_count = 0;
14257 
14258 	if (not_in_kdp) {
14259 		if (!curr_entry->is_sub_map) {
14260 			vm_map_offset_t range_start, range_end;
14261 			range_start = MAX((curr_address - curr_max_below),
14262 			    curr_entry->vme_start);
14263 			range_end = MIN((curr_address + curr_max_above),
14264 			    curr_entry->vme_end);
14265 			vm_map_region_walk(curr_map,
14266 			    range_start,
14267 			    curr_entry,
14268 			    (VME_OFFSET(curr_entry) +
14269 			    (range_start -
14270 			    curr_entry->vme_start)),
14271 			    range_end - range_start,
14272 			    &extended,
14273 			    look_for_pages, VM_REGION_EXTENDED_INFO_COUNT);
14274 			if (extended.external_pager &&
14275 			    extended.ref_count == 2 &&
14276 			    extended.share_mode == SM_SHARED) {
14277 				extended.share_mode = SM_PRIVATE;
14278 			}
14279 			if (submap_needed_copy) {
14280 				extended.share_mode = SM_COW;
14281 			}
14282 		} else {
14283 			if (curr_entry->use_pmap) {
14284 				extended.share_mode = SM_TRUESHARED;
14285 			} else {
14286 				extended.share_mode = SM_PRIVATE;
14287 			}
14288 			extended.ref_count = os_ref_get_count_raw(&VME_SUBMAP(curr_entry)->map_refcnt);
14289 		}
14290 	}
14291 
14292 	if (look_for_pages) {
14293 		submap_info->pages_resident = extended.pages_resident;
14294 		submap_info->pages_swapped_out = extended.pages_swapped_out;
14295 		submap_info->pages_shared_now_private =
14296 		    extended.pages_shared_now_private;
14297 		submap_info->pages_dirtied = extended.pages_dirtied;
14298 		submap_info->external_pager = extended.external_pager;
14299 		submap_info->shadow_depth = extended.shadow_depth;
14300 		submap_info->share_mode = extended.share_mode;
14301 		submap_info->ref_count = extended.ref_count;
14302 
14303 		if (original_count >= VM_REGION_SUBMAP_INFO_V1_COUNT_64) {
14304 			submap_info->pages_reusable = extended.pages_reusable;
14305 		}
14306 		if (original_count >= VM_REGION_SUBMAP_INFO_V2_COUNT_64) {
14307 			submap_info->object_id_full = (vm_object_id_t) (VME_OBJECT(curr_entry) != NULL) ? VM_KERNEL_ADDRPERM(VME_OBJECT(curr_entry)) : 0ULL;
14308 		}
14309 	} else {
14310 		short_info->external_pager = extended.external_pager;
14311 		short_info->shadow_depth = extended.shadow_depth;
14312 		short_info->share_mode = extended.share_mode;
14313 		short_info->ref_count = extended.ref_count;
14314 	}
14315 
14316 	if (not_in_kdp) {
14317 		vm_map_unlock_read(curr_map);
14318 	}
14319 
14320 	return KERN_SUCCESS;
14321 }
14322 
14323 /*
14324  *	vm_region:
14325  *
14326  *	User call to obtain information about a region in
14327  *	a task's address map. Currently, only one flavor is
14328  *	supported.
14329  *
14330  *	XXX The reserved and behavior fields cannot be filled
14331  *	    in until the vm merge from the IK is completed, and
14332  *	    vm_reserve is implemented.
14333  */
14334 
14335 kern_return_t
vm_map_region(vm_map_t map,vm_map_offset_t * address,vm_map_size_t * size,vm_region_flavor_t flavor,vm_region_info_t info,mach_msg_type_number_t * count,mach_port_t * object_name)14336 vm_map_region(
14337 	vm_map_t                 map,
14338 	vm_map_offset_t *address,               /* IN/OUT */
14339 	vm_map_size_t           *size,                  /* OUT */
14340 	vm_region_flavor_t       flavor,                /* IN */
14341 	vm_region_info_t         info,                  /* OUT */
14342 	mach_msg_type_number_t  *count, /* IN/OUT */
14343 	mach_port_t             *object_name)           /* OUT */
14344 {
14345 	vm_map_entry_t          tmp_entry;
14346 	vm_map_entry_t          entry;
14347 	vm_map_offset_t         start;
14348 
14349 	if (map == VM_MAP_NULL) {
14350 		return KERN_INVALID_ARGUMENT;
14351 	}
14352 
14353 	switch (flavor) {
14354 	case VM_REGION_BASIC_INFO:
14355 		/* legacy for old 32-bit objects info */
14356 	{
14357 		vm_region_basic_info_t  basic;
14358 
14359 		if (*count < VM_REGION_BASIC_INFO_COUNT) {
14360 			return KERN_INVALID_ARGUMENT;
14361 		}
14362 
14363 		basic = (vm_region_basic_info_t) info;
14364 		*count = VM_REGION_BASIC_INFO_COUNT;
14365 
14366 		vm_map_lock_read(map);
14367 
14368 		start = *address;
14369 		if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
14370 			if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
14371 				vm_map_unlock_read(map);
14372 				return KERN_INVALID_ADDRESS;
14373 			}
14374 		} else {
14375 			entry = tmp_entry;
14376 		}
14377 
14378 		start = entry->vme_start;
14379 
14380 		basic->offset = (uint32_t)VME_OFFSET(entry);
14381 		basic->protection = entry->protection;
14382 		basic->inheritance = entry->inheritance;
14383 		basic->max_protection = entry->max_protection;
14384 		basic->behavior = entry->behavior;
14385 		basic->user_wired_count = entry->user_wired_count;
14386 		basic->reserved = entry->is_sub_map;
14387 		*address = start;
14388 		*size = (entry->vme_end - start);
14389 
14390 		if (object_name) {
14391 			*object_name = IP_NULL;
14392 		}
14393 		if (entry->is_sub_map) {
14394 			basic->shared = FALSE;
14395 		} else {
14396 			basic->shared = entry->is_shared;
14397 		}
14398 
14399 		vm_map_unlock_read(map);
14400 		return KERN_SUCCESS;
14401 	}
14402 
14403 	case VM_REGION_BASIC_INFO_64:
14404 	{
14405 		vm_region_basic_info_64_t       basic;
14406 
14407 		if (*count < VM_REGION_BASIC_INFO_COUNT_64) {
14408 			return KERN_INVALID_ARGUMENT;
14409 		}
14410 
14411 		basic = (vm_region_basic_info_64_t) info;
14412 		*count = VM_REGION_BASIC_INFO_COUNT_64;
14413 
14414 		vm_map_lock_read(map);
14415 
14416 		start = *address;
14417 		if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
14418 			if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
14419 				vm_map_unlock_read(map);
14420 				return KERN_INVALID_ADDRESS;
14421 			}
14422 		} else {
14423 			entry = tmp_entry;
14424 		}
14425 
14426 		start = entry->vme_start;
14427 
14428 		basic->offset = VME_OFFSET(entry);
14429 		basic->protection = entry->protection;
14430 		basic->inheritance = entry->inheritance;
14431 		basic->max_protection = entry->max_protection;
14432 		basic->behavior = entry->behavior;
14433 		basic->user_wired_count = entry->user_wired_count;
14434 		basic->reserved = entry->is_sub_map;
14435 		*address = start;
14436 		*size = (entry->vme_end - start);
14437 
14438 		if (object_name) {
14439 			*object_name = IP_NULL;
14440 		}
14441 		if (entry->is_sub_map) {
14442 			basic->shared = FALSE;
14443 		} else {
14444 			basic->shared = entry->is_shared;
14445 		}
14446 
14447 		vm_map_unlock_read(map);
14448 		return KERN_SUCCESS;
14449 	}
14450 	case VM_REGION_EXTENDED_INFO:
14451 		if (*count < VM_REGION_EXTENDED_INFO_COUNT) {
14452 			return KERN_INVALID_ARGUMENT;
14453 		}
14454 		OS_FALLTHROUGH;
14455 	case VM_REGION_EXTENDED_INFO__legacy:
14456 		if (*count < VM_REGION_EXTENDED_INFO_COUNT__legacy) {
14457 			return KERN_INVALID_ARGUMENT;
14458 		}
14459 
14460 		{
14461 			vm_region_extended_info_t       extended;
14462 			mach_msg_type_number_t original_count;
14463 			int effective_page_size, effective_page_shift;
14464 
14465 			extended = (vm_region_extended_info_t) info;
14466 
14467 			effective_page_shift = vm_self_region_page_shift(map);
14468 			effective_page_size = (1 << effective_page_shift);
14469 
14470 			vm_map_lock_read(map);
14471 
14472 			start = *address;
14473 			if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
14474 				if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
14475 					vm_map_unlock_read(map);
14476 					return KERN_INVALID_ADDRESS;
14477 				}
14478 			} else {
14479 				entry = tmp_entry;
14480 			}
14481 			start = entry->vme_start;
14482 
14483 			extended->protection = entry->protection;
14484 			extended->user_tag = VME_ALIAS(entry);
14485 			extended->pages_resident = 0;
14486 			extended->pages_swapped_out = 0;
14487 			extended->pages_shared_now_private = 0;
14488 			extended->pages_dirtied = 0;
14489 			extended->external_pager = 0;
14490 			extended->shadow_depth = 0;
14491 
14492 			original_count = *count;
14493 			if (flavor == VM_REGION_EXTENDED_INFO__legacy) {
14494 				*count = VM_REGION_EXTENDED_INFO_COUNT__legacy;
14495 			} else {
14496 				extended->pages_reusable = 0;
14497 				*count = VM_REGION_EXTENDED_INFO_COUNT;
14498 			}
14499 
14500 			vm_map_region_walk(map, start, entry, VME_OFFSET(entry), entry->vme_end - start, extended, TRUE, *count);
14501 
14502 			if (extended->external_pager && extended->ref_count == 2 && extended->share_mode == SM_SHARED) {
14503 				extended->share_mode = SM_PRIVATE;
14504 			}
14505 
14506 			if (object_name) {
14507 				*object_name = IP_NULL;
14508 			}
14509 			*address = start;
14510 			*size = (entry->vme_end - start);
14511 
14512 			vm_map_unlock_read(map);
14513 			return KERN_SUCCESS;
14514 		}
14515 	case VM_REGION_TOP_INFO:
14516 	{
14517 		vm_region_top_info_t    top;
14518 
14519 		if (*count < VM_REGION_TOP_INFO_COUNT) {
14520 			return KERN_INVALID_ARGUMENT;
14521 		}
14522 
14523 		top = (vm_region_top_info_t) info;
14524 		*count = VM_REGION_TOP_INFO_COUNT;
14525 
14526 		vm_map_lock_read(map);
14527 
14528 		start = *address;
14529 		if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
14530 			if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
14531 				vm_map_unlock_read(map);
14532 				return KERN_INVALID_ADDRESS;
14533 			}
14534 		} else {
14535 			entry = tmp_entry;
14536 		}
14537 		start = entry->vme_start;
14538 
14539 		top->private_pages_resident = 0;
14540 		top->shared_pages_resident = 0;
14541 
14542 		vm_map_region_top_walk(entry, top);
14543 
14544 		if (object_name) {
14545 			*object_name = IP_NULL;
14546 		}
14547 		*address = start;
14548 		*size = (entry->vme_end - start);
14549 
14550 		vm_map_unlock_read(map);
14551 		return KERN_SUCCESS;
14552 	}
14553 	default:
14554 		return KERN_INVALID_ARGUMENT;
14555 	}
14556 }
14557 
14558 #define OBJ_RESIDENT_COUNT(obj, entry_size)                             \
14559 	MIN((entry_size),                                               \
14560 	    ((obj)->all_reusable ?                                      \
14561 	     (obj)->wired_page_count :                                  \
14562 	     (obj)->resident_page_count - (obj)->reusable_page_count))
14563 
14564 void
vm_map_region_top_walk(vm_map_entry_t entry,vm_region_top_info_t top)14565 vm_map_region_top_walk(
14566 	vm_map_entry_t             entry,
14567 	vm_region_top_info_t       top)
14568 {
14569 	if (VME_OBJECT(entry) == 0 || entry->is_sub_map) {
14570 		top->share_mode = SM_EMPTY;
14571 		top->ref_count = 0;
14572 		top->obj_id = 0;
14573 		return;
14574 	}
14575 
14576 	{
14577 		struct  vm_object *obj, *tmp_obj;
14578 		int             ref_count;
14579 		uint32_t        entry_size;
14580 
14581 		entry_size = (uint32_t) ((entry->vme_end - entry->vme_start) / PAGE_SIZE_64);
14582 
14583 		obj = VME_OBJECT(entry);
14584 
14585 		vm_object_lock(obj);
14586 
14587 		if ((ref_count = obj->ref_count) > 1 && obj->paging_in_progress) {
14588 			ref_count--;
14589 		}
14590 
14591 		assert(obj->reusable_page_count <= obj->resident_page_count);
14592 		if (obj->shadow) {
14593 			if (ref_count == 1) {
14594 				top->private_pages_resident =
14595 				    OBJ_RESIDENT_COUNT(obj, entry_size);
14596 			} else {
14597 				top->shared_pages_resident =
14598 				    OBJ_RESIDENT_COUNT(obj, entry_size);
14599 			}
14600 			top->ref_count  = ref_count;
14601 			top->share_mode = SM_COW;
14602 
14603 			while ((tmp_obj = obj->shadow)) {
14604 				vm_object_lock(tmp_obj);
14605 				vm_object_unlock(obj);
14606 				obj = tmp_obj;
14607 
14608 				if ((ref_count = obj->ref_count) > 1 && obj->paging_in_progress) {
14609 					ref_count--;
14610 				}
14611 
14612 				assert(obj->reusable_page_count <= obj->resident_page_count);
14613 				top->shared_pages_resident +=
14614 				    OBJ_RESIDENT_COUNT(obj, entry_size);
14615 				top->ref_count += ref_count - 1;
14616 			}
14617 		} else {
14618 			if (entry->superpage_size) {
14619 				top->share_mode = SM_LARGE_PAGE;
14620 				top->shared_pages_resident = 0;
14621 				top->private_pages_resident = entry_size;
14622 			} else if (entry->needs_copy) {
14623 				top->share_mode = SM_COW;
14624 				top->shared_pages_resident =
14625 				    OBJ_RESIDENT_COUNT(obj, entry_size);
14626 			} else {
14627 				if (ref_count == 1 ||
14628 				    (ref_count == 2 && obj->named)) {
14629 					top->share_mode = SM_PRIVATE;
14630 					top->private_pages_resident =
14631 					    OBJ_RESIDENT_COUNT(obj,
14632 					    entry_size);
14633 				} else {
14634 					top->share_mode = SM_SHARED;
14635 					top->shared_pages_resident =
14636 					    OBJ_RESIDENT_COUNT(obj,
14637 					    entry_size);
14638 				}
14639 			}
14640 			top->ref_count = ref_count;
14641 		}
14642 		/* XXX K64: obj_id will be truncated */
14643 		top->obj_id = (unsigned int) (uintptr_t)VM_KERNEL_ADDRPERM(obj);
14644 
14645 		vm_object_unlock(obj);
14646 	}
14647 }
14648 
14649 void
vm_map_region_walk(vm_map_t map,vm_map_offset_t va,vm_map_entry_t entry,vm_object_offset_t offset,vm_object_size_t range,vm_region_extended_info_t extended,boolean_t look_for_pages,mach_msg_type_number_t count)14650 vm_map_region_walk(
14651 	vm_map_t                        map,
14652 	vm_map_offset_t                 va,
14653 	vm_map_entry_t                  entry,
14654 	vm_object_offset_t              offset,
14655 	vm_object_size_t                range,
14656 	vm_region_extended_info_t       extended,
14657 	boolean_t                       look_for_pages,
14658 	mach_msg_type_number_t count)
14659 {
14660 	struct vm_object *obj, *tmp_obj;
14661 	vm_map_offset_t       last_offset;
14662 	int               i;
14663 	int               ref_count;
14664 	struct vm_object        *shadow_object;
14665 	unsigned short          shadow_depth;
14666 	boolean_t         do_region_footprint;
14667 	int                     effective_page_size, effective_page_shift;
14668 	vm_map_offset_t         effective_page_mask;
14669 
14670 	do_region_footprint = task_self_region_footprint();
14671 
14672 	if ((VME_OBJECT(entry) == 0) ||
14673 	    (entry->is_sub_map) ||
14674 	    (VME_OBJECT(entry)->phys_contiguous &&
14675 	    !entry->superpage_size)) {
14676 		extended->share_mode = SM_EMPTY;
14677 		extended->ref_count = 0;
14678 		return;
14679 	}
14680 
14681 	if (entry->superpage_size) {
14682 		extended->shadow_depth = 0;
14683 		extended->share_mode = SM_LARGE_PAGE;
14684 		extended->ref_count = 1;
14685 		extended->external_pager = 0;
14686 
14687 		/* TODO4K: Superpage in 4k mode? */
14688 		extended->pages_resident = (unsigned int)(range >> PAGE_SHIFT);
14689 		extended->shadow_depth = 0;
14690 		return;
14691 	}
14692 
14693 	effective_page_shift = vm_self_region_page_shift(map);
14694 	effective_page_size = (1 << effective_page_shift);
14695 	effective_page_mask = effective_page_size - 1;
14696 
14697 	offset = vm_map_trunc_page(offset, effective_page_mask);
14698 
14699 	obj = VME_OBJECT(entry);
14700 
14701 	vm_object_lock(obj);
14702 
14703 	if ((ref_count = obj->ref_count) > 1 && obj->paging_in_progress) {
14704 		ref_count--;
14705 	}
14706 
14707 	if (look_for_pages) {
14708 		for (last_offset = offset + range;
14709 		    offset < last_offset;
14710 		    offset += effective_page_size, va += effective_page_size) {
14711 			if (do_region_footprint) {
14712 				int disp;
14713 
14714 				disp = 0;
14715 				if (map->has_corpse_footprint) {
14716 					/*
14717 					 * Query the page info data we saved
14718 					 * while forking the corpse.
14719 					 */
14720 					vm_map_corpse_footprint_query_page_info(
14721 						map,
14722 						va,
14723 						&disp);
14724 				} else {
14725 					/*
14726 					 * Query the pmap.
14727 					 */
14728 					vm_map_footprint_query_page_info(
14729 						map,
14730 						entry,
14731 						va,
14732 						&disp);
14733 				}
14734 				if (disp & VM_PAGE_QUERY_PAGE_PRESENT) {
14735 					extended->pages_resident++;
14736 				}
14737 				if (disp & VM_PAGE_QUERY_PAGE_REUSABLE) {
14738 					extended->pages_reusable++;
14739 				}
14740 				if (disp & VM_PAGE_QUERY_PAGE_DIRTY) {
14741 					extended->pages_dirtied++;
14742 				}
14743 				if (disp & PMAP_QUERY_PAGE_COMPRESSED) {
14744 					extended->pages_swapped_out++;
14745 				}
14746 				continue;
14747 			}
14748 
14749 			vm_map_region_look_for_page(map, va, obj,
14750 			    vm_object_trunc_page(offset), ref_count,
14751 			    0, extended, count);
14752 		}
14753 
14754 		if (do_region_footprint) {
14755 			goto collect_object_info;
14756 		}
14757 	} else {
14758 collect_object_info:
14759 		shadow_object = obj->shadow;
14760 		shadow_depth = 0;
14761 
14762 		if (!(obj->internal)) {
14763 			extended->external_pager = 1;
14764 		}
14765 
14766 		if (shadow_object != VM_OBJECT_NULL) {
14767 			vm_object_lock(shadow_object);
14768 			for (;
14769 			    shadow_object != VM_OBJECT_NULL;
14770 			    shadow_depth++) {
14771 				vm_object_t     next_shadow;
14772 
14773 				if (!(shadow_object->internal)) {
14774 					extended->external_pager = 1;
14775 				}
14776 
14777 				next_shadow = shadow_object->shadow;
14778 				if (next_shadow) {
14779 					vm_object_lock(next_shadow);
14780 				}
14781 				vm_object_unlock(shadow_object);
14782 				shadow_object = next_shadow;
14783 			}
14784 		}
14785 		extended->shadow_depth = shadow_depth;
14786 	}
14787 
14788 	if (extended->shadow_depth || entry->needs_copy) {
14789 		extended->share_mode = SM_COW;
14790 	} else {
14791 		if (ref_count == 1) {
14792 			extended->share_mode = SM_PRIVATE;
14793 		} else {
14794 			if (obj->true_share) {
14795 				extended->share_mode = SM_TRUESHARED;
14796 			} else {
14797 				extended->share_mode = SM_SHARED;
14798 			}
14799 		}
14800 	}
14801 	extended->ref_count = ref_count - extended->shadow_depth;
14802 
14803 	for (i = 0; i < extended->shadow_depth; i++) {
14804 		if ((tmp_obj = obj->shadow) == 0) {
14805 			break;
14806 		}
14807 		vm_object_lock(tmp_obj);
14808 		vm_object_unlock(obj);
14809 
14810 		if ((ref_count = tmp_obj->ref_count) > 1 && tmp_obj->paging_in_progress) {
14811 			ref_count--;
14812 		}
14813 
14814 		extended->ref_count += ref_count;
14815 		obj = tmp_obj;
14816 	}
14817 	vm_object_unlock(obj);
14818 
14819 	if (extended->share_mode == SM_SHARED) {
14820 		vm_map_entry_t       cur;
14821 		vm_map_entry_t       last;
14822 		int      my_refs;
14823 
14824 		obj = VME_OBJECT(entry);
14825 		last = vm_map_to_entry(map);
14826 		my_refs = 0;
14827 
14828 		if ((ref_count = obj->ref_count) > 1 && obj->paging_in_progress) {
14829 			ref_count--;
14830 		}
14831 		for (cur = vm_map_first_entry(map); cur != last; cur = cur->vme_next) {
14832 			my_refs += vm_map_region_count_obj_refs(cur, obj);
14833 		}
14834 
14835 		if (my_refs == ref_count) {
14836 			extended->share_mode = SM_PRIVATE_ALIASED;
14837 		} else if (my_refs > 1) {
14838 			extended->share_mode = SM_SHARED_ALIASED;
14839 		}
14840 	}
14841 }
14842 
14843 
14844 /* object is locked on entry and locked on return */
14845 
14846 
14847 static void
vm_map_region_look_for_page(__unused vm_map_t map,__unused vm_map_offset_t va,vm_object_t object,vm_object_offset_t offset,int max_refcnt,unsigned short depth,vm_region_extended_info_t extended,mach_msg_type_number_t count)14848 vm_map_region_look_for_page(
14849 	__unused vm_map_t               map,
14850 	__unused vm_map_offset_t        va,
14851 	vm_object_t                     object,
14852 	vm_object_offset_t              offset,
14853 	int                             max_refcnt,
14854 	unsigned short                  depth,
14855 	vm_region_extended_info_t       extended,
14856 	mach_msg_type_number_t count)
14857 {
14858 	vm_page_t       p;
14859 	vm_object_t     shadow;
14860 	int             ref_count;
14861 	vm_object_t     caller_object;
14862 
14863 	shadow = object->shadow;
14864 	caller_object = object;
14865 
14866 
14867 	while (TRUE) {
14868 		if (!(object->internal)) {
14869 			extended->external_pager = 1;
14870 		}
14871 
14872 		if ((p = vm_page_lookup(object, offset)) != VM_PAGE_NULL) {
14873 			if (shadow && (max_refcnt == 1)) {
14874 				extended->pages_shared_now_private++;
14875 			}
14876 
14877 			if (!p->vmp_fictitious &&
14878 			    (p->vmp_dirty || pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(p)))) {
14879 				extended->pages_dirtied++;
14880 			} else if (count >= VM_REGION_EXTENDED_INFO_COUNT) {
14881 				if (p->vmp_reusable || object->all_reusable) {
14882 					extended->pages_reusable++;
14883 				}
14884 			}
14885 
14886 			extended->pages_resident++;
14887 
14888 			if (object != caller_object) {
14889 				vm_object_unlock(object);
14890 			}
14891 
14892 			return;
14893 		}
14894 		if (object->internal &&
14895 		    object->alive &&
14896 		    !object->terminating &&
14897 		    object->pager_ready) {
14898 			if (VM_COMPRESSOR_PAGER_STATE_GET(object, offset)
14899 			    == VM_EXTERNAL_STATE_EXISTS) {
14900 				/* the pager has that page */
14901 				extended->pages_swapped_out++;
14902 				if (object != caller_object) {
14903 					vm_object_unlock(object);
14904 				}
14905 				return;
14906 			}
14907 		}
14908 
14909 		if (shadow) {
14910 			vm_object_lock(shadow);
14911 
14912 			if ((ref_count = shadow->ref_count) > 1 && shadow->paging_in_progress) {
14913 				ref_count--;
14914 			}
14915 
14916 			if (++depth > extended->shadow_depth) {
14917 				extended->shadow_depth = depth;
14918 			}
14919 
14920 			if (ref_count > max_refcnt) {
14921 				max_refcnt = ref_count;
14922 			}
14923 
14924 			if (object != caller_object) {
14925 				vm_object_unlock(object);
14926 			}
14927 
14928 			offset = offset + object->vo_shadow_offset;
14929 			object = shadow;
14930 			shadow = object->shadow;
14931 			continue;
14932 		}
14933 		if (object != caller_object) {
14934 			vm_object_unlock(object);
14935 		}
14936 		break;
14937 	}
14938 }
14939 
14940 static int
vm_map_region_count_obj_refs(vm_map_entry_t entry,vm_object_t object)14941 vm_map_region_count_obj_refs(
14942 	vm_map_entry_t    entry,
14943 	vm_object_t       object)
14944 {
14945 	int ref_count;
14946 	vm_object_t chk_obj;
14947 	vm_object_t tmp_obj;
14948 
14949 	if (VME_OBJECT(entry) == 0) {
14950 		return 0;
14951 	}
14952 
14953 	if (entry->is_sub_map) {
14954 		return 0;
14955 	} else {
14956 		ref_count = 0;
14957 
14958 		chk_obj = VME_OBJECT(entry);
14959 		vm_object_lock(chk_obj);
14960 
14961 		while (chk_obj) {
14962 			if (chk_obj == object) {
14963 				ref_count++;
14964 			}
14965 			tmp_obj = chk_obj->shadow;
14966 			if (tmp_obj) {
14967 				vm_object_lock(tmp_obj);
14968 			}
14969 			vm_object_unlock(chk_obj);
14970 
14971 			chk_obj = tmp_obj;
14972 		}
14973 	}
14974 	return ref_count;
14975 }
14976 
14977 
14978 /*
14979  *	Routine:	vm_map_simplify
14980  *
14981  *	Description:
14982  *		Attempt to simplify the map representation in
14983  *		the vicinity of the given starting address.
14984  *	Note:
14985  *		This routine is intended primarily to keep the
14986  *		kernel maps more compact -- they generally don't
14987  *		benefit from the "expand a map entry" technology
14988  *		at allocation time because the adjacent entry
14989  *		is often wired down.
14990  */
14991 void
vm_map_simplify_entry(vm_map_t map,vm_map_entry_t this_entry)14992 vm_map_simplify_entry(
14993 	vm_map_t        map,
14994 	vm_map_entry_t  this_entry)
14995 {
14996 	vm_map_entry_t  prev_entry;
14997 
14998 	prev_entry = this_entry->vme_prev;
14999 
15000 	if ((this_entry != vm_map_to_entry(map)) &&
15001 	    (prev_entry != vm_map_to_entry(map)) &&
15002 
15003 	    (prev_entry->vme_end == this_entry->vme_start) &&
15004 
15005 	    (prev_entry->is_sub_map == this_entry->is_sub_map) &&
15006 	    (VME_OBJECT(prev_entry) == VME_OBJECT(this_entry)) &&
15007 	    ((VME_OFFSET(prev_entry) + (prev_entry->vme_end -
15008 	    prev_entry->vme_start))
15009 	    == VME_OFFSET(this_entry)) &&
15010 
15011 	    (prev_entry->behavior == this_entry->behavior) &&
15012 	    (prev_entry->needs_copy == this_entry->needs_copy) &&
15013 	    (prev_entry->protection == this_entry->protection) &&
15014 	    (prev_entry->max_protection == this_entry->max_protection) &&
15015 	    (prev_entry->inheritance == this_entry->inheritance) &&
15016 	    (prev_entry->use_pmap == this_entry->use_pmap) &&
15017 	    (VME_ALIAS(prev_entry) == VME_ALIAS(this_entry)) &&
15018 	    (prev_entry->no_cache == this_entry->no_cache) &&
15019 	    (prev_entry->permanent == this_entry->permanent) &&
15020 	    (prev_entry->map_aligned == this_entry->map_aligned) &&
15021 	    (prev_entry->zero_wired_pages == this_entry->zero_wired_pages) &&
15022 	    (prev_entry->used_for_jit == this_entry->used_for_jit) &&
15023 	    (prev_entry->pmap_cs_associated == this_entry->pmap_cs_associated) &&
15024 	    (prev_entry->iokit_acct == this_entry->iokit_acct) &&
15025 	    (prev_entry->vme_resilient_codesign ==
15026 	    this_entry->vme_resilient_codesign) &&
15027 	    (prev_entry->vme_resilient_media ==
15028 	    this_entry->vme_resilient_media) &&
15029 	    (prev_entry->vme_no_copy_on_read == this_entry->vme_no_copy_on_read) &&
15030 
15031 	    (prev_entry->wired_count == this_entry->wired_count) &&
15032 	    (prev_entry->user_wired_count == this_entry->user_wired_count) &&
15033 
15034 	    ((prev_entry->vme_atomic == FALSE) && (this_entry->vme_atomic == FALSE)) &&
15035 	    (prev_entry->in_transition == FALSE) &&
15036 	    (this_entry->in_transition == FALSE) &&
15037 	    (prev_entry->needs_wakeup == FALSE) &&
15038 	    (this_entry->needs_wakeup == FALSE) &&
15039 	    (prev_entry->is_shared == this_entry->is_shared) &&
15040 	    (prev_entry->superpage_size == FALSE) &&
15041 	    (this_entry->superpage_size == FALSE)
15042 	    ) {
15043 		vm_map_store_entry_unlink(map, prev_entry);
15044 		assert(prev_entry->vme_start < this_entry->vme_end);
15045 		if (prev_entry->map_aligned) {
15046 			assert(VM_MAP_PAGE_ALIGNED(prev_entry->vme_start,
15047 			    VM_MAP_PAGE_MASK(map)));
15048 		}
15049 		this_entry->vme_start = prev_entry->vme_start;
15050 		VME_OFFSET_SET(this_entry, VME_OFFSET(prev_entry));
15051 
15052 		if (map->holelistenabled) {
15053 			vm_map_store_update_first_free(map, this_entry, TRUE);
15054 		}
15055 
15056 		if (prev_entry->is_sub_map) {
15057 			vm_map_deallocate(VME_SUBMAP(prev_entry));
15058 		} else {
15059 			vm_object_deallocate(VME_OBJECT(prev_entry));
15060 		}
15061 		vm_map_entry_dispose(prev_entry);
15062 		SAVE_HINT_MAP_WRITE(map, this_entry);
15063 	}
15064 }
15065 
15066 void
vm_map_simplify(vm_map_t map,vm_map_offset_t start)15067 vm_map_simplify(
15068 	vm_map_t        map,
15069 	vm_map_offset_t start)
15070 {
15071 	vm_map_entry_t  this_entry;
15072 
15073 	vm_map_lock(map);
15074 	if (vm_map_lookup_entry(map, start, &this_entry)) {
15075 		vm_map_simplify_entry(map, this_entry);
15076 		vm_map_simplify_entry(map, this_entry->vme_next);
15077 	}
15078 	vm_map_unlock(map);
15079 }
15080 
15081 static void
vm_map_simplify_range(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)15082 vm_map_simplify_range(
15083 	vm_map_t        map,
15084 	vm_map_offset_t start,
15085 	vm_map_offset_t end)
15086 {
15087 	vm_map_entry_t  entry;
15088 
15089 	/*
15090 	 * The map should be locked (for "write") by the caller.
15091 	 */
15092 
15093 	if (start >= end) {
15094 		/* invalid address range */
15095 		return;
15096 	}
15097 
15098 	start = vm_map_trunc_page(start,
15099 	    VM_MAP_PAGE_MASK(map));
15100 	end = vm_map_round_page(end,
15101 	    VM_MAP_PAGE_MASK(map));
15102 
15103 	if (!vm_map_lookup_entry(map, start, &entry)) {
15104 		/* "start" is not mapped and "entry" ends before "start" */
15105 		if (entry == vm_map_to_entry(map)) {
15106 			/* start with first entry in the map */
15107 			entry = vm_map_first_entry(map);
15108 		} else {
15109 			/* start with next entry */
15110 			entry = entry->vme_next;
15111 		}
15112 	}
15113 
15114 	while (entry != vm_map_to_entry(map) &&
15115 	    entry->vme_start <= end) {
15116 		/* try and coalesce "entry" with its previous entry */
15117 		vm_map_simplify_entry(map, entry);
15118 		entry = entry->vme_next;
15119 	}
15120 }
15121 
15122 
15123 /*
15124  *	Routine:	vm_map_machine_attribute
15125  *	Purpose:
15126  *		Provide machine-specific attributes to mappings,
15127  *		such as cachability etc. for machines that provide
15128  *		them.  NUMA architectures and machines with big/strange
15129  *		caches will use this.
15130  *	Note:
15131  *		Responsibilities for locking and checking are handled here,
15132  *		everything else in the pmap module. If any non-volatile
15133  *		information must be kept, the pmap module should handle
15134  *		it itself. [This assumes that attributes do not
15135  *		need to be inherited, which seems ok to me]
15136  */
15137 kern_return_t
vm_map_machine_attribute(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_machine_attribute_t attribute,vm_machine_attribute_val_t * value)15138 vm_map_machine_attribute(
15139 	vm_map_t                        map,
15140 	vm_map_offset_t         start,
15141 	vm_map_offset_t         end,
15142 	vm_machine_attribute_t  attribute,
15143 	vm_machine_attribute_val_t* value)              /* IN/OUT */
15144 {
15145 	kern_return_t   ret;
15146 	vm_map_size_t sync_size;
15147 	vm_map_entry_t entry;
15148 
15149 	if (start < vm_map_min(map) || end > vm_map_max(map)) {
15150 		return KERN_INVALID_ADDRESS;
15151 	}
15152 
15153 	/* Figure how much memory we need to flush (in page increments) */
15154 	sync_size = end - start;
15155 
15156 	vm_map_lock(map);
15157 
15158 	if (attribute != MATTR_CACHE) {
15159 		/* If we don't have to find physical addresses, we */
15160 		/* don't have to do an explicit traversal here.    */
15161 		ret = pmap_attribute(map->pmap, start, end - start,
15162 		    attribute, value);
15163 		vm_map_unlock(map);
15164 		return ret;
15165 	}
15166 
15167 	ret = KERN_SUCCESS;                                                                             /* Assume it all worked */
15168 
15169 	while (sync_size) {
15170 		if (vm_map_lookup_entry(map, start, &entry)) {
15171 			vm_map_size_t   sub_size;
15172 			if ((entry->vme_end - start) > sync_size) {
15173 				sub_size = sync_size;
15174 				sync_size = 0;
15175 			} else {
15176 				sub_size = entry->vme_end - start;
15177 				sync_size -= sub_size;
15178 			}
15179 			if (entry->is_sub_map) {
15180 				vm_map_offset_t sub_start;
15181 				vm_map_offset_t sub_end;
15182 
15183 				sub_start = (start - entry->vme_start)
15184 				    + VME_OFFSET(entry);
15185 				sub_end = sub_start + sub_size;
15186 				vm_map_machine_attribute(
15187 					VME_SUBMAP(entry),
15188 					sub_start,
15189 					sub_end,
15190 					attribute, value);
15191 			} else {
15192 				if (VME_OBJECT(entry)) {
15193 					vm_page_t               m;
15194 					vm_object_t             object;
15195 					vm_object_t             base_object;
15196 					vm_object_t             last_object;
15197 					vm_object_offset_t      offset;
15198 					vm_object_offset_t      base_offset;
15199 					vm_map_size_t           range;
15200 					range = sub_size;
15201 					offset = (start - entry->vme_start)
15202 					    + VME_OFFSET(entry);
15203 					offset = vm_object_trunc_page(offset);
15204 					base_offset = offset;
15205 					object = VME_OBJECT(entry);
15206 					base_object = object;
15207 					last_object = NULL;
15208 
15209 					vm_object_lock(object);
15210 
15211 					while (range) {
15212 						m = vm_page_lookup(
15213 							object, offset);
15214 
15215 						if (m && !m->vmp_fictitious) {
15216 							ret =
15217 							    pmap_attribute_cache_sync(
15218 								VM_PAGE_GET_PHYS_PAGE(m),
15219 								PAGE_SIZE,
15220 								attribute, value);
15221 						} else if (object->shadow) {
15222 							offset = offset + object->vo_shadow_offset;
15223 							last_object = object;
15224 							object = object->shadow;
15225 							vm_object_lock(last_object->shadow);
15226 							vm_object_unlock(last_object);
15227 							continue;
15228 						}
15229 						if (range < PAGE_SIZE) {
15230 							range = 0;
15231 						} else {
15232 							range -= PAGE_SIZE;
15233 						}
15234 
15235 						if (base_object != object) {
15236 							vm_object_unlock(object);
15237 							vm_object_lock(base_object);
15238 							object = base_object;
15239 						}
15240 						/* Bump to the next page */
15241 						base_offset += PAGE_SIZE;
15242 						offset = base_offset;
15243 					}
15244 					vm_object_unlock(object);
15245 				}
15246 			}
15247 			start += sub_size;
15248 		} else {
15249 			vm_map_unlock(map);
15250 			return KERN_FAILURE;
15251 		}
15252 	}
15253 
15254 	vm_map_unlock(map);
15255 
15256 	return ret;
15257 }
15258 
15259 /*
15260  *	vm_map_behavior_set:
15261  *
15262  *	Sets the paging reference behavior of the specified address
15263  *	range in the target map.  Paging reference behavior affects
15264  *	how pagein operations resulting from faults on the map will be
15265  *	clustered.
15266  */
15267 kern_return_t
vm_map_behavior_set(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_behavior_t new_behavior)15268 vm_map_behavior_set(
15269 	vm_map_t        map,
15270 	vm_map_offset_t start,
15271 	vm_map_offset_t end,
15272 	vm_behavior_t   new_behavior)
15273 {
15274 	vm_map_entry_t  entry;
15275 	vm_map_entry_t  temp_entry;
15276 
15277 	if (start > end ||
15278 	    start < vm_map_min(map) ||
15279 	    end > vm_map_max(map)) {
15280 		return KERN_NO_SPACE;
15281 	}
15282 
15283 	switch (new_behavior) {
15284 	/*
15285 	 * This first block of behaviors all set a persistent state on the specified
15286 	 * memory range.  All we have to do here is to record the desired behavior
15287 	 * in the vm_map_entry_t's.
15288 	 */
15289 
15290 	case VM_BEHAVIOR_DEFAULT:
15291 	case VM_BEHAVIOR_RANDOM:
15292 	case VM_BEHAVIOR_SEQUENTIAL:
15293 	case VM_BEHAVIOR_RSEQNTL:
15294 	case VM_BEHAVIOR_ZERO_WIRED_PAGES:
15295 		vm_map_lock(map);
15296 
15297 		/*
15298 		 *	The entire address range must be valid for the map.
15299 		 *      Note that vm_map_range_check() does a
15300 		 *	vm_map_lookup_entry() internally and returns the
15301 		 *	entry containing the start of the address range if
15302 		 *	the entire range is valid.
15303 		 */
15304 		if (vm_map_range_check(map, start, end, &temp_entry)) {
15305 			entry = temp_entry;
15306 			vm_map_clip_start(map, entry, start);
15307 		} else {
15308 			vm_map_unlock(map);
15309 			return KERN_INVALID_ADDRESS;
15310 		}
15311 
15312 		while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
15313 			vm_map_clip_end(map, entry, end);
15314 			if (entry->is_sub_map) {
15315 				assert(!entry->use_pmap);
15316 			}
15317 
15318 			if (new_behavior == VM_BEHAVIOR_ZERO_WIRED_PAGES) {
15319 				entry->zero_wired_pages = TRUE;
15320 			} else {
15321 				entry->behavior = new_behavior;
15322 			}
15323 			entry = entry->vme_next;
15324 		}
15325 
15326 		vm_map_unlock(map);
15327 		break;
15328 
15329 	/*
15330 	 * The rest of these are different from the above in that they cause
15331 	 * an immediate action to take place as opposed to setting a behavior that
15332 	 * affects future actions.
15333 	 */
15334 
15335 	case VM_BEHAVIOR_WILLNEED:
15336 		return vm_map_willneed(map, start, end);
15337 
15338 	case VM_BEHAVIOR_DONTNEED:
15339 		return vm_map_msync(map, start, end - start, VM_SYNC_DEACTIVATE | VM_SYNC_CONTIGUOUS);
15340 
15341 	case VM_BEHAVIOR_FREE:
15342 		return vm_map_msync(map, start, end - start, VM_SYNC_KILLPAGES | VM_SYNC_CONTIGUOUS);
15343 
15344 	case VM_BEHAVIOR_REUSABLE:
15345 		return vm_map_reusable_pages(map, start, end);
15346 
15347 	case VM_BEHAVIOR_REUSE:
15348 		return vm_map_reuse_pages(map, start, end);
15349 
15350 	case VM_BEHAVIOR_CAN_REUSE:
15351 		return vm_map_can_reuse(map, start, end);
15352 
15353 #if MACH_ASSERT
15354 	case VM_BEHAVIOR_PAGEOUT:
15355 		return vm_map_pageout(map, start, end);
15356 #endif /* MACH_ASSERT */
15357 
15358 	default:
15359 		return KERN_INVALID_ARGUMENT;
15360 	}
15361 
15362 	return KERN_SUCCESS;
15363 }
15364 
15365 
15366 /*
15367  * Internals for madvise(MADV_WILLNEED) system call.
15368  *
15369  * The implementation is to do:-
15370  * a) read-ahead if the mapping corresponds to a mapped regular file
15371  * b) or, fault in the pages (zero-fill, decompress etc) if it's an anonymous mapping
15372  */
15373 
15374 
15375 static kern_return_t
vm_map_willneed(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)15376 vm_map_willneed(
15377 	vm_map_t        map,
15378 	vm_map_offset_t start,
15379 	vm_map_offset_t end
15380 	)
15381 {
15382 	vm_map_entry_t                  entry;
15383 	vm_object_t                     object;
15384 	memory_object_t                 pager;
15385 	struct vm_object_fault_info     fault_info = {};
15386 	kern_return_t                   kr;
15387 	vm_object_size_t                len;
15388 	vm_object_offset_t              offset;
15389 
15390 	fault_info.interruptible = THREAD_UNINT;        /* ignored value */
15391 	fault_info.behavior      = VM_BEHAVIOR_SEQUENTIAL;
15392 	fault_info.stealth       = TRUE;
15393 
15394 	/*
15395 	 * The MADV_WILLNEED operation doesn't require any changes to the
15396 	 * vm_map_entry_t's, so the read lock is sufficient.
15397 	 */
15398 
15399 	vm_map_lock_read(map);
15400 
15401 	/*
15402 	 * The madvise semantics require that the address range be fully
15403 	 * allocated with no holes.  Otherwise, we're required to return
15404 	 * an error.
15405 	 */
15406 
15407 	if (!vm_map_range_check(map, start, end, &entry)) {
15408 		vm_map_unlock_read(map);
15409 		return KERN_INVALID_ADDRESS;
15410 	}
15411 
15412 	/*
15413 	 * Examine each vm_map_entry_t in the range.
15414 	 */
15415 	for (; entry != vm_map_to_entry(map) && start < end;) {
15416 		/*
15417 		 * The first time through, the start address could be anywhere
15418 		 * within the vm_map_entry we found.  So adjust the offset to
15419 		 * correspond.  After that, the offset will always be zero to
15420 		 * correspond to the beginning of the current vm_map_entry.
15421 		 */
15422 		offset = (start - entry->vme_start) + VME_OFFSET(entry);
15423 
15424 		/*
15425 		 * Set the length so we don't go beyond the end of the
15426 		 * map_entry or beyond the end of the range we were given.
15427 		 * This range could span also multiple map entries all of which
15428 		 * map different files, so make sure we only do the right amount
15429 		 * of I/O for each object.  Note that it's possible for there
15430 		 * to be multiple map entries all referring to the same object
15431 		 * but with different page permissions, but it's not worth
15432 		 * trying to optimize that case.
15433 		 */
15434 		len = MIN(entry->vme_end - start, end - start);
15435 
15436 		if ((vm_size_t) len != len) {
15437 			/* 32-bit overflow */
15438 			len = (vm_size_t) (0 - PAGE_SIZE);
15439 		}
15440 		fault_info.cluster_size = (vm_size_t) len;
15441 		fault_info.lo_offset    = offset;
15442 		fault_info.hi_offset    = offset + len;
15443 		fault_info.user_tag     = VME_ALIAS(entry);
15444 		fault_info.pmap_options = 0;
15445 		if (entry->iokit_acct ||
15446 		    (!entry->is_sub_map && !entry->use_pmap)) {
15447 			fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
15448 		}
15449 
15450 		/*
15451 		 * If the entry is a submap OR there's no read permission
15452 		 * to this mapping, then just skip it.
15453 		 */
15454 		if ((entry->is_sub_map) || (entry->protection & VM_PROT_READ) == 0) {
15455 			entry = entry->vme_next;
15456 			start = entry->vme_start;
15457 			continue;
15458 		}
15459 
15460 		object = VME_OBJECT(entry);
15461 
15462 		if (object == NULL ||
15463 		    (object && object->internal)) {
15464 			/*
15465 			 * Memory range backed by anonymous memory.
15466 			 */
15467 			vm_size_t region_size = 0, effective_page_size = 0;
15468 			vm_map_offset_t addr = 0, effective_page_mask = 0;
15469 
15470 			region_size = len;
15471 			addr = start;
15472 
15473 			effective_page_mask = MIN(vm_map_page_mask(current_map()), PAGE_MASK);
15474 			effective_page_size = effective_page_mask + 1;
15475 
15476 			vm_map_unlock_read(map);
15477 
15478 			while (region_size) {
15479 				vm_pre_fault(
15480 					vm_map_trunc_page(addr, effective_page_mask),
15481 					VM_PROT_READ | VM_PROT_WRITE);
15482 
15483 				region_size -= effective_page_size;
15484 				addr += effective_page_size;
15485 			}
15486 		} else {
15487 			/*
15488 			 * Find the file object backing this map entry.  If there is
15489 			 * none, then we simply ignore the "will need" advice for this
15490 			 * entry and go on to the next one.
15491 			 */
15492 			if ((object = find_vnode_object(entry)) == VM_OBJECT_NULL) {
15493 				entry = entry->vme_next;
15494 				start = entry->vme_start;
15495 				continue;
15496 			}
15497 
15498 			vm_object_paging_begin(object);
15499 			pager = object->pager;
15500 			vm_object_unlock(object);
15501 
15502 			/*
15503 			 * The data_request() could take a long time, so let's
15504 			 * release the map lock to avoid blocking other threads.
15505 			 */
15506 			vm_map_unlock_read(map);
15507 
15508 			/*
15509 			 * Get the data from the object asynchronously.
15510 			 *
15511 			 * Note that memory_object_data_request() places limits on the
15512 			 * amount of I/O it will do.  Regardless of the len we
15513 			 * specified, it won't do more than MAX_UPL_TRANSFER_BYTES and it
15514 			 * silently truncates the len to that size.  This isn't
15515 			 * necessarily bad since madvise shouldn't really be used to
15516 			 * page in unlimited amounts of data.  Other Unix variants
15517 			 * limit the willneed case as well.  If this turns out to be an
15518 			 * issue for developers, then we can always adjust the policy
15519 			 * here and still be backwards compatible since this is all
15520 			 * just "advice".
15521 			 */
15522 			kr = memory_object_data_request(
15523 				pager,
15524 				vm_object_trunc_page(offset) + object->paging_offset,
15525 				0,      /* ignored */
15526 				VM_PROT_READ,
15527 				(memory_object_fault_info_t)&fault_info);
15528 
15529 			vm_object_lock(object);
15530 			vm_object_paging_end(object);
15531 			vm_object_unlock(object);
15532 
15533 			/*
15534 			 * If we couldn't do the I/O for some reason, just give up on
15535 			 * the madvise.  We still return success to the user since
15536 			 * madvise isn't supposed to fail when the advice can't be
15537 			 * taken.
15538 			 */
15539 
15540 			if (kr != KERN_SUCCESS) {
15541 				return KERN_SUCCESS;
15542 			}
15543 		}
15544 
15545 		start += len;
15546 		if (start >= end) {
15547 			/* done */
15548 			return KERN_SUCCESS;
15549 		}
15550 
15551 		/* look up next entry */
15552 		vm_map_lock_read(map);
15553 		if (!vm_map_lookup_entry(map, start, &entry)) {
15554 			/*
15555 			 * There's a new hole in the address range.
15556 			 */
15557 			vm_map_unlock_read(map);
15558 			return KERN_INVALID_ADDRESS;
15559 		}
15560 	}
15561 
15562 	vm_map_unlock_read(map);
15563 	return KERN_SUCCESS;
15564 }
15565 
15566 static boolean_t
vm_map_entry_is_reusable(vm_map_entry_t entry)15567 vm_map_entry_is_reusable(
15568 	vm_map_entry_t entry)
15569 {
15570 	/* Only user map entries */
15571 
15572 	vm_object_t object;
15573 
15574 	if (entry->is_sub_map) {
15575 		return FALSE;
15576 	}
15577 
15578 	switch (VME_ALIAS(entry)) {
15579 	case VM_MEMORY_MALLOC:
15580 	case VM_MEMORY_MALLOC_SMALL:
15581 	case VM_MEMORY_MALLOC_LARGE:
15582 	case VM_MEMORY_REALLOC:
15583 	case VM_MEMORY_MALLOC_TINY:
15584 	case VM_MEMORY_MALLOC_LARGE_REUSABLE:
15585 	case VM_MEMORY_MALLOC_LARGE_REUSED:
15586 		/*
15587 		 * This is a malloc() memory region: check if it's still
15588 		 * in its original state and can be re-used for more
15589 		 * malloc() allocations.
15590 		 */
15591 		break;
15592 	default:
15593 		/*
15594 		 * Not a malloc() memory region: let the caller decide if
15595 		 * it's re-usable.
15596 		 */
15597 		return TRUE;
15598 	}
15599 
15600 	if (/*entry->is_shared ||*/
15601 		entry->is_sub_map ||
15602 		entry->in_transition ||
15603 		entry->protection != VM_PROT_DEFAULT ||
15604 		entry->max_protection != VM_PROT_ALL ||
15605 		entry->inheritance != VM_INHERIT_DEFAULT ||
15606 		entry->no_cache ||
15607 		entry->permanent ||
15608 		entry->superpage_size != FALSE ||
15609 		entry->zero_wired_pages ||
15610 		entry->wired_count != 0 ||
15611 		entry->user_wired_count != 0) {
15612 		return FALSE;
15613 	}
15614 
15615 	object = VME_OBJECT(entry);
15616 	if (object == VM_OBJECT_NULL) {
15617 		return TRUE;
15618 	}
15619 	if (
15620 #if 0
15621 		/*
15622 		 * Let's proceed even if the VM object is potentially
15623 		 * shared.
15624 		 * We check for this later when processing the actual
15625 		 * VM pages, so the contents will be safe if shared.
15626 		 *
15627 		 * But we can still mark this memory region as "reusable" to
15628 		 * acknowledge that the caller did let us know that the memory
15629 		 * could be re-used and should not be penalized for holding
15630 		 * on to it.  This allows its "resident size" to not include
15631 		 * the reusable range.
15632 		 */
15633 		object->ref_count == 1 &&
15634 #endif
15635 		object->wired_page_count == 0 &&
15636 		object->copy == VM_OBJECT_NULL &&
15637 		object->shadow == VM_OBJECT_NULL &&
15638 		object->internal &&
15639 		object->purgable == VM_PURGABLE_DENY &&
15640 		object->wimg_bits == VM_WIMG_USE_DEFAULT &&
15641 		!object->code_signed) {
15642 		return TRUE;
15643 	}
15644 	return FALSE;
15645 }
15646 
15647 static kern_return_t
vm_map_reuse_pages(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)15648 vm_map_reuse_pages(
15649 	vm_map_t        map,
15650 	vm_map_offset_t start,
15651 	vm_map_offset_t end)
15652 {
15653 	vm_map_entry_t                  entry;
15654 	vm_object_t                     object;
15655 	vm_object_offset_t              start_offset, end_offset;
15656 
15657 	/*
15658 	 * The MADV_REUSE operation doesn't require any changes to the
15659 	 * vm_map_entry_t's, so the read lock is sufficient.
15660 	 */
15661 
15662 	if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
15663 		/*
15664 		 * XXX TODO4K
15665 		 * need to figure out what reusable means for a
15666 		 * portion of a native page.
15667 		 */
15668 		return KERN_SUCCESS;
15669 	}
15670 
15671 	vm_map_lock_read(map);
15672 	assert(map->pmap != kernel_pmap);       /* protect alias access */
15673 
15674 	/*
15675 	 * The madvise semantics require that the address range be fully
15676 	 * allocated with no holes.  Otherwise, we're required to return
15677 	 * an error.
15678 	 */
15679 
15680 	if (!vm_map_range_check(map, start, end, &entry)) {
15681 		vm_map_unlock_read(map);
15682 		vm_page_stats_reusable.reuse_pages_failure++;
15683 		return KERN_INVALID_ADDRESS;
15684 	}
15685 
15686 	/*
15687 	 * Examine each vm_map_entry_t in the range.
15688 	 */
15689 	for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
15690 	    entry = entry->vme_next) {
15691 		/*
15692 		 * Sanity check on the VM map entry.
15693 		 */
15694 		if (!vm_map_entry_is_reusable(entry)) {
15695 			vm_map_unlock_read(map);
15696 			vm_page_stats_reusable.reuse_pages_failure++;
15697 			return KERN_INVALID_ADDRESS;
15698 		}
15699 
15700 		/*
15701 		 * The first time through, the start address could be anywhere
15702 		 * within the vm_map_entry we found.  So adjust the offset to
15703 		 * correspond.
15704 		 */
15705 		if (entry->vme_start < start) {
15706 			start_offset = start - entry->vme_start;
15707 		} else {
15708 			start_offset = 0;
15709 		}
15710 		end_offset = MIN(end, entry->vme_end) - entry->vme_start;
15711 		start_offset += VME_OFFSET(entry);
15712 		end_offset += VME_OFFSET(entry);
15713 
15714 		assert(!entry->is_sub_map);
15715 		object = VME_OBJECT(entry);
15716 		if (object != VM_OBJECT_NULL) {
15717 			vm_object_lock(object);
15718 			vm_object_reuse_pages(object, start_offset, end_offset,
15719 			    TRUE);
15720 			vm_object_unlock(object);
15721 		}
15722 
15723 		if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE_REUSABLE) {
15724 			/*
15725 			 * XXX
15726 			 * We do not hold the VM map exclusively here.
15727 			 * The "alias" field is not that critical, so it's
15728 			 * safe to update it here, as long as it is the only
15729 			 * one that can be modified while holding the VM map
15730 			 * "shared".
15731 			 */
15732 			VME_ALIAS_SET(entry, VM_MEMORY_MALLOC_LARGE_REUSED);
15733 		}
15734 	}
15735 
15736 	vm_map_unlock_read(map);
15737 	vm_page_stats_reusable.reuse_pages_success++;
15738 	return KERN_SUCCESS;
15739 }
15740 
15741 
15742 static kern_return_t
vm_map_reusable_pages(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)15743 vm_map_reusable_pages(
15744 	vm_map_t        map,
15745 	vm_map_offset_t start,
15746 	vm_map_offset_t end)
15747 {
15748 	vm_map_entry_t                  entry;
15749 	vm_object_t                     object;
15750 	vm_object_offset_t              start_offset, end_offset;
15751 	vm_map_offset_t                 pmap_offset;
15752 
15753 	if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
15754 		/*
15755 		 * XXX TODO4K
15756 		 * need to figure out what reusable means for a portion
15757 		 * of a native page.
15758 		 */
15759 		return KERN_SUCCESS;
15760 	}
15761 
15762 	/*
15763 	 * The MADV_REUSABLE operation doesn't require any changes to the
15764 	 * vm_map_entry_t's, so the read lock is sufficient.
15765 	 */
15766 
15767 	vm_map_lock_read(map);
15768 	assert(map->pmap != kernel_pmap);       /* protect alias access */
15769 
15770 	/*
15771 	 * The madvise semantics require that the address range be fully
15772 	 * allocated with no holes.  Otherwise, we're required to return
15773 	 * an error.
15774 	 */
15775 
15776 	if (!vm_map_range_check(map, start, end, &entry)) {
15777 		vm_map_unlock_read(map);
15778 		vm_page_stats_reusable.reusable_pages_failure++;
15779 		return KERN_INVALID_ADDRESS;
15780 	}
15781 
15782 	/*
15783 	 * Examine each vm_map_entry_t in the range.
15784 	 */
15785 	for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
15786 	    entry = entry->vme_next) {
15787 		int kill_pages = 0;
15788 
15789 		/*
15790 		 * Sanity check on the VM map entry.
15791 		 */
15792 		if (!vm_map_entry_is_reusable(entry)) {
15793 			vm_map_unlock_read(map);
15794 			vm_page_stats_reusable.reusable_pages_failure++;
15795 			return KERN_INVALID_ADDRESS;
15796 		}
15797 
15798 		if (!(entry->protection & VM_PROT_WRITE) && !entry->used_for_jit) {
15799 			/* not writable: can't discard contents */
15800 			vm_map_unlock_read(map);
15801 			vm_page_stats_reusable.reusable_nonwritable++;
15802 			vm_page_stats_reusable.reusable_pages_failure++;
15803 			return KERN_PROTECTION_FAILURE;
15804 		}
15805 
15806 		/*
15807 		 * The first time through, the start address could be anywhere
15808 		 * within the vm_map_entry we found.  So adjust the offset to
15809 		 * correspond.
15810 		 */
15811 		if (entry->vme_start < start) {
15812 			start_offset = start - entry->vme_start;
15813 			pmap_offset = start;
15814 		} else {
15815 			start_offset = 0;
15816 			pmap_offset = entry->vme_start;
15817 		}
15818 		end_offset = MIN(end, entry->vme_end) - entry->vme_start;
15819 		start_offset += VME_OFFSET(entry);
15820 		end_offset += VME_OFFSET(entry);
15821 
15822 		assert(!entry->is_sub_map);
15823 		object = VME_OBJECT(entry);
15824 		if (object == VM_OBJECT_NULL) {
15825 			continue;
15826 		}
15827 
15828 
15829 		vm_object_lock(object);
15830 		if (((object->ref_count == 1) ||
15831 		    (object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC &&
15832 		    object->copy == VM_OBJECT_NULL)) &&
15833 		    object->shadow == VM_OBJECT_NULL &&
15834 		    /*
15835 		     * "iokit_acct" entries are billed for their virtual size
15836 		     * (rather than for their resident pages only), so they
15837 		     * wouldn't benefit from making pages reusable, and it
15838 		     * would be hard to keep track of pages that are both
15839 		     * "iokit_acct" and "reusable" in the pmap stats and
15840 		     * ledgers.
15841 		     */
15842 		    !(entry->iokit_acct ||
15843 		    (!entry->is_sub_map && !entry->use_pmap))) {
15844 			if (object->ref_count != 1) {
15845 				vm_page_stats_reusable.reusable_shared++;
15846 			}
15847 			kill_pages = 1;
15848 		} else {
15849 			kill_pages = -1;
15850 		}
15851 		if (kill_pages != -1) {
15852 			vm_object_deactivate_pages(object,
15853 			    start_offset,
15854 			    end_offset - start_offset,
15855 			    kill_pages,
15856 			    TRUE /*reusable_pages*/,
15857 			    map->pmap,
15858 			    pmap_offset);
15859 		} else {
15860 			vm_page_stats_reusable.reusable_pages_shared++;
15861 			DTRACE_VM4(vm_map_reusable_pages_shared,
15862 			    unsigned int, VME_ALIAS(entry),
15863 			    vm_map_t, map,
15864 			    vm_map_entry_t, entry,
15865 			    vm_object_t, object);
15866 		}
15867 		vm_object_unlock(object);
15868 
15869 		if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE ||
15870 		    VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE_REUSED) {
15871 			/*
15872 			 * XXX
15873 			 * We do not hold the VM map exclusively here.
15874 			 * The "alias" field is not that critical, so it's
15875 			 * safe to update it here, as long as it is the only
15876 			 * one that can be modified while holding the VM map
15877 			 * "shared".
15878 			 */
15879 			VME_ALIAS_SET(entry, VM_MEMORY_MALLOC_LARGE_REUSABLE);
15880 		}
15881 	}
15882 
15883 	vm_map_unlock_read(map);
15884 	vm_page_stats_reusable.reusable_pages_success++;
15885 	return KERN_SUCCESS;
15886 }
15887 
15888 
15889 static kern_return_t
vm_map_can_reuse(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)15890 vm_map_can_reuse(
15891 	vm_map_t        map,
15892 	vm_map_offset_t start,
15893 	vm_map_offset_t end)
15894 {
15895 	vm_map_entry_t                  entry;
15896 
15897 	/*
15898 	 * The MADV_REUSABLE operation doesn't require any changes to the
15899 	 * vm_map_entry_t's, so the read lock is sufficient.
15900 	 */
15901 
15902 	vm_map_lock_read(map);
15903 	assert(map->pmap != kernel_pmap);       /* protect alias access */
15904 
15905 	/*
15906 	 * The madvise semantics require that the address range be fully
15907 	 * allocated with no holes.  Otherwise, we're required to return
15908 	 * an error.
15909 	 */
15910 
15911 	if (!vm_map_range_check(map, start, end, &entry)) {
15912 		vm_map_unlock_read(map);
15913 		vm_page_stats_reusable.can_reuse_failure++;
15914 		return KERN_INVALID_ADDRESS;
15915 	}
15916 
15917 	/*
15918 	 * Examine each vm_map_entry_t in the range.
15919 	 */
15920 	for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
15921 	    entry = entry->vme_next) {
15922 		/*
15923 		 * Sanity check on the VM map entry.
15924 		 */
15925 		if (!vm_map_entry_is_reusable(entry)) {
15926 			vm_map_unlock_read(map);
15927 			vm_page_stats_reusable.can_reuse_failure++;
15928 			return KERN_INVALID_ADDRESS;
15929 		}
15930 	}
15931 
15932 	vm_map_unlock_read(map);
15933 	vm_page_stats_reusable.can_reuse_success++;
15934 	return KERN_SUCCESS;
15935 }
15936 
15937 
15938 #if MACH_ASSERT
15939 static kern_return_t
vm_map_pageout(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)15940 vm_map_pageout(
15941 	vm_map_t        map,
15942 	vm_map_offset_t start,
15943 	vm_map_offset_t end)
15944 {
15945 	vm_map_entry_t                  entry;
15946 
15947 	/*
15948 	 * The MADV_PAGEOUT operation doesn't require any changes to the
15949 	 * vm_map_entry_t's, so the read lock is sufficient.
15950 	 */
15951 
15952 	vm_map_lock_read(map);
15953 
15954 	/*
15955 	 * The madvise semantics require that the address range be fully
15956 	 * allocated with no holes.  Otherwise, we're required to return
15957 	 * an error.
15958 	 */
15959 
15960 	if (!vm_map_range_check(map, start, end, &entry)) {
15961 		vm_map_unlock_read(map);
15962 		return KERN_INVALID_ADDRESS;
15963 	}
15964 
15965 	/*
15966 	 * Examine each vm_map_entry_t in the range.
15967 	 */
15968 	for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
15969 	    entry = entry->vme_next) {
15970 		vm_object_t     object;
15971 
15972 		/*
15973 		 * Sanity check on the VM map entry.
15974 		 */
15975 		if (entry->is_sub_map) {
15976 			vm_map_t submap;
15977 			vm_map_offset_t submap_start;
15978 			vm_map_offset_t submap_end;
15979 			vm_map_entry_t submap_entry;
15980 
15981 			submap = VME_SUBMAP(entry);
15982 			submap_start = VME_OFFSET(entry);
15983 			submap_end = submap_start + (entry->vme_end -
15984 			    entry->vme_start);
15985 
15986 			vm_map_lock_read(submap);
15987 
15988 			if (!vm_map_range_check(submap,
15989 			    submap_start,
15990 			    submap_end,
15991 			    &submap_entry)) {
15992 				vm_map_unlock_read(submap);
15993 				vm_map_unlock_read(map);
15994 				return KERN_INVALID_ADDRESS;
15995 			}
15996 
15997 			object = VME_OBJECT(submap_entry);
15998 			if (submap_entry->is_sub_map ||
15999 			    object == VM_OBJECT_NULL ||
16000 			    !object->internal) {
16001 				vm_map_unlock_read(submap);
16002 				continue;
16003 			}
16004 
16005 			vm_object_pageout(object);
16006 
16007 			vm_map_unlock_read(submap);
16008 			submap = VM_MAP_NULL;
16009 			submap_entry = VM_MAP_ENTRY_NULL;
16010 			continue;
16011 		}
16012 
16013 		object = VME_OBJECT(entry);
16014 		if (entry->is_sub_map ||
16015 		    object == VM_OBJECT_NULL ||
16016 		    !object->internal) {
16017 			continue;
16018 		}
16019 
16020 		vm_object_pageout(object);
16021 	}
16022 
16023 	vm_map_unlock_read(map);
16024 	return KERN_SUCCESS;
16025 }
16026 #endif /* MACH_ASSERT */
16027 
16028 
16029 /*
16030  *	Routine:	vm_map_entry_insert
16031  *
16032  *	Description:	This routine inserts a new vm_entry in a locked map.
16033  */
16034 static vm_map_entry_t
vm_map_entry_insert(vm_map_t map,vm_map_entry_t insp_entry,vm_map_offset_t start,vm_map_offset_t end,vm_object_t object,vm_object_offset_t offset,vm_map_kernel_flags_t vmk_flags,boolean_t needs_copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance,boolean_t no_cache,boolean_t permanent,boolean_t no_copy_on_read,unsigned int superpage_size,boolean_t clear_map_aligned,boolean_t is_submap,boolean_t used_for_jit,int alias,boolean_t translated_allow_execute)16035 vm_map_entry_insert(
16036 	vm_map_t                map,
16037 	vm_map_entry_t          insp_entry,
16038 	vm_map_offset_t         start,
16039 	vm_map_offset_t         end,
16040 	vm_object_t             object,
16041 	vm_object_offset_t      offset,
16042 	vm_map_kernel_flags_t   vmk_flags,
16043 	boolean_t               needs_copy,
16044 	vm_prot_t               cur_protection,
16045 	vm_prot_t               max_protection,
16046 	vm_inherit_t            inheritance,
16047 	boolean_t               no_cache,
16048 	boolean_t               permanent,
16049 	boolean_t               no_copy_on_read,
16050 	unsigned int            superpage_size,
16051 	boolean_t               clear_map_aligned,
16052 	boolean_t               is_submap,
16053 	boolean_t               used_for_jit,
16054 	int                     alias,
16055 	boolean_t               translated_allow_execute)
16056 {
16057 	vm_map_entry_t  new_entry;
16058 	boolean_t map_aligned = FALSE;
16059 
16060 	assert(insp_entry != (vm_map_entry_t)0);
16061 	vm_map_lock_assert_exclusive(map);
16062 
16063 #if DEVELOPMENT || DEBUG
16064 	vm_object_offset_t      end_offset = 0;
16065 	assertf(!os_add_overflow(end - start, offset, &end_offset), "size 0x%llx, offset 0x%llx caused overflow", (uint64_t)(end - start), offset);
16066 #endif /* DEVELOPMENT || DEBUG */
16067 
16068 	if (VM_MAP_PAGE_SHIFT(map) != PAGE_SHIFT) {
16069 		map_aligned = TRUE;
16070 	}
16071 	if (clear_map_aligned &&
16072 	    (!VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)) ||
16073 	    !VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)))) {
16074 		map_aligned = FALSE;
16075 	}
16076 	if (map_aligned) {
16077 		assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
16078 		assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
16079 	} else {
16080 		assert(page_aligned(start));
16081 		assert(page_aligned(end));
16082 	}
16083 	assert(start < end);
16084 
16085 	new_entry = vm_map_entry_create(map);
16086 
16087 	new_entry->vme_start = start;
16088 	new_entry->vme_end = end;
16089 
16090 	VME_OBJECT_SET(new_entry, object);
16091 	VME_OFFSET_SET(new_entry, offset);
16092 	VME_ALIAS_SET(new_entry, alias);
16093 
16094 	new_entry->map_aligned = map_aligned;
16095 	new_entry->is_sub_map = is_submap;
16096 	new_entry->needs_copy = needs_copy;
16097 	new_entry->inheritance = inheritance;
16098 	new_entry->protection = cur_protection;
16099 	new_entry->max_protection = max_protection;
16100 	/*
16101 	 * submap: "use_pmap" means "nested".
16102 	 * default: false.
16103 	 *
16104 	 * object: "use_pmap" means "use pmap accounting" for footprint.
16105 	 * default: true.
16106 	 */
16107 	new_entry->use_pmap = !is_submap;
16108 	new_entry->no_cache = no_cache;
16109 	new_entry->permanent = permanent;
16110 	new_entry->translated_allow_execute = translated_allow_execute;
16111 	new_entry->vme_no_copy_on_read = no_copy_on_read;
16112 	new_entry->superpage_size = (superpage_size != 0);
16113 
16114 	if (used_for_jit) {
16115 		if (!(map->jit_entry_exists) ||
16116 		    VM_MAP_POLICY_ALLOW_MULTIPLE_JIT(map)) {
16117 			new_entry->used_for_jit = TRUE;
16118 			map->jit_entry_exists = TRUE;
16119 		}
16120 	}
16121 
16122 	/*
16123 	 *	Insert the new entry into the list.
16124 	 */
16125 
16126 	vm_map_store_entry_link(map, insp_entry, new_entry, vmk_flags);
16127 	map->size += end - start;
16128 
16129 	/*
16130 	 *	Update the free space hint and the lookup hint.
16131 	 */
16132 
16133 	SAVE_HINT_MAP_WRITE(map, new_entry);
16134 	return new_entry;
16135 }
16136 
16137 /*
16138  *	Routine:	vm_map_remap_extract
16139  *
16140  *	Description:	This routine returns a vm_entry list from a map.
16141  */
16142 static kern_return_t
vm_map_remap_extract(vm_map_t map,vm_map_offset_t addr,vm_map_size_t size,boolean_t copy,struct vm_map_header * map_header,vm_prot_t * cur_protection,vm_prot_t * max_protection,vm_inherit_t inheritance,vm_map_kernel_flags_t vmk_flags)16143 vm_map_remap_extract(
16144 	vm_map_t                map,
16145 	vm_map_offset_t         addr,
16146 	vm_map_size_t           size,
16147 	boolean_t               copy,
16148 	struct vm_map_header    *map_header,
16149 	vm_prot_t               *cur_protection,   /* IN/OUT */
16150 	vm_prot_t               *max_protection,   /* IN/OUT */
16151 	/* What, no behavior? */
16152 	vm_inherit_t            inheritance,
16153 	vm_map_kernel_flags_t   vmk_flags)
16154 {
16155 	kern_return_t           result;
16156 	vm_map_size_t           mapped_size;
16157 	vm_map_size_t           tmp_size;
16158 	vm_map_entry_t          src_entry;     /* result of last map lookup */
16159 	vm_map_entry_t          new_entry;
16160 	vm_object_offset_t      offset;
16161 	vm_map_offset_t         map_address;
16162 	vm_map_offset_t         src_start;     /* start of entry to map */
16163 	vm_map_offset_t         src_end;       /* end of region to be mapped */
16164 	vm_object_t             object;
16165 	vm_map_version_t        version;
16166 	boolean_t               src_needs_copy;
16167 	boolean_t               new_entry_needs_copy;
16168 	vm_map_entry_t          saved_src_entry;
16169 	boolean_t               src_entry_was_wired;
16170 	vm_prot_t               max_prot_for_prot_copy;
16171 	vm_map_offset_t         effective_page_mask;
16172 	boolean_t               pageable, same_map;
16173 	boolean_t               vm_remap_legacy;
16174 	vm_prot_t               required_cur_prot, required_max_prot;
16175 	vm_object_t             new_copy_object;     /* vm_object_copy_* result */
16176 	boolean_t               saved_used_for_jit;     /* Saved used_for_jit. */
16177 
16178 	pageable = vmk_flags.vmkf_copy_pageable;
16179 	same_map = vmk_flags.vmkf_copy_same_map;
16180 
16181 	effective_page_mask = MIN(PAGE_MASK, VM_MAP_PAGE_MASK(map));
16182 
16183 	assert(map != VM_MAP_NULL);
16184 	assert(size != 0);
16185 	assert(size == vm_map_round_page(size, effective_page_mask));
16186 	assert(inheritance == VM_INHERIT_NONE ||
16187 	    inheritance == VM_INHERIT_COPY ||
16188 	    inheritance == VM_INHERIT_SHARE);
16189 	assert(!(*cur_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)));
16190 	assert(!(*max_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)));
16191 	assert((*cur_protection & *max_protection) == *cur_protection);
16192 
16193 	/*
16194 	 *	Compute start and end of region.
16195 	 */
16196 	src_start = vm_map_trunc_page(addr, effective_page_mask);
16197 	src_end = vm_map_round_page(src_start + size, effective_page_mask);
16198 
16199 	/*
16200 	 *	Initialize map_header.
16201 	 */
16202 	map_header->links.next = CAST_TO_VM_MAP_ENTRY(&map_header->links);
16203 	map_header->links.prev = CAST_TO_VM_MAP_ENTRY(&map_header->links);
16204 	map_header->nentries = 0;
16205 	map_header->entries_pageable = pageable;
16206 //	map_header->page_shift = MIN(VM_MAP_PAGE_SHIFT(map), PAGE_SHIFT);
16207 	map_header->page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(map);
16208 	map_header->rb_head_store.rbh_root = (void *)(int)SKIP_RB_TREE;
16209 
16210 	vm_map_store_init( map_header );
16211 
16212 	if (copy && vmk_flags.vmkf_remap_prot_copy) {
16213 		/*
16214 		 * Special case for vm_map_protect(VM_PROT_COPY):
16215 		 * we want to set the new mappings' max protection to the
16216 		 * specified *max_protection...
16217 		 */
16218 		max_prot_for_prot_copy = *max_protection & (VM_PROT_ALL | VM_PROT_ALLEXEC);
16219 		/* ... but we want to use the vm_remap() legacy mode */
16220 		*max_protection = VM_PROT_NONE;
16221 		*cur_protection = VM_PROT_NONE;
16222 	} else {
16223 		max_prot_for_prot_copy = VM_PROT_NONE;
16224 	}
16225 
16226 	if (*cur_protection == VM_PROT_NONE &&
16227 	    *max_protection == VM_PROT_NONE) {
16228 		/*
16229 		 * vm_remap() legacy mode:
16230 		 * Extract all memory regions in the specified range and
16231 		 * collect the strictest set of protections allowed on the
16232 		 * entire range, so the caller knows what they can do with
16233 		 * the remapped range.
16234 		 * We start with VM_PROT_ALL and we'll remove the protections
16235 		 * missing from each memory region.
16236 		 */
16237 		vm_remap_legacy = TRUE;
16238 		*cur_protection = VM_PROT_ALL;
16239 		*max_protection = VM_PROT_ALL;
16240 		required_cur_prot = VM_PROT_NONE;
16241 		required_max_prot = VM_PROT_NONE;
16242 	} else {
16243 		/*
16244 		 * vm_remap_new() mode:
16245 		 * Extract all memory regions in the specified range and
16246 		 * ensure that they have at least the protections specified
16247 		 * by the caller via *cur_protection and *max_protection.
16248 		 * The resulting mapping should have these protections.
16249 		 */
16250 		vm_remap_legacy = FALSE;
16251 		if (copy) {
16252 			required_cur_prot = VM_PROT_NONE;
16253 			required_max_prot = VM_PROT_READ;
16254 		} else {
16255 			required_cur_prot = *cur_protection;
16256 			required_max_prot = *max_protection;
16257 		}
16258 	}
16259 
16260 	map_address = 0;
16261 	mapped_size = 0;
16262 	result = KERN_SUCCESS;
16263 
16264 	/*
16265 	 *	The specified source virtual space might correspond to
16266 	 *	multiple map entries, need to loop on them.
16267 	 */
16268 	vm_map_lock(map);
16269 	if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
16270 		/*
16271 		 * This address space uses sub-pages so the range might
16272 		 * not be re-mappable in an address space with larger
16273 		 * pages. Re-assemble any broken-up VM map entries to
16274 		 * improve our chances of making it work.
16275 		 */
16276 		vm_map_simplify_range(map, src_start, src_end);
16277 	}
16278 	while (mapped_size != size) {
16279 		vm_map_size_t   entry_size;
16280 
16281 		/*
16282 		 *	Find the beginning of the region.
16283 		 */
16284 		if (!vm_map_lookup_entry(map, src_start, &src_entry)) {
16285 			result = KERN_INVALID_ADDRESS;
16286 			break;
16287 		}
16288 
16289 		if (src_start < src_entry->vme_start ||
16290 		    (mapped_size && src_start != src_entry->vme_start)) {
16291 			result = KERN_INVALID_ADDRESS;
16292 			break;
16293 		}
16294 
16295 		tmp_size = size - mapped_size;
16296 		if (src_end > src_entry->vme_end) {
16297 			tmp_size -= (src_end - src_entry->vme_end);
16298 		}
16299 
16300 		entry_size = (vm_map_size_t)(src_entry->vme_end -
16301 		    src_entry->vme_start);
16302 
16303 		if (src_entry->is_sub_map &&
16304 		    vmk_flags.vmkf_copy_single_object) {
16305 			vm_map_t submap;
16306 			vm_map_offset_t submap_start;
16307 			vm_map_size_t submap_size;
16308 			boolean_t submap_needs_copy;
16309 
16310 			/*
16311 			 * No check for "required protection" on "src_entry"
16312 			 * because the protections that matter are the ones
16313 			 * on the submap's VM map entry, which will be checked
16314 			 * during the call to vm_map_remap_extract() below.
16315 			 */
16316 			submap_size = src_entry->vme_end - src_start;
16317 			if (submap_size > size) {
16318 				submap_size = size;
16319 			}
16320 			submap_start = VME_OFFSET(src_entry) + src_start - src_entry->vme_start;
16321 			submap = VME_SUBMAP(src_entry);
16322 			if (copy) {
16323 				/*
16324 				 * The caller wants a copy-on-write re-mapping,
16325 				 * so let's extract from the submap accordingly.
16326 				 */
16327 				submap_needs_copy = TRUE;
16328 			} else if (src_entry->needs_copy) {
16329 				/*
16330 				 * The caller wants a shared re-mapping but the
16331 				 * submap is mapped with "needs_copy", so its
16332 				 * contents can't be shared as is. Extract the
16333 				 * contents of the submap as "copy-on-write".
16334 				 * The re-mapping won't be shared with the
16335 				 * original mapping but this is equivalent to
16336 				 * what happened with the original "remap from
16337 				 * submap" code.
16338 				 * The shared region is mapped "needs_copy", for
16339 				 * example.
16340 				 */
16341 				submap_needs_copy = TRUE;
16342 			} else {
16343 				/*
16344 				 * The caller wants a shared re-mapping and
16345 				 * this mapping can be shared (no "needs_copy"),
16346 				 * so let's extract from the submap accordingly.
16347 				 * Kernel submaps are mapped without
16348 				 * "needs_copy", for example.
16349 				 */
16350 				submap_needs_copy = FALSE;
16351 			}
16352 			vm_map_reference(submap);
16353 			vm_map_unlock(map);
16354 			src_entry = NULL;
16355 			if (vm_remap_legacy) {
16356 				*cur_protection = VM_PROT_NONE;
16357 				*max_protection = VM_PROT_NONE;
16358 			}
16359 
16360 			DTRACE_VM7(remap_submap_recurse,
16361 			    vm_map_t, map,
16362 			    vm_map_offset_t, addr,
16363 			    vm_map_size_t, size,
16364 			    boolean_t, copy,
16365 			    vm_map_offset_t, submap_start,
16366 			    vm_map_size_t, submap_size,
16367 			    boolean_t, submap_needs_copy);
16368 
16369 			result = vm_map_remap_extract(submap,
16370 			    submap_start,
16371 			    submap_size,
16372 			    submap_needs_copy,
16373 			    map_header,
16374 			    cur_protection,
16375 			    max_protection,
16376 			    inheritance,
16377 			    vmk_flags);
16378 			vm_map_deallocate(submap);
16379 			return result;
16380 		}
16381 
16382 		if (src_entry->is_sub_map) {
16383 			/* protections for submap mapping are irrelevant here */
16384 		} else if (((src_entry->protection & required_cur_prot) !=
16385 		    required_cur_prot) ||
16386 		    ((src_entry->max_protection & required_max_prot) !=
16387 		    required_max_prot)) {
16388 			if (vmk_flags.vmkf_copy_single_object &&
16389 			    mapped_size != 0) {
16390 				/*
16391 				 * Single object extraction.
16392 				 * We can't extract more with the required
16393 				 * protection but we've extracted some, so
16394 				 * stop there and declare success.
16395 				 * The caller should check the size of
16396 				 * the copy entry we've extracted.
16397 				 */
16398 				result = KERN_SUCCESS;
16399 			} else {
16400 				/*
16401 				 * VM range extraction.
16402 				 * Required proctection is not available
16403 				 * for this part of the range: fail.
16404 				 */
16405 				result = KERN_PROTECTION_FAILURE;
16406 			}
16407 			break;
16408 		}
16409 
16410 		if (src_entry->is_sub_map) {
16411 			vm_map_t submap;
16412 			vm_map_offset_t submap_start;
16413 			vm_map_size_t submap_size;
16414 			vm_map_copy_t submap_copy;
16415 			vm_prot_t submap_curprot, submap_maxprot;
16416 			boolean_t submap_needs_copy;
16417 
16418 			/*
16419 			 * No check for "required protection" on "src_entry"
16420 			 * because the protections that matter are the ones
16421 			 * on the submap's VM map entry, which will be checked
16422 			 * during the call to vm_map_copy_extract() below.
16423 			 */
16424 			object = VM_OBJECT_NULL;
16425 			submap_copy = VM_MAP_COPY_NULL;
16426 
16427 			/* find equivalent range in the submap */
16428 			submap = VME_SUBMAP(src_entry);
16429 			submap_start = VME_OFFSET(src_entry) + src_start - src_entry->vme_start;
16430 			submap_size = tmp_size;
16431 			if (copy) {
16432 				/*
16433 				 * The caller wants a copy-on-write re-mapping,
16434 				 * so let's extract from the submap accordingly.
16435 				 */
16436 				submap_needs_copy = TRUE;
16437 			} else if (src_entry->needs_copy) {
16438 				/*
16439 				 * The caller wants a shared re-mapping but the
16440 				 * submap is mapped with "needs_copy", so its
16441 				 * contents can't be shared as is. Extract the
16442 				 * contents of the submap as "copy-on-write".
16443 				 * The re-mapping won't be shared with the
16444 				 * original mapping but this is equivalent to
16445 				 * what happened with the original "remap from
16446 				 * submap" code.
16447 				 * The shared region is mapped "needs_copy", for
16448 				 * example.
16449 				 */
16450 				submap_needs_copy = TRUE;
16451 			} else {
16452 				/*
16453 				 * The caller wants a shared re-mapping and
16454 				 * this mapping can be shared (no "needs_copy"),
16455 				 * so let's extract from the submap accordingly.
16456 				 * Kernel submaps are mapped without
16457 				 * "needs_copy", for example.
16458 				 */
16459 				submap_needs_copy = FALSE;
16460 			}
16461 			/* extra ref to keep submap alive */
16462 			vm_map_reference(submap);
16463 
16464 			DTRACE_VM7(remap_submap_recurse,
16465 			    vm_map_t, map,
16466 			    vm_map_offset_t, addr,
16467 			    vm_map_size_t, size,
16468 			    boolean_t, copy,
16469 			    vm_map_offset_t, submap_start,
16470 			    vm_map_size_t, submap_size,
16471 			    boolean_t, submap_needs_copy);
16472 
16473 			/*
16474 			 * The map can be safely unlocked since we
16475 			 * already hold a reference on the submap.
16476 			 *
16477 			 * No timestamp since we don't care if the map
16478 			 * gets modified while we're down in the submap.
16479 			 * We'll resume the extraction at src_start + tmp_size
16480 			 * anyway.
16481 			 */
16482 			vm_map_unlock(map);
16483 			src_entry = NULL; /* not valid once map is unlocked */
16484 
16485 			if (vm_remap_legacy) {
16486 				submap_curprot = VM_PROT_NONE;
16487 				submap_maxprot = VM_PROT_NONE;
16488 				if (max_prot_for_prot_copy) {
16489 					submap_maxprot = max_prot_for_prot_copy;
16490 				}
16491 			} else {
16492 				assert(!max_prot_for_prot_copy);
16493 				submap_curprot = *cur_protection;
16494 				submap_maxprot = *max_protection;
16495 			}
16496 			result = vm_map_copy_extract(submap,
16497 			    submap_start,
16498 			    submap_size,
16499 			    submap_needs_copy,
16500 			    &submap_copy,
16501 			    &submap_curprot,
16502 			    &submap_maxprot,
16503 			    inheritance,
16504 			    vmk_flags);
16505 
16506 			/* release extra ref on submap */
16507 			vm_map_deallocate(submap);
16508 			submap = VM_MAP_NULL;
16509 
16510 			if (result != KERN_SUCCESS) {
16511 				vm_map_lock(map);
16512 				break;
16513 			}
16514 
16515 			/* transfer submap_copy entries to map_header */
16516 			while (vm_map_copy_first_entry(submap_copy) !=
16517 			    vm_map_copy_to_entry(submap_copy)) {
16518 				vm_map_entry_t copy_entry;
16519 				vm_map_size_t copy_entry_size;
16520 
16521 				copy_entry = vm_map_copy_first_entry(submap_copy);
16522 				assert(!copy_entry->is_sub_map);
16523 				object = VME_OBJECT(copy_entry);
16524 
16525 				/*
16526 				 * Prevent kernel_object from being exposed to
16527 				 * user space.
16528 				 */
16529 				if (__improbable(object == kernel_object)) {
16530 					printf("%d[%s]: rejecting attempt to extract from kernel_object\n",
16531 					    proc_selfpid(),
16532 					    (current_task()->bsd_info
16533 					    ? proc_name_address(current_task()->bsd_info)
16534 					    : "?"));
16535 					DTRACE_VM(extract_kernel_only);
16536 					result = KERN_INVALID_RIGHT;
16537 					vm_map_copy_discard(submap_copy);
16538 					submap_copy = VM_MAP_COPY_NULL;
16539 					vm_map_lock(map);
16540 					break;
16541 				}
16542 
16543 				vm_map_copy_entry_unlink(submap_copy, copy_entry);
16544 				copy_entry_size = copy_entry->vme_end - copy_entry->vme_start;
16545 				copy_entry->vme_start = map_address;
16546 				copy_entry->vme_end = map_address + copy_entry_size;
16547 				map_address += copy_entry_size;
16548 				mapped_size += copy_entry_size;
16549 				src_start += copy_entry_size;
16550 				assert(src_start <= src_end);
16551 				_vm_map_store_entry_link(map_header,
16552 				    map_header->links.prev,
16553 				    copy_entry);
16554 			}
16555 			/* done with submap_copy */
16556 			vm_map_copy_discard(submap_copy);
16557 
16558 			if (vm_remap_legacy) {
16559 				*cur_protection &= submap_curprot;
16560 				*max_protection &= submap_maxprot;
16561 			}
16562 
16563 			/* re-acquire the map lock and continue to next entry */
16564 			vm_map_lock(map);
16565 			continue;
16566 		} else {
16567 			object = VME_OBJECT(src_entry);
16568 
16569 			/*
16570 			 * Prevent kernel_object from being exposed to
16571 			 * user space.
16572 			 */
16573 			if (__improbable(object == kernel_object)) {
16574 				printf("%d[%s]: rejecting attempt to extract from kernel_object\n",
16575 				    proc_selfpid(),
16576 				    (current_task()->bsd_info
16577 				    ? proc_name_address(current_task()->bsd_info)
16578 				    : "?"));
16579 				DTRACE_VM(extract_kernel_only);
16580 				result = KERN_INVALID_RIGHT;
16581 				break;
16582 			}
16583 
16584 			if (src_entry->iokit_acct) {
16585 				/*
16586 				 * This entry uses "IOKit accounting".
16587 				 */
16588 			} else if (object != VM_OBJECT_NULL &&
16589 			    (object->purgable != VM_PURGABLE_DENY ||
16590 			    object->vo_ledger_tag != VM_LEDGER_TAG_NONE)) {
16591 				/*
16592 				 * Purgeable objects have their own accounting:
16593 				 * no pmap accounting for them.
16594 				 */
16595 				assertf(!src_entry->use_pmap,
16596 				    "map=%p src_entry=%p [0x%llx:0x%llx] 0x%x/0x%x %d",
16597 				    map,
16598 				    src_entry,
16599 				    (uint64_t)src_entry->vme_start,
16600 				    (uint64_t)src_entry->vme_end,
16601 				    src_entry->protection,
16602 				    src_entry->max_protection,
16603 				    VME_ALIAS(src_entry));
16604 			} else {
16605 				/*
16606 				 * Not IOKit or purgeable:
16607 				 * must be accounted by pmap stats.
16608 				 */
16609 				assertf(src_entry->use_pmap,
16610 				    "map=%p src_entry=%p [0x%llx:0x%llx] 0x%x/0x%x %d",
16611 				    map,
16612 				    src_entry,
16613 				    (uint64_t)src_entry->vme_start,
16614 				    (uint64_t)src_entry->vme_end,
16615 				    src_entry->protection,
16616 				    src_entry->max_protection,
16617 				    VME_ALIAS(src_entry));
16618 			}
16619 
16620 			if (object == VM_OBJECT_NULL) {
16621 				assert(!src_entry->needs_copy);
16622 				object = vm_object_allocate(entry_size);
16623 				VME_OFFSET_SET(src_entry, 0);
16624 				VME_OBJECT_SET(src_entry, object);
16625 				assert(src_entry->use_pmap);
16626 				assert(!map->mapped_in_other_pmaps);
16627 			} else if (src_entry->wired_count ||
16628 			    object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
16629 				/*
16630 				 * A wired memory region should not have
16631 				 * any pending copy-on-write and needs to
16632 				 * keep pointing at the VM object that
16633 				 * contains the wired pages.
16634 				 * If we're sharing this memory (copy=false),
16635 				 * we'll share this VM object.
16636 				 * If we're copying this memory (copy=true),
16637 				 * we'll call vm_object_copy_slowly() below
16638 				 * and use the new VM object for the remapping.
16639 				 *
16640 				 * Or, we are already using an asymmetric
16641 				 * copy, and therefore we already have
16642 				 * the right object.
16643 				 */
16644 				assert(!src_entry->needs_copy);
16645 			} else if (src_entry->needs_copy || object->shadowed ||
16646 			    (object->internal && !object->true_share &&
16647 			    !src_entry->is_shared &&
16648 			    object->vo_size > entry_size)) {
16649 				VME_OBJECT_SHADOW(src_entry, entry_size);
16650 				assert(src_entry->use_pmap);
16651 
16652 				if (!src_entry->needs_copy &&
16653 				    (src_entry->protection & VM_PROT_WRITE)) {
16654 					vm_prot_t prot;
16655 
16656 					assert(!pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, src_entry->protection));
16657 
16658 					prot = src_entry->protection & ~VM_PROT_WRITE;
16659 
16660 					if (override_nx(map,
16661 					    VME_ALIAS(src_entry))
16662 					    && prot) {
16663 						prot |= VM_PROT_EXECUTE;
16664 					}
16665 
16666 					assert(!pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, prot));
16667 
16668 					if (map->mapped_in_other_pmaps) {
16669 						vm_object_pmap_protect(
16670 							VME_OBJECT(src_entry),
16671 							VME_OFFSET(src_entry),
16672 							entry_size,
16673 							PMAP_NULL,
16674 							PAGE_SIZE,
16675 							src_entry->vme_start,
16676 							prot);
16677 #if MACH_ASSERT
16678 					} else if (__improbable(map->pmap == PMAP_NULL)) {
16679 						extern boolean_t vm_tests_in_progress;
16680 						assert(vm_tests_in_progress);
16681 						/*
16682 						 * Some VM tests (in vm_tests.c)
16683 						 * sometimes want to use a VM
16684 						 * map without a pmap.
16685 						 * Otherwise, this should never
16686 						 * happen.
16687 						 */
16688 #endif /* MACH_ASSERT */
16689 					} else {
16690 						pmap_protect(vm_map_pmap(map),
16691 						    src_entry->vme_start,
16692 						    src_entry->vme_end,
16693 						    prot);
16694 					}
16695 				}
16696 
16697 				object = VME_OBJECT(src_entry);
16698 				src_entry->needs_copy = FALSE;
16699 			}
16700 
16701 
16702 			vm_object_lock(object);
16703 			vm_object_reference_locked(object); /* object ref. for new entry */
16704 			assert(!src_entry->needs_copy);
16705 			if (object->copy_strategy ==
16706 			    MEMORY_OBJECT_COPY_SYMMETRIC) {
16707 				/*
16708 				 * If we want to share this object (copy==0),
16709 				 * it needs to be COPY_DELAY.
16710 				 * If we want to copy this object (copy==1),
16711 				 * we can't just set "needs_copy" on our side
16712 				 * and expect the other side to do the same
16713 				 * (symmetrically), so we can't let the object
16714 				 * stay COPY_SYMMETRIC.
16715 				 * So we always switch from COPY_SYMMETRIC to
16716 				 * COPY_DELAY.
16717 				 */
16718 				object->copy_strategy =
16719 				    MEMORY_OBJECT_COPY_DELAY;
16720 				object->true_share = TRUE;
16721 			}
16722 			vm_object_unlock(object);
16723 		}
16724 
16725 		offset = (VME_OFFSET(src_entry) +
16726 		    (src_start - src_entry->vme_start));
16727 
16728 		new_entry = _vm_map_entry_create(map_header);
16729 		vm_map_entry_copy(map, new_entry, src_entry);
16730 		if (new_entry->is_sub_map) {
16731 			/* clr address space specifics */
16732 			new_entry->use_pmap = FALSE;
16733 		} else if (copy) {
16734 			/*
16735 			 * We're dealing with a copy-on-write operation,
16736 			 * so the resulting mapping should not inherit the
16737 			 * original mapping's accounting settings.
16738 			 * "use_pmap" should be reset to its default (TRUE)
16739 			 * so that the new mapping gets accounted for in
16740 			 * the task's memory footprint.
16741 			 */
16742 			new_entry->use_pmap = TRUE;
16743 		}
16744 		/* "iokit_acct" was cleared in vm_map_entry_copy() */
16745 		assert(!new_entry->iokit_acct);
16746 
16747 		new_entry->map_aligned = FALSE;
16748 
16749 		new_entry->vme_start = map_address;
16750 		new_entry->vme_end = map_address + tmp_size;
16751 		assert(new_entry->vme_start < new_entry->vme_end);
16752 		if (copy && vmk_flags.vmkf_remap_prot_copy) {
16753 			/*
16754 			 * Remapping for vm_map_protect(VM_PROT_COPY)
16755 			 * to convert a read-only mapping into a
16756 			 * copy-on-write version of itself but
16757 			 * with write access:
16758 			 * keep the original inheritance and add
16759 			 * VM_PROT_WRITE to the max protection.
16760 			 */
16761 			new_entry->inheritance = src_entry->inheritance;
16762 			new_entry->protection &= max_prot_for_prot_copy;
16763 			new_entry->max_protection |= VM_PROT_WRITE;
16764 		} else {
16765 			new_entry->inheritance = inheritance;
16766 			if (!vm_remap_legacy) {
16767 				new_entry->protection = *cur_protection;
16768 				new_entry->max_protection = *max_protection;
16769 			}
16770 		}
16771 		VME_OFFSET_SET(new_entry, offset);
16772 
16773 		/*
16774 		 * The new region has to be copied now if required.
16775 		 */
16776 RestartCopy:
16777 		if (!copy) {
16778 			if (src_entry->used_for_jit == TRUE) {
16779 				if (same_map) {
16780 				} else if (!VM_MAP_POLICY_ALLOW_JIT_SHARING(map)) {
16781 					/*
16782 					 * Cannot allow an entry describing a JIT
16783 					 * region to be shared across address spaces.
16784 					 */
16785 					result = KERN_INVALID_ARGUMENT;
16786 					vm_object_deallocate(object);
16787 					vm_map_entry_dispose(new_entry);
16788 					new_entry = VM_MAP_ENTRY_NULL;
16789 					break;
16790 				}
16791 			}
16792 
16793 			src_entry->is_shared = TRUE;
16794 			new_entry->is_shared = TRUE;
16795 			if (!(new_entry->is_sub_map)) {
16796 				new_entry->needs_copy = FALSE;
16797 			}
16798 		} else if (src_entry->is_sub_map) {
16799 			/* make this a COW sub_map if not already */
16800 			assert(new_entry->wired_count == 0);
16801 			new_entry->needs_copy = TRUE;
16802 			object = VM_OBJECT_NULL;
16803 		} else if (src_entry->wired_count == 0 &&
16804 		    !(debug4k_no_cow_copyin && VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) &&
16805 		    vm_object_copy_quickly(VME_OBJECT(new_entry),
16806 		    VME_OFFSET(new_entry),
16807 		    (new_entry->vme_end -
16808 		    new_entry->vme_start),
16809 		    &src_needs_copy,
16810 		    &new_entry_needs_copy)) {
16811 			new_entry->needs_copy = new_entry_needs_copy;
16812 			new_entry->is_shared = FALSE;
16813 			assertf(new_entry->use_pmap, "map %p new_entry %p\n", map, new_entry);
16814 
16815 			/*
16816 			 * Handle copy_on_write semantics.
16817 			 */
16818 			if (src_needs_copy && !src_entry->needs_copy) {
16819 				vm_prot_t prot;
16820 
16821 				assert(!pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, src_entry->protection));
16822 
16823 				prot = src_entry->protection & ~VM_PROT_WRITE;
16824 
16825 				if (override_nx(map,
16826 				    VME_ALIAS(src_entry))
16827 				    && prot) {
16828 					prot |= VM_PROT_EXECUTE;
16829 				}
16830 
16831 				assert(!pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, prot));
16832 
16833 				vm_object_pmap_protect(object,
16834 				    offset,
16835 				    entry_size,
16836 				    ((src_entry->is_shared
16837 				    || map->mapped_in_other_pmaps) ?
16838 				    PMAP_NULL : map->pmap),
16839 				    VM_MAP_PAGE_SIZE(map),
16840 				    src_entry->vme_start,
16841 				    prot);
16842 
16843 				assert(src_entry->wired_count == 0);
16844 				src_entry->needs_copy = TRUE;
16845 			}
16846 			/*
16847 			 * Throw away the old object reference of the new entry.
16848 			 */
16849 			vm_object_deallocate(object);
16850 		} else {
16851 			new_entry->is_shared = FALSE;
16852 			assertf(new_entry->use_pmap, "map %p new_entry %p\n", map, new_entry);
16853 
16854 			src_entry_was_wired = (src_entry->wired_count > 0);
16855 			saved_src_entry = src_entry;
16856 			src_entry = VM_MAP_ENTRY_NULL;
16857 
16858 			/*
16859 			 * The map can be safely unlocked since we
16860 			 * already hold a reference on the object.
16861 			 *
16862 			 * Record the timestamp of the map for later
16863 			 * verification, and unlock the map.
16864 			 */
16865 			version.main_timestamp = map->timestamp;
16866 			vm_map_unlock(map);     /* Increments timestamp once! */
16867 
16868 			/*
16869 			 * Perform the copy.
16870 			 */
16871 			if (src_entry_was_wired > 0 ||
16872 			    (debug4k_no_cow_copyin &&
16873 			    VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT)) {
16874 				vm_object_lock(object);
16875 				result = vm_object_copy_slowly(
16876 					object,
16877 					offset,
16878 					(new_entry->vme_end -
16879 					new_entry->vme_start),
16880 					THREAD_UNINT,
16881 					&new_copy_object);
16882 				/* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
16883 				saved_used_for_jit = new_entry->used_for_jit;
16884 				VME_OBJECT_SET(new_entry, new_copy_object);
16885 				new_entry->used_for_jit = saved_used_for_jit;
16886 				VME_OFFSET_SET(new_entry, offset - vm_object_trunc_page(offset));
16887 				new_entry->needs_copy = FALSE;
16888 			} else {
16889 				vm_object_offset_t new_offset;
16890 
16891 				new_offset = VME_OFFSET(new_entry);
16892 				result = vm_object_copy_strategically(
16893 					object,
16894 					offset,
16895 					(new_entry->vme_end -
16896 					new_entry->vme_start),
16897 					&new_copy_object,
16898 					&new_offset,
16899 					&new_entry_needs_copy);
16900 				/* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
16901 				saved_used_for_jit = new_entry->used_for_jit;
16902 				VME_OBJECT_SET(new_entry, new_copy_object);
16903 				new_entry->used_for_jit = saved_used_for_jit;
16904 				if (new_offset != VME_OFFSET(new_entry)) {
16905 					VME_OFFSET_SET(new_entry, new_offset);
16906 				}
16907 
16908 				new_entry->needs_copy = new_entry_needs_copy;
16909 			}
16910 
16911 			/*
16912 			 * Throw away the old object reference of the new entry.
16913 			 */
16914 			vm_object_deallocate(object);
16915 
16916 			if (result != KERN_SUCCESS &&
16917 			    result != KERN_MEMORY_RESTART_COPY) {
16918 				vm_map_entry_dispose(new_entry);
16919 				vm_map_lock(map);
16920 				break;
16921 			}
16922 
16923 			/*
16924 			 * Verify that the map has not substantially
16925 			 * changed while the copy was being made.
16926 			 */
16927 
16928 			vm_map_lock(map);
16929 			if (version.main_timestamp + 1 != map->timestamp) {
16930 				/*
16931 				 * Simple version comparison failed.
16932 				 *
16933 				 * Retry the lookup and verify that the
16934 				 * same object/offset are still present.
16935 				 */
16936 				saved_src_entry = VM_MAP_ENTRY_NULL;
16937 				vm_object_deallocate(VME_OBJECT(new_entry));
16938 				vm_map_entry_dispose(new_entry);
16939 				if (result == KERN_MEMORY_RESTART_COPY) {
16940 					result = KERN_SUCCESS;
16941 				}
16942 				continue;
16943 			}
16944 			/* map hasn't changed: src_entry is still valid */
16945 			src_entry = saved_src_entry;
16946 			saved_src_entry = VM_MAP_ENTRY_NULL;
16947 
16948 			if (result == KERN_MEMORY_RESTART_COPY) {
16949 				vm_object_reference(object);
16950 				goto RestartCopy;
16951 			}
16952 		}
16953 
16954 		_vm_map_store_entry_link(map_header,
16955 		    map_header->links.prev, new_entry);
16956 
16957 		/* protections for submap mapping are irrelevant here */
16958 		if (vm_remap_legacy && !src_entry->is_sub_map) {
16959 			*cur_protection &= src_entry->protection;
16960 			*max_protection &= src_entry->max_protection;
16961 		}
16962 
16963 		map_address += tmp_size;
16964 		mapped_size += tmp_size;
16965 		src_start += tmp_size;
16966 
16967 		if (vmk_flags.vmkf_copy_single_object) {
16968 			if (mapped_size != size) {
16969 				DEBUG4K_SHARE("map %p addr 0x%llx size 0x%llx clipped copy at mapped_size 0x%llx\n", map, (uint64_t)addr, (uint64_t)size, (uint64_t)mapped_size);
16970 				if (src_entry->vme_next != vm_map_to_entry(map) &&
16971 				    VME_OBJECT(src_entry->vme_next) == VME_OBJECT(src_entry)) {
16972 					/* XXX TODO4K */
16973 					DEBUG4K_ERROR("could have extended copy to next entry...\n");
16974 				}
16975 			}
16976 			break;
16977 		}
16978 	} /* end while */
16979 
16980 	vm_map_unlock(map);
16981 	if (result != KERN_SUCCESS) {
16982 		/*
16983 		 * Free all allocated elements.
16984 		 */
16985 		for (src_entry = map_header->links.next;
16986 		    src_entry != CAST_TO_VM_MAP_ENTRY(&map_header->links);
16987 		    src_entry = new_entry) {
16988 			new_entry = src_entry->vme_next;
16989 			_vm_map_store_entry_unlink(map_header, src_entry);
16990 			if (src_entry->is_sub_map) {
16991 				vm_map_deallocate(VME_SUBMAP(src_entry));
16992 			} else {
16993 				vm_object_deallocate(VME_OBJECT(src_entry));
16994 			}
16995 			vm_map_entry_dispose(src_entry);
16996 		}
16997 	}
16998 	return result;
16999 }
17000 
17001 bool
vm_map_is_exotic(vm_map_t map)17002 vm_map_is_exotic(
17003 	vm_map_t map)
17004 {
17005 	return VM_MAP_IS_EXOTIC(map);
17006 }
17007 
17008 bool
vm_map_is_alien(vm_map_t map)17009 vm_map_is_alien(
17010 	vm_map_t map)
17011 {
17012 	return VM_MAP_IS_ALIEN(map);
17013 }
17014 
17015 #if XNU_TARGET_OS_OSX
17016 void
vm_map_mark_alien(vm_map_t map)17017 vm_map_mark_alien(
17018 	vm_map_t map)
17019 {
17020 	vm_map_lock(map);
17021 	map->is_alien = true;
17022 	vm_map_unlock(map);
17023 }
17024 
17025 void
vm_map_single_jit(vm_map_t map)17026 vm_map_single_jit(
17027 	vm_map_t map)
17028 {
17029 	vm_map_lock(map);
17030 	map->single_jit = true;
17031 	vm_map_unlock(map);
17032 }
17033 #endif /* XNU_TARGET_OS_OSX */
17034 
17035 /*
17036  * Callers of this function must call vm_map_copy_require on
17037  * previously created vm_map_copy_t or pass a newly created
17038  * one to ensure that it hasn't been forged.
17039  */
17040 static kern_return_t
vm_map_copy_to_physcopy(vm_map_copy_t copy_map,vm_map_t target_map)17041 vm_map_copy_to_physcopy(
17042 	vm_map_copy_t   copy_map,
17043 	vm_map_t        target_map)
17044 {
17045 	vm_map_size_t           size;
17046 	vm_map_entry_t          entry;
17047 	vm_map_entry_t          new_entry;
17048 	vm_object_t             new_object;
17049 	unsigned int            pmap_flags;
17050 	pmap_t                  new_pmap;
17051 	vm_map_t                new_map;
17052 	vm_map_address_t        src_start, src_end, src_cur;
17053 	vm_map_address_t        dst_start, dst_end, dst_cur;
17054 	kern_return_t           kr;
17055 	void                    *kbuf;
17056 
17057 	/*
17058 	 * Perform the equivalent of vm_allocate() and memcpy().
17059 	 * Replace the mappings in "copy_map" with the newly allocated mapping.
17060 	 */
17061 	DEBUG4K_COPY("copy_map %p (%d %d 0x%llx 0x%llx) BEFORE\n", copy_map, copy_map->cpy_hdr.page_shift, copy_map->cpy_hdr.nentries, copy_map->offset, (uint64_t)copy_map->size);
17062 
17063 	assert(copy_map->cpy_hdr.page_shift != VM_MAP_PAGE_MASK(target_map));
17064 
17065 	/* create a new pmap to map "copy_map" */
17066 	pmap_flags = 0;
17067 	assert(copy_map->cpy_hdr.page_shift == FOURK_PAGE_SHIFT);
17068 #if PMAP_CREATE_FORCE_4K_PAGES
17069 	pmap_flags |= PMAP_CREATE_FORCE_4K_PAGES;
17070 #endif /* PMAP_CREATE_FORCE_4K_PAGES */
17071 	pmap_flags |= PMAP_CREATE_64BIT;
17072 	new_pmap = pmap_create_options(NULL, (vm_map_size_t)0, pmap_flags);
17073 	if (new_pmap == NULL) {
17074 		return KERN_RESOURCE_SHORTAGE;
17075 	}
17076 
17077 	/* allocate new VM object */
17078 	size = VM_MAP_ROUND_PAGE(copy_map->size, PAGE_MASK);
17079 	new_object = vm_object_allocate(size);
17080 	assert(new_object);
17081 
17082 	/* allocate new VM map entry */
17083 	new_entry = vm_map_copy_entry_create(copy_map);
17084 	assert(new_entry);
17085 
17086 	/* finish initializing new VM map entry */
17087 	new_entry->protection = VM_PROT_DEFAULT;
17088 	new_entry->max_protection = VM_PROT_DEFAULT;
17089 	new_entry->use_pmap = TRUE;
17090 
17091 	/* make new VM map entry point to new VM object */
17092 	new_entry->vme_start = 0;
17093 	new_entry->vme_end = size;
17094 	VME_OBJECT_SET(new_entry, new_object);
17095 	VME_OFFSET_SET(new_entry, 0);
17096 
17097 	/* create a new pageable VM map to map "copy_map" */
17098 	new_map = vm_map_create_options(new_pmap, 0, MACH_VM_MAX_ADDRESS,
17099 	    VM_MAP_CREATE_PAGEABLE);
17100 	assert(new_map);
17101 	vm_map_set_page_shift(new_map, copy_map->cpy_hdr.page_shift);
17102 
17103 	/* map "copy_map" in the new VM map */
17104 	src_start = 0;
17105 	kr = vm_map_copyout_internal(
17106 		new_map,
17107 		&src_start,
17108 		copy_map,
17109 		copy_map->size,
17110 		FALSE, /* consume_on_success */
17111 		VM_PROT_DEFAULT,
17112 		VM_PROT_DEFAULT,
17113 		VM_INHERIT_DEFAULT);
17114 	assert(kr == KERN_SUCCESS);
17115 	src_end = src_start + copy_map->size;
17116 
17117 	/* map "new_object" in the new VM map */
17118 	vm_object_reference(new_object);
17119 	dst_start = 0;
17120 	kr = vm_map_enter(new_map,
17121 	    &dst_start,
17122 	    size,
17123 	    0,               /* mask */
17124 	    VM_FLAGS_ANYWHERE,
17125 	    VM_MAP_KERNEL_FLAGS_NONE,
17126 	    VM_KERN_MEMORY_OSFMK,
17127 	    new_object,
17128 	    0,               /* offset */
17129 	    FALSE,               /* needs copy */
17130 	    VM_PROT_DEFAULT,
17131 	    VM_PROT_DEFAULT,
17132 	    VM_INHERIT_DEFAULT);
17133 	assert(kr == KERN_SUCCESS);
17134 	dst_end = dst_start + size;
17135 
17136 	/* get a kernel buffer */
17137 	kbuf = kalloc_data(PAGE_SIZE, Z_WAITOK | Z_NOFAIL);
17138 
17139 	/* physically copy "copy_map" mappings to new VM object */
17140 	for (src_cur = src_start, dst_cur = dst_start;
17141 	    src_cur < src_end;
17142 	    src_cur += PAGE_SIZE, dst_cur += PAGE_SIZE) {
17143 		vm_size_t bytes;
17144 
17145 		bytes = PAGE_SIZE;
17146 		if (src_cur + PAGE_SIZE > src_end) {
17147 			/* partial copy for last page */
17148 			bytes = src_end - src_cur;
17149 			assert(bytes > 0 && bytes < PAGE_SIZE);
17150 			/* rest of dst page should be zero-filled */
17151 		}
17152 		/* get bytes from src mapping */
17153 		kr = copyinmap(new_map, src_cur, kbuf, bytes);
17154 		if (kr != KERN_SUCCESS) {
17155 			DEBUG4K_COPY("copyinmap(%p, 0x%llx, %p, 0x%llx) kr 0x%x\n", new_map, (uint64_t)src_cur, kbuf, (uint64_t)bytes, kr);
17156 		}
17157 		/* put bytes in dst mapping */
17158 		assert(dst_cur < dst_end);
17159 		assert(dst_cur + bytes <= dst_end);
17160 		kr = copyoutmap(new_map, kbuf, dst_cur, bytes);
17161 		if (kr != KERN_SUCCESS) {
17162 			DEBUG4K_COPY("copyoutmap(%p, %p, 0x%llx, 0x%llx) kr 0x%x\n", new_map, kbuf, (uint64_t)dst_cur, (uint64_t)bytes, kr);
17163 		}
17164 	}
17165 
17166 	/* free kernel buffer */
17167 	kfree_data(kbuf, PAGE_SIZE);
17168 
17169 	/* destroy new map */
17170 	vm_map_destroy(new_map);
17171 	new_map = VM_MAP_NULL;
17172 
17173 	/* dispose of the old map entries in "copy_map" */
17174 	while (vm_map_copy_first_entry(copy_map) !=
17175 	    vm_map_copy_to_entry(copy_map)) {
17176 		entry = vm_map_copy_first_entry(copy_map);
17177 		vm_map_copy_entry_unlink(copy_map, entry);
17178 		if (entry->is_sub_map) {
17179 			vm_map_deallocate(VME_SUBMAP(entry));
17180 		} else {
17181 			vm_object_deallocate(VME_OBJECT(entry));
17182 		}
17183 		vm_map_copy_entry_dispose(entry);
17184 	}
17185 
17186 	/* change "copy_map"'s page_size to match "target_map" */
17187 	copy_map->cpy_hdr.page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(target_map);
17188 	copy_map->offset = 0;
17189 	copy_map->size = size;
17190 
17191 	/* insert new map entry in "copy_map" */
17192 	assert(vm_map_copy_last_entry(copy_map) == vm_map_copy_to_entry(copy_map));
17193 	vm_map_copy_entry_link(copy_map, vm_map_copy_last_entry(copy_map), new_entry);
17194 
17195 	DEBUG4K_COPY("copy_map %p (%d %d 0x%llx 0x%llx) AFTER\n", copy_map, copy_map->cpy_hdr.page_shift, copy_map->cpy_hdr.nentries, copy_map->offset, (uint64_t)copy_map->size);
17196 	return KERN_SUCCESS;
17197 }
17198 
17199 void
17200 vm_map_copy_adjust_get_target_copy_map(
17201 	vm_map_copy_t   copy_map,
17202 	vm_map_copy_t   *target_copy_map_p);
17203 void
vm_map_copy_adjust_get_target_copy_map(vm_map_copy_t copy_map,vm_map_copy_t * target_copy_map_p)17204 vm_map_copy_adjust_get_target_copy_map(
17205 	vm_map_copy_t   copy_map,
17206 	vm_map_copy_t   *target_copy_map_p)
17207 {
17208 	vm_map_copy_t   target_copy_map;
17209 	vm_map_entry_t  entry, target_entry;
17210 
17211 	if (*target_copy_map_p != VM_MAP_COPY_NULL) {
17212 		/* the caller already has a "target_copy_map": use it */
17213 		return;
17214 	}
17215 
17216 	/* the caller wants us to create a new copy of "copy_map" */
17217 	target_copy_map = vm_map_copy_allocate();
17218 	target_copy_map->type = copy_map->type;
17219 	assert(target_copy_map->type == VM_MAP_COPY_ENTRY_LIST);
17220 	target_copy_map->offset = copy_map->offset;
17221 	target_copy_map->size = copy_map->size;
17222 	target_copy_map->cpy_hdr.page_shift = copy_map->cpy_hdr.page_shift;
17223 	vm_map_store_init(&target_copy_map->cpy_hdr);
17224 	for (entry = vm_map_copy_first_entry(copy_map);
17225 	    entry != vm_map_copy_to_entry(copy_map);
17226 	    entry = entry->vme_next) {
17227 		target_entry = vm_map_copy_entry_create(target_copy_map);
17228 		vm_map_entry_copy_full(target_entry, entry);
17229 		if (target_entry->is_sub_map) {
17230 			vm_map_reference(VME_SUBMAP(target_entry));
17231 		} else {
17232 			vm_object_reference(VME_OBJECT(target_entry));
17233 		}
17234 		vm_map_copy_entry_link(
17235 			target_copy_map,
17236 			vm_map_copy_last_entry(target_copy_map),
17237 			target_entry);
17238 	}
17239 	entry = VM_MAP_ENTRY_NULL;
17240 	*target_copy_map_p = target_copy_map;
17241 }
17242 
17243 /*
17244  * Callers of this function must call vm_map_copy_require on
17245  * previously created vm_map_copy_t or pass a newly created
17246  * one to ensure that it hasn't been forged.
17247  */
17248 static void
vm_map_copy_trim(vm_map_copy_t copy_map,uint16_t new_page_shift,vm_map_offset_t trim_start,vm_map_offset_t trim_end)17249 vm_map_copy_trim(
17250 	vm_map_copy_t   copy_map,
17251 	uint16_t        new_page_shift,
17252 	vm_map_offset_t trim_start,
17253 	vm_map_offset_t trim_end)
17254 {
17255 	uint16_t        copy_page_shift;
17256 	vm_map_entry_t  entry, next_entry;
17257 
17258 	assert(copy_map->type == VM_MAP_COPY_ENTRY_LIST);
17259 	assert(copy_map->cpy_hdr.nentries > 0);
17260 
17261 	trim_start += vm_map_copy_first_entry(copy_map)->vme_start;
17262 	trim_end += vm_map_copy_first_entry(copy_map)->vme_start;
17263 
17264 	/* use the new page_shift to do the clipping */
17265 	copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy_map);
17266 	copy_map->cpy_hdr.page_shift = new_page_shift;
17267 
17268 	for (entry = vm_map_copy_first_entry(copy_map);
17269 	    entry != vm_map_copy_to_entry(copy_map);
17270 	    entry = next_entry) {
17271 		next_entry = entry->vme_next;
17272 		if (entry->vme_end <= trim_start) {
17273 			/* entry fully before trim range: skip */
17274 			continue;
17275 		}
17276 		if (entry->vme_start >= trim_end) {
17277 			/* entry fully after trim range: done */
17278 			break;
17279 		}
17280 		/* clip entry if needed */
17281 		vm_map_copy_clip_start(copy_map, entry, trim_start);
17282 		vm_map_copy_clip_end(copy_map, entry, trim_end);
17283 		/* dispose of entry */
17284 		copy_map->size -= entry->vme_end - entry->vme_start;
17285 		vm_map_copy_entry_unlink(copy_map, entry);
17286 		if (entry->is_sub_map) {
17287 			vm_map_deallocate(VME_SUBMAP(entry));
17288 		} else {
17289 			vm_object_deallocate(VME_OBJECT(entry));
17290 		}
17291 		vm_map_copy_entry_dispose(entry);
17292 		entry = VM_MAP_ENTRY_NULL;
17293 	}
17294 
17295 	/* restore copy_map's original page_shift */
17296 	copy_map->cpy_hdr.page_shift = copy_page_shift;
17297 }
17298 
17299 /*
17300  * Make any necessary adjustments to "copy_map" to allow it to be
17301  * mapped into "target_map".
17302  * If no changes were necessary, "target_copy_map" points to the
17303  * untouched "copy_map".
17304  * If changes are necessary, changes will be made to "target_copy_map".
17305  * If "target_copy_map" was NULL, we create a new "vm_map_copy_t" and
17306  * copy the original "copy_map" to it before applying the changes.
17307  * The caller should discard "target_copy_map" if it's not the same as
17308  * the original "copy_map".
17309  */
17310 /* TODO4K: also adjust to sub-range in the copy_map -> add start&end? */
17311 kern_return_t
vm_map_copy_adjust_to_target(vm_map_copy_t src_copy_map,vm_map_offset_t offset,vm_map_size_t size,vm_map_t target_map,boolean_t copy,vm_map_copy_t * target_copy_map_p,vm_map_offset_t * overmap_start_p,vm_map_offset_t * overmap_end_p,vm_map_offset_t * trimmed_start_p)17312 vm_map_copy_adjust_to_target(
17313 	vm_map_copy_t           src_copy_map,
17314 	vm_map_offset_t         offset,
17315 	vm_map_size_t           size,
17316 	vm_map_t                target_map,
17317 	boolean_t               copy,
17318 	vm_map_copy_t           *target_copy_map_p,
17319 	vm_map_offset_t         *overmap_start_p,
17320 	vm_map_offset_t         *overmap_end_p,
17321 	vm_map_offset_t         *trimmed_start_p)
17322 {
17323 	vm_map_copy_t           copy_map, target_copy_map;
17324 	vm_map_size_t           target_size;
17325 	vm_map_size_t           src_copy_map_size;
17326 	vm_map_size_t           overmap_start, overmap_end;
17327 	int                     misalignments;
17328 	vm_map_entry_t          entry, target_entry;
17329 	vm_map_offset_t         addr_adjustment;
17330 	vm_map_offset_t         new_start, new_end;
17331 	int                     copy_page_mask, target_page_mask;
17332 	uint16_t                copy_page_shift, target_page_shift;
17333 	vm_map_offset_t         trimmed_end;
17334 
17335 	/*
17336 	 * Assert that the vm_map_copy is coming from the right
17337 	 * zone and hasn't been forged
17338 	 */
17339 	vm_map_copy_require(src_copy_map);
17340 	assert(src_copy_map->type == VM_MAP_COPY_ENTRY_LIST);
17341 
17342 	/*
17343 	 * Start working with "src_copy_map" but we'll switch
17344 	 * to "target_copy_map" as soon as we start making adjustments.
17345 	 */
17346 	copy_map = src_copy_map;
17347 	src_copy_map_size = src_copy_map->size;
17348 
17349 	copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy_map);
17350 	copy_page_mask = VM_MAP_COPY_PAGE_MASK(copy_map);
17351 	target_page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(target_map);
17352 	target_page_mask = VM_MAP_PAGE_MASK(target_map);
17353 
17354 	DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p...\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, (uint64_t)offset, (uint64_t)size, *target_copy_map_p);
17355 
17356 	target_copy_map = *target_copy_map_p;
17357 	if (target_copy_map != VM_MAP_COPY_NULL) {
17358 		vm_map_copy_require(target_copy_map);
17359 	}
17360 
17361 	if (offset + size > copy_map->size) {
17362 		DEBUG4K_ERROR("copy_map %p (%d->%d) copy_map->size 0x%llx offset 0x%llx size 0x%llx KERN_INVALID_ARGUMENT\n", copy_map, copy_page_shift, target_page_shift, (uint64_t)copy_map->size, (uint64_t)offset, (uint64_t)size);
17363 		return KERN_INVALID_ARGUMENT;
17364 	}
17365 
17366 	/* trim the end */
17367 	trimmed_end = 0;
17368 	new_end = VM_MAP_ROUND_PAGE(offset + size, target_page_mask);
17369 	if (new_end < copy_map->size) {
17370 		trimmed_end = src_copy_map_size - new_end;
17371 		DEBUG4K_ADJUST("copy_map %p (%d->%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p... trim end from 0x%llx to 0x%llx\n", copy_map, copy_page_shift, target_page_shift, copy, (uint64_t)offset, (uint64_t)size, target_copy_map, (uint64_t)new_end, (uint64_t)copy_map->size);
17372 		/* get "target_copy_map" if needed and adjust it */
17373 		vm_map_copy_adjust_get_target_copy_map(copy_map,
17374 		    &target_copy_map);
17375 		copy_map = target_copy_map;
17376 		vm_map_copy_trim(target_copy_map, target_page_shift,
17377 		    new_end, copy_map->size);
17378 	}
17379 
17380 	/* trim the start */
17381 	new_start = VM_MAP_TRUNC_PAGE(offset, target_page_mask);
17382 	if (new_start != 0) {
17383 		DEBUG4K_ADJUST("copy_map %p (%d->%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p... trim start from 0x%llx to 0x%llx\n", copy_map, copy_page_shift, target_page_shift, copy, (uint64_t)offset, (uint64_t)size, target_copy_map, (uint64_t)0, (uint64_t)new_start);
17384 		/* get "target_copy_map" if needed and adjust it */
17385 		vm_map_copy_adjust_get_target_copy_map(copy_map,
17386 		    &target_copy_map);
17387 		copy_map = target_copy_map;
17388 		vm_map_copy_trim(target_copy_map, target_page_shift,
17389 		    0, new_start);
17390 	}
17391 	*trimmed_start_p = new_start;
17392 
17393 	/* target_size starts with what's left after trimming */
17394 	target_size = copy_map->size;
17395 	assertf(target_size == src_copy_map_size - *trimmed_start_p - trimmed_end,
17396 	    "target_size 0x%llx src_copy_map_size 0x%llx trimmed_start 0x%llx trimmed_end 0x%llx\n",
17397 	    (uint64_t)target_size, (uint64_t)src_copy_map_size,
17398 	    (uint64_t)*trimmed_start_p, (uint64_t)trimmed_end);
17399 
17400 	/* check for misalignments but don't adjust yet */
17401 	misalignments = 0;
17402 	overmap_start = 0;
17403 	overmap_end = 0;
17404 	if (copy_page_shift < target_page_shift) {
17405 		/*
17406 		 * Remapping from 4K to 16K: check the VM object alignments
17407 		 * throughout the range.
17408 		 * If the start and end of the range are mis-aligned, we can
17409 		 * over-map to re-align, and adjust the "overmap" start/end
17410 		 * and "target_size" of the range accordingly.
17411 		 * If there is any mis-alignment within the range:
17412 		 *     if "copy":
17413 		 *         we can do immediate-copy instead of copy-on-write,
17414 		 *     else:
17415 		 *         no way to remap and share; fail.
17416 		 */
17417 		for (entry = vm_map_copy_first_entry(copy_map);
17418 		    entry != vm_map_copy_to_entry(copy_map);
17419 		    entry = entry->vme_next) {
17420 			vm_object_offset_t object_offset_start, object_offset_end;
17421 
17422 			object_offset_start = VME_OFFSET(entry);
17423 			object_offset_end = object_offset_start;
17424 			object_offset_end += entry->vme_end - entry->vme_start;
17425 			if (object_offset_start & target_page_mask) {
17426 				if (entry == vm_map_copy_first_entry(copy_map) && !copy) {
17427 					overmap_start++;
17428 				} else {
17429 					misalignments++;
17430 				}
17431 			}
17432 			if (object_offset_end & target_page_mask) {
17433 				if (entry->vme_next == vm_map_copy_to_entry(copy_map) && !copy) {
17434 					overmap_end++;
17435 				} else {
17436 					misalignments++;
17437 				}
17438 			}
17439 		}
17440 	}
17441 	entry = VM_MAP_ENTRY_NULL;
17442 
17443 	/* decide how to deal with misalignments */
17444 	assert(overmap_start <= 1);
17445 	assert(overmap_end <= 1);
17446 	if (!overmap_start && !overmap_end && !misalignments) {
17447 		/* copy_map is properly aligned for target_map ... */
17448 		if (*trimmed_start_p) {
17449 			/* ... but we trimmed it, so still need to adjust */
17450 		} else {
17451 			/* ... and we didn't trim anything: we're done */
17452 			if (target_copy_map == VM_MAP_COPY_NULL) {
17453 				target_copy_map = copy_map;
17454 			}
17455 			*target_copy_map_p = target_copy_map;
17456 			*overmap_start_p = 0;
17457 			*overmap_end_p = 0;
17458 			DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx) -> trimmed 0x%llx overmap start 0x%llx end 0x%llx KERN_SUCCESS\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p);
17459 			return KERN_SUCCESS;
17460 		}
17461 	} else if (misalignments && !copy) {
17462 		/* can't "share" if misaligned */
17463 		DEBUG4K_ADJUST("unsupported sharing\n");
17464 #if MACH_ASSERT
17465 		if (debug4k_panic_on_misaligned_sharing) {
17466 			panic("DEBUG4k %s:%d unsupported sharing", __FUNCTION__, __LINE__);
17467 		}
17468 #endif /* MACH_ASSERT */
17469 		DEBUG4K_ADJUST("copy_map %p (%d) target_map %p (%d) copy %d target_copy_map %p -> KERN_NOT_SUPPORTED\n", copy_map, copy_page_shift, target_map, target_page_shift, copy, *target_copy_map_p);
17470 		return KERN_NOT_SUPPORTED;
17471 	} else {
17472 		/* can't virtual-copy if misaligned (but can physical-copy) */
17473 		DEBUG4K_ADJUST("mis-aligned copying\n");
17474 	}
17475 
17476 	/* get a "target_copy_map" if needed and switch to it */
17477 	vm_map_copy_adjust_get_target_copy_map(copy_map, &target_copy_map);
17478 	copy_map = target_copy_map;
17479 
17480 	if (misalignments && copy) {
17481 		vm_map_size_t target_copy_map_size;
17482 
17483 		/*
17484 		 * Can't do copy-on-write with misaligned mappings.
17485 		 * Replace the mappings with a physical copy of the original
17486 		 * mappings' contents.
17487 		 */
17488 		target_copy_map_size = target_copy_map->size;
17489 		kern_return_t kr = vm_map_copy_to_physcopy(target_copy_map, target_map);
17490 		if (kr != KERN_SUCCESS) {
17491 			return kr;
17492 		}
17493 		*target_copy_map_p = target_copy_map;
17494 		*overmap_start_p = 0;
17495 		*overmap_end_p = target_copy_map->size - target_copy_map_size;
17496 		DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx)-> trimmed 0x%llx overmap start 0x%llx end 0x%llx PHYSCOPY\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p);
17497 		return KERN_SUCCESS;
17498 	}
17499 
17500 	/* apply the adjustments */
17501 	misalignments = 0;
17502 	overmap_start = 0;
17503 	overmap_end = 0;
17504 	/* remove copy_map->offset, so that everything starts at offset 0 */
17505 	addr_adjustment = copy_map->offset;
17506 	/* also remove whatever we trimmed from the start */
17507 	addr_adjustment += *trimmed_start_p;
17508 	for (target_entry = vm_map_copy_first_entry(target_copy_map);
17509 	    target_entry != vm_map_copy_to_entry(target_copy_map);
17510 	    target_entry = target_entry->vme_next) {
17511 		vm_object_offset_t object_offset_start, object_offset_end;
17512 
17513 		DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx BEFORE\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
17514 		object_offset_start = VME_OFFSET(target_entry);
17515 		if (object_offset_start & target_page_mask) {
17516 			DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx misaligned at start\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
17517 			if (target_entry == vm_map_copy_first_entry(target_copy_map)) {
17518 				/*
17519 				 * start of 1st entry is mis-aligned:
17520 				 * re-adjust by over-mapping.
17521 				 */
17522 				overmap_start = object_offset_start - trunc_page_mask_64(object_offset_start, target_page_mask);
17523 				DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> overmap_start 0x%llx\n", target_entry, VME_OFFSET(target_entry), copy, (uint64_t)overmap_start);
17524 				VME_OFFSET_SET(target_entry, VME_OFFSET(target_entry) - overmap_start);
17525 			} else {
17526 				misalignments++;
17527 				DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> misalignments %d\n", target_entry, VME_OFFSET(target_entry), copy, misalignments);
17528 				assert(copy);
17529 			}
17530 		}
17531 
17532 		if (target_entry == vm_map_copy_first_entry(target_copy_map)) {
17533 			target_size += overmap_start;
17534 		} else {
17535 			target_entry->vme_start += overmap_start;
17536 		}
17537 		target_entry->vme_end += overmap_start;
17538 
17539 		object_offset_end = VME_OFFSET(target_entry) + target_entry->vme_end - target_entry->vme_start;
17540 		if (object_offset_end & target_page_mask) {
17541 			DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx misaligned at end\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
17542 			if (target_entry->vme_next == vm_map_copy_to_entry(target_copy_map)) {
17543 				/*
17544 				 * end of last entry is mis-aligned: re-adjust by over-mapping.
17545 				 */
17546 				overmap_end = round_page_mask_64(object_offset_end, target_page_mask) - object_offset_end;
17547 				DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> overmap_end 0x%llx\n", target_entry, VME_OFFSET(target_entry), copy, (uint64_t)overmap_end);
17548 				target_entry->vme_end += overmap_end;
17549 				target_size += overmap_end;
17550 			} else {
17551 				misalignments++;
17552 				DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> misalignments %d\n", target_entry, VME_OFFSET(target_entry), copy, misalignments);
17553 				assert(copy);
17554 			}
17555 		}
17556 		target_entry->vme_start -= addr_adjustment;
17557 		target_entry->vme_end -= addr_adjustment;
17558 		DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx AFTER\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
17559 	}
17560 
17561 	target_copy_map->size = target_size;
17562 	target_copy_map->offset += overmap_start;
17563 	target_copy_map->offset -= addr_adjustment;
17564 	target_copy_map->cpy_hdr.page_shift = target_page_shift;
17565 
17566 //	assert(VM_MAP_PAGE_ALIGNED(target_copy_map->size, target_page_mask));
17567 //	assert(VM_MAP_PAGE_ALIGNED(target_copy_map->offset, FOURK_PAGE_MASK));
17568 	assert(overmap_start < VM_MAP_PAGE_SIZE(target_map));
17569 	assert(overmap_end < VM_MAP_PAGE_SIZE(target_map));
17570 
17571 	*target_copy_map_p = target_copy_map;
17572 	*overmap_start_p = overmap_start;
17573 	*overmap_end_p = overmap_end;
17574 
17575 	DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx) -> trimmed 0x%llx overmap start 0x%llx end 0x%llx KERN_SUCCESS\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p);
17576 	return KERN_SUCCESS;
17577 }
17578 
17579 kern_return_t
vm_map_range_physical_size(vm_map_t map,vm_map_address_t start,mach_vm_size_t size,mach_vm_size_t * phys_size)17580 vm_map_range_physical_size(
17581 	vm_map_t         map,
17582 	vm_map_address_t start,
17583 	mach_vm_size_t   size,
17584 	mach_vm_size_t * phys_size)
17585 {
17586 	kern_return_t   kr;
17587 	vm_map_copy_t   copy_map, target_copy_map;
17588 	vm_map_offset_t adjusted_start, adjusted_end;
17589 	vm_map_size_t   adjusted_size;
17590 	vm_prot_t       cur_prot, max_prot;
17591 	vm_map_offset_t overmap_start, overmap_end, trimmed_start;
17592 	vm_map_kernel_flags_t vmk_flags;
17593 
17594 	adjusted_start = vm_map_trunc_page(start, VM_MAP_PAGE_MASK(map));
17595 	adjusted_end = vm_map_round_page(start + size, VM_MAP_PAGE_MASK(map));
17596 	adjusted_size = adjusted_end - adjusted_start;
17597 	*phys_size = adjusted_size;
17598 	if (VM_MAP_PAGE_SIZE(map) == PAGE_SIZE) {
17599 		return KERN_SUCCESS;
17600 	}
17601 	if (start == 0) {
17602 		adjusted_start = vm_map_trunc_page(start, PAGE_MASK);
17603 		adjusted_end = vm_map_round_page(start + size, PAGE_MASK);
17604 		adjusted_size = adjusted_end - adjusted_start;
17605 		*phys_size = adjusted_size;
17606 		return KERN_SUCCESS;
17607 	}
17608 	if (adjusted_size == 0) {
17609 		DEBUG4K_SHARE("map %p start 0x%llx size 0x%llx adjusted 0x%llx -> phys_size 0!\n", map, (uint64_t)start, (uint64_t)size, (uint64_t)adjusted_size);
17610 		*phys_size = 0;
17611 		return KERN_SUCCESS;
17612 	}
17613 
17614 	vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
17615 	vmk_flags.vmkf_copy_pageable = TRUE;
17616 	vmk_flags.vmkf_copy_same_map = TRUE;
17617 	assert(adjusted_size != 0);
17618 	cur_prot = VM_PROT_NONE; /* legacy mode */
17619 	max_prot = VM_PROT_NONE; /* legacy mode */
17620 	kr = vm_map_copy_extract(map, adjusted_start, adjusted_size,
17621 	    FALSE /* copy */,
17622 	    &copy_map,
17623 	    &cur_prot, &max_prot, VM_INHERIT_DEFAULT,
17624 	    vmk_flags);
17625 	if (kr != KERN_SUCCESS) {
17626 		DEBUG4K_ERROR("map %p start 0x%llx 0x%llx size 0x%llx 0x%llx kr 0x%x\n", map, (uint64_t)start, (uint64_t)adjusted_start, size, (uint64_t)adjusted_size, kr);
17627 		//assert(0);
17628 		*phys_size = 0;
17629 		return kr;
17630 	}
17631 	assert(copy_map != VM_MAP_COPY_NULL);
17632 	target_copy_map = copy_map;
17633 	DEBUG4K_ADJUST("adjusting...\n");
17634 	kr = vm_map_copy_adjust_to_target(
17635 		copy_map,
17636 		start - adjusted_start, /* offset */
17637 		size, /* size */
17638 		kernel_map,
17639 		FALSE,                          /* copy */
17640 		&target_copy_map,
17641 		&overmap_start,
17642 		&overmap_end,
17643 		&trimmed_start);
17644 	if (kr == KERN_SUCCESS) {
17645 		if (target_copy_map->size != *phys_size) {
17646 			DEBUG4K_ADJUST("map %p (%d) start 0x%llx size 0x%llx adjusted_start 0x%llx adjusted_end 0x%llx overmap_start 0x%llx overmap_end 0x%llx trimmed_start 0x%llx phys_size 0x%llx -> 0x%llx\n", map, VM_MAP_PAGE_SHIFT(map), (uint64_t)start, (uint64_t)size, (uint64_t)adjusted_start, (uint64_t)adjusted_end, (uint64_t)overmap_start, (uint64_t)overmap_end, (uint64_t)trimmed_start, (uint64_t)*phys_size, (uint64_t)target_copy_map->size);
17647 		}
17648 		*phys_size = target_copy_map->size;
17649 	} else {
17650 		DEBUG4K_ERROR("map %p start 0x%llx 0x%llx size 0x%llx 0x%llx kr 0x%x\n", map, (uint64_t)start, (uint64_t)adjusted_start, size, (uint64_t)adjusted_size, kr);
17651 		//assert(0);
17652 		*phys_size = 0;
17653 	}
17654 	vm_map_copy_discard(copy_map);
17655 	copy_map = VM_MAP_COPY_NULL;
17656 
17657 	return kr;
17658 }
17659 
17660 
17661 kern_return_t
memory_entry_check_for_adjustment(vm_map_t src_map,ipc_port_t port,vm_map_offset_t * overmap_start,vm_map_offset_t * overmap_end)17662 memory_entry_check_for_adjustment(
17663 	vm_map_t                        src_map,
17664 	ipc_port_t                      port,
17665 	vm_map_offset_t         *overmap_start,
17666 	vm_map_offset_t         *overmap_end)
17667 {
17668 	kern_return_t kr = KERN_SUCCESS;
17669 	vm_map_copy_t copy_map = VM_MAP_COPY_NULL, target_copy_map = VM_MAP_COPY_NULL;
17670 
17671 	assert(port);
17672 	assertf(ip_kotype(port) == IKOT_NAMED_ENTRY, "Port Type expected: %d...received:%d\n", IKOT_NAMED_ENTRY, ip_kotype(port));
17673 
17674 	vm_named_entry_t        named_entry;
17675 
17676 	named_entry = mach_memory_entry_from_port(port);
17677 	named_entry_lock(named_entry);
17678 	copy_map = named_entry->backing.copy;
17679 	target_copy_map = copy_map;
17680 
17681 	if (src_map && VM_MAP_PAGE_SHIFT(src_map) < PAGE_SHIFT) {
17682 		vm_map_offset_t trimmed_start;
17683 
17684 		trimmed_start = 0;
17685 		DEBUG4K_ADJUST("adjusting...\n");
17686 		kr = vm_map_copy_adjust_to_target(
17687 			copy_map,
17688 			0, /* offset */
17689 			copy_map->size, /* size */
17690 			src_map,
17691 			FALSE, /* copy */
17692 			&target_copy_map,
17693 			overmap_start,
17694 			overmap_end,
17695 			&trimmed_start);
17696 		assert(trimmed_start == 0);
17697 	}
17698 	named_entry_unlock(named_entry);
17699 
17700 	return kr;
17701 }
17702 
17703 
17704 /*
17705  *	Routine:	vm_remap
17706  *
17707  *			Map portion of a task's address space.
17708  *			Mapped region must not overlap more than
17709  *			one vm memory object. Protections and
17710  *			inheritance attributes remain the same
17711  *			as in the original task and are	out parameters.
17712  *			Source and Target task can be identical
17713  *			Other attributes are identical as for vm_map()
17714  */
17715 kern_return_t
vm_map_remap(vm_map_t target_map,vm_map_address_t * address,vm_map_size_t size,vm_map_offset_t mask,int flags,vm_map_kernel_flags_t vmk_flags,vm_tag_t tag,vm_map_t src_map,vm_map_offset_t memory_address,boolean_t copy,vm_prot_t * cur_protection,vm_prot_t * max_protection,vm_inherit_t inheritance)17716 vm_map_remap(
17717 	vm_map_t                target_map,
17718 	vm_map_address_t        *address,
17719 	vm_map_size_t           size,
17720 	vm_map_offset_t         mask,
17721 	int                     flags,
17722 	vm_map_kernel_flags_t   vmk_flags,
17723 	vm_tag_t                tag,
17724 	vm_map_t                src_map,
17725 	vm_map_offset_t         memory_address,
17726 	boolean_t               copy,
17727 	vm_prot_t               *cur_protection, /* IN/OUT */
17728 	vm_prot_t               *max_protection, /* IN/OUT */
17729 	vm_inherit_t            inheritance)
17730 {
17731 	kern_return_t           result;
17732 	vm_map_entry_t          entry;
17733 	vm_map_entry_t          insp_entry = VM_MAP_ENTRY_NULL;
17734 	vm_map_entry_t          new_entry;
17735 	vm_map_copy_t           copy_map;
17736 	vm_map_offset_t         offset_in_mapping;
17737 	vm_map_size_t           target_size = 0;
17738 	vm_map_size_t           src_page_mask, target_page_mask;
17739 	vm_map_offset_t         overmap_start, overmap_end, trimmed_start;
17740 	vm_map_offset_t         initial_memory_address;
17741 	vm_map_size_t           initial_size;
17742 	VM_MAP_ZAP_DECLARE(zap_list);
17743 
17744 	if (target_map == VM_MAP_NULL) {
17745 		return KERN_INVALID_ARGUMENT;
17746 	}
17747 
17748 	initial_memory_address = memory_address;
17749 	initial_size = size;
17750 	src_page_mask = VM_MAP_PAGE_MASK(src_map);
17751 	target_page_mask = VM_MAP_PAGE_MASK(target_map);
17752 
17753 	switch (inheritance) {
17754 	case VM_INHERIT_NONE:
17755 	case VM_INHERIT_COPY:
17756 	case VM_INHERIT_SHARE:
17757 		if (size != 0 && src_map != VM_MAP_NULL) {
17758 			break;
17759 		}
17760 		OS_FALLTHROUGH;
17761 	default:
17762 		return KERN_INVALID_ARGUMENT;
17763 	}
17764 
17765 	if (src_page_mask != target_page_mask) {
17766 		if (copy) {
17767 			DEBUG4K_COPY("src_map %p pgsz 0x%x addr 0x%llx size 0x%llx copy %d -> target_map %p pgsz 0x%x\n", src_map, VM_MAP_PAGE_SIZE(src_map), (uint64_t)memory_address, (uint64_t)size, copy, target_map, VM_MAP_PAGE_SIZE(target_map));
17768 		} else {
17769 			DEBUG4K_SHARE("src_map %p pgsz 0x%x addr 0x%llx size 0x%llx copy %d -> target_map %p pgsz 0x%x\n", src_map, VM_MAP_PAGE_SIZE(src_map), (uint64_t)memory_address, (uint64_t)size, copy, target_map, VM_MAP_PAGE_SIZE(target_map));
17770 		}
17771 	}
17772 
17773 	/*
17774 	 * If the user is requesting that we return the address of the
17775 	 * first byte of the data (rather than the base of the page),
17776 	 * then we use different rounding semantics: specifically,
17777 	 * we assume that (memory_address, size) describes a region
17778 	 * all of whose pages we must cover, rather than a base to be truncated
17779 	 * down and a size to be added to that base.  So we figure out
17780 	 * the highest page that the requested region includes and make
17781 	 * sure that the size will cover it.
17782 	 *
17783 	 * The key example we're worried about it is of the form:
17784 	 *
17785 	 *              memory_address = 0x1ff0, size = 0x20
17786 	 *
17787 	 * With the old semantics, we round down the memory_address to 0x1000
17788 	 * and round up the size to 0x1000, resulting in our covering *only*
17789 	 * page 0x1000.  With the new semantics, we'd realize that the region covers
17790 	 * 0x1ff0-0x2010, and compute a size of 0x2000.  Thus, we cover both page
17791 	 * 0x1000 and page 0x2000 in the region we remap.
17792 	 */
17793 	if ((flags & VM_FLAGS_RETURN_DATA_ADDR) != 0) {
17794 		vm_map_offset_t range_start, range_end;
17795 
17796 		range_start = vm_map_trunc_page(memory_address, src_page_mask);
17797 		range_end = vm_map_round_page(memory_address + size, src_page_mask);
17798 		memory_address = range_start;
17799 		size = range_end - range_start;
17800 		offset_in_mapping = initial_memory_address - memory_address;
17801 	} else {
17802 		/*
17803 		 * IMPORTANT:
17804 		 * This legacy code path is broken: for the range mentioned
17805 		 * above [ memory_address = 0x1ff0,size = 0x20 ], which spans
17806 		 * two 4k pages, it yields [ memory_address = 0x1000,
17807 		 * size = 0x1000 ], which covers only the first 4k page.
17808 		 * BUT some code unfortunately depends on this bug, so we
17809 		 * can't fix it without breaking something.
17810 		 * New code should get automatically opted in the new
17811 		 * behavior with the new VM_FLAGS_RETURN_DATA_ADDR flags.
17812 		 */
17813 		offset_in_mapping = 0;
17814 		memory_address = vm_map_trunc_page(memory_address, src_page_mask);
17815 		size = vm_map_round_page(size, src_page_mask);
17816 		initial_memory_address = memory_address;
17817 		initial_size = size;
17818 	}
17819 
17820 
17821 	if (size == 0) {
17822 		return KERN_INVALID_ARGUMENT;
17823 	}
17824 
17825 	if (flags & VM_FLAGS_RESILIENT_MEDIA) {
17826 		/* must be copy-on-write to be "media resilient" */
17827 		if (!copy) {
17828 			return KERN_INVALID_ARGUMENT;
17829 		}
17830 	}
17831 
17832 	vmk_flags.vmkf_copy_pageable = target_map->hdr.entries_pageable;
17833 	vmk_flags.vmkf_copy_same_map = (src_map == target_map);
17834 
17835 	assert(size != 0);
17836 	result = vm_map_copy_extract(src_map,
17837 	    memory_address,
17838 	    size,
17839 	    copy, &copy_map,
17840 	    cur_protection, /* IN/OUT */
17841 	    max_protection, /* IN/OUT */
17842 	    inheritance,
17843 	    vmk_flags);
17844 	if (result != KERN_SUCCESS) {
17845 		return result;
17846 	}
17847 	assert(copy_map != VM_MAP_COPY_NULL);
17848 
17849 	overmap_start = 0;
17850 	overmap_end = 0;
17851 	trimmed_start = 0;
17852 	target_size = size;
17853 	if (src_page_mask != target_page_mask) {
17854 		vm_map_copy_t target_copy_map;
17855 
17856 		target_copy_map = copy_map; /* can modify "copy_map" itself */
17857 		DEBUG4K_ADJUST("adjusting...\n");
17858 		result = vm_map_copy_adjust_to_target(
17859 			copy_map,
17860 			offset_in_mapping, /* offset */
17861 			initial_size,
17862 			target_map,
17863 			copy,
17864 			&target_copy_map,
17865 			&overmap_start,
17866 			&overmap_end,
17867 			&trimmed_start);
17868 		if (result != KERN_SUCCESS) {
17869 			DEBUG4K_COPY("failed to adjust 0x%x\n", result);
17870 			vm_map_copy_discard(copy_map);
17871 			return result;
17872 		}
17873 		if (trimmed_start == 0) {
17874 			/* nothing trimmed: no adjustment needed */
17875 		} else if (trimmed_start >= offset_in_mapping) {
17876 			/* trimmed more than offset_in_mapping: nothing left */
17877 			assert(overmap_start == 0);
17878 			assert(overmap_end == 0);
17879 			offset_in_mapping = 0;
17880 		} else {
17881 			/* trimmed some of offset_in_mapping: adjust */
17882 			assert(overmap_start == 0);
17883 			assert(overmap_end == 0);
17884 			offset_in_mapping -= trimmed_start;
17885 		}
17886 		offset_in_mapping += overmap_start;
17887 		target_size = target_copy_map->size;
17888 	}
17889 
17890 	/*
17891 	 * Allocate/check a range of free virtual address
17892 	 * space for the target
17893 	 */
17894 	*address = vm_map_trunc_page(*address, target_page_mask);
17895 	vm_map_lock(target_map);
17896 	target_size = vm_map_round_page(target_size, target_page_mask);
17897 	result = vm_map_remap_range_allocate(target_map, address,
17898 	    target_size, mask, flags, vmk_flags, tag,
17899 	    &insp_entry, &zap_list);
17900 
17901 	for (entry = vm_map_copy_first_entry(copy_map);
17902 	    entry != vm_map_copy_to_entry(copy_map);
17903 	    entry = new_entry) {
17904 		new_entry = entry->vme_next;
17905 		vm_map_copy_entry_unlink(copy_map, entry);
17906 		if (result == KERN_SUCCESS) {
17907 			if (flags & VM_FLAGS_RESILIENT_CODESIGN) {
17908 				/* no codesigning -> read-only access */
17909 				entry->max_protection = VM_PROT_READ;
17910 				entry->protection = VM_PROT_READ;
17911 				entry->vme_resilient_codesign = TRUE;
17912 			}
17913 			entry->vme_start += *address;
17914 			entry->vme_end += *address;
17915 			assert(!entry->map_aligned);
17916 			if ((flags & VM_FLAGS_RESILIENT_MEDIA) &&
17917 			    !entry->is_sub_map &&
17918 			    (VME_OBJECT(entry) == VM_OBJECT_NULL ||
17919 			    VME_OBJECT(entry)->internal)) {
17920 				entry->vme_resilient_media = TRUE;
17921 			}
17922 			assert(VM_MAP_PAGE_ALIGNED(entry->vme_start, MIN(target_page_mask, PAGE_MASK)));
17923 			assert(VM_MAP_PAGE_ALIGNED(entry->vme_end, MIN(target_page_mask, PAGE_MASK)));
17924 			assert(VM_MAP_PAGE_ALIGNED(VME_OFFSET(entry), MIN(target_page_mask, PAGE_MASK)));
17925 			vm_map_store_entry_link(target_map, insp_entry, entry,
17926 			    vmk_flags);
17927 			insp_entry = entry;
17928 		} else {
17929 			if (!entry->is_sub_map) {
17930 				vm_object_deallocate(VME_OBJECT(entry));
17931 			} else {
17932 				vm_map_deallocate(VME_SUBMAP(entry));
17933 			}
17934 			vm_map_copy_entry_dispose(entry);
17935 		}
17936 	}
17937 
17938 	if (flags & VM_FLAGS_RESILIENT_CODESIGN) {
17939 		*cur_protection = VM_PROT_READ;
17940 		*max_protection = VM_PROT_READ;
17941 	}
17942 
17943 	if (result == KERN_SUCCESS) {
17944 		target_map->size += target_size;
17945 		SAVE_HINT_MAP_WRITE(target_map, insp_entry);
17946 
17947 	}
17948 	vm_map_unlock(target_map);
17949 
17950 	vm_map_zap_dispose(&zap_list);
17951 
17952 	if (result == KERN_SUCCESS && target_map->wiring_required) {
17953 		result = vm_map_wire_kernel(target_map, *address,
17954 		    *address + size, *cur_protection, VM_KERN_MEMORY_MLOCK,
17955 		    TRUE);
17956 	}
17957 
17958 	/*
17959 	 * If requested, return the address of the data pointed to by the
17960 	 * request, rather than the base of the resulting page.
17961 	 */
17962 	if ((flags & VM_FLAGS_RETURN_DATA_ADDR) != 0) {
17963 		*address += offset_in_mapping;
17964 	}
17965 
17966 	if (src_page_mask != target_page_mask) {
17967 		DEBUG4K_SHARE("vm_remap(%p 0x%llx 0x%llx copy=%d-> %p 0x%llx 0x%llx  result=0x%x\n", src_map, (uint64_t)memory_address, (uint64_t)size, copy, target_map, (uint64_t)*address, (uint64_t)offset_in_mapping, result);
17968 	}
17969 	vm_map_copy_discard(copy_map);
17970 	copy_map = VM_MAP_COPY_NULL;
17971 
17972 	return result;
17973 }
17974 
17975 /*
17976  *	Routine:	vm_map_remap_range_allocate
17977  *
17978  *	Description:
17979  *		Allocate a range in the specified virtual address map.
17980  *		returns the address and the map entry just before the allocated
17981  *		range
17982  *
17983  *	Map must be locked.
17984  */
17985 
17986 static kern_return_t
vm_map_remap_range_allocate(vm_map_t map,vm_map_address_t * address,vm_map_size_t size,vm_map_offset_t mask,int flags,vm_map_kernel_flags_t vmk_flags,__unused vm_tag_t tag,vm_map_entry_t * map_entry,vm_map_zap_t zap_list)17987 vm_map_remap_range_allocate(
17988 	vm_map_t                map,
17989 	vm_map_address_t        *address,       /* IN/OUT */
17990 	vm_map_size_t           size,
17991 	vm_map_offset_t         mask,
17992 	int                     flags,
17993 	vm_map_kernel_flags_t   vmk_flags,
17994 	__unused vm_tag_t       tag,
17995 	vm_map_entry_t          *map_entry,     /* OUT */
17996 	vm_map_zap_t            zap_list)
17997 {
17998 	vm_map_entry_t  entry;
17999 	vm_map_offset_t start;
18000 	kern_return_t   kr;
18001 
18002 	start = *address;
18003 
18004 	if (flags & VM_FLAGS_ANYWHERE) {
18005 		if (flags & VM_FLAGS_RANDOM_ADDR) {
18006 			vmk_flags.vmkf_random_address = true;
18007 		}
18008 		if (start) {
18009 			vmk_flags.vmkf_range_id = kmem_addr_get_range(start, size);
18010 		}
18011 
18012 		kr = vm_map_locate_space(map, size, mask, vmk_flags,
18013 		    &start, &entry);
18014 		if (kr != KERN_SUCCESS) {
18015 			return kr;
18016 		}
18017 		*address = start;
18018 	} else {
18019 		vm_map_entry_t  temp_entry;
18020 		vm_map_offset_t end;
18021 
18022 		/*
18023 		 *	Verify that:
18024 		 *		the address doesn't itself violate
18025 		 *		the mask requirement.
18026 		 */
18027 
18028 		if ((start & mask) != 0) {
18029 			return KERN_NO_SPACE;
18030 		}
18031 
18032 
18033 		/*
18034 		 *	...	the address is within bounds
18035 		 */
18036 
18037 		end = start + size;
18038 
18039 		if ((start < map->min_offset) ||
18040 		    (end > map->max_offset) ||
18041 		    (start >= end)) {
18042 			return KERN_INVALID_ADDRESS;
18043 		}
18044 
18045 		/*
18046 		 * If we're asked to overwrite whatever was mapped in that
18047 		 * range, first deallocate that range.
18048 		 */
18049 		if (flags & VM_FLAGS_OVERWRITE) {
18050 			vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_MAP_ALIGN;
18051 
18052 			/*
18053 			 * We use a "zap_list" to avoid having to unlock
18054 			 * the "map" in vm_map_delete(), which would compromise
18055 			 * the atomicity of the "deallocate" and then "remap"
18056 			 * combination.
18057 			 */
18058 			remove_flags |= VM_MAP_REMOVE_NO_YIELD;
18059 
18060 			if (vmk_flags.vmkf_overwrite_immutable) {
18061 				remove_flags |= VM_MAP_REMOVE_IMMUTABLE;
18062 			}
18063 			(void)vm_map_delete(map, start, end,
18064 			    remove_flags, zap_list);
18065 		}
18066 
18067 		/*
18068 		 *	...	the starting address isn't allocated
18069 		 */
18070 
18071 		if (vm_map_lookup_entry(map, start, &temp_entry)) {
18072 			return KERN_NO_SPACE;
18073 		}
18074 
18075 		entry = temp_entry;
18076 
18077 		/*
18078 		 *	...	the next region doesn't overlap the
18079 		 *		end point.
18080 		 */
18081 
18082 		if ((entry->vme_next != vm_map_to_entry(map)) &&
18083 		    (entry->vme_next->vme_start < end)) {
18084 			return KERN_NO_SPACE;
18085 		}
18086 	}
18087 	*map_entry = entry;
18088 	return KERN_SUCCESS;
18089 }
18090 
18091 /*
18092  *	vm_map_switch:
18093  *
18094  *	Set the address map for the current thread to the specified map
18095  */
18096 
18097 vm_map_t
vm_map_switch(vm_map_t map)18098 vm_map_switch(
18099 	vm_map_t        map)
18100 {
18101 	int             mycpu;
18102 	thread_t        thread = current_thread();
18103 	vm_map_t        oldmap = thread->map;
18104 
18105 	mp_disable_preemption();
18106 	mycpu = cpu_number();
18107 
18108 	/*
18109 	 *	Deactivate the current map and activate the requested map
18110 	 */
18111 	PMAP_SWITCH_USER(thread, map, mycpu);
18112 
18113 	mp_enable_preemption();
18114 	return oldmap;
18115 }
18116 
18117 
18118 /*
18119  *	Routine:	vm_map_write_user
18120  *
18121  *	Description:
18122  *		Copy out data from a kernel space into space in the
18123  *		destination map. The space must already exist in the
18124  *		destination map.
18125  *		NOTE:  This routine should only be called by threads
18126  *		which can block on a page fault. i.e. kernel mode user
18127  *		threads.
18128  *
18129  */
18130 kern_return_t
vm_map_write_user(vm_map_t map,void * src_p,vm_map_address_t dst_addr,vm_size_t size)18131 vm_map_write_user(
18132 	vm_map_t                map,
18133 	void                    *src_p,
18134 	vm_map_address_t        dst_addr,
18135 	vm_size_t               size)
18136 {
18137 	kern_return_t   kr = KERN_SUCCESS;
18138 
18139 	if (current_map() == map) {
18140 		if (copyout(src_p, dst_addr, size)) {
18141 			kr = KERN_INVALID_ADDRESS;
18142 		}
18143 	} else {
18144 		vm_map_t        oldmap;
18145 
18146 		/* take on the identity of the target map while doing */
18147 		/* the transfer */
18148 
18149 		vm_map_reference(map);
18150 		oldmap = vm_map_switch(map);
18151 		if (copyout(src_p, dst_addr, size)) {
18152 			kr = KERN_INVALID_ADDRESS;
18153 		}
18154 		vm_map_switch(oldmap);
18155 		vm_map_deallocate(map);
18156 	}
18157 	return kr;
18158 }
18159 
18160 /*
18161  *	Routine:	vm_map_read_user
18162  *
18163  *	Description:
18164  *		Copy in data from a user space source map into the
18165  *		kernel map. The space must already exist in the
18166  *		kernel map.
18167  *		NOTE:  This routine should only be called by threads
18168  *		which can block on a page fault. i.e. kernel mode user
18169  *		threads.
18170  *
18171  */
18172 kern_return_t
vm_map_read_user(vm_map_t map,vm_map_address_t src_addr,void * dst_p,vm_size_t size)18173 vm_map_read_user(
18174 	vm_map_t                map,
18175 	vm_map_address_t        src_addr,
18176 	void                    *dst_p,
18177 	vm_size_t               size)
18178 {
18179 	kern_return_t   kr = KERN_SUCCESS;
18180 
18181 	if (current_map() == map) {
18182 		if (copyin(src_addr, dst_p, size)) {
18183 			kr = KERN_INVALID_ADDRESS;
18184 		}
18185 	} else {
18186 		vm_map_t        oldmap;
18187 
18188 		/* take on the identity of the target map while doing */
18189 		/* the transfer */
18190 
18191 		vm_map_reference(map);
18192 		oldmap = vm_map_switch(map);
18193 		if (copyin(src_addr, dst_p, size)) {
18194 			kr = KERN_INVALID_ADDRESS;
18195 		}
18196 		vm_map_switch(oldmap);
18197 		vm_map_deallocate(map);
18198 	}
18199 	return kr;
18200 }
18201 
18202 
18203 /*
18204  *	vm_map_check_protection:
18205  *
18206  *	Assert that the target map allows the specified
18207  *	privilege on the entire address region given.
18208  *	The entire region must be allocated.
18209  */
18210 boolean_t
vm_map_check_protection(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t protection)18211 vm_map_check_protection(vm_map_t map, vm_map_offset_t start,
18212     vm_map_offset_t end, vm_prot_t protection)
18213 {
18214 	vm_map_entry_t entry;
18215 	vm_map_entry_t tmp_entry;
18216 
18217 	vm_map_lock(map);
18218 
18219 	if (start < vm_map_min(map) || end > vm_map_max(map) || start > end) {
18220 		vm_map_unlock(map);
18221 		return FALSE;
18222 	}
18223 
18224 	if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
18225 		vm_map_unlock(map);
18226 		return FALSE;
18227 	}
18228 
18229 	entry = tmp_entry;
18230 
18231 	while (start < end) {
18232 		if (entry == vm_map_to_entry(map)) {
18233 			vm_map_unlock(map);
18234 			return FALSE;
18235 		}
18236 
18237 		/*
18238 		 *	No holes allowed!
18239 		 */
18240 
18241 		if (start < entry->vme_start) {
18242 			vm_map_unlock(map);
18243 			return FALSE;
18244 		}
18245 
18246 		/*
18247 		 * Check protection associated with entry.
18248 		 */
18249 
18250 		if ((entry->protection & protection) != protection) {
18251 			vm_map_unlock(map);
18252 			return FALSE;
18253 		}
18254 
18255 		/* go to next entry */
18256 
18257 		start = entry->vme_end;
18258 		entry = entry->vme_next;
18259 	}
18260 	vm_map_unlock(map);
18261 	return TRUE;
18262 }
18263 
18264 kern_return_t
vm_map_purgable_control(vm_map_t map,vm_map_offset_t address,vm_purgable_t control,int * state)18265 vm_map_purgable_control(
18266 	vm_map_t                map,
18267 	vm_map_offset_t         address,
18268 	vm_purgable_t           control,
18269 	int                     *state)
18270 {
18271 	vm_map_entry_t          entry;
18272 	vm_object_t             object;
18273 	kern_return_t           kr;
18274 	boolean_t               was_nonvolatile;
18275 
18276 	/*
18277 	 * Vet all the input parameters and current type and state of the
18278 	 * underlaying object.  Return with an error if anything is amiss.
18279 	 */
18280 	if (map == VM_MAP_NULL) {
18281 		return KERN_INVALID_ARGUMENT;
18282 	}
18283 
18284 	if (control != VM_PURGABLE_SET_STATE &&
18285 	    control != VM_PURGABLE_GET_STATE &&
18286 	    control != VM_PURGABLE_PURGE_ALL &&
18287 	    control != VM_PURGABLE_SET_STATE_FROM_KERNEL) {
18288 		return KERN_INVALID_ARGUMENT;
18289 	}
18290 
18291 	if (control == VM_PURGABLE_PURGE_ALL) {
18292 		vm_purgeable_object_purge_all();
18293 		return KERN_SUCCESS;
18294 	}
18295 
18296 	if ((control == VM_PURGABLE_SET_STATE ||
18297 	    control == VM_PURGABLE_SET_STATE_FROM_KERNEL) &&
18298 	    (((*state & ~(VM_PURGABLE_ALL_MASKS)) != 0) ||
18299 	    ((*state & VM_PURGABLE_STATE_MASK) > VM_PURGABLE_STATE_MASK))) {
18300 		return KERN_INVALID_ARGUMENT;
18301 	}
18302 
18303 	vm_map_lock_read(map);
18304 
18305 	if (!vm_map_lookup_entry(map, address, &entry) || entry->is_sub_map) {
18306 		/*
18307 		 * Must pass a valid non-submap address.
18308 		 */
18309 		vm_map_unlock_read(map);
18310 		return KERN_INVALID_ADDRESS;
18311 	}
18312 
18313 	if ((entry->protection & VM_PROT_WRITE) == 0 &&
18314 	    control != VM_PURGABLE_GET_STATE) {
18315 		/*
18316 		 * Can't apply purgable controls to something you can't write.
18317 		 */
18318 		vm_map_unlock_read(map);
18319 		return KERN_PROTECTION_FAILURE;
18320 	}
18321 
18322 	object = VME_OBJECT(entry);
18323 	if (object == VM_OBJECT_NULL ||
18324 	    object->purgable == VM_PURGABLE_DENY) {
18325 		/*
18326 		 * Object must already be present and be purgeable.
18327 		 */
18328 		vm_map_unlock_read(map);
18329 		return KERN_INVALID_ARGUMENT;
18330 	}
18331 
18332 	vm_object_lock(object);
18333 
18334 #if 00
18335 	if (VME_OFFSET(entry) != 0 ||
18336 	    entry->vme_end - entry->vme_start != object->vo_size) {
18337 		/*
18338 		 * Can only apply purgable controls to the whole (existing)
18339 		 * object at once.
18340 		 */
18341 		vm_map_unlock_read(map);
18342 		vm_object_unlock(object);
18343 		return KERN_INVALID_ARGUMENT;
18344 	}
18345 #endif
18346 
18347 	assert(!entry->is_sub_map);
18348 	assert(!entry->use_pmap); /* purgeable has its own accounting */
18349 
18350 	vm_map_unlock_read(map);
18351 
18352 	was_nonvolatile = (object->purgable == VM_PURGABLE_NONVOLATILE);
18353 
18354 	kr = vm_object_purgable_control(object, control, state);
18355 
18356 	if (was_nonvolatile &&
18357 	    object->purgable != VM_PURGABLE_NONVOLATILE &&
18358 	    map->pmap == kernel_pmap) {
18359 #if DEBUG
18360 		object->vo_purgeable_volatilizer = kernel_task;
18361 #endif /* DEBUG */
18362 	}
18363 
18364 	vm_object_unlock(object);
18365 
18366 	return kr;
18367 }
18368 
18369 void
vm_map_footprint_query_page_info(vm_map_t map,vm_map_entry_t map_entry,vm_map_offset_t curr_s_offset,int * disposition_p)18370 vm_map_footprint_query_page_info(
18371 	vm_map_t        map,
18372 	vm_map_entry_t  map_entry,
18373 	vm_map_offset_t curr_s_offset,
18374 	int             *disposition_p)
18375 {
18376 	int             pmap_disp;
18377 	vm_object_t     object;
18378 	int             disposition;
18379 	int             effective_page_size;
18380 
18381 	vm_map_lock_assert_held(map);
18382 	assert(!map->has_corpse_footprint);
18383 	assert(curr_s_offset >= map_entry->vme_start);
18384 	assert(curr_s_offset < map_entry->vme_end);
18385 
18386 	object = VME_OBJECT(map_entry);
18387 	if (object == VM_OBJECT_NULL) {
18388 		*disposition_p = 0;
18389 		return;
18390 	}
18391 
18392 	effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(map));
18393 
18394 	pmap_disp = 0;
18395 	if (object == VM_OBJECT_NULL) {
18396 		/* nothing mapped here: no need to ask */
18397 		*disposition_p = 0;
18398 		return;
18399 	} else if (map_entry->is_sub_map &&
18400 	    !map_entry->use_pmap) {
18401 		/* nested pmap: no footprint */
18402 		*disposition_p = 0;
18403 		return;
18404 	}
18405 
18406 	/*
18407 	 * Query the pmap.
18408 	 */
18409 	pmap_query_page_info(map->pmap, curr_s_offset, &pmap_disp);
18410 
18411 	/*
18412 	 * Compute this page's disposition.
18413 	 */
18414 	disposition = 0;
18415 
18416 	/* deal with "alternate accounting" first */
18417 	if (!map_entry->is_sub_map &&
18418 	    object->vo_no_footprint) {
18419 		/* does not count in footprint */
18420 		assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
18421 	} else if (!map_entry->is_sub_map &&
18422 	    (object->purgable == VM_PURGABLE_NONVOLATILE ||
18423 	    (object->purgable == VM_PURGABLE_DENY &&
18424 	    object->vo_ledger_tag)) &&
18425 	    VM_OBJECT_OWNER(object) != NULL &&
18426 	    VM_OBJECT_OWNER(object)->map == map) {
18427 		assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
18428 		if ((((curr_s_offset
18429 		    - map_entry->vme_start
18430 		    + VME_OFFSET(map_entry))
18431 		    / effective_page_size) <
18432 		    (object->resident_page_count +
18433 		    vm_compressor_pager_get_count(object->pager)))) {
18434 			/*
18435 			 * Non-volatile purgeable object owned
18436 			 * by this task: report the first
18437 			 * "#resident + #compressed" pages as
18438 			 * "resident" (to show that they
18439 			 * contribute to the footprint) but not
18440 			 * "dirty" (to avoid double-counting
18441 			 * with the fake "non-volatile" region
18442 			 * we'll report at the end of the
18443 			 * address space to account for all
18444 			 * (mapped or not) non-volatile memory
18445 			 * owned by this task.
18446 			 */
18447 			disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
18448 		}
18449 	} else if (!map_entry->is_sub_map &&
18450 	    (object->purgable == VM_PURGABLE_VOLATILE ||
18451 	    object->purgable == VM_PURGABLE_EMPTY) &&
18452 	    VM_OBJECT_OWNER(object) != NULL &&
18453 	    VM_OBJECT_OWNER(object)->map == map) {
18454 		assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
18455 		if ((((curr_s_offset
18456 		    - map_entry->vme_start
18457 		    + VME_OFFSET(map_entry))
18458 		    / effective_page_size) <
18459 		    object->wired_page_count)) {
18460 			/*
18461 			 * Volatile|empty purgeable object owned
18462 			 * by this task: report the first
18463 			 * "#wired" pages as "resident" (to
18464 			 * show that they contribute to the
18465 			 * footprint) but not "dirty" (to avoid
18466 			 * double-counting with the fake
18467 			 * "non-volatile" region we'll report
18468 			 * at the end of the address space to
18469 			 * account for all (mapped or not)
18470 			 * non-volatile memory owned by this
18471 			 * task.
18472 			 */
18473 			disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
18474 		}
18475 	} else if (!map_entry->is_sub_map &&
18476 	    map_entry->iokit_acct &&
18477 	    object->internal &&
18478 	    object->purgable == VM_PURGABLE_DENY) {
18479 		/*
18480 		 * Non-purgeable IOKit memory: phys_footprint
18481 		 * includes the entire virtual mapping.
18482 		 */
18483 		assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
18484 		disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
18485 		disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
18486 	} else if (pmap_disp & (PMAP_QUERY_PAGE_ALTACCT |
18487 	    PMAP_QUERY_PAGE_COMPRESSED_ALTACCT)) {
18488 		/* alternate accounting */
18489 #if (__arm__ || __arm64__) && (DEVELOPMENT || DEBUG)
18490 		if (map->pmap->footprint_was_suspended) {
18491 			/*
18492 			 * The assertion below can fail if dyld
18493 			 * suspended footprint accounting
18494 			 * while doing some adjustments to
18495 			 * this page;  the mapping would say
18496 			 * "use pmap accounting" but the page
18497 			 * would be marked "alternate
18498 			 * accounting".
18499 			 */
18500 		} else
18501 #endif /* (__arm__ || __arm64__) && (DEVELOPMENT || DEBUG) */
18502 		{
18503 			assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
18504 		}
18505 		disposition = 0;
18506 	} else {
18507 		if (pmap_disp & PMAP_QUERY_PAGE_PRESENT) {
18508 			assertf(map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
18509 			disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
18510 			disposition |= VM_PAGE_QUERY_PAGE_REF;
18511 			if (pmap_disp & PMAP_QUERY_PAGE_INTERNAL) {
18512 				disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
18513 			} else {
18514 				disposition |= VM_PAGE_QUERY_PAGE_EXTERNAL;
18515 			}
18516 			if (pmap_disp & PMAP_QUERY_PAGE_REUSABLE) {
18517 				disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
18518 			}
18519 		} else if (pmap_disp & PMAP_QUERY_PAGE_COMPRESSED) {
18520 			assertf(map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
18521 			disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
18522 		}
18523 	}
18524 
18525 	*disposition_p = disposition;
18526 }
18527 
18528 kern_return_t
vm_map_page_query_internal(vm_map_t target_map,vm_map_offset_t offset,int * disposition,int * ref_count)18529 vm_map_page_query_internal(
18530 	vm_map_t        target_map,
18531 	vm_map_offset_t offset,
18532 	int             *disposition,
18533 	int             *ref_count)
18534 {
18535 	kern_return_t                   kr;
18536 	vm_page_info_basic_data_t       info;
18537 	mach_msg_type_number_t          count;
18538 
18539 	count = VM_PAGE_INFO_BASIC_COUNT;
18540 	kr = vm_map_page_info(target_map,
18541 	    offset,
18542 	    VM_PAGE_INFO_BASIC,
18543 	    (vm_page_info_t) &info,
18544 	    &count);
18545 	if (kr == KERN_SUCCESS) {
18546 		*disposition = info.disposition;
18547 		*ref_count = info.ref_count;
18548 	} else {
18549 		*disposition = 0;
18550 		*ref_count = 0;
18551 	}
18552 
18553 	return kr;
18554 }
18555 
18556 kern_return_t
vm_map_page_info(vm_map_t map,vm_map_offset_t offset,vm_page_info_flavor_t flavor,vm_page_info_t info,mach_msg_type_number_t * count)18557 vm_map_page_info(
18558 	vm_map_t                map,
18559 	vm_map_offset_t         offset,
18560 	vm_page_info_flavor_t   flavor,
18561 	vm_page_info_t          info,
18562 	mach_msg_type_number_t  *count)
18563 {
18564 	return vm_map_page_range_info_internal(map,
18565 	           offset, /* start of range */
18566 	           (offset + 1), /* this will get rounded in the call to the page boundary */
18567 	           (int)-1, /* effective_page_shift: unspecified */
18568 	           flavor,
18569 	           info,
18570 	           count);
18571 }
18572 
18573 kern_return_t
vm_map_page_range_info_internal(vm_map_t map,vm_map_offset_t start_offset,vm_map_offset_t end_offset,int effective_page_shift,vm_page_info_flavor_t flavor,vm_page_info_t info,mach_msg_type_number_t * count)18574 vm_map_page_range_info_internal(
18575 	vm_map_t                map,
18576 	vm_map_offset_t         start_offset,
18577 	vm_map_offset_t         end_offset,
18578 	int                     effective_page_shift,
18579 	vm_page_info_flavor_t   flavor,
18580 	vm_page_info_t          info,
18581 	mach_msg_type_number_t  *count)
18582 {
18583 	vm_map_entry_t          map_entry = VM_MAP_ENTRY_NULL;
18584 	vm_object_t             object = VM_OBJECT_NULL, curr_object = VM_OBJECT_NULL;
18585 	vm_page_t               m = VM_PAGE_NULL;
18586 	kern_return_t           retval = KERN_SUCCESS;
18587 	int                     disposition = 0;
18588 	int                     ref_count = 0;
18589 	int                     depth = 0, info_idx = 0;
18590 	vm_page_info_basic_t    basic_info = 0;
18591 	vm_map_offset_t         offset_in_page = 0, offset_in_object = 0, curr_offset_in_object = 0;
18592 	vm_map_offset_t         start = 0, end = 0, curr_s_offset = 0, curr_e_offset = 0;
18593 	boolean_t               do_region_footprint;
18594 	ledger_amount_t         ledger_resident, ledger_compressed;
18595 	int                     effective_page_size;
18596 	vm_map_offset_t         effective_page_mask;
18597 
18598 	switch (flavor) {
18599 	case VM_PAGE_INFO_BASIC:
18600 		if (*count != VM_PAGE_INFO_BASIC_COUNT) {
18601 			/*
18602 			 * The "vm_page_info_basic_data" structure was not
18603 			 * properly padded, so allow the size to be off by
18604 			 * one to maintain backwards binary compatibility...
18605 			 */
18606 			if (*count != VM_PAGE_INFO_BASIC_COUNT - 1) {
18607 				return KERN_INVALID_ARGUMENT;
18608 			}
18609 		}
18610 		break;
18611 	default:
18612 		return KERN_INVALID_ARGUMENT;
18613 	}
18614 
18615 	if (effective_page_shift == -1) {
18616 		effective_page_shift = vm_self_region_page_shift_safely(map);
18617 		if (effective_page_shift == -1) {
18618 			return KERN_INVALID_ARGUMENT;
18619 		}
18620 	}
18621 	effective_page_size = (1 << effective_page_shift);
18622 	effective_page_mask = effective_page_size - 1;
18623 
18624 	do_region_footprint = task_self_region_footprint();
18625 	disposition = 0;
18626 	ref_count = 0;
18627 	depth = 0;
18628 	info_idx = 0; /* Tracks the next index within the info structure to be filled.*/
18629 	retval = KERN_SUCCESS;
18630 
18631 	offset_in_page = start_offset & effective_page_mask;
18632 	start = vm_map_trunc_page(start_offset, effective_page_mask);
18633 	end = vm_map_round_page(end_offset, effective_page_mask);
18634 
18635 	if (end < start) {
18636 		return KERN_INVALID_ARGUMENT;
18637 	}
18638 
18639 	assert((end - start) <= MAX_PAGE_RANGE_QUERY);
18640 
18641 	vm_map_lock_read(map);
18642 
18643 	task_ledgers_footprint(map->pmap->ledger, &ledger_resident, &ledger_compressed);
18644 
18645 	for (curr_s_offset = start; curr_s_offset < end;) {
18646 		/*
18647 		 * New lookup needs reset of these variables.
18648 		 */
18649 		curr_object = object = VM_OBJECT_NULL;
18650 		offset_in_object = 0;
18651 		ref_count = 0;
18652 		depth = 0;
18653 
18654 		if (do_region_footprint &&
18655 		    curr_s_offset >= vm_map_last_entry(map)->vme_end) {
18656 			/*
18657 			 * Request for "footprint" info about a page beyond
18658 			 * the end of address space: this must be for
18659 			 * the fake region vm_map_region_recurse_64()
18660 			 * reported to account for non-volatile purgeable
18661 			 * memory owned by this task.
18662 			 */
18663 			disposition = 0;
18664 
18665 			if (curr_s_offset - vm_map_last_entry(map)->vme_end <=
18666 			    (unsigned) ledger_compressed) {
18667 				/*
18668 				 * We haven't reported all the "non-volatile
18669 				 * compressed" pages yet, so report this fake
18670 				 * page as "compressed".
18671 				 */
18672 				disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
18673 			} else {
18674 				/*
18675 				 * We've reported all the non-volatile
18676 				 * compressed page but not all the non-volatile
18677 				 * pages , so report this fake page as
18678 				 * "resident dirty".
18679 				 */
18680 				disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
18681 				disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
18682 				disposition |= VM_PAGE_QUERY_PAGE_REF;
18683 			}
18684 			switch (flavor) {
18685 			case VM_PAGE_INFO_BASIC:
18686 				basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
18687 				basic_info->disposition = disposition;
18688 				basic_info->ref_count = 1;
18689 				basic_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
18690 				basic_info->offset = 0;
18691 				basic_info->depth = 0;
18692 
18693 				info_idx++;
18694 				break;
18695 			}
18696 			curr_s_offset += effective_page_size;
18697 			continue;
18698 		}
18699 
18700 		/*
18701 		 * First, find the map entry covering "curr_s_offset", going down
18702 		 * submaps if necessary.
18703 		 */
18704 		if (!vm_map_lookup_entry(map, curr_s_offset, &map_entry)) {
18705 			/* no entry -> no object -> no page */
18706 
18707 			if (curr_s_offset < vm_map_min(map)) {
18708 				/*
18709 				 * Illegal address that falls below map min.
18710 				 */
18711 				curr_e_offset = MIN(end, vm_map_min(map));
18712 			} else if (curr_s_offset >= vm_map_max(map)) {
18713 				/*
18714 				 * Illegal address that falls on/after map max.
18715 				 */
18716 				curr_e_offset = end;
18717 			} else if (map_entry == vm_map_to_entry(map)) {
18718 				/*
18719 				 * Hit a hole.
18720 				 */
18721 				if (map_entry->vme_next == vm_map_to_entry(map)) {
18722 					/*
18723 					 * Empty map.
18724 					 */
18725 					curr_e_offset = MIN(map->max_offset, end);
18726 				} else {
18727 					/*
18728 					 * Hole at start of the map.
18729 					 */
18730 					curr_e_offset = MIN(map_entry->vme_next->vme_start, end);
18731 				}
18732 			} else {
18733 				if (map_entry->vme_next == vm_map_to_entry(map)) {
18734 					/*
18735 					 * Hole at the end of the map.
18736 					 */
18737 					curr_e_offset = MIN(map->max_offset, end);
18738 				} else {
18739 					curr_e_offset = MIN(map_entry->vme_next->vme_start, end);
18740 				}
18741 			}
18742 
18743 			assert(curr_e_offset >= curr_s_offset);
18744 
18745 			uint64_t num_pages = (curr_e_offset - curr_s_offset) >> effective_page_shift;
18746 
18747 			void *info_ptr = (void*) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
18748 
18749 			bzero(info_ptr, num_pages * sizeof(struct vm_page_info_basic));
18750 
18751 			curr_s_offset = curr_e_offset;
18752 
18753 			info_idx += num_pages;
18754 
18755 			continue;
18756 		}
18757 
18758 		/* compute offset from this map entry's start */
18759 		offset_in_object = curr_s_offset - map_entry->vme_start;
18760 
18761 		/* compute offset into this map entry's object (or submap) */
18762 		offset_in_object += VME_OFFSET(map_entry);
18763 
18764 		if (map_entry->is_sub_map) {
18765 			vm_map_t sub_map = VM_MAP_NULL;
18766 			vm_page_info_t submap_info = 0;
18767 			vm_map_offset_t submap_s_offset = 0, submap_e_offset = 0, range_len = 0;
18768 
18769 			range_len = MIN(map_entry->vme_end, end) - curr_s_offset;
18770 
18771 			submap_s_offset = offset_in_object;
18772 			submap_e_offset = submap_s_offset + range_len;
18773 
18774 			sub_map = VME_SUBMAP(map_entry);
18775 
18776 			vm_map_reference(sub_map);
18777 			vm_map_unlock_read(map);
18778 
18779 			submap_info = (vm_page_info_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
18780 
18781 			assertf(VM_MAP_PAGE_SHIFT(sub_map) >= VM_MAP_PAGE_SHIFT(map),
18782 			    "Submap page size (%d) differs from current map (%d)\n", VM_MAP_PAGE_SIZE(sub_map), VM_MAP_PAGE_SIZE(map));
18783 
18784 			retval = vm_map_page_range_info_internal(sub_map,
18785 			    submap_s_offset,
18786 			    submap_e_offset,
18787 			    effective_page_shift,
18788 			    VM_PAGE_INFO_BASIC,
18789 			    (vm_page_info_t) submap_info,
18790 			    count);
18791 
18792 			assert(retval == KERN_SUCCESS);
18793 
18794 			vm_map_lock_read(map);
18795 			vm_map_deallocate(sub_map);
18796 
18797 			/* Move the "info" index by the number of pages we inspected.*/
18798 			info_idx += range_len >> effective_page_shift;
18799 
18800 			/* Move our current offset by the size of the range we inspected.*/
18801 			curr_s_offset += range_len;
18802 
18803 			continue;
18804 		}
18805 
18806 		object = VME_OBJECT(map_entry);
18807 
18808 		if (object == VM_OBJECT_NULL) {
18809 			/*
18810 			 * We don't have an object here and, hence,
18811 			 * no pages to inspect. We'll fill up the
18812 			 * info structure appropriately.
18813 			 */
18814 
18815 			curr_e_offset = MIN(map_entry->vme_end, end);
18816 
18817 			uint64_t num_pages = (curr_e_offset - curr_s_offset) >> effective_page_shift;
18818 
18819 			void *info_ptr = (void*) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
18820 
18821 			bzero(info_ptr, num_pages * sizeof(struct vm_page_info_basic));
18822 
18823 			curr_s_offset = curr_e_offset;
18824 
18825 			info_idx += num_pages;
18826 
18827 			continue;
18828 		}
18829 
18830 		if (do_region_footprint) {
18831 			disposition = 0;
18832 			if (map->has_corpse_footprint) {
18833 				/*
18834 				 * Query the page info data we saved
18835 				 * while forking the corpse.
18836 				 */
18837 				vm_map_corpse_footprint_query_page_info(
18838 					map,
18839 					curr_s_offset,
18840 					&disposition);
18841 			} else {
18842 				/*
18843 				 * Query the live pmap for footprint info
18844 				 * about this page.
18845 				 */
18846 				vm_map_footprint_query_page_info(
18847 					map,
18848 					map_entry,
18849 					curr_s_offset,
18850 					&disposition);
18851 			}
18852 			switch (flavor) {
18853 			case VM_PAGE_INFO_BASIC:
18854 				basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
18855 				basic_info->disposition = disposition;
18856 				basic_info->ref_count = 1;
18857 				basic_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
18858 				basic_info->offset = 0;
18859 				basic_info->depth = 0;
18860 
18861 				info_idx++;
18862 				break;
18863 			}
18864 			curr_s_offset += effective_page_size;
18865 			continue;
18866 		}
18867 
18868 		vm_object_reference(object);
18869 		/*
18870 		 * Shared mode -- so we can allow other readers
18871 		 * to grab the lock too.
18872 		 */
18873 		vm_object_lock_shared(object);
18874 
18875 		curr_e_offset = MIN(map_entry->vme_end, end);
18876 
18877 		vm_map_unlock_read(map);
18878 
18879 		map_entry = NULL; /* map is unlocked, the entry is no longer valid. */
18880 
18881 		curr_object = object;
18882 
18883 		for (; curr_s_offset < curr_e_offset;) {
18884 			if (object == curr_object) {
18885 				ref_count = curr_object->ref_count - 1; /* account for our object reference above. */
18886 			} else {
18887 				ref_count = curr_object->ref_count;
18888 			}
18889 
18890 			curr_offset_in_object = offset_in_object;
18891 
18892 			for (;;) {
18893 				m = vm_page_lookup(curr_object, vm_object_trunc_page(curr_offset_in_object));
18894 
18895 				if (m != VM_PAGE_NULL) {
18896 					disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
18897 					break;
18898 				} else {
18899 					if (curr_object->internal &&
18900 					    curr_object->alive &&
18901 					    !curr_object->terminating &&
18902 					    curr_object->pager_ready) {
18903 						if (VM_COMPRESSOR_PAGER_STATE_GET(curr_object, vm_object_trunc_page(curr_offset_in_object))
18904 						    == VM_EXTERNAL_STATE_EXISTS) {
18905 							/* the pager has that page */
18906 							disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
18907 							break;
18908 						}
18909 					}
18910 
18911 					/*
18912 					 * Go down the VM object shadow chain until we find the page
18913 					 * we're looking for.
18914 					 */
18915 
18916 					if (curr_object->shadow != VM_OBJECT_NULL) {
18917 						vm_object_t shadow = VM_OBJECT_NULL;
18918 
18919 						curr_offset_in_object += curr_object->vo_shadow_offset;
18920 						shadow = curr_object->shadow;
18921 
18922 						vm_object_lock_shared(shadow);
18923 						vm_object_unlock(curr_object);
18924 
18925 						curr_object = shadow;
18926 						depth++;
18927 						continue;
18928 					} else {
18929 						break;
18930 					}
18931 				}
18932 			}
18933 
18934 			/* The ref_count is not strictly accurate, it measures the number   */
18935 			/* of entities holding a ref on the object, they may not be mapping */
18936 			/* the object or may not be mapping the section holding the         */
18937 			/* target page but its still a ball park number and though an over- */
18938 			/* count, it picks up the copy-on-write cases                       */
18939 
18940 			/* We could also get a picture of page sharing from pmap_attributes */
18941 			/* but this would under count as only faulted-in mappings would     */
18942 			/* show up.							    */
18943 
18944 			if ((curr_object == object) && curr_object->shadow) {
18945 				disposition |= VM_PAGE_QUERY_PAGE_COPIED;
18946 			}
18947 
18948 			if (!curr_object->internal) {
18949 				disposition |= VM_PAGE_QUERY_PAGE_EXTERNAL;
18950 			}
18951 
18952 			if (m != VM_PAGE_NULL) {
18953 				if (m->vmp_fictitious) {
18954 					disposition |= VM_PAGE_QUERY_PAGE_FICTITIOUS;
18955 				} else {
18956 					if (m->vmp_dirty || pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(m))) {
18957 						disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
18958 					}
18959 
18960 					if (m->vmp_reference || pmap_is_referenced(VM_PAGE_GET_PHYS_PAGE(m))) {
18961 						disposition |= VM_PAGE_QUERY_PAGE_REF;
18962 					}
18963 
18964 					if (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
18965 						disposition |= VM_PAGE_QUERY_PAGE_SPECULATIVE;
18966 					}
18967 
18968 					/*
18969 					 * XXX TODO4K:
18970 					 * when this routine deals with 4k
18971 					 * pages, check the appropriate CS bit
18972 					 * here.
18973 					 */
18974 					if (m->vmp_cs_validated) {
18975 						disposition |= VM_PAGE_QUERY_PAGE_CS_VALIDATED;
18976 					}
18977 					if (m->vmp_cs_tainted) {
18978 						disposition |= VM_PAGE_QUERY_PAGE_CS_TAINTED;
18979 					}
18980 					if (m->vmp_cs_nx) {
18981 						disposition |= VM_PAGE_QUERY_PAGE_CS_NX;
18982 					}
18983 					if (m->vmp_reusable || curr_object->all_reusable) {
18984 						disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
18985 					}
18986 				}
18987 			}
18988 
18989 			switch (flavor) {
18990 			case VM_PAGE_INFO_BASIC:
18991 				basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
18992 				basic_info->disposition = disposition;
18993 				basic_info->ref_count = ref_count;
18994 				basic_info->object_id = (vm_object_id_t) (uintptr_t)
18995 				    VM_KERNEL_ADDRPERM(curr_object);
18996 				basic_info->offset =
18997 				    (memory_object_offset_t) curr_offset_in_object + offset_in_page;
18998 				basic_info->depth = depth;
18999 
19000 				info_idx++;
19001 				break;
19002 			}
19003 
19004 			disposition = 0;
19005 			offset_in_page = 0; // This doesn't really make sense for any offset other than the starting offset.
19006 
19007 			/*
19008 			 * Move to next offset in the range and in our object.
19009 			 */
19010 			curr_s_offset += effective_page_size;
19011 			offset_in_object += effective_page_size;
19012 			curr_offset_in_object = offset_in_object;
19013 
19014 			if (curr_object != object) {
19015 				vm_object_unlock(curr_object);
19016 
19017 				curr_object = object;
19018 
19019 				vm_object_lock_shared(curr_object);
19020 			} else {
19021 				vm_object_lock_yield_shared(curr_object);
19022 			}
19023 		}
19024 
19025 		vm_object_unlock(curr_object);
19026 		vm_object_deallocate(curr_object);
19027 
19028 		vm_map_lock_read(map);
19029 	}
19030 
19031 	vm_map_unlock_read(map);
19032 	return retval;
19033 }
19034 
19035 /*
19036  *	vm_map_msync
19037  *
19038  *	Synchronises the memory range specified with its backing store
19039  *	image by either flushing or cleaning the contents to the appropriate
19040  *	memory manager engaging in a memory object synchronize dialog with
19041  *	the manager.  The client doesn't return until the manager issues
19042  *	m_o_s_completed message.  MIG Magically converts user task parameter
19043  *	to the task's address map.
19044  *
19045  *	interpretation of sync_flags
19046  *	VM_SYNC_INVALIDATE	- discard pages, only return precious
19047  *				  pages to manager.
19048  *
19049  *	VM_SYNC_INVALIDATE & (VM_SYNC_SYNCHRONOUS | VM_SYNC_ASYNCHRONOUS)
19050  *				- discard pages, write dirty or precious
19051  *				  pages back to memory manager.
19052  *
19053  *	VM_SYNC_SYNCHRONOUS | VM_SYNC_ASYNCHRONOUS
19054  *				- write dirty or precious pages back to
19055  *				  the memory manager.
19056  *
19057  *	VM_SYNC_CONTIGUOUS	- does everything normally, but if there
19058  *				  is a hole in the region, and we would
19059  *				  have returned KERN_SUCCESS, return
19060  *				  KERN_INVALID_ADDRESS instead.
19061  *
19062  *	NOTE
19063  *	The memory object attributes have not yet been implemented, this
19064  *	function will have to deal with the invalidate attribute
19065  *
19066  *	RETURNS
19067  *	KERN_INVALID_TASK		Bad task parameter
19068  *	KERN_INVALID_ARGUMENT		both sync and async were specified.
19069  *	KERN_SUCCESS			The usual.
19070  *	KERN_INVALID_ADDRESS		There was a hole in the region.
19071  */
19072 
19073 kern_return_t
vm_map_msync(vm_map_t map,vm_map_address_t address,vm_map_size_t size,vm_sync_t sync_flags)19074 vm_map_msync(
19075 	vm_map_t                map,
19076 	vm_map_address_t        address,
19077 	vm_map_size_t           size,
19078 	vm_sync_t               sync_flags)
19079 {
19080 	vm_map_entry_t          entry;
19081 	vm_map_size_t           amount_left;
19082 	vm_object_offset_t      offset;
19083 	vm_object_offset_t      start_offset, end_offset;
19084 	boolean_t               do_sync_req;
19085 	boolean_t               had_hole = FALSE;
19086 	vm_map_offset_t         pmap_offset;
19087 
19088 	if ((sync_flags & VM_SYNC_ASYNCHRONOUS) &&
19089 	    (sync_flags & VM_SYNC_SYNCHRONOUS)) {
19090 		return KERN_INVALID_ARGUMENT;
19091 	}
19092 
19093 	if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
19094 		DEBUG4K_SHARE("map %p address 0x%llx size 0x%llx flags 0x%x\n", map, (uint64_t)address, (uint64_t)size, sync_flags);
19095 	}
19096 
19097 	/*
19098 	 * align address and size on page boundaries
19099 	 */
19100 	size = (vm_map_round_page(address + size,
19101 	    VM_MAP_PAGE_MASK(map)) -
19102 	    vm_map_trunc_page(address,
19103 	    VM_MAP_PAGE_MASK(map)));
19104 	address = vm_map_trunc_page(address,
19105 	    VM_MAP_PAGE_MASK(map));
19106 
19107 	if (map == VM_MAP_NULL) {
19108 		return KERN_INVALID_TASK;
19109 	}
19110 
19111 	if (size == 0) {
19112 		return KERN_SUCCESS;
19113 	}
19114 
19115 	amount_left = size;
19116 
19117 	while (amount_left > 0) {
19118 		vm_object_size_t        flush_size;
19119 		vm_object_t             object;
19120 
19121 		vm_map_lock(map);
19122 		if (!vm_map_lookup_entry(map,
19123 		    address,
19124 		    &entry)) {
19125 			vm_map_size_t   skip;
19126 
19127 			/*
19128 			 * hole in the address map.
19129 			 */
19130 			had_hole = TRUE;
19131 
19132 			if (sync_flags & VM_SYNC_KILLPAGES) {
19133 				/*
19134 				 * For VM_SYNC_KILLPAGES, there should be
19135 				 * no holes in the range, since we couldn't
19136 				 * prevent someone else from allocating in
19137 				 * that hole and we wouldn't want to "kill"
19138 				 * their pages.
19139 				 */
19140 				vm_map_unlock(map);
19141 				break;
19142 			}
19143 
19144 			/*
19145 			 * Check for empty map.
19146 			 */
19147 			if (entry == vm_map_to_entry(map) &&
19148 			    entry->vme_next == entry) {
19149 				vm_map_unlock(map);
19150 				break;
19151 			}
19152 			/*
19153 			 * Check that we don't wrap and that
19154 			 * we have at least one real map entry.
19155 			 */
19156 			if ((map->hdr.nentries == 0) ||
19157 			    (entry->vme_next->vme_start < address)) {
19158 				vm_map_unlock(map);
19159 				break;
19160 			}
19161 			/*
19162 			 * Move up to the next entry if needed
19163 			 */
19164 			skip = (entry->vme_next->vme_start - address);
19165 			if (skip >= amount_left) {
19166 				amount_left = 0;
19167 			} else {
19168 				amount_left -= skip;
19169 			}
19170 			address = entry->vme_next->vme_start;
19171 			vm_map_unlock(map);
19172 			continue;
19173 		}
19174 
19175 		offset = address - entry->vme_start;
19176 		pmap_offset = address;
19177 
19178 		/*
19179 		 * do we have more to flush than is contained in this
19180 		 * entry ?
19181 		 */
19182 		if (amount_left + entry->vme_start + offset > entry->vme_end) {
19183 			flush_size = entry->vme_end -
19184 			    (entry->vme_start + offset);
19185 		} else {
19186 			flush_size = amount_left;
19187 		}
19188 		amount_left -= flush_size;
19189 		address += flush_size;
19190 
19191 		if (entry->is_sub_map == TRUE) {
19192 			vm_map_t        local_map;
19193 			vm_map_offset_t local_offset;
19194 
19195 			local_map = VME_SUBMAP(entry);
19196 			local_offset = VME_OFFSET(entry);
19197 			vm_map_reference(local_map);
19198 			vm_map_unlock(map);
19199 			if (vm_map_msync(
19200 				    local_map,
19201 				    local_offset,
19202 				    flush_size,
19203 				    sync_flags) == KERN_INVALID_ADDRESS) {
19204 				had_hole = TRUE;
19205 			}
19206 			vm_map_deallocate(local_map);
19207 			continue;
19208 		}
19209 		object = VME_OBJECT(entry);
19210 
19211 		/*
19212 		 * We can't sync this object if the object has not been
19213 		 * created yet
19214 		 */
19215 		if (object == VM_OBJECT_NULL) {
19216 			vm_map_unlock(map);
19217 			continue;
19218 		}
19219 		offset += VME_OFFSET(entry);
19220 
19221 		vm_object_lock(object);
19222 
19223 		if (sync_flags & (VM_SYNC_KILLPAGES | VM_SYNC_DEACTIVATE)) {
19224 			int kill_pages = 0;
19225 			boolean_t reusable_pages = FALSE;
19226 
19227 			if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
19228 				/*
19229 				 * This is a destructive operation and so we
19230 				 * err on the side of limiting the range of
19231 				 * the operation.
19232 				 */
19233 				start_offset = vm_object_round_page(offset);
19234 				end_offset = vm_object_trunc_page(offset + flush_size);
19235 
19236 				if (end_offset <= start_offset) {
19237 					vm_object_unlock(object);
19238 					vm_map_unlock(map);
19239 					continue;
19240 				}
19241 
19242 				pmap_offset += start_offset - offset;
19243 			} else {
19244 				start_offset = offset;
19245 				end_offset = offset + flush_size;
19246 			}
19247 
19248 			if (sync_flags & VM_SYNC_KILLPAGES) {
19249 				if (((object->ref_count == 1) ||
19250 				    ((object->copy_strategy !=
19251 				    MEMORY_OBJECT_COPY_SYMMETRIC) &&
19252 				    (object->copy == VM_OBJECT_NULL))) &&
19253 				    (object->shadow == VM_OBJECT_NULL)) {
19254 					if (object->ref_count != 1) {
19255 						vm_page_stats_reusable.free_shared++;
19256 					}
19257 					kill_pages = 1;
19258 				} else {
19259 					kill_pages = -1;
19260 				}
19261 			}
19262 			if (kill_pages != -1) {
19263 				vm_object_deactivate_pages(
19264 					object,
19265 					start_offset,
19266 					(vm_object_size_t) (end_offset - start_offset),
19267 					kill_pages,
19268 					reusable_pages,
19269 					map->pmap,
19270 					pmap_offset);
19271 			}
19272 			vm_object_unlock(object);
19273 			vm_map_unlock(map);
19274 			continue;
19275 		}
19276 		/*
19277 		 * We can't sync this object if there isn't a pager.
19278 		 * Don't bother to sync internal objects, since there can't
19279 		 * be any "permanent" storage for these objects anyway.
19280 		 */
19281 		if ((object->pager == MEMORY_OBJECT_NULL) ||
19282 		    (object->internal) || (object->private)) {
19283 			vm_object_unlock(object);
19284 			vm_map_unlock(map);
19285 			continue;
19286 		}
19287 		/*
19288 		 * keep reference on the object until syncing is done
19289 		 */
19290 		vm_object_reference_locked(object);
19291 		vm_object_unlock(object);
19292 
19293 		vm_map_unlock(map);
19294 
19295 		if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
19296 			start_offset = vm_object_trunc_page(offset);
19297 			end_offset = vm_object_round_page(offset + flush_size);
19298 		} else {
19299 			start_offset = offset;
19300 			end_offset = offset + flush_size;
19301 		}
19302 
19303 		do_sync_req = vm_object_sync(object,
19304 		    start_offset,
19305 		    (end_offset - start_offset),
19306 		    sync_flags & VM_SYNC_INVALIDATE,
19307 		    ((sync_flags & VM_SYNC_SYNCHRONOUS) ||
19308 		    (sync_flags & VM_SYNC_ASYNCHRONOUS)),
19309 		    sync_flags & VM_SYNC_SYNCHRONOUS);
19310 
19311 		if ((sync_flags & VM_SYNC_INVALIDATE) && object->resident_page_count == 0) {
19312 			/*
19313 			 * clear out the clustering and read-ahead hints
19314 			 */
19315 			vm_object_lock(object);
19316 
19317 			object->pages_created = 0;
19318 			object->pages_used = 0;
19319 			object->sequential = 0;
19320 			object->last_alloc = 0;
19321 
19322 			vm_object_unlock(object);
19323 		}
19324 		vm_object_deallocate(object);
19325 	} /* while */
19326 
19327 	/* for proper msync() behaviour */
19328 	if (had_hole == TRUE && (sync_flags & VM_SYNC_CONTIGUOUS)) {
19329 		return KERN_INVALID_ADDRESS;
19330 	}
19331 
19332 	return KERN_SUCCESS;
19333 }/* vm_msync */
19334 
19335 void
vm_named_entry_associate_vm_object(vm_named_entry_t named_entry,vm_object_t object,vm_object_offset_t offset,vm_object_size_t size,vm_prot_t prot)19336 vm_named_entry_associate_vm_object(
19337 	vm_named_entry_t        named_entry,
19338 	vm_object_t             object,
19339 	vm_object_offset_t      offset,
19340 	vm_object_size_t        size,
19341 	vm_prot_t               prot)
19342 {
19343 	vm_map_copy_t copy;
19344 	vm_map_entry_t copy_entry;
19345 
19346 	assert(!named_entry->is_sub_map);
19347 	assert(!named_entry->is_copy);
19348 	assert(!named_entry->is_object);
19349 	assert(!named_entry->internal);
19350 	assert(named_entry->backing.copy == VM_MAP_COPY_NULL);
19351 
19352 	copy = vm_map_copy_allocate();
19353 	copy->type = VM_MAP_COPY_ENTRY_LIST;
19354 	copy->offset = offset;
19355 	copy->size = size;
19356 	copy->cpy_hdr.page_shift = (uint16_t)PAGE_SHIFT;
19357 	vm_map_store_init(&copy->cpy_hdr);
19358 
19359 	copy_entry = vm_map_copy_entry_create(copy);
19360 	copy_entry->protection = prot;
19361 	copy_entry->max_protection = prot;
19362 	copy_entry->use_pmap = TRUE;
19363 	copy_entry->vme_start = VM_MAP_TRUNC_PAGE(offset, PAGE_MASK);
19364 	copy_entry->vme_end = VM_MAP_ROUND_PAGE(offset + size, PAGE_MASK);
19365 	VME_OBJECT_SET(copy_entry, object);
19366 	VME_OFFSET_SET(copy_entry, vm_object_trunc_page(offset));
19367 	vm_map_copy_entry_link(copy, vm_map_copy_last_entry(copy), copy_entry);
19368 
19369 	named_entry->backing.copy = copy;
19370 	named_entry->is_object = TRUE;
19371 	if (object->internal) {
19372 		named_entry->internal = TRUE;
19373 	}
19374 
19375 	DEBUG4K_MEMENTRY("named_entry %p copy %p object %p offset 0x%llx size 0x%llx prot 0x%x\n",
19376 	    named_entry, copy, object, offset, size, prot);
19377 }
19378 
19379 vm_object_t
vm_named_entry_to_vm_object(vm_named_entry_t named_entry)19380 vm_named_entry_to_vm_object(
19381 	vm_named_entry_t named_entry)
19382 {
19383 	vm_map_copy_t   copy;
19384 	vm_map_entry_t  copy_entry;
19385 	vm_object_t     object;
19386 
19387 	assert(!named_entry->is_sub_map);
19388 	assert(!named_entry->is_copy);
19389 	assert(named_entry->is_object);
19390 	copy = named_entry->backing.copy;
19391 	assert(copy != VM_MAP_COPY_NULL);
19392 	/*
19393 	 * Assert that the vm_map_copy is coming from the right
19394 	 * zone and hasn't been forged
19395 	 */
19396 	vm_map_copy_require(copy);
19397 	assert(copy->cpy_hdr.nentries == 1);
19398 	copy_entry = vm_map_copy_first_entry(copy);
19399 	assert(!copy_entry->is_sub_map);
19400 	object = VME_OBJECT(copy_entry);
19401 
19402 	DEBUG4K_MEMENTRY("%p -> %p -> %p [0x%llx 0x%llx 0x%llx 0x%x/0x%x ] -> %p offset 0x%llx size 0x%llx prot 0x%x\n", named_entry, copy, copy_entry, (uint64_t)copy_entry->vme_start, (uint64_t)copy_entry->vme_end, copy_entry->vme_offset, copy_entry->protection, copy_entry->max_protection, object, named_entry->offset, named_entry->size, named_entry->protection);
19403 
19404 	return object;
19405 }
19406 
19407 /*
19408  *	Routine:	convert_port_entry_to_map
19409  *	Purpose:
19410  *		Convert from a port specifying an entry or a task
19411  *		to a map. Doesn't consume the port ref; produces a map ref,
19412  *		which may be null.  Unlike convert_port_to_map, the
19413  *		port may be task or a named entry backed.
19414  *	Conditions:
19415  *		Nothing locked.
19416  */
19417 
19418 vm_map_t
convert_port_entry_to_map(ipc_port_t port)19419 convert_port_entry_to_map(
19420 	ipc_port_t      port)
19421 {
19422 	vm_map_t map = VM_MAP_NULL;
19423 	vm_named_entry_t named_entry;
19424 
19425 	if (!IP_VALID(port)) {
19426 		return VM_MAP_NULL;
19427 	}
19428 
19429 	if (ip_kotype(port) != IKOT_NAMED_ENTRY) {
19430 		return convert_port_to_map(port);
19431 	}
19432 
19433 	named_entry = mach_memory_entry_from_port(port);
19434 
19435 	if ((named_entry->is_sub_map) &&
19436 	    (named_entry->protection & VM_PROT_WRITE)) {
19437 		map = named_entry->backing.map;
19438 		if (map->pmap != PMAP_NULL) {
19439 			if (map->pmap == kernel_pmap) {
19440 				panic("userspace has access "
19441 				    "to a kernel map %p", map);
19442 			}
19443 			pmap_require(map->pmap);
19444 		}
19445 		vm_map_reference(map);
19446 	}
19447 
19448 	return map;
19449 }
19450 
19451 /*
19452  * Export routines to other components for the things we access locally through
19453  * macros.
19454  */
19455 #undef current_map
19456 vm_map_t
current_map(void)19457 current_map(void)
19458 {
19459 	return current_map_fast();
19460 }
19461 
19462 /*
19463  *	vm_map_reference:
19464  *
19465  *	Takes a reference on the specified map.
19466  */
19467 void
vm_map_reference(vm_map_t map)19468 vm_map_reference(
19469 	vm_map_t        map)
19470 {
19471 	if (__probable(map != VM_MAP_NULL)) {
19472 		vm_map_require(map);
19473 		os_ref_retain_raw(&map->map_refcnt, &map_refgrp);
19474 	}
19475 }
19476 
19477 /*
19478  *	vm_map_deallocate:
19479  *
19480  *	Removes a reference from the specified map,
19481  *	destroying it if no references remain.
19482  *	The map should not be locked.
19483  */
19484 void
vm_map_deallocate(vm_map_t map)19485 vm_map_deallocate(
19486 	vm_map_t        map)
19487 {
19488 	if (__probable(map != VM_MAP_NULL)) {
19489 		vm_map_require(map);
19490 		if (os_ref_release_raw(&map->map_refcnt, &map_refgrp) == 0) {
19491 			vm_map_destroy(map);
19492 		}
19493 	}
19494 }
19495 
19496 void
vm_map_inspect_deallocate(vm_map_inspect_t map)19497 vm_map_inspect_deallocate(
19498 	vm_map_inspect_t      map)
19499 {
19500 	vm_map_deallocate((vm_map_t)map);
19501 }
19502 
19503 void
vm_map_read_deallocate(vm_map_read_t map)19504 vm_map_read_deallocate(
19505 	vm_map_read_t      map)
19506 {
19507 	vm_map_deallocate((vm_map_t)map);
19508 }
19509 
19510 
19511 void
vm_map_disable_NX(vm_map_t map)19512 vm_map_disable_NX(vm_map_t map)
19513 {
19514 	if (map == NULL) {
19515 		return;
19516 	}
19517 	if (map->pmap == NULL) {
19518 		return;
19519 	}
19520 
19521 	pmap_disable_NX(map->pmap);
19522 }
19523 
19524 void
vm_map_disallow_data_exec(vm_map_t map)19525 vm_map_disallow_data_exec(vm_map_t map)
19526 {
19527 	if (map == NULL) {
19528 		return;
19529 	}
19530 
19531 	map->map_disallow_data_exec = TRUE;
19532 }
19533 
19534 /* XXX Consider making these constants (VM_MAX_ADDRESS and MACH_VM_MAX_ADDRESS)
19535  * more descriptive.
19536  */
19537 void
vm_map_set_32bit(vm_map_t map)19538 vm_map_set_32bit(vm_map_t map)
19539 {
19540 #if defined(__arm__) || defined(__arm64__)
19541 	map->max_offset = pmap_max_offset(FALSE, ARM_PMAP_MAX_OFFSET_DEVICE);
19542 #else
19543 	map->max_offset = (vm_map_offset_t)VM_MAX_ADDRESS;
19544 #endif
19545 }
19546 
19547 
19548 void
vm_map_set_64bit(vm_map_t map)19549 vm_map_set_64bit(vm_map_t map)
19550 {
19551 #if defined(__arm__) || defined(__arm64__)
19552 	map->max_offset = pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_DEVICE);
19553 #else
19554 	map->max_offset = (vm_map_offset_t)MACH_VM_MAX_ADDRESS;
19555 #endif
19556 }
19557 
19558 /*
19559  * Expand the maximum size of an existing map to the maximum supported.
19560  */
19561 void
vm_map_set_jumbo(vm_map_t map)19562 vm_map_set_jumbo(vm_map_t map)
19563 {
19564 #if defined (__arm64__) && !defined(CONFIG_ARROW)
19565 	vm_map_set_max_addr(map, ~0);
19566 #else /* arm64 */
19567 	(void) map;
19568 #endif
19569 }
19570 
19571 /*
19572  * This map has a JIT entitlement
19573  */
19574 void
vm_map_set_jit_entitled(vm_map_t map)19575 vm_map_set_jit_entitled(vm_map_t map)
19576 {
19577 #if defined (__arm64__)
19578 	pmap_set_jit_entitled(map->pmap);
19579 #else /* arm64 */
19580 	(void) map;
19581 #endif
19582 }
19583 
19584 /*
19585  * Expand the maximum size of an existing map.
19586  */
19587 void
vm_map_set_max_addr(vm_map_t map,vm_map_offset_t new_max_offset)19588 vm_map_set_max_addr(vm_map_t map, vm_map_offset_t new_max_offset)
19589 {
19590 #if defined(__arm64__)
19591 	vm_map_offset_t max_supported_offset = 0;
19592 	vm_map_offset_t old_max_offset = map->max_offset;
19593 	max_supported_offset = pmap_max_offset(vm_map_is_64bit(map), ARM_PMAP_MAX_OFFSET_JUMBO);
19594 
19595 	new_max_offset = trunc_page(new_max_offset);
19596 
19597 	/* The address space cannot be shrunk using this routine. */
19598 	if (old_max_offset >= new_max_offset) {
19599 		return;
19600 	}
19601 
19602 	if (max_supported_offset < new_max_offset) {
19603 		new_max_offset = max_supported_offset;
19604 	}
19605 
19606 	map->max_offset = new_max_offset;
19607 
19608 	if (map->holes_list->prev->vme_end == old_max_offset) {
19609 		/*
19610 		 * There is already a hole at the end of the map; simply make it bigger.
19611 		 */
19612 		map->holes_list->prev->vme_end = map->max_offset;
19613 	} else {
19614 		/*
19615 		 * There is no hole at the end, so we need to create a new hole
19616 		 * for the new empty space we're creating.
19617 		 */
19618 		struct vm_map_links *new_hole = zalloc(vm_map_holes_zone);
19619 		new_hole->start = old_max_offset;
19620 		new_hole->end = map->max_offset;
19621 		new_hole->prev = map->holes_list->prev;
19622 		new_hole->next = (struct vm_map_entry *)map->holes_list;
19623 		map->holes_list->prev->links.next = (struct vm_map_entry *)new_hole;
19624 		map->holes_list->prev = (struct vm_map_entry *)new_hole;
19625 	}
19626 #else
19627 	(void)map;
19628 	(void)new_max_offset;
19629 #endif
19630 }
19631 
19632 vm_map_offset_t
vm_compute_max_offset(boolean_t is64)19633 vm_compute_max_offset(boolean_t is64)
19634 {
19635 #if defined(__arm__) || defined(__arm64__)
19636 	return pmap_max_offset(is64, ARM_PMAP_MAX_OFFSET_DEVICE);
19637 #else
19638 	return is64 ? (vm_map_offset_t)MACH_VM_MAX_ADDRESS : (vm_map_offset_t)VM_MAX_ADDRESS;
19639 #endif
19640 }
19641 
19642 void
vm_map_get_max_aslr_slide_section(vm_map_t map __unused,int64_t * max_sections,int64_t * section_size)19643 vm_map_get_max_aslr_slide_section(
19644 	vm_map_t                map __unused,
19645 	int64_t                 *max_sections,
19646 	int64_t                 *section_size)
19647 {
19648 #if defined(__arm64__)
19649 	*max_sections = 3;
19650 	*section_size = ARM_TT_TWIG_SIZE;
19651 #else
19652 	*max_sections = 1;
19653 	*section_size = 0;
19654 #endif
19655 }
19656 
19657 uint64_t
vm_map_get_max_aslr_slide_pages(vm_map_t map)19658 vm_map_get_max_aslr_slide_pages(vm_map_t map)
19659 {
19660 #if defined(__arm64__)
19661 	/* Limit arm64 slide to 16MB to conserve contiguous VA space in the more
19662 	 * limited embedded address space; this is also meant to minimize pmap
19663 	 * memory usage on 16KB page systems.
19664 	 */
19665 	return 1 << (24 - VM_MAP_PAGE_SHIFT(map));
19666 #else
19667 	return 1 << (vm_map_is_64bit(map) ? 16 : 8);
19668 #endif
19669 }
19670 
19671 uint64_t
vm_map_get_max_loader_aslr_slide_pages(vm_map_t map)19672 vm_map_get_max_loader_aslr_slide_pages(vm_map_t map)
19673 {
19674 #if defined(__arm64__)
19675 	/* We limit the loader slide to 4MB, in order to ensure at least 8 bits
19676 	 * of independent entropy on 16KB page systems.
19677 	 */
19678 	return 1 << (22 - VM_MAP_PAGE_SHIFT(map));
19679 #else
19680 	return 1 << (vm_map_is_64bit(map) ? 16 : 8);
19681 #endif
19682 }
19683 
19684 #ifndef __arm__
19685 boolean_t
vm_map_is_64bit(vm_map_t map)19686 vm_map_is_64bit(
19687 	vm_map_t map)
19688 {
19689 	return map->max_offset > ((vm_map_offset_t)VM_MAX_ADDRESS);
19690 }
19691 #endif
19692 
19693 boolean_t
vm_map_has_hard_pagezero(vm_map_t map,vm_map_offset_t pagezero_size)19694 vm_map_has_hard_pagezero(
19695 	vm_map_t        map,
19696 	vm_map_offset_t pagezero_size)
19697 {
19698 	/*
19699 	 * XXX FBDP
19700 	 * We should lock the VM map (for read) here but we can get away
19701 	 * with it for now because there can't really be any race condition:
19702 	 * the VM map's min_offset is changed only when the VM map is created
19703 	 * and when the zero page is established (when the binary gets loaded),
19704 	 * and this routine gets called only when the task terminates and the
19705 	 * VM map is being torn down, and when a new map is created via
19706 	 * load_machfile()/execve().
19707 	 */
19708 	return map->min_offset >= pagezero_size;
19709 }
19710 
19711 /*
19712  * Raise a VM map's maximun offset.
19713  */
19714 kern_return_t
vm_map_raise_max_offset(vm_map_t map,vm_map_offset_t new_max_offset)19715 vm_map_raise_max_offset(
19716 	vm_map_t        map,
19717 	vm_map_offset_t new_max_offset)
19718 {
19719 	kern_return_t   ret;
19720 
19721 	vm_map_lock(map);
19722 	ret = KERN_INVALID_ADDRESS;
19723 
19724 	if (new_max_offset >= map->max_offset) {
19725 		if (!vm_map_is_64bit(map)) {
19726 			if (new_max_offset <= (vm_map_offset_t)VM_MAX_ADDRESS) {
19727 				map->max_offset = new_max_offset;
19728 				ret = KERN_SUCCESS;
19729 			}
19730 		} else {
19731 			if (new_max_offset <= (vm_map_offset_t)MACH_VM_MAX_ADDRESS) {
19732 				map->max_offset = new_max_offset;
19733 				ret = KERN_SUCCESS;
19734 			}
19735 		}
19736 	}
19737 
19738 	vm_map_unlock(map);
19739 	return ret;
19740 }
19741 
19742 
19743 /*
19744  * Raise a VM map's minimum offset.
19745  * To strictly enforce "page zero" reservation.
19746  */
19747 kern_return_t
vm_map_raise_min_offset(vm_map_t map,vm_map_offset_t new_min_offset)19748 vm_map_raise_min_offset(
19749 	vm_map_t        map,
19750 	vm_map_offset_t new_min_offset)
19751 {
19752 	vm_map_entry_t  first_entry;
19753 
19754 	new_min_offset = vm_map_round_page(new_min_offset,
19755 	    VM_MAP_PAGE_MASK(map));
19756 
19757 	vm_map_lock(map);
19758 
19759 	if (new_min_offset < map->min_offset) {
19760 		/*
19761 		 * Can't move min_offset backwards, as that would expose
19762 		 * a part of the address space that was previously, and for
19763 		 * possibly good reasons, inaccessible.
19764 		 */
19765 		vm_map_unlock(map);
19766 		return KERN_INVALID_ADDRESS;
19767 	}
19768 	if (new_min_offset >= map->max_offset) {
19769 		/* can't go beyond the end of the address space */
19770 		vm_map_unlock(map);
19771 		return KERN_INVALID_ADDRESS;
19772 	}
19773 
19774 	first_entry = vm_map_first_entry(map);
19775 	if (first_entry != vm_map_to_entry(map) &&
19776 	    first_entry->vme_start < new_min_offset) {
19777 		/*
19778 		 * Some memory was already allocated below the new
19779 		 * minimun offset.  It's too late to change it now...
19780 		 */
19781 		vm_map_unlock(map);
19782 		return KERN_NO_SPACE;
19783 	}
19784 
19785 	map->min_offset = new_min_offset;
19786 
19787 	assert(map->holes_list);
19788 	map->holes_list->start = new_min_offset;
19789 	assert(new_min_offset < map->holes_list->end);
19790 
19791 	vm_map_unlock(map);
19792 
19793 	return KERN_SUCCESS;
19794 }
19795 
19796 /*
19797  * Set the limit on the maximum amount of address space and user wired memory allowed for this map.
19798  * This is basically a copy of the RLIMIT_AS and RLIMIT_MEMLOCK rlimit value maintained by the BSD
19799  * side of the kernel. The limits are checked in the mach VM side, so we keep a copy so we don't
19800  * have to reach over to the BSD data structures.
19801  */
19802 
19803 uint64_t vm_map_set_size_limit_count = 0;
19804 kern_return_t
vm_map_set_size_limit(vm_map_t map,uint64_t new_size_limit)19805 vm_map_set_size_limit(vm_map_t map, uint64_t new_size_limit)
19806 {
19807 	kern_return_t kr;
19808 
19809 	vm_map_lock(map);
19810 	if (new_size_limit < map->size) {
19811 		/* new limit should not be lower than its current size */
19812 		DTRACE_VM2(vm_map_set_size_limit_fail,
19813 		    vm_map_size_t, map->size,
19814 		    uint64_t, new_size_limit);
19815 		kr = KERN_FAILURE;
19816 	} else if (new_size_limit == map->size_limit) {
19817 		/* no change */
19818 		kr = KERN_SUCCESS;
19819 	} else {
19820 		/* set new limit */
19821 		DTRACE_VM2(vm_map_set_size_limit,
19822 		    vm_map_size_t, map->size,
19823 		    uint64_t, new_size_limit);
19824 		if (new_size_limit != RLIM_INFINITY) {
19825 			vm_map_set_size_limit_count++;
19826 		}
19827 		map->size_limit = new_size_limit;
19828 		kr = KERN_SUCCESS;
19829 	}
19830 	vm_map_unlock(map);
19831 	return kr;
19832 }
19833 
19834 uint64_t vm_map_set_data_limit_count = 0;
19835 kern_return_t
vm_map_set_data_limit(vm_map_t map,uint64_t new_data_limit)19836 vm_map_set_data_limit(vm_map_t map, uint64_t new_data_limit)
19837 {
19838 	kern_return_t kr;
19839 
19840 	vm_map_lock(map);
19841 	if (new_data_limit < map->size) {
19842 		/* new limit should not be lower than its current size */
19843 		DTRACE_VM2(vm_map_set_data_limit_fail,
19844 		    vm_map_size_t, map->size,
19845 		    uint64_t, new_data_limit);
19846 		kr = KERN_FAILURE;
19847 	} else if (new_data_limit == map->data_limit) {
19848 		/* no change */
19849 		kr = KERN_SUCCESS;
19850 	} else {
19851 		/* set new limit */
19852 		DTRACE_VM2(vm_map_set_data_limit,
19853 		    vm_map_size_t, map->size,
19854 		    uint64_t, new_data_limit);
19855 		if (new_data_limit != RLIM_INFINITY) {
19856 			vm_map_set_data_limit_count++;
19857 		}
19858 		map->data_limit = new_data_limit;
19859 		kr = KERN_SUCCESS;
19860 	}
19861 	vm_map_unlock(map);
19862 	return kr;
19863 }
19864 
19865 void
vm_map_set_user_wire_limit(vm_map_t map,vm_size_t limit)19866 vm_map_set_user_wire_limit(vm_map_t     map,
19867     vm_size_t    limit)
19868 {
19869 	vm_map_lock(map);
19870 	map->user_wire_limit = limit;
19871 	vm_map_unlock(map);
19872 }
19873 
19874 
19875 void
vm_map_switch_protect(vm_map_t map,boolean_t val)19876 vm_map_switch_protect(vm_map_t     map,
19877     boolean_t    val)
19878 {
19879 	vm_map_lock(map);
19880 	map->switch_protect = val;
19881 	vm_map_unlock(map);
19882 }
19883 
19884 extern int cs_process_enforcement_enable;
19885 boolean_t
vm_map_cs_enforcement(vm_map_t map)19886 vm_map_cs_enforcement(
19887 	vm_map_t map)
19888 {
19889 	if (cs_process_enforcement_enable) {
19890 		return TRUE;
19891 	}
19892 	return map->cs_enforcement;
19893 }
19894 
19895 kern_return_t
vm_map_cs_wx_enable(vm_map_t map)19896 vm_map_cs_wx_enable(
19897 	vm_map_t map)
19898 {
19899 	return pmap_cs_allow_invalid(vm_map_pmap(map));
19900 }
19901 
19902 void
vm_map_cs_debugged_set(vm_map_t map,boolean_t val)19903 vm_map_cs_debugged_set(
19904 	vm_map_t map,
19905 	boolean_t val)
19906 {
19907 	vm_map_lock(map);
19908 	map->cs_debugged = val;
19909 	vm_map_unlock(map);
19910 }
19911 
19912 void
vm_map_cs_enforcement_set(vm_map_t map,boolean_t val)19913 vm_map_cs_enforcement_set(
19914 	vm_map_t map,
19915 	boolean_t val)
19916 {
19917 	vm_map_lock(map);
19918 	map->cs_enforcement = val;
19919 	pmap_set_vm_map_cs_enforced(map->pmap, val);
19920 	vm_map_unlock(map);
19921 }
19922 
19923 /*
19924  * IOKit has mapped a region into this map; adjust the pmap's ledgers appropriately.
19925  * phys_footprint is a composite limit consisting of iokit + physmem, so we need to
19926  * bump both counters.
19927  */
19928 void
vm_map_iokit_mapped_region(vm_map_t map,vm_size_t bytes)19929 vm_map_iokit_mapped_region(vm_map_t map, vm_size_t bytes)
19930 {
19931 	pmap_t pmap = vm_map_pmap(map);
19932 
19933 	ledger_credit(pmap->ledger, task_ledgers.iokit_mapped, bytes);
19934 	ledger_credit(pmap->ledger, task_ledgers.phys_footprint, bytes);
19935 }
19936 
19937 void
vm_map_iokit_unmapped_region(vm_map_t map,vm_size_t bytes)19938 vm_map_iokit_unmapped_region(vm_map_t map, vm_size_t bytes)
19939 {
19940 	pmap_t pmap = vm_map_pmap(map);
19941 
19942 	ledger_debit(pmap->ledger, task_ledgers.iokit_mapped, bytes);
19943 	ledger_debit(pmap->ledger, task_ledgers.phys_footprint, bytes);
19944 }
19945 
19946 /* Add (generate) code signature for memory range */
19947 #if CONFIG_DYNAMIC_CODE_SIGNING
19948 kern_return_t
vm_map_sign(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)19949 vm_map_sign(vm_map_t map,
19950     vm_map_offset_t start,
19951     vm_map_offset_t end)
19952 {
19953 	vm_map_entry_t entry;
19954 	vm_page_t m;
19955 	vm_object_t object;
19956 
19957 	/*
19958 	 * Vet all the input parameters and current type and state of the
19959 	 * underlaying object.  Return with an error if anything is amiss.
19960 	 */
19961 	if (map == VM_MAP_NULL) {
19962 		return KERN_INVALID_ARGUMENT;
19963 	}
19964 
19965 	vm_map_lock_read(map);
19966 
19967 	if (!vm_map_lookup_entry(map, start, &entry) || entry->is_sub_map) {
19968 		/*
19969 		 * Must pass a valid non-submap address.
19970 		 */
19971 		vm_map_unlock_read(map);
19972 		return KERN_INVALID_ADDRESS;
19973 	}
19974 
19975 	if ((entry->vme_start > start) || (entry->vme_end < end)) {
19976 		/*
19977 		 * Map entry doesn't cover the requested range. Not handling
19978 		 * this situation currently.
19979 		 */
19980 		vm_map_unlock_read(map);
19981 		return KERN_INVALID_ARGUMENT;
19982 	}
19983 
19984 	object = VME_OBJECT(entry);
19985 	if (object == VM_OBJECT_NULL) {
19986 		/*
19987 		 * Object must already be present or we can't sign.
19988 		 */
19989 		vm_map_unlock_read(map);
19990 		return KERN_INVALID_ARGUMENT;
19991 	}
19992 
19993 	vm_object_lock(object);
19994 	vm_map_unlock_read(map);
19995 
19996 	while (start < end) {
19997 		uint32_t refmod;
19998 
19999 		m = vm_page_lookup(object,
20000 		    start - entry->vme_start + VME_OFFSET(entry));
20001 		if (m == VM_PAGE_NULL) {
20002 			/* shoud we try to fault a page here? we can probably
20003 			 * demand it exists and is locked for this request */
20004 			vm_object_unlock(object);
20005 			return KERN_FAILURE;
20006 		}
20007 		/* deal with special page status */
20008 		if (m->vmp_busy ||
20009 		    (m->vmp_unusual && (m->vmp_error || m->vmp_restart || m->vmp_private || m->vmp_absent))) {
20010 			vm_object_unlock(object);
20011 			return KERN_FAILURE;
20012 		}
20013 
20014 		/* Page is OK... now "validate" it */
20015 		/* This is the place where we'll call out to create a code
20016 		 * directory, later */
20017 		/* XXX TODO4K: deal with 4k subpages individually? */
20018 		m->vmp_cs_validated = VMP_CS_ALL_TRUE;
20019 
20020 		/* The page is now "clean" for codesigning purposes. That means
20021 		 * we don't consider it as modified (wpmapped) anymore. But
20022 		 * we'll disconnect the page so we note any future modification
20023 		 * attempts. */
20024 		m->vmp_wpmapped = FALSE;
20025 		refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
20026 
20027 		/* Pull the dirty status from the pmap, since we cleared the
20028 		 * wpmapped bit */
20029 		if ((refmod & VM_MEM_MODIFIED) && !m->vmp_dirty) {
20030 			SET_PAGE_DIRTY(m, FALSE);
20031 		}
20032 
20033 		/* On to the next page */
20034 		start += PAGE_SIZE;
20035 	}
20036 	vm_object_unlock(object);
20037 
20038 	return KERN_SUCCESS;
20039 }
20040 #endif
20041 
20042 kern_return_t
vm_map_partial_reap(vm_map_t map,unsigned int * reclaimed_resident,unsigned int * reclaimed_compressed)20043 vm_map_partial_reap(vm_map_t map, unsigned int *reclaimed_resident, unsigned int *reclaimed_compressed)
20044 {
20045 	vm_map_entry_t  entry = VM_MAP_ENTRY_NULL;
20046 	vm_map_entry_t  next_entry;
20047 	kern_return_t   kr = KERN_SUCCESS;
20048 	VM_MAP_ZAP_DECLARE(zap_list);
20049 
20050 	vm_map_lock(map);
20051 
20052 	for (entry = vm_map_first_entry(map);
20053 	    entry != vm_map_to_entry(map);
20054 	    entry = next_entry) {
20055 		next_entry = entry->vme_next;
20056 
20057 		if (VME_OBJECT(entry) &&
20058 		    !entry->is_sub_map &&
20059 		    (VME_OBJECT(entry)->internal == TRUE) &&
20060 		    (VME_OBJECT(entry)->ref_count == 1)) {
20061 			*reclaimed_resident += VME_OBJECT(entry)->resident_page_count;
20062 			*reclaimed_compressed += vm_compressor_pager_get_count(VME_OBJECT(entry)->pager);
20063 
20064 			(void)vm_map_delete(map, entry->vme_start,
20065 			    entry->vme_end,
20066 			    VM_MAP_REMOVE_NO_YIELD,
20067 			    &zap_list);
20068 		}
20069 	}
20070 
20071 	vm_map_unlock(map);
20072 
20073 	vm_map_zap_dispose(&zap_list);
20074 
20075 	return kr;
20076 }
20077 
20078 
20079 #if DEVELOPMENT || DEBUG
20080 
20081 int
vm_map_disconnect_page_mappings(vm_map_t map,boolean_t do_unnest)20082 vm_map_disconnect_page_mappings(
20083 	vm_map_t map,
20084 	boolean_t do_unnest)
20085 {
20086 	vm_map_entry_t entry;
20087 	ledger_amount_t byte_count = 0;
20088 
20089 	if (do_unnest == TRUE) {
20090 #ifndef NO_NESTED_PMAP
20091 		vm_map_lock(map);
20092 
20093 		for (entry = vm_map_first_entry(map);
20094 		    entry != vm_map_to_entry(map);
20095 		    entry = entry->vme_next) {
20096 			if (entry->is_sub_map && entry->use_pmap) {
20097 				/*
20098 				 * Make sure the range between the start of this entry and
20099 				 * the end of this entry is no longer nested, so that
20100 				 * we will only remove mappings from the pmap in use by this
20101 				 * this task
20102 				 */
20103 				vm_map_clip_unnest(map, entry, entry->vme_start, entry->vme_end);
20104 			}
20105 		}
20106 		vm_map_unlock(map);
20107 #endif
20108 	}
20109 	vm_map_lock_read(map);
20110 
20111 	ledger_get_balance(map->pmap->ledger, task_ledgers.phys_mem, &byte_count);
20112 
20113 	for (entry = vm_map_first_entry(map);
20114 	    entry != vm_map_to_entry(map);
20115 	    entry = entry->vme_next) {
20116 		if (!entry->is_sub_map && ((VME_OBJECT(entry) == 0) ||
20117 		    (VME_OBJECT(entry)->phys_contiguous))) {
20118 			continue;
20119 		}
20120 		if (entry->is_sub_map) {
20121 			assert(!entry->use_pmap);
20122 		}
20123 
20124 		pmap_remove_options(map->pmap, entry->vme_start, entry->vme_end, 0);
20125 	}
20126 	vm_map_unlock_read(map);
20127 
20128 	return (int) (byte_count / VM_MAP_PAGE_SIZE(map));
20129 }
20130 
20131 kern_return_t
vm_map_inject_error(vm_map_t map,vm_map_offset_t vaddr)20132 vm_map_inject_error(vm_map_t map, vm_map_offset_t vaddr)
20133 {
20134 	vm_object_t object = NULL;
20135 	vm_object_offset_t offset;
20136 	vm_prot_t prot;
20137 	boolean_t wired;
20138 	vm_map_version_t version;
20139 	vm_map_t real_map;
20140 	int result = KERN_FAILURE;
20141 
20142 	vaddr = vm_map_trunc_page(vaddr, PAGE_MASK);
20143 	vm_map_lock(map);
20144 
20145 	result = vm_map_lookup_locked(&map, vaddr, VM_PROT_READ,
20146 	    OBJECT_LOCK_EXCLUSIVE, &version, &object, &offset, &prot, &wired,
20147 	    NULL, &real_map, NULL);
20148 	if (object == NULL) {
20149 		result = KERN_MEMORY_ERROR;
20150 	} else if (object->pager) {
20151 		result = vm_compressor_pager_inject_error(object->pager,
20152 		    offset);
20153 	} else {
20154 		result = KERN_MEMORY_PRESENT;
20155 	}
20156 
20157 	if (object != NULL) {
20158 		vm_object_unlock(object);
20159 	}
20160 
20161 	if (real_map != map) {
20162 		vm_map_unlock(real_map);
20163 	}
20164 	vm_map_unlock(map);
20165 
20166 	return result;
20167 }
20168 
20169 #endif
20170 
20171 
20172 #if CONFIG_FREEZE
20173 
20174 
20175 extern struct freezer_context freezer_context_global;
20176 AbsoluteTime c_freezer_last_yield_ts = 0;
20177 
20178 extern unsigned int memorystatus_freeze_private_shared_pages_ratio;
20179 extern unsigned int memorystatus_freeze_shared_mb_per_process_max;
20180 
20181 kern_return_t
vm_map_freeze(task_t task,unsigned int * purgeable_count,unsigned int * wired_count,unsigned int * clean_count,unsigned int * dirty_count,unsigned int dirty_budget,unsigned int * shared_count,int * freezer_error_code,boolean_t eval_only)20182 vm_map_freeze(
20183 	task_t       task,
20184 	unsigned int *purgeable_count,
20185 	unsigned int *wired_count,
20186 	unsigned int *clean_count,
20187 	unsigned int *dirty_count,
20188 	unsigned int dirty_budget,
20189 	unsigned int *shared_count,
20190 	int          *freezer_error_code,
20191 	boolean_t    eval_only)
20192 {
20193 	vm_map_entry_t  entry2 = VM_MAP_ENTRY_NULL;
20194 	kern_return_t   kr = KERN_SUCCESS;
20195 	boolean_t       evaluation_phase = TRUE;
20196 	vm_object_t     cur_shared_object = NULL;
20197 	int             cur_shared_obj_ref_cnt = 0;
20198 	unsigned int    dirty_private_count = 0, dirty_shared_count = 0, obj_pages_snapshot = 0;
20199 
20200 	*purgeable_count = *wired_count = *clean_count = *dirty_count = *shared_count = 0;
20201 
20202 	/*
20203 	 * We need the exclusive lock here so that we can
20204 	 * block any page faults or lookups while we are
20205 	 * in the middle of freezing this vm map.
20206 	 */
20207 	vm_map_t map = task->map;
20208 
20209 	vm_map_lock(map);
20210 
20211 	assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
20212 
20213 	if (vm_compressor_low_on_space() || vm_swap_low_on_space()) {
20214 		if (vm_compressor_low_on_space()) {
20215 			*freezer_error_code = FREEZER_ERROR_NO_COMPRESSOR_SPACE;
20216 		}
20217 
20218 		if (vm_swap_low_on_space()) {
20219 			*freezer_error_code = FREEZER_ERROR_NO_SWAP_SPACE;
20220 		}
20221 
20222 		kr = KERN_NO_SPACE;
20223 		goto done;
20224 	}
20225 
20226 	if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE == FALSE) {
20227 		/*
20228 		 * In-memory compressor backing the freezer. No disk.
20229 		 * So no need to do the evaluation phase.
20230 		 */
20231 		evaluation_phase = FALSE;
20232 
20233 		if (eval_only == TRUE) {
20234 			/*
20235 			 * We don't support 'eval_only' mode
20236 			 * in this non-swap config.
20237 			 */
20238 			*freezer_error_code = FREEZER_ERROR_GENERIC;
20239 			kr = KERN_INVALID_ARGUMENT;
20240 			goto done;
20241 		}
20242 
20243 		freezer_context_global.freezer_ctx_uncompressed_pages = 0;
20244 		clock_get_uptime(&c_freezer_last_yield_ts);
20245 	}
20246 again:
20247 
20248 	for (entry2 = vm_map_first_entry(map);
20249 	    entry2 != vm_map_to_entry(map);
20250 	    entry2 = entry2->vme_next) {
20251 		vm_object_t     src_object = VME_OBJECT(entry2);
20252 
20253 		if (src_object &&
20254 		    !entry2->is_sub_map &&
20255 		    !src_object->phys_contiguous) {
20256 			/* If eligible, scan the entry, moving eligible pages over to our parent object */
20257 
20258 			if (src_object->internal == TRUE) {
20259 				if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
20260 					/*
20261 					 * We skip purgeable objects during evaluation phase only.
20262 					 * If we decide to freeze this process, we'll explicitly
20263 					 * purge these objects before we go around again with
20264 					 * 'evaluation_phase' set to FALSE.
20265 					 */
20266 
20267 					if ((src_object->purgable == VM_PURGABLE_EMPTY) || (src_object->purgable == VM_PURGABLE_VOLATILE)) {
20268 						/*
20269 						 * We want to purge objects that may not belong to this task but are mapped
20270 						 * in this task alone. Since we already purged this task's purgeable memory
20271 						 * at the end of a successful evaluation phase, we want to avoid doing no-op calls
20272 						 * on this task's purgeable objects. Hence the check for only volatile objects.
20273 						 */
20274 						if (evaluation_phase == FALSE &&
20275 						    (src_object->purgable == VM_PURGABLE_VOLATILE) &&
20276 						    (src_object->ref_count == 1)) {
20277 							vm_object_lock(src_object);
20278 							vm_object_purge(src_object, 0);
20279 							vm_object_unlock(src_object);
20280 						}
20281 						continue;
20282 					}
20283 
20284 					/*
20285 					 * Pages belonging to this object could be swapped to disk.
20286 					 * Make sure it's not a shared object because we could end
20287 					 * up just bringing it back in again.
20288 					 *
20289 					 * We try to optimize somewhat by checking for objects that are mapped
20290 					 * more than once within our own map. But we don't do full searches,
20291 					 * we just look at the entries following our current entry.
20292 					 */
20293 
20294 					if (src_object->ref_count > 1) {
20295 						if (src_object != cur_shared_object) {
20296 							obj_pages_snapshot = (src_object->resident_page_count - src_object->wired_page_count) + vm_compressor_pager_get_count(src_object->pager);
20297 							dirty_shared_count += obj_pages_snapshot;
20298 
20299 							cur_shared_object = src_object;
20300 							cur_shared_obj_ref_cnt = 1;
20301 							continue;
20302 						} else {
20303 							cur_shared_obj_ref_cnt++;
20304 							if (src_object->ref_count == cur_shared_obj_ref_cnt) {
20305 								/*
20306 								 * Fall through to below and treat this object as private.
20307 								 * So deduct its pages from our shared total and add it to the
20308 								 * private total.
20309 								 */
20310 
20311 								dirty_shared_count -= obj_pages_snapshot;
20312 								dirty_private_count += obj_pages_snapshot;
20313 							} else {
20314 								continue;
20315 							}
20316 						}
20317 					}
20318 
20319 
20320 					if (src_object->ref_count == 1) {
20321 						dirty_private_count += (src_object->resident_page_count - src_object->wired_page_count) + vm_compressor_pager_get_count(src_object->pager);
20322 					}
20323 
20324 					if (evaluation_phase == TRUE) {
20325 						continue;
20326 					}
20327 				}
20328 
20329 				uint32_t paged_out_count = vm_object_compressed_freezer_pageout(src_object, dirty_budget);
20330 				*wired_count += src_object->wired_page_count;
20331 
20332 				if (vm_compressor_low_on_space() || vm_swap_low_on_space()) {
20333 					if (vm_compressor_low_on_space()) {
20334 						*freezer_error_code = FREEZER_ERROR_NO_COMPRESSOR_SPACE;
20335 					}
20336 
20337 					if (vm_swap_low_on_space()) {
20338 						*freezer_error_code = FREEZER_ERROR_NO_SWAP_SPACE;
20339 					}
20340 
20341 					kr = KERN_NO_SPACE;
20342 					break;
20343 				}
20344 				if (paged_out_count >= dirty_budget) {
20345 					break;
20346 				}
20347 				dirty_budget -= paged_out_count;
20348 			}
20349 		}
20350 	}
20351 
20352 	*shared_count = (unsigned int) ((dirty_shared_count * PAGE_SIZE_64) / (1024 * 1024ULL));
20353 	if (evaluation_phase) {
20354 		unsigned int shared_pages_threshold = (memorystatus_freeze_shared_mb_per_process_max * 1024 * 1024ULL) / PAGE_SIZE_64;
20355 
20356 		if (dirty_shared_count > shared_pages_threshold) {
20357 			*freezer_error_code = FREEZER_ERROR_EXCESS_SHARED_MEMORY;
20358 			kr = KERN_FAILURE;
20359 			goto done;
20360 		}
20361 
20362 		if (dirty_shared_count &&
20363 		    ((dirty_private_count / dirty_shared_count) < memorystatus_freeze_private_shared_pages_ratio)) {
20364 			*freezer_error_code = FREEZER_ERROR_LOW_PRIVATE_SHARED_RATIO;
20365 			kr = KERN_FAILURE;
20366 			goto done;
20367 		}
20368 
20369 		evaluation_phase = FALSE;
20370 		dirty_shared_count = dirty_private_count = 0;
20371 
20372 		freezer_context_global.freezer_ctx_uncompressed_pages = 0;
20373 		clock_get_uptime(&c_freezer_last_yield_ts);
20374 
20375 		if (eval_only) {
20376 			kr = KERN_SUCCESS;
20377 			goto done;
20378 		}
20379 
20380 		vm_purgeable_purge_task_owned(task);
20381 
20382 		goto again;
20383 	} else {
20384 		kr = KERN_SUCCESS;
20385 	}
20386 
20387 done:
20388 	vm_map_unlock(map);
20389 
20390 	if ((eval_only == FALSE) && (kr == KERN_SUCCESS)) {
20391 		vm_object_compressed_freezer_done();
20392 	}
20393 	return kr;
20394 }
20395 
20396 #endif
20397 
20398 /*
20399  * vm_map_entry_should_cow_for_true_share:
20400  *
20401  * Determines if the map entry should be clipped and setup for copy-on-write
20402  * to avoid applying "true_share" to a large VM object when only a subset is
20403  * targeted.
20404  *
20405  * For now, we target only the map entries created for the Objective C
20406  * Garbage Collector, which initially have the following properties:
20407  *	- alias == VM_MEMORY_MALLOC
20408  *      - wired_count == 0
20409  *      - !needs_copy
20410  * and a VM object with:
20411  *      - internal
20412  *      - copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC
20413  *      - !true_share
20414  *      - vo_size == ANON_CHUNK_SIZE
20415  *
20416  * Only non-kernel map entries.
20417  */
20418 boolean_t
vm_map_entry_should_cow_for_true_share(vm_map_entry_t entry)20419 vm_map_entry_should_cow_for_true_share(
20420 	vm_map_entry_t  entry)
20421 {
20422 	vm_object_t     object;
20423 
20424 	if (entry->is_sub_map) {
20425 		/* entry does not point at a VM object */
20426 		return FALSE;
20427 	}
20428 
20429 	if (entry->needs_copy) {
20430 		/* already set for copy_on_write: done! */
20431 		return FALSE;
20432 	}
20433 
20434 	if (VME_ALIAS(entry) != VM_MEMORY_MALLOC &&
20435 	    VME_ALIAS(entry) != VM_MEMORY_MALLOC_SMALL) {
20436 		/* not a malloc heap or Obj-C Garbage Collector heap */
20437 		return FALSE;
20438 	}
20439 
20440 	if (entry->wired_count) {
20441 		/* wired: can't change the map entry... */
20442 		vm_counters.should_cow_but_wired++;
20443 		return FALSE;
20444 	}
20445 
20446 	object = VME_OBJECT(entry);
20447 
20448 	if (object == VM_OBJECT_NULL) {
20449 		/* no object yet... */
20450 		return FALSE;
20451 	}
20452 
20453 	if (!object->internal) {
20454 		/* not an internal object */
20455 		return FALSE;
20456 	}
20457 
20458 	if (object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
20459 		/* not the default copy strategy */
20460 		return FALSE;
20461 	}
20462 
20463 	if (object->true_share) {
20464 		/* already true_share: too late to avoid it */
20465 		return FALSE;
20466 	}
20467 
20468 	if (VME_ALIAS(entry) == VM_MEMORY_MALLOC &&
20469 	    object->vo_size != ANON_CHUNK_SIZE) {
20470 		/* ... not an object created for the ObjC Garbage Collector */
20471 		return FALSE;
20472 	}
20473 
20474 	if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_SMALL &&
20475 	    object->vo_size != 2048 * 4096) {
20476 		/* ... not a "MALLOC_SMALL" heap */
20477 		return FALSE;
20478 	}
20479 
20480 	/*
20481 	 * All the criteria match: we have a large object being targeted for "true_share".
20482 	 * To limit the adverse side-effects linked with "true_share", tell the caller to
20483 	 * try and avoid setting up the entire object for "true_share" by clipping the
20484 	 * targeted range and setting it up for copy-on-write.
20485 	 */
20486 	return TRUE;
20487 }
20488 
20489 vm_map_offset_t
vm_map_round_page_mask(vm_map_offset_t offset,vm_map_offset_t mask)20490 vm_map_round_page_mask(
20491 	vm_map_offset_t offset,
20492 	vm_map_offset_t mask)
20493 {
20494 	return VM_MAP_ROUND_PAGE(offset, mask);
20495 }
20496 
20497 vm_map_offset_t
vm_map_trunc_page_mask(vm_map_offset_t offset,vm_map_offset_t mask)20498 vm_map_trunc_page_mask(
20499 	vm_map_offset_t offset,
20500 	vm_map_offset_t mask)
20501 {
20502 	return VM_MAP_TRUNC_PAGE(offset, mask);
20503 }
20504 
20505 boolean_t
vm_map_page_aligned(vm_map_offset_t offset,vm_map_offset_t mask)20506 vm_map_page_aligned(
20507 	vm_map_offset_t offset,
20508 	vm_map_offset_t mask)
20509 {
20510 	return ((offset) & mask) == 0;
20511 }
20512 
20513 int
vm_map_page_shift(vm_map_t map)20514 vm_map_page_shift(
20515 	vm_map_t map)
20516 {
20517 	return VM_MAP_PAGE_SHIFT(map);
20518 }
20519 
20520 int
vm_map_page_size(vm_map_t map)20521 vm_map_page_size(
20522 	vm_map_t map)
20523 {
20524 	return VM_MAP_PAGE_SIZE(map);
20525 }
20526 
20527 vm_map_offset_t
vm_map_page_mask(vm_map_t map)20528 vm_map_page_mask(
20529 	vm_map_t map)
20530 {
20531 	return VM_MAP_PAGE_MASK(map);
20532 }
20533 
20534 kern_return_t
vm_map_set_page_shift(vm_map_t map,int pageshift)20535 vm_map_set_page_shift(
20536 	vm_map_t        map,
20537 	int             pageshift)
20538 {
20539 	if (map->hdr.nentries != 0) {
20540 		/* too late to change page size */
20541 		return KERN_FAILURE;
20542 	}
20543 
20544 	map->hdr.page_shift = (uint16_t)pageshift;
20545 
20546 	return KERN_SUCCESS;
20547 }
20548 
20549 kern_return_t
vm_map_query_volatile(vm_map_t map,mach_vm_size_t * volatile_virtual_size_p,mach_vm_size_t * volatile_resident_size_p,mach_vm_size_t * volatile_compressed_size_p,mach_vm_size_t * volatile_pmap_size_p,mach_vm_size_t * volatile_compressed_pmap_size_p)20550 vm_map_query_volatile(
20551 	vm_map_t        map,
20552 	mach_vm_size_t  *volatile_virtual_size_p,
20553 	mach_vm_size_t  *volatile_resident_size_p,
20554 	mach_vm_size_t  *volatile_compressed_size_p,
20555 	mach_vm_size_t  *volatile_pmap_size_p,
20556 	mach_vm_size_t  *volatile_compressed_pmap_size_p)
20557 {
20558 	mach_vm_size_t  volatile_virtual_size;
20559 	mach_vm_size_t  volatile_resident_count;
20560 	mach_vm_size_t  volatile_compressed_count;
20561 	mach_vm_size_t  volatile_pmap_count;
20562 	mach_vm_size_t  volatile_compressed_pmap_count;
20563 	mach_vm_size_t  resident_count;
20564 	vm_map_entry_t  entry;
20565 	vm_object_t     object;
20566 
20567 	/* map should be locked by caller */
20568 
20569 	volatile_virtual_size = 0;
20570 	volatile_resident_count = 0;
20571 	volatile_compressed_count = 0;
20572 	volatile_pmap_count = 0;
20573 	volatile_compressed_pmap_count = 0;
20574 
20575 	for (entry = vm_map_first_entry(map);
20576 	    entry != vm_map_to_entry(map);
20577 	    entry = entry->vme_next) {
20578 		mach_vm_size_t  pmap_resident_bytes, pmap_compressed_bytes;
20579 
20580 		if (entry->is_sub_map) {
20581 			continue;
20582 		}
20583 		if (!(entry->protection & VM_PROT_WRITE)) {
20584 			continue;
20585 		}
20586 		object = VME_OBJECT(entry);
20587 		if (object == VM_OBJECT_NULL) {
20588 			continue;
20589 		}
20590 		if (object->purgable != VM_PURGABLE_VOLATILE &&
20591 		    object->purgable != VM_PURGABLE_EMPTY) {
20592 			continue;
20593 		}
20594 		if (VME_OFFSET(entry)) {
20595 			/*
20596 			 * If the map entry has been split and the object now
20597 			 * appears several times in the VM map, we don't want
20598 			 * to count the object's resident_page_count more than
20599 			 * once.  We count it only for the first one, starting
20600 			 * at offset 0 and ignore the other VM map entries.
20601 			 */
20602 			continue;
20603 		}
20604 		resident_count = object->resident_page_count;
20605 		if ((VME_OFFSET(entry) / PAGE_SIZE) >= resident_count) {
20606 			resident_count = 0;
20607 		} else {
20608 			resident_count -= (VME_OFFSET(entry) / PAGE_SIZE);
20609 		}
20610 
20611 		volatile_virtual_size += entry->vme_end - entry->vme_start;
20612 		volatile_resident_count += resident_count;
20613 		if (object->pager) {
20614 			volatile_compressed_count +=
20615 			    vm_compressor_pager_get_count(object->pager);
20616 		}
20617 		pmap_compressed_bytes = 0;
20618 		pmap_resident_bytes =
20619 		    pmap_query_resident(map->pmap,
20620 		    entry->vme_start,
20621 		    entry->vme_end,
20622 		    &pmap_compressed_bytes);
20623 		volatile_pmap_count += (pmap_resident_bytes / PAGE_SIZE);
20624 		volatile_compressed_pmap_count += (pmap_compressed_bytes
20625 		    / PAGE_SIZE);
20626 	}
20627 
20628 	/* map is still locked on return */
20629 
20630 	*volatile_virtual_size_p = volatile_virtual_size;
20631 	*volatile_resident_size_p = volatile_resident_count * PAGE_SIZE;
20632 	*volatile_compressed_size_p = volatile_compressed_count * PAGE_SIZE;
20633 	*volatile_pmap_size_p = volatile_pmap_count * PAGE_SIZE;
20634 	*volatile_compressed_pmap_size_p = volatile_compressed_pmap_count * PAGE_SIZE;
20635 
20636 	return KERN_SUCCESS;
20637 }
20638 
20639 void
vm_map_sizes(vm_map_t map,vm_map_size_t * psize,vm_map_size_t * pfree,vm_map_size_t * plargest_free)20640 vm_map_sizes(vm_map_t map,
20641     vm_map_size_t * psize,
20642     vm_map_size_t * pfree,
20643     vm_map_size_t * plargest_free)
20644 {
20645 	vm_map_entry_t  entry;
20646 	vm_map_offset_t prev;
20647 	vm_map_size_t   free, total_free, largest_free;
20648 	boolean_t       end;
20649 
20650 	if (!map) {
20651 		*psize = *pfree = *plargest_free = 0;
20652 		return;
20653 	}
20654 	total_free = largest_free = 0;
20655 
20656 	vm_map_lock_read(map);
20657 	if (psize) {
20658 		*psize = map->max_offset - map->min_offset;
20659 	}
20660 
20661 	prev = map->min_offset;
20662 	for (entry = vm_map_first_entry(map);; entry = entry->vme_next) {
20663 		end = (entry == vm_map_to_entry(map));
20664 
20665 		if (end) {
20666 			free = entry->vme_end   - prev;
20667 		} else {
20668 			free = entry->vme_start - prev;
20669 		}
20670 
20671 		total_free += free;
20672 		if (free > largest_free) {
20673 			largest_free = free;
20674 		}
20675 
20676 		if (end) {
20677 			break;
20678 		}
20679 		prev = entry->vme_end;
20680 	}
20681 	vm_map_unlock_read(map);
20682 	if (pfree) {
20683 		*pfree = total_free;
20684 	}
20685 	if (plargest_free) {
20686 		*plargest_free = largest_free;
20687 	}
20688 }
20689 
20690 #if VM_SCAN_FOR_SHADOW_CHAIN
20691 int vm_map_shadow_max(vm_map_t map);
20692 int
vm_map_shadow_max(vm_map_t map)20693 vm_map_shadow_max(
20694 	vm_map_t map)
20695 {
20696 	int             shadows, shadows_max;
20697 	vm_map_entry_t  entry;
20698 	vm_object_t     object, next_object;
20699 
20700 	if (map == NULL) {
20701 		return 0;
20702 	}
20703 
20704 	shadows_max = 0;
20705 
20706 	vm_map_lock_read(map);
20707 
20708 	for (entry = vm_map_first_entry(map);
20709 	    entry != vm_map_to_entry(map);
20710 	    entry = entry->vme_next) {
20711 		if (entry->is_sub_map) {
20712 			continue;
20713 		}
20714 		object = VME_OBJECT(entry);
20715 		if (object == NULL) {
20716 			continue;
20717 		}
20718 		vm_object_lock_shared(object);
20719 		for (shadows = 0;
20720 		    object->shadow != NULL;
20721 		    shadows++, object = next_object) {
20722 			next_object = object->shadow;
20723 			vm_object_lock_shared(next_object);
20724 			vm_object_unlock(object);
20725 		}
20726 		vm_object_unlock(object);
20727 		if (shadows > shadows_max) {
20728 			shadows_max = shadows;
20729 		}
20730 	}
20731 
20732 	vm_map_unlock_read(map);
20733 
20734 	return shadows_max;
20735 }
20736 #endif /* VM_SCAN_FOR_SHADOW_CHAIN */
20737 
20738 void
vm_commit_pagezero_status(vm_map_t lmap)20739 vm_commit_pagezero_status(vm_map_t lmap)
20740 {
20741 	pmap_advise_pagezero_range(lmap->pmap, lmap->min_offset);
20742 }
20743 
20744 #if XNU_TARGET_OS_OSX
20745 void
vm_map_set_high_start(vm_map_t map,vm_map_offset_t high_start)20746 vm_map_set_high_start(
20747 	vm_map_t        map,
20748 	vm_map_offset_t high_start)
20749 {
20750 	map->vmmap_high_start = high_start;
20751 }
20752 #endif /* XNU_TARGET_OS_OSX */
20753 
20754 
20755 /*
20756  * FORKED CORPSE FOOTPRINT
20757  *
20758  * A forked corpse gets a copy of the original VM map but its pmap is mostly
20759  * empty since it never ran and never got to fault in any pages.
20760  * Collecting footprint info (via "sysctl vm.self_region_footprint") for
20761  * a forked corpse would therefore return very little information.
20762  *
20763  * When forking a corpse, we can pass the VM_MAP_FORK_CORPSE_FOOTPRINT option
20764  * to vm_map_fork() to collect footprint information from the original VM map
20765  * and its pmap, and store it in the forked corpse's VM map.  That information
20766  * is stored in place of the VM map's "hole list" since we'll never need to
20767  * lookup for holes in the corpse's map.
20768  *
20769  * The corpse's footprint info looks like this:
20770  *
20771  * vm_map->vmmap_corpse_footprint points to pageable kernel memory laid out
20772  * as follows:
20773  *                     +---------------------------------------+
20774  *            header-> | cf_size                               |
20775  *                     +-------------------+-------------------+
20776  *                     | cf_last_region    | cf_last_zeroes    |
20777  *                     +-------------------+-------------------+
20778  *           region1-> | cfr_vaddr                             |
20779  *                     +-------------------+-------------------+
20780  *                     | cfr_num_pages     | d0 | d1 | d2 | d3 |
20781  *                     +---------------------------------------+
20782  *                     | d4 | d5 | ...                         |
20783  *                     +---------------------------------------+
20784  *                     | ...                                   |
20785  *                     +-------------------+-------------------+
20786  *                     | dy | dz | na | na | cfr_vaddr...      | <-region2
20787  *                     +-------------------+-------------------+
20788  *                     | cfr_vaddr (ctd)   | cfr_num_pages     |
20789  *                     +---------------------------------------+
20790  *                     | d0 | d1 ...                           |
20791  *                     +---------------------------------------+
20792  *                       ...
20793  *                     +---------------------------------------+
20794  *       last region-> | cfr_vaddr                             |
20795  *                     +---------------------------------------+
20796  *                     + cfr_num_pages     | d0 | d1 | d2 | d3 |
20797  *                     +---------------------------------------+
20798  *                       ...
20799  *                     +---------------------------------------+
20800  *                     | dx | dy | dz | na | na | na | na | na |
20801  *                     +---------------------------------------+
20802  *
20803  * where:
20804  *      cf_size:	total size of the buffer (rounded to page size)
20805  *      cf_last_region:	offset in the buffer of the last "region" sub-header
20806  *	cf_last_zeroes: number of trailing "zero" dispositions at the end
20807  *			of last region
20808  *	cfr_vaddr:	virtual address of the start of the covered "region"
20809  *	cfr_num_pages:	number of pages in the covered "region"
20810  *	d*:		disposition of the page at that virtual address
20811  * Regions in the buffer are word-aligned.
20812  *
20813  * We estimate the size of the buffer based on the number of memory regions
20814  * and the virtual size of the address space.  While copying each memory region
20815  * during vm_map_fork(), we also collect the footprint info for that region
20816  * and store it in the buffer, packing it as much as possible (coalescing
20817  * contiguous memory regions to avoid having too many region headers and
20818  * avoiding long streaks of "zero" page dispositions by splitting footprint
20819  * "regions", so the number of regions in the footprint buffer might not match
20820  * the number of memory regions in the address space.
20821  *
20822  * We also have to copy the original task's "nonvolatile" ledgers since that's
20823  * part of the footprint and will need to be reported to any tool asking for
20824  * the footprint information of the forked corpse.
20825  */
20826 
20827 uint64_t vm_map_corpse_footprint_count = 0;
20828 uint64_t vm_map_corpse_footprint_size_avg = 0;
20829 uint64_t vm_map_corpse_footprint_size_max = 0;
20830 uint64_t vm_map_corpse_footprint_full = 0;
20831 uint64_t vm_map_corpse_footprint_no_buf = 0;
20832 
20833 struct vm_map_corpse_footprint_header {
20834 	vm_size_t       cf_size;        /* allocated buffer size */
20835 	uint32_t        cf_last_region; /* offset of last region in buffer */
20836 	union {
20837 		uint32_t cfu_last_zeroes; /* during creation:
20838 		                           * number of "zero" dispositions at
20839 		                           * end of last region */
20840 		uint32_t cfu_hint_region; /* during lookup:
20841 		                           * offset of last looked up region */
20842 #define cf_last_zeroes cfu.cfu_last_zeroes
20843 #define cf_hint_region cfu.cfu_hint_region
20844 	} cfu;
20845 };
20846 typedef uint8_t cf_disp_t;
20847 struct vm_map_corpse_footprint_region {
20848 	vm_map_offset_t cfr_vaddr;      /* region start virtual address */
20849 	uint32_t        cfr_num_pages;  /* number of pages in this "region" */
20850 	cf_disp_t   cfr_disposition[0]; /* disposition of each page */
20851 } __attribute__((packed));
20852 
20853 static cf_disp_t
vm_page_disposition_to_cf_disp(int disposition)20854 vm_page_disposition_to_cf_disp(
20855 	int disposition)
20856 {
20857 	assert(sizeof(cf_disp_t) == 1);
20858 	/* relocate bits that don't fit in a "uint8_t" */
20859 	if (disposition & VM_PAGE_QUERY_PAGE_REUSABLE) {
20860 		disposition |= VM_PAGE_QUERY_PAGE_FICTITIOUS;
20861 	}
20862 	/* cast gets rid of extra bits */
20863 	return (cf_disp_t) disposition;
20864 }
20865 
20866 static int
vm_page_cf_disp_to_disposition(cf_disp_t cf_disp)20867 vm_page_cf_disp_to_disposition(
20868 	cf_disp_t cf_disp)
20869 {
20870 	int disposition;
20871 
20872 	assert(sizeof(cf_disp_t) == 1);
20873 	disposition = (int) cf_disp;
20874 	/* move relocated bits back in place */
20875 	if (cf_disp & VM_PAGE_QUERY_PAGE_FICTITIOUS) {
20876 		disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
20877 		disposition &= ~VM_PAGE_QUERY_PAGE_FICTITIOUS;
20878 	}
20879 	return disposition;
20880 }
20881 
20882 /*
20883  * vm_map_corpse_footprint_new_region:
20884  *      closes the current footprint "region" and creates a new one
20885  *
20886  * Returns NULL if there's not enough space in the buffer for a new region.
20887  */
20888 static struct vm_map_corpse_footprint_region *
vm_map_corpse_footprint_new_region(struct vm_map_corpse_footprint_header * footprint_header)20889 vm_map_corpse_footprint_new_region(
20890 	struct vm_map_corpse_footprint_header *footprint_header)
20891 {
20892 	uintptr_t       footprint_edge;
20893 	uint32_t        new_region_offset;
20894 	struct vm_map_corpse_footprint_region *footprint_region;
20895 	struct vm_map_corpse_footprint_region *new_footprint_region;
20896 
20897 	footprint_edge = ((uintptr_t)footprint_header +
20898 	    footprint_header->cf_size);
20899 	footprint_region = ((struct vm_map_corpse_footprint_region *)
20900 	    ((char *)footprint_header +
20901 	    footprint_header->cf_last_region));
20902 	assert((uintptr_t)footprint_region + sizeof(*footprint_region) <=
20903 	    footprint_edge);
20904 
20905 	/* get rid of trailing zeroes in the last region */
20906 	assert(footprint_region->cfr_num_pages >=
20907 	    footprint_header->cf_last_zeroes);
20908 	footprint_region->cfr_num_pages -=
20909 	    footprint_header->cf_last_zeroes;
20910 	footprint_header->cf_last_zeroes = 0;
20911 
20912 	/* reuse this region if it's now empty */
20913 	if (footprint_region->cfr_num_pages == 0) {
20914 		return footprint_region;
20915 	}
20916 
20917 	/* compute offset of new region */
20918 	new_region_offset = footprint_header->cf_last_region;
20919 	new_region_offset += sizeof(*footprint_region);
20920 	new_region_offset += (footprint_region->cfr_num_pages * sizeof(cf_disp_t));
20921 	new_region_offset = roundup(new_region_offset, sizeof(int));
20922 
20923 	/* check if we're going over the edge */
20924 	if (((uintptr_t)footprint_header +
20925 	    new_region_offset +
20926 	    sizeof(*footprint_region)) >=
20927 	    footprint_edge) {
20928 		/* over the edge: no new region */
20929 		return NULL;
20930 	}
20931 
20932 	/* adjust offset of last region in header */
20933 	footprint_header->cf_last_region = new_region_offset;
20934 
20935 	new_footprint_region = (struct vm_map_corpse_footprint_region *)
20936 	    ((char *)footprint_header +
20937 	    footprint_header->cf_last_region);
20938 	new_footprint_region->cfr_vaddr = 0;
20939 	new_footprint_region->cfr_num_pages = 0;
20940 	/* caller needs to initialize new region */
20941 
20942 	return new_footprint_region;
20943 }
20944 
20945 /*
20946  * vm_map_corpse_footprint_collect:
20947  *	collect footprint information for "old_entry" in "old_map" and
20948  *	stores it in "new_map"'s vmmap_footprint_info.
20949  */
20950 kern_return_t
vm_map_corpse_footprint_collect(vm_map_t old_map,vm_map_entry_t old_entry,vm_map_t new_map)20951 vm_map_corpse_footprint_collect(
20952 	vm_map_t        old_map,
20953 	vm_map_entry_t  old_entry,
20954 	vm_map_t        new_map)
20955 {
20956 	vm_map_offset_t va;
20957 	kern_return_t   kr;
20958 	struct vm_map_corpse_footprint_header *footprint_header;
20959 	struct vm_map_corpse_footprint_region *footprint_region;
20960 	struct vm_map_corpse_footprint_region *new_footprint_region;
20961 	cf_disp_t       *next_disp_p;
20962 	uintptr_t       footprint_edge;
20963 	uint32_t        num_pages_tmp;
20964 	int             effective_page_size;
20965 
20966 	effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(old_map));
20967 
20968 	va = old_entry->vme_start;
20969 
20970 	vm_map_lock_assert_exclusive(old_map);
20971 	vm_map_lock_assert_exclusive(new_map);
20972 
20973 	assert(new_map->has_corpse_footprint);
20974 	assert(!old_map->has_corpse_footprint);
20975 	if (!new_map->has_corpse_footprint ||
20976 	    old_map->has_corpse_footprint) {
20977 		/*
20978 		 * This can only transfer footprint info from a
20979 		 * map with a live pmap to a map with a corpse footprint.
20980 		 */
20981 		return KERN_NOT_SUPPORTED;
20982 	}
20983 
20984 	if (new_map->vmmap_corpse_footprint == NULL) {
20985 		vm_offset_t     buf;
20986 		vm_size_t       buf_size;
20987 
20988 		buf = 0;
20989 		buf_size = (sizeof(*footprint_header) +
20990 		    (old_map->hdr.nentries
20991 		    *
20992 		    (sizeof(*footprint_region) +
20993 		    +3))            /* potential alignment for each region */
20994 		    +
20995 		    ((old_map->size / effective_page_size)
20996 		    *
20997 		    sizeof(cf_disp_t)));      /* disposition for each page */
20998 //		printf("FBDP corpse map %p guestimate footprint size 0x%llx\n", new_map, (uint64_t) buf_size);
20999 		buf_size = round_page(buf_size);
21000 
21001 		/* limit buffer to 1 page to validate overflow detection */
21002 //		buf_size = PAGE_SIZE;
21003 
21004 		/* limit size to a somewhat sane amount */
21005 #if XNU_TARGET_OS_OSX
21006 #define VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE   (8*1024*1024)   /* 8MB */
21007 #else /* XNU_TARGET_OS_OSX */
21008 #define VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE   (256*1024)      /* 256KB */
21009 #endif /* XNU_TARGET_OS_OSX */
21010 		if (buf_size > VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE) {
21011 			buf_size = VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE;
21012 		}
21013 
21014 		/*
21015 		 * Allocate the pageable buffer (with a trailing guard page).
21016 		 * It will be zero-filled on demand.
21017 		 */
21018 		kr = kmem_alloc(kernel_map, &buf, buf_size + PAGE_SIZE,
21019 		    KMA_DATA | KMA_PAGEABLE | KMA_GUARD_LAST,
21020 		    VM_KERN_MEMORY_DIAG);
21021 		if (kr != KERN_SUCCESS) {
21022 			vm_map_corpse_footprint_no_buf++;
21023 			return kr;
21024 		}
21025 
21026 		/* initialize header and 1st region */
21027 		footprint_header = (struct vm_map_corpse_footprint_header *)buf;
21028 		new_map->vmmap_corpse_footprint = footprint_header;
21029 
21030 		footprint_header->cf_size = buf_size;
21031 		footprint_header->cf_last_region =
21032 		    sizeof(*footprint_header);
21033 		footprint_header->cf_last_zeroes = 0;
21034 
21035 		footprint_region = (struct vm_map_corpse_footprint_region *)
21036 		    ((char *)footprint_header +
21037 		    footprint_header->cf_last_region);
21038 		footprint_region->cfr_vaddr = 0;
21039 		footprint_region->cfr_num_pages = 0;
21040 	} else {
21041 		/* retrieve header and last region */
21042 		footprint_header = (struct vm_map_corpse_footprint_header *)
21043 		    new_map->vmmap_corpse_footprint;
21044 		footprint_region = (struct vm_map_corpse_footprint_region *)
21045 		    ((char *)footprint_header +
21046 		    footprint_header->cf_last_region);
21047 	}
21048 	footprint_edge = ((uintptr_t)footprint_header +
21049 	    footprint_header->cf_size);
21050 
21051 	if ((footprint_region->cfr_vaddr +
21052 	    (((vm_map_offset_t)footprint_region->cfr_num_pages) *
21053 	    effective_page_size))
21054 	    != old_entry->vme_start) {
21055 		uint64_t num_pages_delta, num_pages_delta_size;
21056 		uint32_t region_offset_delta_size;
21057 
21058 		/*
21059 		 * Not the next contiguous virtual address:
21060 		 * start a new region or store "zero" dispositions for
21061 		 * the missing pages?
21062 		 */
21063 		/* size of gap in actual page dispositions */
21064 		num_pages_delta = ((old_entry->vme_start -
21065 		    footprint_region->cfr_vaddr) / effective_page_size)
21066 		    - footprint_region->cfr_num_pages;
21067 		num_pages_delta_size = num_pages_delta * sizeof(cf_disp_t);
21068 		/* size of gap as a new footprint region header */
21069 		region_offset_delta_size =
21070 		    (sizeof(*footprint_region) +
21071 		    roundup(((footprint_region->cfr_num_pages -
21072 		    footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)),
21073 		    sizeof(int)) -
21074 		    ((footprint_region->cfr_num_pages -
21075 		    footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)));
21076 //		printf("FBDP %s:%d region 0x%x 0x%llx 0x%x vme_start 0x%llx pages_delta 0x%llx region_delta 0x%x\n", __FUNCTION__, __LINE__, footprint_header->cf_last_region, footprint_region->cfr_vaddr, footprint_region->cfr_num_pages, old_entry->vme_start, num_pages_delta, region_offset_delta);
21077 		if (region_offset_delta_size < num_pages_delta_size ||
21078 		    os_add3_overflow(footprint_region->cfr_num_pages,
21079 		    (uint32_t) num_pages_delta,
21080 		    1,
21081 		    &num_pages_tmp)) {
21082 			/*
21083 			 * Storing data for this gap would take more space
21084 			 * than inserting a new footprint region header:
21085 			 * let's start a new region and save space. If it's a
21086 			 * tie, let's avoid using a new region, since that
21087 			 * would require more region hops to find the right
21088 			 * range during lookups.
21089 			 *
21090 			 * If the current region's cfr_num_pages would overflow
21091 			 * if we added "zero" page dispositions for the gap,
21092 			 * no choice but to start a new region.
21093 			 */
21094 //			printf("FBDP %s:%d new region\n", __FUNCTION__, __LINE__);
21095 			new_footprint_region =
21096 			    vm_map_corpse_footprint_new_region(footprint_header);
21097 			/* check that we're not going over the edge */
21098 			if (new_footprint_region == NULL) {
21099 				goto over_the_edge;
21100 			}
21101 			footprint_region = new_footprint_region;
21102 			/* initialize new region as empty */
21103 			footprint_region->cfr_vaddr = old_entry->vme_start;
21104 			footprint_region->cfr_num_pages = 0;
21105 		} else {
21106 			/*
21107 			 * Store "zero" page dispositions for the missing
21108 			 * pages.
21109 			 */
21110 //			printf("FBDP %s:%d zero gap\n", __FUNCTION__, __LINE__);
21111 			for (; num_pages_delta > 0; num_pages_delta--) {
21112 				next_disp_p = (cf_disp_t *)
21113 				    ((uintptr_t) footprint_region +
21114 				    sizeof(*footprint_region));
21115 				next_disp_p += footprint_region->cfr_num_pages;
21116 				/* check that we're not going over the edge */
21117 				if ((uintptr_t)next_disp_p >= footprint_edge) {
21118 					goto over_the_edge;
21119 				}
21120 				/* store "zero" disposition for this gap page */
21121 				footprint_region->cfr_num_pages++;
21122 				*next_disp_p = (cf_disp_t) 0;
21123 				footprint_header->cf_last_zeroes++;
21124 			}
21125 		}
21126 	}
21127 
21128 	for (va = old_entry->vme_start;
21129 	    va < old_entry->vme_end;
21130 	    va += effective_page_size) {
21131 		int             disposition;
21132 		cf_disp_t       cf_disp;
21133 
21134 		vm_map_footprint_query_page_info(old_map,
21135 		    old_entry,
21136 		    va,
21137 		    &disposition);
21138 		cf_disp = vm_page_disposition_to_cf_disp(disposition);
21139 
21140 //		if (va < SHARED_REGION_BASE_ARM64) printf("FBDP collect map %p va 0x%llx disp 0x%x\n", new_map, va, disp);
21141 
21142 		if (cf_disp == 0 && footprint_region->cfr_num_pages == 0) {
21143 			/*
21144 			 * Ignore "zero" dispositions at start of
21145 			 * region: just move start of region.
21146 			 */
21147 			footprint_region->cfr_vaddr += effective_page_size;
21148 			continue;
21149 		}
21150 
21151 		/* would region's cfr_num_pages overflow? */
21152 		if (os_add_overflow(footprint_region->cfr_num_pages, 1,
21153 		    &num_pages_tmp)) {
21154 			/* overflow: create a new region */
21155 			new_footprint_region =
21156 			    vm_map_corpse_footprint_new_region(
21157 				footprint_header);
21158 			if (new_footprint_region == NULL) {
21159 				goto over_the_edge;
21160 			}
21161 			footprint_region = new_footprint_region;
21162 			footprint_region->cfr_vaddr = va;
21163 			footprint_region->cfr_num_pages = 0;
21164 		}
21165 
21166 		next_disp_p = (cf_disp_t *) ((uintptr_t) footprint_region +
21167 		    sizeof(*footprint_region));
21168 		next_disp_p += footprint_region->cfr_num_pages;
21169 		/* check that we're not going over the edge */
21170 		if ((uintptr_t)next_disp_p >= footprint_edge) {
21171 			goto over_the_edge;
21172 		}
21173 		/* store this dispostion */
21174 		*next_disp_p = cf_disp;
21175 		footprint_region->cfr_num_pages++;
21176 
21177 		if (cf_disp != 0) {
21178 			/* non-zero disp: break the current zero streak */
21179 			footprint_header->cf_last_zeroes = 0;
21180 			/* done */
21181 			continue;
21182 		}
21183 
21184 		/* zero disp: add to the current streak of zeroes */
21185 		footprint_header->cf_last_zeroes++;
21186 		if ((footprint_header->cf_last_zeroes +
21187 		    roundup(((footprint_region->cfr_num_pages -
21188 		    footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)) &
21189 		    (sizeof(int) - 1),
21190 		    sizeof(int))) <
21191 		    (sizeof(*footprint_header))) {
21192 			/*
21193 			 * There are not enough trailing "zero" dispositions
21194 			 * (+ the extra padding we would need for the previous
21195 			 * region); creating a new region would not save space
21196 			 * at this point, so let's keep this "zero" disposition
21197 			 * in this region and reconsider later.
21198 			 */
21199 			continue;
21200 		}
21201 		/*
21202 		 * Create a new region to avoid having too many consecutive
21203 		 * "zero" dispositions.
21204 		 */
21205 		new_footprint_region =
21206 		    vm_map_corpse_footprint_new_region(footprint_header);
21207 		if (new_footprint_region == NULL) {
21208 			goto over_the_edge;
21209 		}
21210 		footprint_region = new_footprint_region;
21211 		/* initialize the new region as empty ... */
21212 		footprint_region->cfr_num_pages = 0;
21213 		/* ... and skip this "zero" disp */
21214 		footprint_region->cfr_vaddr = va + effective_page_size;
21215 	}
21216 
21217 	return KERN_SUCCESS;
21218 
21219 over_the_edge:
21220 //	printf("FBDP map %p footprint was full for va 0x%llx\n", new_map, va);
21221 	vm_map_corpse_footprint_full++;
21222 	return KERN_RESOURCE_SHORTAGE;
21223 }
21224 
21225 /*
21226  * vm_map_corpse_footprint_collect_done:
21227  *	completes the footprint collection by getting rid of any remaining
21228  *	trailing "zero" dispositions and trimming the unused part of the
21229  *	kernel buffer
21230  */
21231 void
vm_map_corpse_footprint_collect_done(vm_map_t new_map)21232 vm_map_corpse_footprint_collect_done(
21233 	vm_map_t        new_map)
21234 {
21235 	struct vm_map_corpse_footprint_header *footprint_header;
21236 	struct vm_map_corpse_footprint_region *footprint_region;
21237 	vm_size_t       buf_size, actual_size;
21238 	kern_return_t   kr;
21239 
21240 	assert(new_map->has_corpse_footprint);
21241 	if (!new_map->has_corpse_footprint ||
21242 	    new_map->vmmap_corpse_footprint == NULL) {
21243 		return;
21244 	}
21245 
21246 	footprint_header = (struct vm_map_corpse_footprint_header *)
21247 	    new_map->vmmap_corpse_footprint;
21248 	buf_size = footprint_header->cf_size;
21249 
21250 	footprint_region = (struct vm_map_corpse_footprint_region *)
21251 	    ((char *)footprint_header +
21252 	    footprint_header->cf_last_region);
21253 
21254 	/* get rid of trailing zeroes in last region */
21255 	assert(footprint_region->cfr_num_pages >= footprint_header->cf_last_zeroes);
21256 	footprint_region->cfr_num_pages -= footprint_header->cf_last_zeroes;
21257 	footprint_header->cf_last_zeroes = 0;
21258 
21259 	actual_size = (vm_size_t)(footprint_header->cf_last_region +
21260 	    sizeof(*footprint_region) +
21261 	    (footprint_region->cfr_num_pages * sizeof(cf_disp_t)));
21262 
21263 //	printf("FBDP map %p buf_size 0x%llx actual_size 0x%llx\n", new_map, (uint64_t) buf_size, (uint64_t) actual_size);
21264 	vm_map_corpse_footprint_size_avg =
21265 	    (((vm_map_corpse_footprint_size_avg *
21266 	    vm_map_corpse_footprint_count) +
21267 	    actual_size) /
21268 	    (vm_map_corpse_footprint_count + 1));
21269 	vm_map_corpse_footprint_count++;
21270 	if (actual_size > vm_map_corpse_footprint_size_max) {
21271 		vm_map_corpse_footprint_size_max = actual_size;
21272 	}
21273 
21274 	actual_size = round_page(actual_size);
21275 	if (buf_size > actual_size) {
21276 		kr = vm_deallocate(kernel_map,
21277 		    ((vm_address_t)footprint_header +
21278 		    actual_size +
21279 		    PAGE_SIZE),                 /* trailing guard page */
21280 		    (buf_size - actual_size));
21281 		assertf(kr == KERN_SUCCESS,
21282 		    "trim: footprint_header %p buf_size 0x%llx actual_size 0x%llx kr=0x%x\n",
21283 		    footprint_header,
21284 		    (uint64_t) buf_size,
21285 		    (uint64_t) actual_size,
21286 		    kr);
21287 		kr = vm_protect(kernel_map,
21288 		    ((vm_address_t)footprint_header +
21289 		    actual_size),
21290 		    PAGE_SIZE,
21291 		    FALSE,             /* set_maximum */
21292 		    VM_PROT_NONE);
21293 		assertf(kr == KERN_SUCCESS,
21294 		    "guard: footprint_header %p buf_size 0x%llx actual_size 0x%llx kr=0x%x\n",
21295 		    footprint_header,
21296 		    (uint64_t) buf_size,
21297 		    (uint64_t) actual_size,
21298 		    kr);
21299 	}
21300 
21301 	footprint_header->cf_size = actual_size;
21302 }
21303 
21304 /*
21305  * vm_map_corpse_footprint_query_page_info:
21306  *	retrieves the disposition of the page at virtual address "vaddr"
21307  *	in the forked corpse's VM map
21308  *
21309  * This is the equivalent of vm_map_footprint_query_page_info() for a forked corpse.
21310  */
21311 kern_return_t
vm_map_corpse_footprint_query_page_info(vm_map_t map,vm_map_offset_t va,int * disposition_p)21312 vm_map_corpse_footprint_query_page_info(
21313 	vm_map_t        map,
21314 	vm_map_offset_t va,
21315 	int             *disposition_p)
21316 {
21317 	struct vm_map_corpse_footprint_header *footprint_header;
21318 	struct vm_map_corpse_footprint_region *footprint_region;
21319 	uint32_t        footprint_region_offset;
21320 	vm_map_offset_t region_start, region_end;
21321 	int             disp_idx;
21322 	kern_return_t   kr;
21323 	int             effective_page_size;
21324 	cf_disp_t       cf_disp;
21325 
21326 	if (!map->has_corpse_footprint) {
21327 		*disposition_p = 0;
21328 		kr = KERN_INVALID_ARGUMENT;
21329 		goto done;
21330 	}
21331 
21332 	footprint_header = map->vmmap_corpse_footprint;
21333 	if (footprint_header == NULL) {
21334 		*disposition_p = 0;
21335 //		if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p);
21336 		kr = KERN_INVALID_ARGUMENT;
21337 		goto done;
21338 	}
21339 
21340 	/* start looking at the hint ("cf_hint_region") */
21341 	footprint_region_offset = footprint_header->cf_hint_region;
21342 
21343 	effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(map));
21344 
21345 lookup_again:
21346 	if (footprint_region_offset < sizeof(*footprint_header)) {
21347 		/* hint too low: start from 1st region */
21348 		footprint_region_offset = sizeof(*footprint_header);
21349 	}
21350 	if (footprint_region_offset >= footprint_header->cf_last_region) {
21351 		/* hint too high: re-start from 1st region */
21352 		footprint_region_offset = sizeof(*footprint_header);
21353 	}
21354 	footprint_region = (struct vm_map_corpse_footprint_region *)
21355 	    ((char *)footprint_header + footprint_region_offset);
21356 	region_start = footprint_region->cfr_vaddr;
21357 	region_end = (region_start +
21358 	    ((vm_map_offset_t)(footprint_region->cfr_num_pages) *
21359 	    effective_page_size));
21360 	if (va < region_start &&
21361 	    footprint_region_offset != sizeof(*footprint_header)) {
21362 		/* our range starts before the hint region */
21363 
21364 		/* reset the hint (in a racy way...) */
21365 		footprint_header->cf_hint_region = sizeof(*footprint_header);
21366 		/* lookup "va" again from 1st region */
21367 		footprint_region_offset = sizeof(*footprint_header);
21368 		goto lookup_again;
21369 	}
21370 
21371 	while (va >= region_end) {
21372 		if (footprint_region_offset >= footprint_header->cf_last_region) {
21373 			break;
21374 		}
21375 		/* skip the region's header */
21376 		footprint_region_offset += sizeof(*footprint_region);
21377 		/* skip the region's page dispositions */
21378 		footprint_region_offset += (footprint_region->cfr_num_pages * sizeof(cf_disp_t));
21379 		/* align to next word boundary */
21380 		footprint_region_offset =
21381 		    roundup(footprint_region_offset,
21382 		    sizeof(int));
21383 		footprint_region = (struct vm_map_corpse_footprint_region *)
21384 		    ((char *)footprint_header + footprint_region_offset);
21385 		region_start = footprint_region->cfr_vaddr;
21386 		region_end = (region_start +
21387 		    ((vm_map_offset_t)(footprint_region->cfr_num_pages) *
21388 		    effective_page_size));
21389 	}
21390 	if (va < region_start || va >= region_end) {
21391 		/* page not found */
21392 		*disposition_p = 0;
21393 //		if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p);
21394 		kr = KERN_SUCCESS;
21395 		goto done;
21396 	}
21397 
21398 	/* "va" found: set the lookup hint for next lookup (in a racy way...) */
21399 	footprint_header->cf_hint_region = footprint_region_offset;
21400 
21401 	/* get page disposition for "va" in this region */
21402 	disp_idx = (int) ((va - footprint_region->cfr_vaddr) / effective_page_size);
21403 	cf_disp = footprint_region->cfr_disposition[disp_idx];
21404 	*disposition_p = vm_page_cf_disp_to_disposition(cf_disp);
21405 	kr = KERN_SUCCESS;
21406 done:
21407 //	if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p);
21408 	/* dtrace -n 'vminfo:::footprint_query_page_info { printf("map 0x%p va 0x%llx disp 0x%x kr 0x%x", arg0, arg1, arg2, arg3); }' */
21409 	DTRACE_VM4(footprint_query_page_info,
21410 	    vm_map_t, map,
21411 	    vm_map_offset_t, va,
21412 	    int, *disposition_p,
21413 	    kern_return_t, kr);
21414 
21415 	return kr;
21416 }
21417 
21418 void
vm_map_corpse_footprint_destroy(vm_map_t map)21419 vm_map_corpse_footprint_destroy(
21420 	vm_map_t        map)
21421 {
21422 	if (map->has_corpse_footprint &&
21423 	    map->vmmap_corpse_footprint != 0) {
21424 		struct vm_map_corpse_footprint_header *footprint_header;
21425 		vm_size_t buf_size;
21426 		kern_return_t kr;
21427 
21428 		footprint_header = map->vmmap_corpse_footprint;
21429 		buf_size = footprint_header->cf_size;
21430 		kr = vm_deallocate(kernel_map,
21431 		    (vm_offset_t) map->vmmap_corpse_footprint,
21432 		    ((vm_size_t) buf_size
21433 		    + PAGE_SIZE));                 /* trailing guard page */
21434 		assertf(kr == KERN_SUCCESS, "kr=0x%x\n", kr);
21435 		map->vmmap_corpse_footprint = 0;
21436 		map->has_corpse_footprint = FALSE;
21437 	}
21438 }
21439 
21440 /*
21441  * vm_map_copy_footprint_ledgers:
21442  *	copies any ledger that's relevant to the memory footprint of "old_task"
21443  *	into the forked corpse's task ("new_task")
21444  */
21445 void
vm_map_copy_footprint_ledgers(task_t old_task,task_t new_task)21446 vm_map_copy_footprint_ledgers(
21447 	task_t  old_task,
21448 	task_t  new_task)
21449 {
21450 	vm_map_copy_ledger(old_task, new_task, task_ledgers.phys_footprint);
21451 	vm_map_copy_ledger(old_task, new_task, task_ledgers.purgeable_nonvolatile);
21452 	vm_map_copy_ledger(old_task, new_task, task_ledgers.purgeable_nonvolatile_compressed);
21453 	vm_map_copy_ledger(old_task, new_task, task_ledgers.internal);
21454 	vm_map_copy_ledger(old_task, new_task, task_ledgers.internal_compressed);
21455 	vm_map_copy_ledger(old_task, new_task, task_ledgers.iokit_mapped);
21456 	vm_map_copy_ledger(old_task, new_task, task_ledgers.alternate_accounting);
21457 	vm_map_copy_ledger(old_task, new_task, task_ledgers.alternate_accounting_compressed);
21458 	vm_map_copy_ledger(old_task, new_task, task_ledgers.page_table);
21459 	vm_map_copy_ledger(old_task, new_task, task_ledgers.tagged_footprint);
21460 	vm_map_copy_ledger(old_task, new_task, task_ledgers.tagged_footprint_compressed);
21461 	vm_map_copy_ledger(old_task, new_task, task_ledgers.network_nonvolatile);
21462 	vm_map_copy_ledger(old_task, new_task, task_ledgers.network_nonvolatile_compressed);
21463 	vm_map_copy_ledger(old_task, new_task, task_ledgers.media_footprint);
21464 	vm_map_copy_ledger(old_task, new_task, task_ledgers.media_footprint_compressed);
21465 	vm_map_copy_ledger(old_task, new_task, task_ledgers.graphics_footprint);
21466 	vm_map_copy_ledger(old_task, new_task, task_ledgers.graphics_footprint_compressed);
21467 	vm_map_copy_ledger(old_task, new_task, task_ledgers.neural_footprint);
21468 	vm_map_copy_ledger(old_task, new_task, task_ledgers.neural_footprint_compressed);
21469 	vm_map_copy_ledger(old_task, new_task, task_ledgers.wired_mem);
21470 }
21471 
21472 /*
21473  * vm_map_copy_ledger:
21474  *	copy a single ledger from "old_task" to "new_task"
21475  */
21476 void
vm_map_copy_ledger(task_t old_task,task_t new_task,int ledger_entry)21477 vm_map_copy_ledger(
21478 	task_t  old_task,
21479 	task_t  new_task,
21480 	int     ledger_entry)
21481 {
21482 	ledger_amount_t old_balance, new_balance, delta;
21483 
21484 	assert(new_task->map->has_corpse_footprint);
21485 	if (!new_task->map->has_corpse_footprint) {
21486 		return;
21487 	}
21488 
21489 	/* turn off sanity checks for the ledger we're about to mess with */
21490 	ledger_disable_panic_on_negative(new_task->ledger,
21491 	    ledger_entry);
21492 
21493 	/* adjust "new_task" to match "old_task" */
21494 	ledger_get_balance(old_task->ledger,
21495 	    ledger_entry,
21496 	    &old_balance);
21497 	ledger_get_balance(new_task->ledger,
21498 	    ledger_entry,
21499 	    &new_balance);
21500 	if (new_balance == old_balance) {
21501 		/* new == old: done */
21502 	} else if (new_balance > old_balance) {
21503 		/* new > old ==> new -= new - old */
21504 		delta = new_balance - old_balance;
21505 		ledger_debit(new_task->ledger,
21506 		    ledger_entry,
21507 		    delta);
21508 	} else {
21509 		/* new < old ==> new += old - new */
21510 		delta = old_balance - new_balance;
21511 		ledger_credit(new_task->ledger,
21512 		    ledger_entry,
21513 		    delta);
21514 	}
21515 }
21516 
21517 /*
21518  * vm_map_get_pmap:
21519  * returns the pmap associated with the vm_map
21520  */
21521 pmap_t
vm_map_get_pmap(vm_map_t map)21522 vm_map_get_pmap(vm_map_t map)
21523 {
21524 	return vm_map_pmap(map);
21525 }
21526 
21527 #if MACH_ASSERT
21528 
21529 extern int pmap_ledgers_panic;
21530 extern int pmap_ledgers_panic_leeway;
21531 
21532 #define LEDGER_DRIFT(__LEDGER)                    \
21533 	int             __LEDGER##_over;          \
21534 	ledger_amount_t __LEDGER##_over_total;    \
21535 	ledger_amount_t __LEDGER##_over_max;      \
21536 	int             __LEDGER##_under;         \
21537 	ledger_amount_t __LEDGER##_under_total;   \
21538 	ledger_amount_t __LEDGER##_under_max
21539 
21540 struct {
21541 	uint64_t        num_pmaps_checked;
21542 
21543 	LEDGER_DRIFT(phys_footprint);
21544 	LEDGER_DRIFT(internal);
21545 	LEDGER_DRIFT(internal_compressed);
21546 	LEDGER_DRIFT(external);
21547 	LEDGER_DRIFT(reusable);
21548 	LEDGER_DRIFT(iokit_mapped);
21549 	LEDGER_DRIFT(alternate_accounting);
21550 	LEDGER_DRIFT(alternate_accounting_compressed);
21551 	LEDGER_DRIFT(page_table);
21552 	LEDGER_DRIFT(purgeable_volatile);
21553 	LEDGER_DRIFT(purgeable_nonvolatile);
21554 	LEDGER_DRIFT(purgeable_volatile_compressed);
21555 	LEDGER_DRIFT(purgeable_nonvolatile_compressed);
21556 	LEDGER_DRIFT(tagged_nofootprint);
21557 	LEDGER_DRIFT(tagged_footprint);
21558 	LEDGER_DRIFT(tagged_nofootprint_compressed);
21559 	LEDGER_DRIFT(tagged_footprint_compressed);
21560 	LEDGER_DRIFT(network_volatile);
21561 	LEDGER_DRIFT(network_nonvolatile);
21562 	LEDGER_DRIFT(network_volatile_compressed);
21563 	LEDGER_DRIFT(network_nonvolatile_compressed);
21564 	LEDGER_DRIFT(media_nofootprint);
21565 	LEDGER_DRIFT(media_footprint);
21566 	LEDGER_DRIFT(media_nofootprint_compressed);
21567 	LEDGER_DRIFT(media_footprint_compressed);
21568 	LEDGER_DRIFT(graphics_nofootprint);
21569 	LEDGER_DRIFT(graphics_footprint);
21570 	LEDGER_DRIFT(graphics_nofootprint_compressed);
21571 	LEDGER_DRIFT(graphics_footprint_compressed);
21572 	LEDGER_DRIFT(neural_nofootprint);
21573 	LEDGER_DRIFT(neural_footprint);
21574 	LEDGER_DRIFT(neural_nofootprint_compressed);
21575 	LEDGER_DRIFT(neural_footprint_compressed);
21576 } pmap_ledgers_drift;
21577 
21578 void
vm_map_pmap_check_ledgers(pmap_t pmap,ledger_t ledger,int pid,char * procname)21579 vm_map_pmap_check_ledgers(
21580 	pmap_t          pmap,
21581 	ledger_t        ledger,
21582 	int             pid,
21583 	char            *procname)
21584 {
21585 	ledger_amount_t bal;
21586 	boolean_t       do_panic;
21587 
21588 	do_panic = FALSE;
21589 
21590 	pmap_ledgers_drift.num_pmaps_checked++;
21591 
21592 #define LEDGER_CHECK_BALANCE(__LEDGER)                                  \
21593 MACRO_BEGIN                                                             \
21594 	int panic_on_negative = TRUE;                                   \
21595 	ledger_get_balance(ledger,                                      \
21596 	                   task_ledgers.__LEDGER,                       \
21597 	                   &bal);                                       \
21598 	ledger_get_panic_on_negative(ledger,                            \
21599 	                             task_ledgers.__LEDGER,             \
21600 	                             &panic_on_negative);               \
21601 	if (bal != 0) {                                                 \
21602 	        if (panic_on_negative ||                                \
21603 	            (pmap_ledgers_panic &&                              \
21604 	             pmap_ledgers_panic_leeway > 0 &&                   \
21605 	             (bal > (pmap_ledgers_panic_leeway * PAGE_SIZE) ||  \
21606 	              bal < (-pmap_ledgers_panic_leeway * PAGE_SIZE)))) { \
21607 	                do_panic = TRUE;                                \
21608 	        }                                                       \
21609 	        printf("LEDGER BALANCE proc %d (%s) "                   \
21610 	               "\"%s\" = %lld\n",                               \
21611 	               pid, procname, #__LEDGER, bal);                  \
21612 	        if (bal > 0) {                                          \
21613 	                pmap_ledgers_drift.__LEDGER##_over++;           \
21614 	                pmap_ledgers_drift.__LEDGER##_over_total += bal; \
21615 	                if (bal > pmap_ledgers_drift.__LEDGER##_over_max) { \
21616 	                        pmap_ledgers_drift.__LEDGER##_over_max = bal; \
21617 	                }                                               \
21618 	        } else if (bal < 0) {                                   \
21619 	                pmap_ledgers_drift.__LEDGER##_under++;          \
21620 	                pmap_ledgers_drift.__LEDGER##_under_total += bal; \
21621 	                if (bal < pmap_ledgers_drift.__LEDGER##_under_max) { \
21622 	                        pmap_ledgers_drift.__LEDGER##_under_max = bal; \
21623 	                }                                               \
21624 	        }                                                       \
21625 	}                                                               \
21626 MACRO_END
21627 
21628 	LEDGER_CHECK_BALANCE(phys_footprint);
21629 	LEDGER_CHECK_BALANCE(internal);
21630 	LEDGER_CHECK_BALANCE(internal_compressed);
21631 	LEDGER_CHECK_BALANCE(external);
21632 	LEDGER_CHECK_BALANCE(reusable);
21633 	LEDGER_CHECK_BALANCE(iokit_mapped);
21634 	LEDGER_CHECK_BALANCE(alternate_accounting);
21635 	LEDGER_CHECK_BALANCE(alternate_accounting_compressed);
21636 	LEDGER_CHECK_BALANCE(page_table);
21637 	LEDGER_CHECK_BALANCE(purgeable_volatile);
21638 	LEDGER_CHECK_BALANCE(purgeable_nonvolatile);
21639 	LEDGER_CHECK_BALANCE(purgeable_volatile_compressed);
21640 	LEDGER_CHECK_BALANCE(purgeable_nonvolatile_compressed);
21641 	LEDGER_CHECK_BALANCE(tagged_nofootprint);
21642 	LEDGER_CHECK_BALANCE(tagged_footprint);
21643 	LEDGER_CHECK_BALANCE(tagged_nofootprint_compressed);
21644 	LEDGER_CHECK_BALANCE(tagged_footprint_compressed);
21645 	LEDGER_CHECK_BALANCE(network_volatile);
21646 	LEDGER_CHECK_BALANCE(network_nonvolatile);
21647 	LEDGER_CHECK_BALANCE(network_volatile_compressed);
21648 	LEDGER_CHECK_BALANCE(network_nonvolatile_compressed);
21649 	LEDGER_CHECK_BALANCE(media_nofootprint);
21650 	LEDGER_CHECK_BALANCE(media_footprint);
21651 	LEDGER_CHECK_BALANCE(media_nofootprint_compressed);
21652 	LEDGER_CHECK_BALANCE(media_footprint_compressed);
21653 	LEDGER_CHECK_BALANCE(graphics_nofootprint);
21654 	LEDGER_CHECK_BALANCE(graphics_footprint);
21655 	LEDGER_CHECK_BALANCE(graphics_nofootprint_compressed);
21656 	LEDGER_CHECK_BALANCE(graphics_footprint_compressed);
21657 	LEDGER_CHECK_BALANCE(neural_nofootprint);
21658 	LEDGER_CHECK_BALANCE(neural_footprint);
21659 	LEDGER_CHECK_BALANCE(neural_nofootprint_compressed);
21660 	LEDGER_CHECK_BALANCE(neural_footprint_compressed);
21661 
21662 	if (do_panic) {
21663 		if (pmap_ledgers_panic) {
21664 			panic("pmap_destroy(%p) %d[%s] has imbalanced ledgers",
21665 			    pmap, pid, procname);
21666 		} else {
21667 			printf("pmap_destroy(%p) %d[%s] has imbalanced ledgers\n",
21668 			    pmap, pid, procname);
21669 		}
21670 	}
21671 }
21672 #endif /* MACH_ASSERT */
21673